diff --git a/.gitignore b/.gitignore
index 95d0f4204..d251400b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@ __pycache__
*.pth
UCF-101/
results/
-vae
build/
opensora.egg-info/
wandb/
@@ -22,10 +21,69 @@ wandb/
cache_dir/
wandb/
test*
-sample_videos/
+sample_video*/
512*
720*
1024*
debug*
private*
-caption*
\ No newline at end of file
+caption*
+.deepspeed_env
+256*
+sample_image*/
+taming*
+*test*
+sft*
+flash*
+65x256*
+alpha_vae
+*node*
+cache/
+Open-Sora-Plan_models/
+sample_image*cfg*
+*tmp*
+*pymp*
+check.py
+bucket.py
+whileinf.py
+validation_dir/
+runs/
+samples/
+inpaint*/
+bs32x8x1*
+*tmp*
+*pymp*
+check.py
+bucket.py
+whileinf.py
+bs4x8x16_*
+*.zip
+*validation/
+bs1x8x32*
+bs16x8x1*
+bs8x8x2*
+bs8x8x1*
+bs8x8x8*
+bs1x8x16*
+checklora.py
+dim4todim8.py
+*vae8_any*320x320*
+samples/
+runs/
+*validation/
+training_log*txt
+filter_motion*
+json2*.py
+motionfun*
+res_dist*
+filter_json_aes_m*
+stage2*.json
+kernel_meta
+ge_check_op.json
+WFVAE_DISTILL_FORMAL
+read_video*
+bs32x8x2*
+filter_json_aes_m*
+json2json*
+makenpu_json*
+
diff --git a/README.md b/README.md
index 2b5635b8d..2e23ad8b9 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,12 @@
[[Project Page]](https://pku-yuangroup.github.io/Open-Sora-Plan/) [[中文主页]](https://pku-yuangroup.github.io/Open-Sora-Plan/blog_cn.html)
-->
-[![slack badge](https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&)](https://discord.gg/vqGmpjkSaz)
+
+[![slack badge](https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&)](https://discord.gg/YtsBNg7n)
[![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/issues/53#issuecomment-1987226516)
-[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1763476690385424554?s=20)
-[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0)
-[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0)
-[![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512)
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb)
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0)
+[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1795018003345510687)
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0)
[![License](https://img.shields.io/badge/License-MIT-yellow)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/LICENSE)
[![GitHub repo contributors](https://img.shields.io/github/contributors-anon/PKU-YuanGroup/Open-Sora-Plan?style=flat&label=Contributors)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/graphs/contributors)
[![GitHub Commit](https://img.shields.io/github/commit-activity/m/PKU-YuanGroup/Open-Sora-Plan?label=Commit)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/commits/main/)
@@ -21,28 +20,154 @@
[![GitHub repo watchers](https://img.shields.io/github/watchers/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Watchers)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/watchers)
[![GitHub repo size](https://img.shields.io/github/repo-size/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Repo%20Size)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/archive/refs/heads/main.zip)
-We are thrilled to present **Open-Sora-Plan v1.0.0**, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.0.0.md). We are training for higher resolution (>1024) as well as longer duration (>10s) videos, here is a preview of the next release. We show compressed .gif on GitHub, which loses some quality.
+
+v1.0.0 badge
+[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1763476690385424554?s=20)
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0)
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0)
+[![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb)
+
+
+We are thrilled to present **Open-Sora-Plan v1.1.0**, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.1.0.md). We show compressed .gif on GitHub, which loses some quality.
+
+Thanks to **HUAWEI Ascend Team** for supporting us. In the second stage, we used Huawei Ascend computing power for training. This stage's training and inference were fully supported by Huawei. Models trained on Huawei Ascend can also be loaded into GPUs and generate videos of the same quality.
+
+目前已经支持使用国产AI计算系统(华为昇腾,期待更多国产算力芯片)进行完整的训练和推理。在项目第二阶段,所有训练和推理任务完全由华为昇腾计算系统支持。此外,基于华为昇腾的512卡集群训练出的模型,也可以无缝地在GPU上运行,并保持相同的视频质量。详细信息请参考我们的[hw branch](https://github.com/PKU-YuanGroup/Open-Sora-Plan/tree/hw).
+
+
+### 221×512×512 Text-to-Video Generation
+
+
+
+
+ 3D animation of a small, round, fluffy creature with big, expressive eyes explores ... |
+ A single drop of liquid metal falls from a floating orb, landing on a mirror-like ... |
+ The video presents an abstract composition centered around a hexagonal shape adorned ... |
+
+
+ |
+ |
+ |
+
+
+ A drone camera circles around a beautiful historic church built on a rocky outcropping ... |
+ Aerial view of Santorini during the blue hour, showcasing the stunning architecture ... |
+ An aerial shot of a lighthouse standing tall on a rocky cliff, its beacon cutting ... |
+
+
+ |
+ |
+ |
+
+
+ A snowy forest landscape with a dirt road running through it. The road is flanked by ... |
+ Drone shot along the Hawaii jungle coastline, sunny day. Kayaks in the water. |
+ The camera rotates around a large stack of vintage televisions all showing different ... |
+
+
+ |
+ |
+ |
+
+
+
+
+
+
+### 65×512×512 Text-to-Video Generation
+
+
+
+
+
+ In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two ... |
+ A Shiba Inu dog wearing a beret and black turtleneck. |
+ A painting of a boat on water comes to life, with waves crashing and the boat becoming ... |
+
+
+ |
+ |
+ |
+
+
+ A person clad in a space suit with a helmet and equipped with a chest light and arm ... |
+ 3D animation of a small, round, fluffy creature with big, expressive eyes explores a ... |
+ In a studio, there is a painting depicting a ship sailing through the rough sea. |
+
+
+ |
+ |
+ |
+
+
+ A robot dog trots down a deserted alley at night, its metallic paws clinking softly ... |
+ A lone surfer rides a massive wave, skillfully maneuvering through the surf. The water ... |
+ A solitary cheetah sprints across the savannah, its powerful muscles propelling it ... |
+
+
+ |
+ |
+ |
+
+
+
+
+
+### 65×512×512 Video Editing
+
+
+
+
+ Generated |
+ |
+ |
+ |
+
+
+ Edited |
+ |
+ |
+ |
+
+
+
+
+### 512×512 Text-to-Image Generation
+
+
+
+
+
+
+## 📰 News
+
+**[2024.05.27]** 🚀🚀🚀 We are launching Open-Sora Plan v1.1.0, which significantly improves video quality and length, and is fully open source! Please check out our latest [report](docs/Report-v1.1.0.md). Thanks to [ShareGPT4Video's](https://sharegpt4video.github.io/) capability to annotate long videos.
+
+**[2024.04.09]** 🚀 Excited to share our latest exploration on metamorphic time-lapse video generation: [MagicTime](https://github.com/PKU-YuanGroup/MagicTime), which learns real-world physics knowledge from time-lapse videos. Here is the dataset for train (updating): [Open-Sora-Dataset](https://github.com/PKU-YuanGroup/Open-Sora-Dataset).
+
+**[2024.04.07]** 🔥🔥🔥 Today, we are thrilled to present Open-Sora-Plan v1.0.0, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.0.0.md). Thanks to HUAWEI NPU for supporting us.
-Thanks to **HUAWEI Ascend NPU Team** for supporting us.
+**[2024.03.27]** 🚀🚀🚀 We release the report of [VideoCausalVAE](docs/CausalVideoVAE.md), which supports both images and videos. We present our reconstructed video in this demonstration as follows. The text-to-video model is on the way.
-目前已支持国产AI芯片(华为昇腾,期待更多国产算力芯片)进行推理,下一步将支持国产算力训练,具体可参考昇腾分支[hw branch](https://github.com/PKU-YuanGroup/Open-Sora-Plan/tree/hw).
+
+View more
+
+**[2024.03.10]** 🚀🚀🚀 This repo supports training a latent size of 225×90×90 (t×h×w), which means we are able to **train 1 minute of 1080P video with 30FPS** (2× interpolated frames and 2× super resolution) under class-condition.
-| 257×512×512 (10s) | 65×1024×1024 (2.7s) | 65×1024×1024 (2.7s) |
-| --- | --- | --- |
-| | | |
-| Time-lapse of a coastal landscape transitioning from sunrise to nightfall... | A quiet beach at dawn, the waves gently lapping at the shore and the sky painted in pastel hues....|Sunset over the sea. |
+**[2024.03.08]** We support the training code of text condition with 16 frames of 512x512. The code is mainly borrowed from [Latte](https://github.com/Vchitect/Latte).
+**[2024.03.07]** We support training with 128 frames (when sample rate = 3, which is about 13 seconds) of 256x256, or 64 frames (which is about 6 seconds) of 512x512.
-| 65×512×512 (2.7s) | 65×512×512 (2.7s) | 65×512×512 (2.7s) |
-| --- | --- | --- |
-| | | |
-| A serene underwater scene featuring a sea turtle swimming... | Yellow and black tropical fish dart through the sea. | a dynamic interaction between the ocean and a large rock... |
-| | | |
-| The dynamic movement of tall, wispy grasses swaying in the wind... | Slow pan upward of blazing oak fire in an indoor fireplace. | A serene waterfall cascading down moss-covered rocks... |
+**[2024.03.05]** See our latest [todo](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#todo), pull requests are welcome.
+**[2024.03.04]** We re-organize and modulize our code to make it easy to [contribute](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#how-to-contribute-to-the-open-sora-plan-community) to the project, to contribute please see the [Repo structure](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#repo-structure).
+**[2024.03.03]** We open some [discussions](https://github.com/PKU-YuanGroup/Open-Sora-Plan/discussions) to clarify several issues.
+**[2024.03.01]** Training code is available now! Learn more on our [project page](https://pku-yuangroup.github.io/Open-Sora-Plan/). Please feel free to watch 👀 this repository for the latest updates.
+
## 💪 Goal
This project aims to create a simple and scalable repo, to reproduce [Sora](https://openai.com/sora) (OpenAI, but we prefer to call it "ClosedAI" ). We wish the open-source community can contribute to this project. Pull requests are welcome!!!
@@ -66,30 +191,8 @@ Project stages:
-## 📰 News
-
-**[2024.04.09]** 🚀 Excited to share our latest exploration on metamorphic time-lapse video generation: [MagicTime](https://github.com/PKU-YuanGroup/MagicTime), which learns real-world physics knowledge from time-lapse videos. Here is the dataset for train (updating): [Open-Sora-Dataset](https://github.com/PKU-YuanGroup/Open-Sora-Dataset).
-
-**[2024.04.07]** 🔥🔥🔥 Today, we are thrilled to present Open-Sora-Plan v1.0.0, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.0.0.md). Thanks to HUAWEI NPU for supporting us.
-
-**[2024.03.27]** 🚀🚀🚀 We release the report of [VideoCausalVAE](docs/CausalVideoVAE.md), which supports both images and videos. We present our reconstructed video in this demonstration as follows. The text-to-video model is on the way.
-
-**[2024.03.10]** 🚀🚀🚀 This repo supports training a latent size of 225×90×90 (t×h×w), which means we are able to **train 1 minute of 1080P video with 30FPS** (2× interpolated frames and 2× super resolution) under class-condition.
-
-**[2024.03.08]** We support the training code of text condition with 16 frames of 512x512. The code is mainly borrowed from [Latte](https://github.com/Vchitect/Latte).
-
-**[2024.03.07]** We support training with 128 frames (when sample rate = 3, which is about 13 seconds) of 256x256, or 64 frames (which is about 6 seconds) of 512x512.
-
-**[2024.03.05]** See our latest [todo](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#todo), pull requests are welcome.
-
-**[2024.03.04]** We re-organize and modulize our code to make it easy to [contribute](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#how-to-contribute-to-the-open-sora-plan-community) to the project, to contribute please see the [Repo structure](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#repo-structure).
-
-**[2024.03.03]** We open some [discussions](https://github.com/PKU-YuanGroup/Open-Sora-Plan/discussions) to clarify several issues.
-
-**[2024.03.01]** Training code is available now! Learn more on our [project page](https://pku-yuangroup.github.io/Open-Sora-Plan/). Please feel free to watch 👀 this repository for the latest updates.
-
-
-## ✊ Todo
+
+✊ Todo
#### Setup the codebase and train an unconditional model on landscape dataset
- [x] Fix typos & Update readme. 🤝 Thanks to [@mio2333](https://github.com/mio2333), [@CreamyLong](https://github.com/CreamyLong), [@chg0901](https://github.com/chg0901), [@Nyx-177](https://github.com/Nyx-177), [@HowardLi1984](https://github.com/HowardLi1984), [@sennnnn](https://github.com/sennnnn), [@Jason-fan20](https://github.com/Jason-fan20)
@@ -159,10 +262,14 @@ Project stages:
- [x] Train with T5 conditioning.
- [ ] Train with CLIP conditioning.
- [ ] Train with CLIP + T5 conditioning (probably costly during training and experiments).
+- [ ] Support Chinese. ⌛ [WIP]
#### Control model with more condition
- [ ] Incorporating [ControlNet](https://github.com/lllyasviel/ControlNet). ⌛ [WIP] 🙏 **[Need your contribution]**
+- [ ] Incorporating [ReVideo](https://github.com/MC-E/ReVideo). ⌛ [WIP]
+
+
## 📂 Repo structure (WIP)
```
├── README.md
@@ -224,12 +331,26 @@ pip install -e '.[dev]'
#### Gradio Web UI
+Highly recommend trying out our web demo by the following command. We also provide [online demo](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0).
+
+
+v1.0.0
+
Highly recommend trying out our web demo by the following command. We also provide [online demo](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) and [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0) in Huggingface Spaces.
🤝 Enjoying the [![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512) and [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb), created by [@camenduru](https://github.com/camenduru), who generously supports our research!
+
+
+For the 65 frames.
+
```bash
-python -m opensora.serve.gradio_web_server
+python -m opensora.serve.gradio_web_server --version 65x512x512
+```
+
+For the 221 frames.
+```bash
+python -m opensora.serve.gradio_web_server --version 221x512x512
```
#### CLI Inference
@@ -244,7 +365,7 @@ Refer to [Data.md](docs/Data.md)
### Evaluation
Refer to the document [EVAL.md](docs/EVAL.md).
-### Causal Video VAE
+### CausalVideoVAE
#### Reconstructing
@@ -258,8 +379,6 @@ Parameter explanation:
- `--enable_tiling`: This parameter is a flag to enable a tiling conv.
-- `--enable_time_chunk`: This parameter is a flag to enable a time chunking. This will block the video in the temporal dimension and reconstruct the long video. This is only an operation performed in the video space, not the latent space, and cannot be used for training.
-
#### Training and Eval
Please refer to the document [CausalVideoVAE](docs/Train_And_Eval_CausalVideoVAE.md).
@@ -268,20 +387,62 @@ Please refer to the document [CausalVideoVAE](docs/Train_And_Eval_CausalVideoVAE
Please refer to the document [VQVAE](docs/VQVAE.md).
-### Video Diffusion Transformer
-#### Training
+### Text-to-Video training
+
```
-sh scripts/text_condition/train_videoae_17x256x256.sh
+bash scripts/text_condition/gpu/train_t2v.sh
```
+
+We introduce some key parameters in order to customize your training process.
+
+#### Training size
+To train videos of different resolutions and durations, adjust `--num_frames xx`, `--max_height xxx` and `--max_width xxx`.
+
+#### Data processing
+You specify your training data using `--data /path/to/data.txt`. For more information, please refer to the [documentation]().
+
+If the data movement is slow, we can specify `--speed_factor 1.25` to accelerate 1.25x videos.
+
+If you do not want to train on videos of dynamic durations, set `--drop_short_ratio 1.0` to discard all video data with frame counts not equal to `--num_frames`.
+
+If you want to train with videos of dynamic durations, we highly recommend specifying `--group_frame` as well. It improves computational efficiency during training.
+
+#### Multi-stage transfer learning
+When training a base model, such as 240p (`--max_height 240` and `--max_width 320`, `--interpolation_scale_h 1.0` and `--interpolation_scale_w 1.0`) , and you want to initialize higher resolution models like 480p (width 640, height 480) from 240p's weights, you need to adjust `--max_height 480` and `--max_width 640`, `--interpolation_scale_h 2.0` and `--interpolation_scale_w 2.0`, and set `--pretrained` to your 240p weights path (path/to/240p/xxx.safetensors).
+
+#### Load weights
+We have two ways to load weights: `--pretrained path/to/240p/xxx.safetensors` and `--resume_from_checkpoint /path/to/output_dir`. If both are specified, the latter will override the former.
+
+**For `--pretrained`**, this is typically used for loading pretrained weights across stages, such as using 240p weights to initialize 480p training. Or when switching datasets and you do not want the previous optimizer state.
+
+**For `--resume_from_checkpoint`**, it will resume the training process from the latest checkpoint in `--output_dir`. Typically, we set `--resume_from_checkpoint="latest"`, which is useful in cases of unexpected interruptions during training.
+
+#### Sequence Parallelism
+`--sp_size 8 --train_sp_batch_size 2` means running a batch size of 2 across 8 GPUs (on the same node).
+
+### Text-to-Video inference
+
+#### 1 GPU
+If you only have one GPU, it will perform inference on each sample sequentially, one at a time.
+```
+bash scripts/text_condition/gpu/sample_t2v.sh
+```
+
+#### Multi-GPUs
+If you want to batch infer a large number of samples, each GPU will infer one sample.
```
-sh scripts/text_condition/train_videoae_65x256x256.sh
+bash scripts/text_condition/gpu/sample_t2v_ddp.sh
```
+
+#### Multi-GPUs & Sequence Parallelism
+If you want to quickly infer one sample, it will utilize all GPUs simultaneously to infer that sample.
```
-sh scripts/text_condition/train_videoae_65x512x512.sh
+bash scripts/text_condition/gpu/sample_t2v_sp.sh
```
+
+
## 💡 How to Contribute to the Open-Sora Plan Community
We greatly appreciate your contributions to the Open-Sora Plan open-source community and helping us make it even better than it is now!
@@ -317,6 +480,7 @@ For more details, please refer to the [Contribution Guidelines](docs/Contributio
## 👍 Acknowledgement
* [Latte](https://github.com/Vchitect/Latte): The **main codebase** we built upon and it is an wonderful video generated model.
* [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha): Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis.
+* [ShareGPT4Video](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4Video): Improving Video Understanding and Generation with Better Captions.
* [VideoGPT](https://github.com/wilson1yan/VideoGPT): Video Generation using VQ-VAE and Transformers.
* [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
* [FiT](https://github.com/whlzy/FiT): Flexible Vision Transformer for Diffusion Model.
@@ -356,5 +520,4 @@ For more details, please refer to the [Contribution Guidelines](docs/Contributio
-
-
+
\ No newline at end of file
diff --git a/docs/Report-v1.1.0.md b/docs/Report-v1.1.0.md
new file mode 100644
index 000000000..ca3ec2ba5
--- /dev/null
+++ b/docs/Report-v1.1.0.md
@@ -0,0 +1,256 @@
+# Report v1.1.0
+
+In April 2024, we launched Open-Sora-Plan v1.0.0, featuring a simple and efficient design along with remarkable performance in text-to-video generation. It has already been adopted as a foundational model in numerous research projects, including its data and model.
+
+**Today, we are excited to present Open-Sora-Plan v1.1.0, which significantly improves video generation quality and duration.**
+
+Compared to the previous version, Open-Sora-Plan v1.1.0, the improvements include:
+
+1. **Better compressed visual representations**. We optimized the CausalVideoVAE architecture, which now has stronger performance and higher inference efficiency.
+2. **Generate higher quality, longer videos**. We used higher quality visual data and captions by [ShareGPT4Video](https://sharegpt4video.github.io/), enabling the model to better understand the workings of the world.
+
+Along with performance improvements, Open-Sora-Plan v1.1.0 maintains the minimalist design and data efficiency of v1.0.0. Remarkably, we found that v1.1.0 exhibits similar performance to the Sora base model, indicating that our version's evolution aligns with the scaling law demonstrated by Sora.
+
+### Open-Source Release
+We open-source the Open-Sora-Plan to facilitate future development of Video Generation in the community. Code, data, model will be made publicly available.
+- Demo: Hugging Face demo [here](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0).
+- Code: All training scripts and sample scripts.
+- Model: Both Diffusion Model and CasualVideoVAE [here](https://huggingface.co/LanguageBind/Open-Sora-Plan-v1.1.0).
+- Data: Both raw videos and captions [here](https://huggingface.co/datasets/LanguageBind/Open-Sora-Plan-v1.1.0).
+
+## Gallery
+
+
+### 221×512×512 Text-to-Video Generation
+
+| 221×512×512 (9.2s) | 221×512×512 (9.2s) | 221×512×512 (9.2s) | 221×512×512 (9.2s) |
+| --- | --- | --- | --- |
+| | | | |
+| This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage ... | a cat wearing sunglasses and working as a lifeguard at pool. | Photorealistic closeup video of two pirate ships battling each other as they sail ... | A movie trailer featuring the adventures ofthe 30 year old spacemanwearing a redwool ... |
+| | | | |
+| A snowy forest landscape with a dirt road running through it. The road is flanked by ... | Drone shot along the Hawaii jungle coastline, sunny day. Kayaks in the water. | Alpacas wearing knit wool sweaters, graffiti background, sunglasses. | The camera rotates around a large stack of vintage televisions all showing different ... |
+| | | | |
+| A drone camera circles around a beautiful historic church built on a rocky outcropping ... | Aerial view of Santorini during the blue hour, showcasing the stunning architecture ... | A robot dog explores the surface of Mars, kicking up red dust as it investigates ... | An aerial shot of a lighthouse standing tall on a rocky cliff, its beacon cutting ... |
+| | | | |
+| 3D animation of a small, round, fluffy creature with big, expressive eyes explores ... | A corgi vlogging itself in tropical Maui. | A single drop of liquid metal falls from a floating orb, landing on a mirror-like ... | The video presents an abstract composition centered around a hexagonal shape adorned ... |
+
+### 65×512×512 Text-to-Video Generation
+
+| 65×512×512 (2.7s) | 65×512×512 (2.7s) | 65×512×512 (2.7s) | 65×512×512 (2.7s) |
+| --- | --- | --- | --- |
+| | | | |
+| Extreme close-up of chicken and green pepper kebabs grilling on a barbeque with flames. | 3D animation of a small, round, fluffy creature with big, expressive eyes explores a ... | A corgi vlogging itself in tropical Maui. | In a studio, there is a painting depicting a ship sailing through the rough sea. |
+| | | | |
+| A robot dog trots down a deserted alley at night, its metallic paws clinking softly ... | A solitary spider weaves its web in a quiet corner. The web shimmers and glows with ... | A lone surfer rides a massive wave, skillfully maneuvering through the surf. The water ... | A solitary cheetah sprints across the savannah, its powerful muscles propelling it ... |
+| | | | |
+| A solitary astronaut plants a flag on an alien planet covered in crystal formations ... | At dawn's first light, a spaceship slowly exits the edge of the galaxy against a ...| A dapper puppy in a miniature suit, basking in the afternoon sun, adjusting his tie ... | A wise old elephant painting abstract art with its trunk, each stroke a burst of color ... |
+| | | | |
+| In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two ... | A Shiba Inu dog wearing a beret and black turtleneck. | A painting of a boat on water comes to life, with waves crashing and the boat becoming ... | Many spotted jellyfish pulsating under water. Their bodies are transparent and glowing ... |
+| | | | |
+| An animated hedgehog with distinctive spiky hair and large eyes is seen exploring a ... | An animated rabbit in a playful pink snowboarding outfit is carving its way down a ... | A person clad in a space suit with a helmet and equipped with a chest light and arm ... | |
+
+### 65×512×512 Video Editing
+
+| generated 65×512×512 (2.7s) | edited 65×512×512 (2.7s) |
+| --- | --- |
+| | |
+| | |
+| | |
+
+### 512×512 Text-to-Image Generation
+
+
+
+## Detailed Technical Report
+
+### CasualVideoVAE
+
+#### Model Structure
+
+As the number of frames increases, the encoder overhead of CausalVideoVAE gradually rises. When training with 257 frames, 80GB of VRAM is insufficient for the VAE to encode the video. Therefore, we reduced the number of CausalConv3D layers, retaining only the last two stages of CausalConv3D in the encoder. This change significantly lowers the overhead while maintaining nearly the same performance. Note that we only modified the encoder; the decoder still retains all CausalConv3D layers, as training the Diffusion Model does not require the decoder.
+
+
+
+We compare the computational overhead of the two versions by testing the forward inference of the encoder on the H100.
+
+| Version | 129×256×256 | | 257×256×256 | | 513×256×256 | |
+|---|---|---|---|---|---|---|
+| | Peak Mem. | Speed | Peak Mem. | Speed |Peak Mem. | Speed |
+| v1.0.0 | 22G | 2.9 it/s | OOM | - | OOM | - |
+| v1.1.0 | 18G | 4.9 it/s | 34G | 2.5 it/s | 61G | 1.2 it/s |
+
+
+#### Temporal Module
+
+
+
+In v1.0.0, our temporal module had only TemporalAvgPool. TemporalAvgPool leads to the loss of high-frequency information in the video, such as details and edges. To address this issue, we improved this module in v1.1.0. As shown in the figure below, we introduced convolution and added learnable weights, allowing different branches to decouple different features. When we omit CausalConv3D, the video is reconstructed very blurry. Similarly, when we omit TemporalAvgPool, the video becomes very sharp.
+
+| | SSIM↑ | LPIPS↓ | PSNR↑ |
+|---|---|---|---|
+| Base | 0.850 | 0.091 | 28.047 |
+| + Frames | 0.868 | 0.070 | 28.829 |
+| + Reset mixed factor | 0.873 | 0.070 | 29.140 |
+
+
+
+
+
+#### Training Details
+
+Similar to v1.0.0, we initialized from the Latent Diffusion's VAE and used tail initialization. For CausalVideoVAE, we trained for 100k steps in the first stage with a video shape of 9×256×256. Subsequently, we increased the frame count from 9 to 25 and found that this significantly improved the model's performance. It is important to clarify that we enabled the mixed factor during both the first and second stages, with a value of a (sigmoid(mixed factor)) reaching 0.88 at the end of training, indicating the model's tendency to retain low-frequency information. In the third stage, we reinitialized the mixed factor to 0.5 (sigmoid(0.5) = 0.6225), which further enhanced the model's capabilities.
+
+#### Loss Function
+
+We found that using GAN loss helps retain high-frequency information and alleviates grid artifacts. Additionally, we observed that switching from 2D GAN to 3D GAN provides further improvements.
+
+| GAN Loss/Step | SSIM↑ | LPIPS↓ | PSNR↑ |
+|---|---|---|---|
+| 2D/80k | 0.879 | 0.068 | 29.480 |
+| 3D/80k | 0.882 | 0.067 | 29.890 |
+
+#### Inference Tricks
+Therefore, we introduced a method called **temporal rollback tiled convolution**, a tiling approach specifically designed for CausalVideoVAE. Specifically, all windows except the first one discard the first frame because the first frame in a window is treated as an image, while the remaining frames should be treated as video frames.
+
+
+
+We tested the speed on the H100 with a window size of 65×256×256.
+
+| Version | 129×256×256 | | 257×256×256 | | 513×256×256 | |
+|---|---|---|---|---|---|---|
+| | Peak Mem. | Speed | Peak Mem. | Speed |Peak Mem. | Speed |
+| 4×8×8 | 10G | 1.3 s/it | 10G | 2.6 s/it | 10G | 5.3 s/it |
+
+### Data Construction
+Since Open-Sora-Plan supports joint training of images and videos, our data collection is divided into two parts: images and videos. Images do not need to originate from videos; they are independent datasets. We spent approximately 32×240 H100 hours generating image and video captions, and all of this is **open source**!
+
+#### Image-Text Collection Pipeline
+We obtained 11 million image-text pairs from [Pixart-Alpha](https://huggingface.co/datasets/PixArt-alpha/SAM-LLaVA-Captions10M), with captions generated by [LLaVA](https://github.com/haotian-liu/LLaVA). Additionally, we utilized the high-quality OCR dataset [Anytext-3M](https://github.com/tyxsspa/AnyText), which pairs each image with corresponding OCR characters. However, these captions were insufficient to describe the entire image, so we used [InternVL-1.5](https://github.com/OpenGVLab/InternVL) for supplementary descriptions. Since T5 only supports English, we filtered for English data, which constitutes about half of the complete dataset. Furthermore, we selected high-quality images from [Laion-5B](https://laion.ai/blog/laion-5b/) to enhance human-like generation quality. The selection criteria included high resolution, high aesthetic scores, and watermark-free images containing people.
+
+Here, we are open-sourcing the prompt used for InternVL-1.5:
+```
+# for anytext-3m
+Combine this rough caption: "{}", analyze the image in a comprehensive and detailed manner. "{}" can be recognized in the image.
+# for human-160k
+Analyze the image in a comprehensive and detailed manner.
+```
+
+| Name | Image Source | Text Captioner | Num pair |
+|---|---|---|---|
+| SAM-11M | [SAM](https://ai.meta.com/datasets/segment-anything/) | [LLaVA](https://github.com/haotian-liu/LLaVA) | 11,185,255 |
+| Anytext-3M-en | [Anytext](https://github.com/tyxsspa/AnyText) | [InternVL-1.5](https://github.com/OpenGVLab/InternVL) | 1,886,137 |
+| Human-160k | [Laion](https://laion.ai/blog/laion-5b/) | [InternVL-1.5](https://github.com/OpenGVLab/InternVL) | 162,094 |
+
+
+#### Video-Text Collection Pipeline
+In v1.0.0, we sampled one frame from each video to generate captions. However, as video length increased, a single frame could not adequately describe the entire video's content or temporal movements. Therefore, we used a video captioner to generate captions for the entire video clip. Specifically, we used [ShareGPT4Video](https://sharegpt4video.github.io/), which effectively covers temporal information and describes the entire video content. The v1.1.0 video dataset comprises approximately 3k hours, compared to only 300 hours in v1.0.0. As before, we have open-sourced all text annotations and videos (both under the CC0 license), which can be found [here](https://huggingface.co/datasets/LanguageBind/Open-Sora-Plan-v1.1.0/tree/main).
+
+
+
+| Name | Hours | Num frames | Num pair |
+|---|---|---|---|
+| [Mixkit](https://mixkit.co/) | 42.0h | 65 | 54,735 |
+| | | 513 | 1,997 |
+| [Pixabay](https://pixabay.com/) | 353.3h | 65 | 601,513 |
+| | | 513 | 51,483 |
+| [Pexel](https://www.pexels.com/) | 2561.9h | 65 | 3,832,666 |
+| | | 513 | 271,782 |
+
+### Training Diffusion Model
+Similar to our previous work, we employed a multi-stage cascaded training method. Below is our training card:
+
+#### Stage 1
+
+Surprisingly, we initially believed that the performance of the diffusion model would improve with longer training. However, by observing the [logs](https://api.wandb.ai/links/linbin/o76j03j4), we found that videos generated at 50k steps were of higher quality than those at 70-100k steps. In fact, extensive sampling revealed that checkpoints at 40-60k steps outperformed those at 80-100k steps. Quantitatively, 50k steps correspond to approximately 2 epochs of training. It is currently unclear whether this is due to overfitting from a small dataset or the limited capacity of the 2+1D model.
+
+#### Stage 2
+
+In the second stage, we used Huawei Ascend computing power for training. This stage's training and inference were fully supported by Huawei. We conducted sequence parallel training and inference on a large-scale cluster, distributing one sample across eight ranks. Models trained on Huawei Ascend can also be loaded into GPUs and generate videos of the same quality.
+
+
+#### Stage 3
+
+In the third stage, we further increased the frame count to 513 frames, approximately 21 seconds at 24 FPS. However, this stage presents several challenges, such as ensuring temporal consistency in the 2+1D model over long durations and whether the current amount of data is sufficient. We are still training the model for this stage and continuously monitoring its progress.
+
+| Name | Stage 1 | Stage 2 | Stage 3 |
+|---|---|---|---|
+| Training Video Size | 65×512×512 | 221×512×512 | 513×512×512 |
+| Compute (#Num x #Hours) | 80 H100 × 72 | 512 Ascend × 72 | Under Training |
+| Checkpoint | [HF](https://huggingface.co/LanguageBind/Open-Sora-Plan-v1.1.0/tree/main/65x512x512) | [HF](https://huggingface.co/LanguageBind/Open-Sora-Plan-v1.1.0/tree/main/221x512x512) | Under Training |
+| Log | [wandb](https://api.wandb.ai/links/linbin/o76j03j4) | - | - |
+| Training Data | ~3k hours videos + 13M images | | |
+
+### Video Editing
+
+The recently proposed [ReVideo](https://mc-e.github.io/project/ReVideo/) achieves accurate video editing by modifying the first frame and applying motion control within the edited area. Although it achieves excellent video editing performance, the editing length is limited by the base model [SVD](https://github.com/Stability-AI/generative-models). Open-Sora, as a fundamental model for long-video generation, can compensate for this issue. Currently, we are collaborating with the ReVideo team to use Open-Sora as the base model for long video editing. Some preliminary results are shown [here]().
+
+The initial version still needs improvement in several aspects. In the future, we will continue to explore integration with ReVideo to develop improved long-video editing models.
+
+## Failed Case and Discussion
+
+Despite the promising results of v1.1.0, there remains a gap between our model and Sora. Here, we present some failure cases and discuss them.
+
+### CasualVideoVAE
+
+Despite the significant performance improvement of VAE in v1.1.0 over v1.0.0, we still encounter failures in challenging cases, such as sand dunes and leaves. The video on the left shows the reconstructed video downsampled by a factor of 4 in time, while the video on the right is downsampled by a factor of 2. Both exhibit jitter when reconstructing fine-grained features. This indicates that reducing temporal downsampling alone cannot fully resolve the jitter issue.
+
+https://github.com/PKU-YuanGroup/Open-Sora-Plan/assets/62638829/1a87d6d8-4bf1-4b4e-83bb-84870c5c3a11
+
+https://github.com/PKU-YuanGroup/Open-Sora-Plan/assets/62638829/1a87d6d8-4bf1-4b4e-83bb-84870c5c3a11
+
+### Diffusion Model
+
+#### Semantic distortion
+
+On the left is a video generated by v1.1.0 showing a puppy in the snow. In this video, the puppy's head exhibits semantic distortion, indicating that the model struggles to correctly identify which head belongs to which dog. On the right is a video generated by Sora's [base model](https://openai.com/index/video-generation-models-as-world-simulators/). We observe that Sora's early base model also experienced semantic distortion issues. This suggests that we may achieve better results by scaling up the model and increasing the amount of training data.
+
+Prompt:A litter of golden retriever puppies playing in the snow.Their heads pop out of the snow, covered in.
+
+| Our | Sora Base×1 | Sora Base×4 | Sora Base×32 |
+|---|---|---|---|
+| | | | |
+
+#### Limited dynamics
+
+The primary difference between videos and images lies in their dynamic nature, where objects undergo a series of changes across consecutive frames. However, the videos generated by v1.1.0 still contain many instances of limited dynamics. Upon reviewing a large number of training videos, we found that while web-crawled videos have high visual quality, they are often filled with meaningless close-up shots. These close-ups typically show minimal movement or are even static. On the left, we present a generated video of a bird, while on the right is a training video we found, which is almost static. There are many similar videos in the dataset from stock footage sites.
+
+Prompt:This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird's head is tilted slightly to the side,giving the impression of it looking regal and majestic. The background is blurred,drawing attention to the bird's striking appearance.
+
+
+| Our | Raw video |
+|---|---|
+| | |
+
+#### Negative prompt
+
+We found that using negative prompts can significantly improve video quality, even though we did not explicitly tag the training data with different labels. On the left is a video sampled using a negative prompt, while on the right is a video generated without a negative prompt. This suggests that we may need to incorporate more prior knowledge into the training data. For example, when a video has a watermark, we should note "watermark" in the corresponding caption. When a video's bitrate is too low, we should add more tags to distinguish it from high-quality videos, such as "low quality" or "blurry." We believe that explicitly injecting these priors can help the model differentiate between the vast amounts of pretraining data (low quality) and the smaller amounts of fine-tuning data (high quality), thereby generating higher quality videos.
+
+Prompt:A litter of golden retriever puppies playing in the snow.Their heads pop out of the snow, covered in.
+Negative Prompt:distorted, discontinuous, ugly, blurry, low resolution, motionless, static, low quality
+
+
+| With Negative Prompt | Without Negative Prompt |
+|---|---|
+| | |
+
+## Future Work
+
+In our future work, we will focus on two main areas: (1) data scaling and (2) model design. Once we have a robust baseline model, we will extend it to handle variable durations and conditional control models.
+
+### Data Scaling
+
+#### Data source
+
+As mentioned earlier, our dataset is entirely sourced from stock footage websites. Although these videos are of high quality, many consist of close-up shots of specific areas, resulting in slow motion in the videos. We believe this is one of the main reasons for the limited dynamics observed. Therefore, we will continue to collect datasets from diverse sources to address this issue.
+
+#### Data volume
+
+In v1.1.0, our dataset comprises only ~3k hours of video. We are actively collecting more data and anticipate that the video dataset for the next version will reach ~100k hours. We welcome recommendations from the open-source community for additional datasets.
+
+### Model Design
+
+#### CasualVideoVAE
+In our internal testing, even without downsampling in time, we found that it is not possible to completely resolve the jitter issue in reconstructing fine-grained features. Therefore, we need to reconsider how to mitigate video jitter to the greatest extent possible while simultaneously supporting both images and videos. We will introduce a more powerful CasualVideoVAE in the next version.
+
+#### Diffusion Model
+In v1.1.0, we found that 2+1D models can generate higher-quality videos in short durations. However, for long videos, they tend to exhibit discontinuities and inconsistencies. Therefore, we will explore more possibilities in model architecture to address this issue.
diff --git a/docs/Train_And_Eval_CausalVideoVAE.md b/docs/Train_And_Eval_CausalVideoVAE.md
index 691f159f8..772cff903 100644
--- a/docs/Train_And_Eval_CausalVideoVAE.md
+++ b/docs/Train_And_Eval_CausalVideoVAE.md
@@ -32,14 +32,15 @@ Model training requires two key files: one is the `config.json` file, which conf
### Model Configuration File
-Taking the release version model configuration file `release.json` as an example:
+Taking the v1.1.0 version model configuration file as an example:
```json
{
"_class_name": "CausalVAEModel",
"_diffusers_version": "0.27.2",
+ "_name_or_path": "../results/pretrained_488_tail",
"attn_resolutions": [],
- "decoder_attention": "AttnBlock3D",
+ "decoder_attention": "AttnBlock3DFix",
"decoder_conv_in": "CausalConv3d",
"decoder_conv_out": "CausalConv3d",
"decoder_mid_resnet": "ResnetBlock3D",
@@ -58,32 +59,32 @@ Taking the release version model configuration file `release.json` as an example
"decoder_temporal_upsample": [
"",
"",
- "TimeUpsample2x",
- "TimeUpsample2x"
+ "TimeUpsampleRes2x",
+ "TimeUpsampleRes2x"
],
"double_z": true,
"dropout": 0.0,
"embed_dim": 4,
- "encoder_attention": "AttnBlock3D",
- "encoder_conv_in": "CausalConv3d",
+ "encoder_attention": "AttnBlock3DFix",
+ "encoder_conv_in": "Conv2d",
"encoder_conv_out": "CausalConv3d",
"encoder_mid_resnet": "ResnetBlock3D",
"encoder_resnet_blocks": [
- "ResnetBlock3D",
- "ResnetBlock3D",
+ "ResnetBlock2D",
+ "ResnetBlock2D",
"ResnetBlock3D",
"ResnetBlock3D"
],
"encoder_spatial_downsample": [
- "SpatialDownsample2x",
- "SpatialDownsample2x",
- "SpatialDownsample2x",
+ "Downsample",
+ "Downsample",
+ "Downsample",
""
],
"encoder_temporal_downsample": [
- "TimeDownsample2x",
- "TimeDownsample2x",
"",
+ "TimeDownsampleRes2x",
+ "TimeDownsampleRes2x",
""
],
"hidden_size": 128,
@@ -93,15 +94,17 @@ Taking the release version model configuration file `release.json` as an example
4,
4
],
+ "in_channels": 3,
"loss_params": {
"disc_start": 2001,
"disc_weight": 0.5,
"kl_weight": 1e-06,
"logvar_init": 0.0
},
- "loss_type": "opensora.models.ae.videobase.losses.LPIPSWithDiscriminator",
+ "loss_type": "opensora.models.ae.videobase.losses.LPIPSWithDiscriminator3D",
"lr": 1e-05,
"num_res_blocks": 2,
+ "out_channels": 3,
"q_conv": "CausalConv3d",
"resolution": 256,
"z_channels": 4
diff --git a/examples/prompt.txt b/examples/prompt.txt
new file mode 100644
index 000000000..4c23e1e91
--- /dev/null
+++ b/examples/prompt.txt
@@ -0,0 +1,8 @@
+yoji shinkawa painting of a stylish sniper demon
+a beautiful paint of cultists dancing surrounds a huge alpaca in desert at night, by zdzislaw beksinski, trending on artstation.
+male king arthur and his squirrel wife
+frontal portrait of ragged, worried twin women, by john singer sargent and j. c. leyendecker.
+a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation
+only memories remain, trending on artstation
+dream swimming pool with nobody
+a dog doing weights. epic oil painting.
\ No newline at end of file
diff --git a/examples/prompt_list_0.txt b/examples/prompt_list_0.txt
index 6c91d3c3e..92fb551c6 100644
--- a/examples/prompt_list_0.txt
+++ b/examples/prompt_list_0.txt
@@ -1,16 +1,20 @@
-A quiet beach at dawn, the waves gently lapping at the shore and the sky painted in pastel hues.
-A quiet beach at dawn, the waves softly lapping at the shore, pink and orange hues painting the sky, offering a moment of solitude and reflection.
-The majestic beauty of a waterfall cascading down a cliff into a serene lake.
-Sunset over the sea.
-a cat wearing sunglasses and working as a lifeguard at pool.
-Slow pan upward of blazing oak fire in an indoor fireplace.
-Yellow and black tropical fish dart through the sea.
-a serene winter scene in a forest. The forest is blanketed in a thick layer of snow, which has settled on the branches of the trees, creating a canopy of white. The trees, a mix of evergreens and deciduous, stand tall and silent, their forms partially obscured by the snow. The ground is a uniform white, with no visible tracks or signs of human activity. The sun is low in the sky, casting a warm glow that contrasts with the cool tones of the snow. The light filters through the trees, creating a soft, diffused illumination that highlights the texture of the snow and the contours of the trees. The overall style of the scene is naturalistic, with a focus on the tranquility and beauty of the winter landscape.
-a dynamic interaction between the ocean and a large rock. The rock, with its rough texture and jagged edges, is partially submerged in the water, suggesting it is a natural feature of the coastline. The water around the rock is in motion, with white foam and waves crashing against the rock, indicating the force of the ocean's movement. The background is a vast expanse of the ocean, with small ripples and waves, suggesting a moderate sea state. The overall style of the scene is a realistic depiction of a natural landscape, with a focus on the interplay between the rock and the water.
-A serene waterfall cascading down moss-covered rocks, its soothing sound creating a harmonious symphony with nature.
+A young man at his 20s is sitting on a piece of cloud in the sky, reading a book.
+An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt, he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
+A close-up of a woman’s face, illuminated by the soft light of dawn, her expression serene and content as she wakes up in a cozy bedroom.
+An intense close-up of a detective’s face, lit by a single desk lamp, his eyes scanning a wall covered in photos and notes, deep in thought.
+Audience members in a theater are captured in a series of medium shots, with a young man and woman in formal attire centrally positioned and illuminated by a spotlight effect.
A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures.
-The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.
-A vibrant scene of a snowy mountain landscape. The sky is filled with a multitude of colorful hot air balloons, each floating at different heights, creating a dynamic and lively atmosphere. The balloons are scattered across the sky, some closer to the viewer, others further away, adding depth to the scene. Below, the mountainous terrain is blanketed in a thick layer of snow, with a few patches of bare earth visible here and there. The snow-covered mountains provide a stark contrast to the colorful balloons, enhancing the visual appeal of the scene.
-A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene.
-A snowy forest landscape with a dirt road running through it. The road is flanked by trees covered in snow, and the ground is also covered in snow. The sun is shining, creating a bright and serene atmosphere. The road appears to be empty, and there are no people or animals visible in the video. The style of the video is a natural landscape shot, with a focus on the beauty of the snowy forest and the peacefulness of the road.
-The dynamic movement of tall, wispy grasses swaying in the wind. The sky above is filled with clouds, creating a dramatic backdrop. The sunlight pierces through the clouds, casting a warm glow on the scene. The grasses are a mix of green and brown, indicating a change in seasons. The overall style of the video is naturalistic, capturing the beauty of the landscape in a realistic manner. The focus is on the grasses and their movement, with the sky serving as a secondary element. The video does not contain any human or animal elements.
\ No newline at end of file
+a realistic 3d rendering of a female character with curly blonde hair and blue eyes. she is wearing a black tank top and has a neutral expression while facing the camera directly. the background is a plain blue sky, and the scene is devoid of any other objects or text. the character is detailed, with realistic textures and lighting, suitable for a video game or high-quality animation. there is no movement or additional action in the video. the focus is entirely on the character's appearance and realistic rendering.
+A panda strumming a guitar under a bamboo grove, its paws gently plucking the strings as a group of mesmerized rabbits watch, the music blending with the rustle of bamboo leaves. HD.
+A close-up of a woman with a vintage hairstyle and bright red lipstick, gazing seductively into the camera, the background blurred to keep the focus solely on her.
+In the jungle, a hidden temple stands guarded by statues of lions, their eyes glowing with emerald light, protecting secrets untold for millennia. 8K.
+A close-up of an old man’s weathered face, with deep wrinkles and a thick white mustache, looking out to sea, the wind gently blowing through his hair.
+An intense close-up of a soldier’s face, covered in dirt and sweat, his eyes filled with determination as he surveys the battlefield.
+A river that flows uphill, defying gravity as it returns lost treasures from the sea to the mountain top, each item telling a story of a voyage gone by. HD.
+A close-up of a man’s face, lit only by the glow of his computer screen, his eyes wide and unblinking as he discovers something shocking online.
+On a deserted island, palm trees sway to summon a rainstorm, their leaves conducting the wind like maestros, orchestrating a symphony of thunder and lightning. High Resolution.
+An extreme close-up of a middle-aged man’s face, with a five o’clock shadow, staring pensively into the distance as rain softly taps against the window beside him, his thoughts deep and contemplative.
+A close-up of a man’s face, his expression one of deep concentration as he works on a complex task.
+Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach.The crashing blue waters create white-tipped waves,while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green
+shrubbery covers the cliffs edge. The steep drop from the road down to the beach is adramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
+a close-up shot of a woman standing in a dimly lit room. she is wearing a traditional chinese outfit, which includes a red and gold dress with intricate designs and a matching headpiece. the woman has her hair styled in an updo, adorned with a gold accessory. her makeup is done in a way that accentuates her features, with red lipstick and dark eyeshadow. she is looking directly at the camera with a neutral expression. the room has a rustic feel, with wooden beams and a stone wall visible in the background. the lighting in the room is soft and warm, creating a contrast with the woman's vibrant attire. there are no texts or other objects in the video. the style of the video is a portrait, focusing on the woman and her attire.
\ No newline at end of file
diff --git a/examples/prompt_list_1.txt b/examples/prompt_list_1.txt
new file mode 100644
index 000000000..ab0fb276d
--- /dev/null
+++ b/examples/prompt_list_1.txt
@@ -0,0 +1,32 @@
+Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
+A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors.
+Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image.
+A time-lapse of a storm forming over the ocean, dark clouds gathering and lightning flashing. The storm's energy creates spirals of light that dance across the sky.
+A majestic eagle perches on a high cliff, its keen eyes scanning the valley below. With a powerful flap, it takes off, leaving a trail of sparkling feathers.
+A single butterfly with wings that resemble stained glass flutters through a field of flowers. The shot captures the light as it passes through the delicate wings, creating a vibrant, colorful display. HD.
+A solitary mermaid swims through an underwater cave filled with glowing crystals. The shot follows her graceful movements, capturing the play of light on her scales and the ethereal beauty of the cave.
+Close-up of a dragon's eye as it slowly opens, revealing a fiery iris that reflects the burning landscape around it, while smoke wisps off its scaly eyelid.
+A cat with the enigmatic smile of the Mona Lisa, lounging regally on a velvet cushion, her eyes following a fluttering butterfly that mirrors the mysterious allure of her expression. 4K.
+A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
+This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird’s head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird’s striking appearance.
+Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
+The majestic beauty of a waterfall cascading down a cliff into a serene lake.
+Sunset over the sea.
+a cat wearing sunglasses and working as a lifeguard at pool.
+An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt, he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
+A lone figure stands on the deck of a spaceship, looking out at a nebula filled with vibrant colors. The shot tracks their gaze, capturing the breathtaking beauty of the cosmic landscape and the sense of infinite possibility.
+A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. lts tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock,its claws raised and ready to attack. The crab is brown and spiny,with long legs and antennae. The scene is captured from a wide angle,showing the vastness and depth of the ocean. The wateris clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred,creating a depth of field effect.
+a dynamic interaction between the ocean and a large rock. The rock, with its rough texture and jagged edges, is partially submerged in the water, suggesting it is a natural feature of the coastline. The water around the rock is in motion, with white foam and waves crashing against the rock, indicating the force of the ocean's movement. The background is a vast expanse of the ocean, with small ripples and waves, suggesting a moderate sea state. The overall style of the scene is a realistic depiction of a natural landscape, with a focus on the interplay between the rock and the water.
+A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures.
+A vibrant scene of a snowy mountain landscape. The sky is filled with a multitude of colorful hot air balloons, each floating at different heights, creating a dynamic and lively atmosphere. The balloons are scattered across the sky, some closer to the viewer, others further away, adding depth to the scene. Below, the mountainous terrain is blanketed in a thick layer of snow, with a few patches of bare earth visible here and there. The snow-covered mountains provide a stark contrast to the colorful balloons, enhancing the visual appeal of the scene.
+A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene.
+A snowy forest landscape with a dirt road running through it. The road is flanked by trees covered in snow, and the ground is also covered in snow. The sun is shining, creating a bright and serene atmosphere. The road appears to be empty, and there are no people or animals visible in the video. The style of the video is a natural landscape shot, with a focus on the beauty of the snowy forest and the peacefulness of the road.
+The dynamic movement of tall, wispy grasses swaying in the wind. The sky above is filled with clouds, creating a dramatic backdrop. The sunlight pierces through the clouds, casting a warm glow on the scene. The grasses are a mix of green and brown, indicating a change in seasons. The overall style of the video is naturalistic, capturing the beauty of the landscape in a realistic manner. The focus is on the grasses and their movement, with the sky serving as a secondary element. The video does not contain any human or animal elements.
+A close-up of a magician’s crystal ball that reveals a futuristic cityscape within. Skyscrapers of light stretch towards the heavens, and flying cars zip through the air, casting neon reflections across the ball’s surface. 8K.
+A majestic horse gallops across a bridge made of rainbows, each hoof striking sparks of color that cascade into the sky, the clouds parting to reveal a sunlit path to a distant, magical realm.
+A close-up of a robot dog as it interacts with a group of real puppies in a park, its mechanical eyes blinking with curiosity and tail wagging energetically. High Resolution.
+An elderly woman with white hair and a lined face is seated inside an older model car, looking out through the side window with a contemplative or mildly sad expression.
+A quiet beach at dawn, the waves gently lapping at the shore and the sky painted in pastel hues.
+A quiet beach at dawn, the waves softly lapping at the shore, pink and orange hues painting the sky, offering a moment of solitude and reflection.
+A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
+Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
\ No newline at end of file
diff --git a/examples/prompt_list_2.txt b/examples/prompt_list_2.txt
new file mode 100644
index 000000000..4f50aa4ff
--- /dev/null
+++ b/examples/prompt_list_2.txt
@@ -0,0 +1,7 @@
+A person lying upside down on the grass.
+A transparent crystal Pikachu in Times Square.
+a Chinese girl, half body portrait photography,her cheek rest on her hand, light from the window, dramatic light, long shadow cast, standing beside the window, indoor, Fuji film, medium shot, super detailed, high realistic, award-winning photography
+photo of an old man and his granddaughter are sitting on the sofa side by side, eating popcorn
+photo of an old man and his granddaughter are sitting on the sofa side by side, eating popcorn. professional photography, cinematic still
+photo of an old man and his granddaughter are sitting on the sofa side by side, eating popcorn. There is a sloth on the girl's head, professional photography, cinematic still
+photo of an old man and his granddaughter are sitting on the sofa side by side, eating popcorn. There is a sloth on the girl's head, and a capybara sitting in front of the man. professional photography, cinematic still
\ No newline at end of file
diff --git a/examples/rec_image.py b/examples/rec_image.py
index a8e95cb30..c8c820d6f 100644
--- a/examples/rec_image.py
+++ b/examples/rec_image.py
@@ -2,17 +2,17 @@
sys.path.append(".")
from PIL import Image
import torch
-from torchvision.transforms import ToTensor, Compose, Resize, Normalize
+from torchvision.transforms import ToTensor, Compose, Resize, Normalize, Lambda
from torch.nn import functional as F
-from opensora.models.ae.videobase import CausalVAEModel
import argparse
import numpy as np
+from opensora.models.causalvideovae import ae_wrapper
def preprocess(video_data: torch.Tensor, short_size: int = 128) -> torch.Tensor:
transform = Compose(
[
ToTensor(),
- Normalize((0.5), (0.5)),
+ Lambda(lambda x: 2. * x - 1.),
Resize(size=short_size),
]
)
@@ -22,19 +22,26 @@ def preprocess(video_data: torch.Tensor, short_size: int = 128) -> torch.Tensor:
def main(args: argparse.Namespace):
image_path = args.image_path
- resolution = args.resolution
+ short_size = args.short_size
device = args.device
+ kwarg = {}
- vqvae = CausalVAEModel.load_from_checkpoint(args.ckpt)
- vqvae.eval()
- vqvae = vqvae.to(device)
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir='cache_dir', **kwarg).to(device)
+ vae = ae_wrapper[args.ae](args.ae_path, **kwarg).eval().to(device)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.eval()
+ vae = vae.to(device)
+ vae = vae.half()
with torch.no_grad():
- x_vae = preprocess(Image.open(image_path), resolution)
- x_vae = x_vae.to(device)
- latents = vqvae.encode(x_vae)
- recon = vqvae.decode(latents.sample())
- x = recon[0, :, 0, :, :]
+ x_vae = preprocess(Image.open(image_path), short_size)
+ x_vae = x_vae.to(device, dtype=torch.float16) # b c t h w
+ latents = vae.encode(x_vae)
+ latents = latents.to(torch.float16)
+ image_recon = vae.decode(latents) # b t c h w
+ x = image_recon[0, 0, :, :, :]
x = x.squeeze()
x = x.detach().cpu().numpy()
x = np.clip(x, -1, 1)
@@ -47,11 +54,15 @@ def main(args: argparse.Namespace):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
- parser.add_argument('--image-path', type=str, default='')
- parser.add_argument('--rec-path', type=str, default='')
- parser.add_argument('--ckpt', type=str, default='')
- parser.add_argument('--resolution', type=int, default=336)
+ parser.add_argument('--image_path', type=str, default='')
+ parser.add_argument('--rec_path', type=str, default='')
+ parser.add_argument('--ae', type=str, default='')
+ parser.add_argument('--ae_path', type=str, default='')
+ parser.add_argument('--model_path', type=str, default='results/pretrained')
+ parser.add_argument('--short_size', type=int, default=336)
parser.add_argument('--device', type=str, default='cuda')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--enable_tiling', action='store_true')
args = parser.parse_args()
main(args)
diff --git a/examples/rec_imvi_vae.py b/examples/rec_imvi_vae.py
deleted file mode 100644
index 35315157f..000000000
--- a/examples/rec_imvi_vae.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import math
-import random
-import argparse
-from typing import Optional
-
-import cv2
-import numpy as np
-import numpy.typing as npt
-import torch
-from PIL import Image
-from decord import VideoReader, cpu
-from torch.nn import functional as F
-from pytorchvideo.transforms import ShortSideScale
-from torchvision.transforms import Lambda, Compose
-
-import sys
-sys.path.append(".")
-
-from opensora.models.ae import getae_wrapper
-from opensora.dataset.transform import CenterCropVideo, resize
-from opensora.models.ae.videobase import CausalVAEModel
-
-def process_in_chunks(
- video_data: torch.Tensor,
- model: torch.nn.Module,
- chunk_size: int,
- overlap: int,
- device: str,
-):
- assert (chunk_size + overlap - 1) % 4 == 0
- num_frames = video_data.size(2)
- output_chunks = []
-
- start = 0
- while start < num_frames:
- end = min(start + chunk_size, num_frames)
- if start + chunk_size + overlap < num_frames:
- end += overlap
- chunk = video_data[:, :, start:end, :, :]
-
- with torch.no_grad():
- chunk = chunk.to(device)
- latents = model.encode(chunk)
- recon_chunk = model.decode(latents.half()).cpu().float() # b t c h w
- recon_chunk = recon_chunk.permute(0, 2, 1, 3, 4)
-
- if output_chunks:
- overlap_step = min(overlap, recon_chunk.shape[2])
- overlap_tensor = (
- output_chunks[-1][:, :, -overlap_step:] * 1 / 4
- + recon_chunk[:, :, :overlap_step] * 3 / 4
- )
- output_chunks[-1] = torch.cat(
- (output_chunks[-1][:, :, :-overlap], overlap_tensor), dim=2
- )
- if end < num_frames:
- output_chunks.append(recon_chunk[:, :, overlap:])
- else:
- output_chunks.append(recon_chunk[:, :, :, :, :])
- else:
- output_chunks.append(recon_chunk)
- start += chunk_size
- return torch.cat(output_chunks, dim=2).permute(0, 2, 1, 3, 4)
-
-
-def array_to_video(image_array: npt.NDArray, fps: float = 30.0, output_file: str = 'output_video.mp4') -> None:
- height, width, channels = image_array[0].shape
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
- video_writer = cv2.VideoWriter(output_file, fourcc, float(fps), (width, height))
-
- for image in image_array:
- image_rgb = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
- video_writer.write(image_rgb)
-
- video_writer.release()
-
-
-def custom_to_video(x: torch.Tensor, fps: float = 2.0, output_file: str = 'output_video.mp4') -> None:
- x = x.detach().cpu()
- x = torch.clamp(x, -1, 1)
- x = (x + 1) / 2
- x = x.permute(0, 2, 3, 1).numpy()
- x = (255 * x).astype(np.uint8)
- array_to_video(x, fps=fps, output_file=output_file)
- return
-
-
-def read_video(video_path: str, num_frames: int, sample_rate: int) -> torch.Tensor:
- decord_vr = VideoReader(video_path, ctx=cpu(0))
- total_frames = len(decord_vr)
- sample_frames_len = sample_rate * num_frames
-
- if total_frames > sample_frames_len:
- s = random.randint(0, total_frames - sample_frames_len - 1)
- s = 0
- e = s + sample_frames_len
- num_frames = num_frames
- else:
- s = 0
- e = total_frames
- num_frames = int(total_frames / sample_frames_len * num_frames)
- print(f'sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}', video_path,
- total_frames)
-
- frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
- video_data = decord_vr.get_batch(frame_id_list).asnumpy()
- video_data = torch.from_numpy(video_data)
- video_data = video_data.permute(3, 0, 1, 2) # (T, H, W, C) -> (C, T, H, W)
- return video_data
-
-
-class ResizeVideo:
- def __init__(
- self,
- size,
- interpolation_mode="bilinear",
- ):
- self.size = size
-
- self.interpolation_mode = interpolation_mode
-
- def __call__(self, clip):
- _, _, h, w = clip.shape
- if w < h:
- new_h = int(math.floor((float(h) / w) * self.size))
- new_w = self.size
- else:
- new_h = self.size
- new_w = int(math.floor((float(w) / h) * self.size))
- return torch.nn.functional.interpolate(
- clip, size=(new_h, new_w), mode=self.interpolation_mode, align_corners=False, antialias=True
- )
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-
-
-def preprocess(video_data: torch.Tensor, short_size: int = 128, crop_size: Optional[int] = None) -> torch.Tensor:
- transform = Compose(
- [
- Lambda(lambda x: ((x / 255.0) * 2 - 1)),
- ResizeVideo(size=short_size),
- CenterCropVideo(crop_size) if crop_size is not None else Lambda(lambda x: x),
- ]
- )
-
- video_outputs = transform(video_data)
- video_outputs = torch.unsqueeze(video_outputs, 0)
-
- return video_outputs
-
-
-def main(args: argparse.Namespace):
- device = args.device
- kwarg = {}
- # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir='cache_dir', **kwarg).to(device)
- vae = getae_wrapper(args.ae)(args.ae_path, **kwarg).to(device)
- if args.enable_tiling:
- vae.vae.enable_tiling()
- vae.vae.tile_overlap_factor = args.tile_overlap_factor
- vae.eval()
- vae = vae.to(device)
- vae = vae.half()
-
- with torch.no_grad():
- x_vae = preprocess(read_video(args.video_path, args.num_frames, args.sample_rate), args.resolution,
- args.crop_size)
- x_vae = x_vae.to(device, dtype=torch.float16) # b c t h w
- if args.enable_time_chunk:
- video_recon = process_in_chunks(x_vae, vae, 7, 2, device)
- else:
- latents = vae.encode(x_vae)
- latents = latents.to(torch.float16)
- video_recon = vae.decode(latents) # b t c h w
-
- if video_recon.shape[2] == 1:
- x = video_recon[0, 0, :, :, :]
- x = x.squeeze()
- x = x.detach().cpu().numpy()
- x = np.clip(x, -1, 1)
- x = (x + 1) / 2
- x = (255 * x).astype(np.uint8)
- x = x.transpose(1, 2, 0)
- image = Image.fromarray(x)
- image.save(args.rec_path.replace('mp4', 'jpg'))
- else:
- custom_to_video(video_recon[0], fps=args.fps, output_file=args.rec_path)
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--video_path', type=str, default='')
- parser.add_argument('--rec_path', type=str, default='')
- parser.add_argument('--ae', type=str, default='')
- parser.add_argument('--ae_path', type=str, default='')
- parser.add_argument('--model_path', type=str, default='results/pretrained')
- parser.add_argument('--fps', type=int, default=30)
- parser.add_argument('--resolution', type=int, default=336)
- parser.add_argument('--crop_size', type=int, default=None)
- parser.add_argument('--num_frames', type=int, default=100)
- parser.add_argument('--sample_rate', type=int, default=1)
- parser.add_argument('--device', type=str, default="cuda")
- parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
- parser.add_argument('--enable_tiling', action='store_true')
- parser.add_argument('--enable_time_chunk', action='store_true')
-
- args = parser.parse_args()
- main(args)
diff --git a/examples/rec_video.py b/examples/rec_video.py
new file mode 100644
index 000000000..9e1888281
--- /dev/null
+++ b/examples/rec_video.py
@@ -0,0 +1,159 @@
+import math
+import random
+import argparse
+from typing import Optional
+
+import cv2
+import numpy as np
+import numpy.typing as npt
+import torch
+from PIL import Image
+from decord import VideoReader, cpu
+from torch.nn import functional as F
+from pytorchvideo.transforms import ShortSideScale
+from torchvision.transforms import Lambda, Compose
+import sys
+from opensora.models.causalvideovae import ae_wrapper
+from opensora.dataset.transform import ToTensorVideo, CenterCropResizeVideo
+
+
+def array_to_video(image_array: npt.NDArray, fps: float = 30.0, output_file: str = 'output_video.mp4') -> None:
+ height, width, channels = image_array[0].shape
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+ video_writer = cv2.VideoWriter(output_file, fourcc, float(fps), (width, height))
+
+ for image in image_array:
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+ video_writer.write(image_rgb)
+
+ video_writer.release()
+
+
+def custom_to_video(x: torch.Tensor, fps: float = 2.0, output_file: str = 'output_video.mp4') -> None:
+ x = x.detach().cpu()
+ x = torch.clamp(x, -1, 1)
+ x = (x + 1) / 2
+ x = x.permute(0, 2, 3, 1).numpy()
+ x = (255 * x).astype(np.uint8)
+ array_to_video(x, fps=fps, output_file=output_file)
+ return
+
+
+def read_video(video_path: str, num_frames: int, sample_rate: int) -> torch.Tensor:
+ decord_vr = VideoReader(video_path, ctx=cpu(0))
+ total_frames = len(decord_vr)
+ sample_frames_len = sample_rate * num_frames
+
+ # if total_frames > sample_frames_len:
+ # s = random.randint(0, total_frames - sample_frames_len - 1)
+ # s = 0
+ # e = s + sample_frames_len
+ # num_frames = num_frames
+ # else:
+ # s = 0
+ # e = total_frames
+ # num_frames = int(total_frames / sample_frames_len * num_frames)
+ s = 0
+ e = sample_frames_len
+ print(f'sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}', video_path,
+ total_frames)
+
+ frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
+ video_data = torch.from_numpy(video_data)
+ video_data = video_data.permute(3, 0, 1, 2) # (T, H, W, C) -> (C, T, H, W)
+ return video_data
+
+
+def preprocess(video_data: torch.Tensor, height: int = 128, width: int = 128) -> torch.Tensor:
+ transform = Compose(
+ [
+ ToTensorVideo(),
+ CenterCropResizeVideo((height, width)),
+ Lambda(lambda x: 2. * x - 1.)
+ ]
+ )
+
+ video_outputs = transform(video_data)
+ video_outputs = torch.unsqueeze(video_outputs, 0)
+
+ return video_outputs
+
+
+def main(args: argparse.Namespace):
+ device = args.device
+ kwarg = {}
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir='cache_dir', **kwarg).to(device)
+ # vae = CausalVAEModelWrapper(args.ae_path, **kwarg).to(device)
+ vae = ae_wrapper[args.ae](args.ae_path, **kwarg).eval().to(device)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ # vae.vae.tile_sample_min_size = 512
+ # vae.vae.tile_latent_min_size = 64
+ # vae.vae.tile_sample_min_size_t = 29
+ # vae.vae.tile_latent_min_size_t = 8
+ # if args.save_memory:
+ # vae.vae.tile_sample_min_size = 256
+ # vae.vae.tile_latent_min_size = 32
+ # vae.vae.tile_sample_min_size_t = 9
+ # vae.vae.tile_latent_min_size_t = 3
+ dtype = torch.float32
+ vae.eval()
+ vae = vae.to(device, dtype=dtype)
+
+ with torch.no_grad():
+ x_vae = preprocess(read_video(args.video_path, args.num_frames, args.sample_rate), args.height,
+ args.width)
+ print(x_vae.shape)
+ x_vae = x_vae.to(device, dtype=dtype) # b c t h w
+ # for i in range(10000):
+ latents = vae.encode(x_vae)
+ print(latents.shape)
+ latents = latents.to(dtype)
+ video_recon = vae.decode(latents) # b t c h w
+ print(video_recon.shape)
+
+
+
+ # vae = vae.half()
+ # from tqdm import tqdm
+ # with torch.no_grad():
+ # x_vae = torch.rand(1, 3, 93, 720, 1280)
+ # print(x_vae.shape)
+ # x_vae = x_vae.to(device, dtype=torch.float16) # b c t h w
+ # # x_vae = x_vae.to(device) # b c t h w
+ # for i in tqdm(range(100000)):
+ # latents = vae.encode(x_vae)
+ # print(latents.shape)
+ # latents = latents.to(torch.float16)
+ # video_recon = vae.decode(latents) # b t c h w
+ # print(video_recon.shape)
+
+
+ custom_to_video(video_recon[0], fps=args.fps, output_file=args.rec_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--video_path', type=str, default='')
+ parser.add_argument('--rec_path', type=str, default='')
+ parser.add_argument('--ae', type=str, default='')
+ parser.add_argument('--ae_path', type=str, default='')
+ parser.add_argument('--model_path', type=str, default='results/pretrained')
+ parser.add_argument('--fps', type=int, default=30)
+ parser.add_argument('--height', type=int, default=336)
+ parser.add_argument('--width', type=int, default=336)
+ parser.add_argument('--num_frames', type=int, default=100)
+ parser.add_argument('--sample_rate', type=int, default=1)
+ parser.add_argument('--device', type=str, default="cuda")
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--tile_sample_min_size', type=int, default=512)
+ parser.add_argument('--tile_sample_min_size_t', type=int, default=33)
+ parser.add_argument('--tile_sample_min_size_dec', type=int, default=256)
+ parser.add_argument('--tile_sample_min_size_dec_t', type=int, default=33)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--save_memory', action='store_true')
+
+ args = parser.parse_args()
+ main(args)
diff --git a/examples/refine_prompt.txt b/examples/refine_prompt.txt
new file mode 100644
index 000000000..7c5274867
--- /dev/null
+++ b/examples/refine_prompt.txt
@@ -0,0 +1,8 @@
+A dynamic painting by Yoji Shinkawa featuring a stylish sniper demon poised on a rooftop under the moonlit sky, aiming with precision while shadows dance around its menacing form, capturing an intense atmosphere. The scene is framed with a low-angle perspective, emphasizing the demon's dominance, as wisps of fog curl around its feet and the faint glow of city lights flicker below.
+A captivating painting by Zdzislaw Beksinski, trending on ArtStation, depicts a group of enigmatic cultists dancing energetically around a magnificent, towering alpaca in the middle of a desert at night. The flickering firelight casts dynamic, eerie shadows on the scene, highlighting the surreal, otherworldly atmosphere. In the distance, a few cultists can be seen drumming passionately while others wave incense burners, adding layers of smoke that drift hauntingly against the moonlit sky. The composition is enhanced by dramatic camera angles, with low shots emphasizing the grandeur of the alpaca and high shots capturing the frantic movements of the figures below.
+In a sun-dappled medieval forest clearing, bathed with warm, golden light filtering through the thick canopy above, the regal King Arthur stands tall, his majestic presence emphasized by his ornate armor catching glints of the sun, while his squirrel wife, dainty yet graceful with her bushy tail and sparkling eyes, scampers playfully across his armored shoulders. The camera slowly pans from the glistening dew on the vibrant green leaves to the couple, capturing their tender interactions as King Arthur gently extends a hand, and his cheeky wife nibbles on an acorn. Their love creates a serene, almost magical atmosphere, with soft shadows dancing around them, and the sounds of chirping birds and rustling leaves enhancing the enchanting scene.
+A frontal portrait of ragged, worried twin women, painted in the styles of John Singer Sargent and J. C. Leyendecker, where one woman clasps her forehead in anxiety while the other looks pensively into the distance. The scene is set against a dimly lit background with dramatic shadows that highlight their haggard features and worn clothing. The ambient lighting creates a melancholic atmosphere, emphasizing their shared emotional burden. The camera captures them from the waist up, focusing on their expressions and the intricate play of light and shadow on their faces, evoking a sense of profound concern and endurance.
+A highly detailed portrait of a female robot composed entirely of intricately woven code, captured in an octane render at 8K resolution, trending on ArtStation. She is posed gracefully as if in deep contemplation, her luminous eyes focusing intently on a distant point. The scene is set in a futuristic, neon-lit laboratory, with background screens flickering with cascading lines of data. The camera zooms in closely to emphasize the complex textures and circuits forming her face, catching the interplay of light and shadow that highlights the metallic and organic elements. The atmosphere is charged with a sense of technological transcendence, making the viewer feel enveloped in the essence of advanced robotics and artificial intelligence.
+A lone, weathered artist stands amidst the ruins of an ancient city, sketching the remnants of towering stone arches and crumbling walls bathed in the golden glow of the setting sun, while shadows dance and flicker across the scene, casting an air of melancholic nostalgia. The faint chirping of crickets adds to the atmosphere, as the artist's brush captures fleeting images of what once was, now trending on ArtStation.
+A serene, crystal-clear swimming pool, completely devoid of people, glistens under the gentle embrace of sunlight. The camera pans smoothly over the surface, capturing the glimmering reflections dancing with shadows cast by nearby trees. Sunbeams pierce through the lush foliage, creating a dappled pattern on the tiled border. As the scene progresses, a soft breeze ripples the water, sending delicate waves cascading towards the pool’s edge, subtly shifting the play of light and shadow. The peaceful atmosphere is enhanced by the distant rustle of leaves and the occasional chirp of birds, making the scene an epitome of tranquil solitude.
+A muscular dog lifting heavy barbells in an epic oil painting, showcasing the dog's powerful stance and focused expression as it engages in a vigorous workout. The scene captures the action from a low-angle perspective, emphasizing the dog's strength and determination. Dramatic lighting casts deep shadows, adding intensity to the atmosphere, while subtle beams of sunlight filter through the gym windows, highlighting the sheen of sweat on the dog's fur. In the background, other gym-goers are caught in mid-action, adding to the dynamic and energetic ambiance of the setting.
\ No newline at end of file
diff --git a/examples/sora.txt b/examples/sora.txt
new file mode 100644
index 000000000..582e24510
--- /dev/null
+++ b/examples/sora.txt
@@ -0,0 +1,51 @@
+A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, along red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is dampand reflective, creating a mirror effect of thecolorful lights. Many pedestrians walk about.
+Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered tree sand dramatic snow capped mountains in the distance,mid afternoon lightwith wispy cloud sand a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field
+A movie trailer featuring the adventures ofthe 30 year old spacemanwearing a redwool knitted motorcycle helmet, bluesky, saltdesert, cinematic style, shoton 35mm film, vivid colors.
+Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach.The crashing blue waters create white-tipped waves,while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green
+shrubbery covers the cliffs edge. The steep drop from the road down to the beach is adramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
+Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle.The art style is 3D and realistic,with a focus on lighting and texture.The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and
+open mouth. lts pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time.The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image.
+A gorgeously rendered papercraft world of a coral reef,rife with colorful fish and sea creatures.
+This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird's head is tilted slightly to the side,giving the impression of it looking regal and majestic. The background is blurred,drawing attention to the bird's striking appearance.
+Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
+A young man at his 20s is sitting on a piece of cloud in the sky, reading a book.
+A petri dish with a bamboo forest growing within it that has tiny red pandas running around.
+The camera rotates around a large stack of vintage televisions all showing different programs-1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery.
+3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream,its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.
+Historical footage of California during the gold rush.
+A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
+Extreme close up of a 24 year old woman's eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field,vivid colors, cinematic.
+A cartoon kangaroo disco dances.
+A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera.
+A cat waking up its sleeping owner demanding breakfast.The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer.
+Borneo wildlife on the Kinabatangan River
+A Chinese Lunar New Year celebration video with Chinese Dragon.
+The camera follows behind a white vintage SUv with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it's tires, the sunlight shines on the Suv as it speeds along the dirt road,casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars orvehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains with a clear blue sky above with wispy clouds.
+Reflections in the window of a train traveling through the Tokyo suburbs.
+A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast ltaly, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography
+A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. lts tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock,its claws raised and ready to attack. The crab is brown and spiny,with long legs and antennae. The scene is captured from a wide angle,showing the vastness and depth of the ocean. The wateris clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred,creating a depth of field effect.
+A flock of paper airplanes flutters through a dense jungle,weaving around trees as if they were migrating birds.
+A beautiful silhouette animation shows a wolf howling at the moon,feeling lonely, untilit finds its pack.
+New York City submerged like Atlantis.Fish,whales,sea turtles and sharks swim through the streets of New York.
+A litter of golden retriever puppies playing in the snow.Their heads pop out of the snow, covered in.
+Tour of an art gallery with many beautiful works of art in different styles.
+Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes.
+A stop motion animation of a flower growing out of the windowsill of a suburban house.
+The story of a robot's life in a cyberpunk setting.
+An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt, he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
+Basketball through hoop then explodes
+Archeologists discovera generic plastic chairin the desert,excavating and dusting it with great care
+A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table,expression is one of pure joy and happines with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker,the grandmotherwears a light blue blouse adorned with floral patterns,several happy friends and family sitting at the table can be seen celebrating,out of focus.The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood
+Step-printing scene of a person running, cinematic film shot in 35mm
+Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing.
+Tiltshift of a construction site filled with workers, equipment, and heavy machinery.
+A giant, towering cloud in the shape of a man looms overthe earth. The cloud man shoots lighting bolts down to the earth.
+A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur.
+The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort Wiliam. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop forthe train journey. The sky is blue and the sun is shining,making for a beautiful day to explore this majestic spot.
+The camera directly faces colorful buildings in Burano ltaly. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings.
+An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands,3D digital render art style.
+This close-up shot of a chameleon showcases its striking color changing capabilities.The background is blurred, drawing attention to the animals striking appearance.
+A corgi vlogging itself in tropical Maui.
+A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something.Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates awarm contrast, accentuating the cat's orange fur. The shot is clear and sharp, with a shallow depth of field.
+Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking,and the lighting creates a beautiful, serene atmosphere.
+Tiltshift of a construction site filled with workers, equipment, and heavy machinery.
\ No newline at end of file
diff --git a/examples/sora_refine.txt b/examples/sora_refine.txt
new file mode 100644
index 000000000..1e7c627ef
--- /dev/null
+++ b/examples/sora_refine.txt
@@ -0,0 +1,51 @@
+A stylish woman strides confidently down a Tokyo street, her black leather jacket, red dress, and black boots accentuating her confident stride. She carries a sleek black purse and wears sunglasses and bold red lipstick, her presence illuminated by the warm, glowing neon lights and animated city signage. The street is damp and reflective, creating a mirror-like effect that amplifies the colorful lights. As she navigates through the bustling crowd, she occasionally pauses to glance at her phone, her eyes scanning the vibrant scene around her. The interplay of light and shadow adds depth to the atmosphere, making the scene come alive with the hum of city life.
+Several giant woolly mammoths trudge through a snowy meadow, their long, woolly fur lightly blowing in the wind. In the distance, dramatic snow-capped mountains tower, bathed in the warm glow of mid-afternoon light, with wispy clouds scattered across the sky. The low-angle camera captures the large, furry mammals in stunning depth of field, as some mammoths stop to graze on the sparse vegetation while others trumpet and nudge each other playfully. Shadows from the setting sun cast long, dramatic silhouettes, adding depth and atmosphere to the serene yet dynamic scene.
+A cinematic trailer showcasing the adventures of a 30-year-old spaceman wearing a red wool-knitted motorcycle helmet, set against the backdrop of a vibrant blue sky and a vast salt desert. Shot on 35mm film, the vivid colors bring the scene to life as the spaceman navigates the rugged terrain, dodging cacti and leaping over rocky outcrops. The camera captures sweeping aerial shots and close-ups of the spaceman's determined expression, accentuating the interplay of light and shadow. As the scene unfolds, he encounters strange alien creatures and discovers hidden treasures, creating an atmosphere of excitement and mystery.
+A drone captures a breathtaking view of waves crashing against the rugged cliffs along Big Sur's Garay Point Beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore, casting long shadows and creating a warm, serene atmosphere. A small island with a lighthouse stands in the distance, adding to the scene's charm. Seagulls glide overhead, and a lone surfer paddles out to catch the last waves of the day, as the ocean breeze rustles through the nearby vegetation, bringing life to the tranquil coastal landscape.
+The lush shrubbery clings to the cliff's edge, while the steep drop from the road down to the beach presents a dramatic spectacle. The jagged cliff's edges jut out over the sea, capturing the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. As the golden light of the setting sun casts long shadows, seagulls soar overhead and the gentle waves crash below, creating a dynamic and captivating scene.
+An animated scene features a close-up of a short, fluffy monster kneeling beside a melting red candle. The 3D, realistic art style focuses on the interplay of lighting and texture, casting intriguing shadows across the scene. The monster gazes at the flame with wide, curious eyes, its fur gently ruffling in the warm, flickering glow. The camera slowly zooms in, capturing the intricate details of the monster's fur and the delicate, molten wax droplets. The atmosphere is filled with a sense of wonder and curiosity, as the monster tentatively reaches out a paw, as if to touch the flame, while the candlelight dances and flickers around it.
+A young child with an open mouth, standing in a playful pose and expression, as if exploring the world around it for the first time. The scene is bathed in warm colors and dramatic lighting, creating a cozy atmosphere. The child reaches out curiously towards a fluttering butterfly, its eyes wide with wonder, while gentle shadows dance across the ground, adding depth and texture to the moment.
+A beautifully crafted papercraft world of a vibrant coral reef teems with colorful fish and sea creatures darting gracefully through the water, their scales shimmering in the dappled sunlight that filters through the waves. The scene is alive with movement as a playful sea turtle glides past schools of neon fish, while a curious octopus tentatively reaches out to explore a sunken shipwreck, casting intricate shadows on the ocean floor. The gentle rustling of seaweed sways in the current, adding a dynamic rhythm to the underwater ballet, while the overall atmosphere exudes a sense of tranquility and wonder.
+A close-up shot captures a Victoria crowned pigeon, its striking blue plumage and vibrant red chest standing out prominently. The bird's delicate, lacy crest and striking red eye add to its regal appearance. The pigeon's head is tilted slightly to the side, giving it a majestic look. The background is blurred, drawing attention to the bird's striking features. Soft light bathes the scene, casting gentle shadows that enhance the texture of its feathers. The pigeon flutters its wings slightly, and its beak tilts upwards, as if curiously observing the surroundings, creating a dynamic and captivating atmosphere.
+A photorealistic close-up video captures two intricately detailed pirate ships clashing in a dramatic sea battle, all set inside a steaming cup of coffee. The scene is illuminated by the warm, golden glow of the morning sun filtering through the steam rising from the coffee, casting dynamic shadows that dance across the surface. The camera slowly pans and zooms in on the action, capturing the intense moment as cannons fire and cannonsballs splinter the wooden hulls. The atmosphere is thick with tension and smoke, as the pirates shout commands and swing their swords amidst the chaos. The camera occasionally shifts to a wide shot, showing the entire cup and the swirling coffee, adding to the surreal and captivating nature of the scene.
+A young man in his early twenties, with tousled hair and a pair of glasses perched on the end of his nose, sits serenely on a fluffy, white cloud floating high in the sky. He is engrossed in a book, occasionally glancing up to watch the birds soar around him. The sunlight filters through the wispy clouds, casting a soft, golden glow over the scene and creating playful shadows that dance on his face. As he turns a page, a gentle breeze rustles the pages, and he smiles, feeling the thrill of weightlessness and freedom.
+In a petri dish, a lush bamboo forest thrives, with tiny red pandas scurrying playfully among the tall stalks. The scene is bathed in soft, diffused light, casting gentle shadows that dance across the forest floor. As the pandas leap and climb, some pause to nibble on bamboo shoots while others interact with each other, adding a lively atmosphere to the miniature ecosystem.
+The camera slowly rotates around a large stack of vintage televisions, each displaying a unique program: 1950s sci-fi movies, horror movies, news broadcasts, static, and a 1970s sitcom. The scene is set inside a grand New York museum gallery, where the soft glow of the televisions casts flickering shadows on the walls, creating an eerie and nostalgic atmosphere. As the camera moves, visitors in the gallery pause to watch, their faces illuminated by the colorful light of the screens. One visitor leans in closer to a television showing an old news broadcast, while another takes a selfie with a 1950s sci-fi movie playing in the background. The overall effect is a captivating blend of past and present, as the past comes alive in a dynamic, immersive display.
+A 3D animation depicts a small, round, fluffy creature with big, expressive eyes exploring a vibrant, enchanted forest. This whimsical creature, a blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. As the creature hops, it pauses to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. It then looks up in awe at a large, glowing tree that seems to be the heart of the forest. The camera pans smoothly to capture the creature's curiosity as it reaches out to touch a glowing flower, causing it to change colors. The scene is bathed in a soft, ethereal light, with shadows dancing gently in the background, creating an atmosphere of enchantment and wonder. The creature's playful antics and the magical ambiance make the forest come alive, as if every moment is a discovery and a delight.
+Historical footage captures rugged prospectors panning for gold in the sun-drenched rivers of California during the gold rush era. The scene is set against a backdrop of towering, dusty mountains and dense forests, with the camera panning over bustling campsites filled with makeshift tents and the sound of pickaxes striking rocks. The golden light of the setting sun casts long shadows, creating a dramatic contrast between the gleaming metal and the dusty terrain. Prospectors can be seen hauling heavy sacks of ore, while children play nearby, and townsfolk haggle over goods at makeshift markets. The atmosphere is a mix of excitement and perseverance, as the camera captures the relentless pursuit of gold and the spirit of adventure that defined this pivotal moment in American history.
+A close-up view captures a glass sphere containing a serene zen garden, where a small dwarf meticulously rakes the sand to create intricate patterns. The scene is bathed in soft, diffused light, casting gentle shadows that dance across the sand, adding depth to the tranquil atmosphere. The dwarf pauses occasionally to arrange small stones and pine needles, adding a touch of nature to his delicate designs. In the background, the glass sphere glistens subtly, reflecting the ambient light and enhancing the sense of a miniature world within.
+A 24-year-old woman's eye blinks rapidly as she stands in the vibrant, bustling streets of Marrakech during the enchanting magic hour. Captured in cinematic 70mm film, the scene is characterized by a vivid play of light and shadow, with a shallow depth of field that accentuates the colors around her. As the golden hues of the setting sun cast long shadows, she glances around, her expression a mix of curiosity and wonder. The atmosphere is charged with the exotic energy of the city, with merchants calling out their wares and the distant sound of a traditional Moroccan instrument adding to the dynamic backdrop.
+A lively cartoon kangaroo, adorned in a shimmering disco outfit, energetically dances under the dazzling lights of a vibrant disco ball, its movements casting playful shadows on the dance floor. The scene is filled with pulsating music and colorful neon lights, as the kangaroo twirls and spins, occasionally striking a pose and waving its arms to the rhythm. The atmosphere is electric, with the kangaroo's joyful energy radiating throughout the room.
+A vibrant homemade video captures the bustling streets of Lagos, Nigeria in the year 2056, showcasing the lively interactions and daily life of its residents. Shot with a mobile phone camera, the scene is illuminated by the warm, golden light of the setting sun, casting long shadows that dance across the cobblestone paths. In the video, people are seen going about their day, from vendors selling colorful goods to children playing joyfully in the park, while the camera smoothly pans to follow a group of friends laughing and chatting animatedly on a corner bench. The atmosphere is filled with the sounds of chatter and the occasional honking of cars, creating a dynamic and immersive portrayal of life in this vibrant city.
+In a cozy, sunlit bedroom, a fluffy tabby cat, with its bright green eyes gleaming, wakes up its sleeping owner by pawing at their face and meowing persistently. The owner, a young woman with tousled hair, tries to ignore the insistent feline, but the cat escalates its efforts by batting at the bed sheets and pouncing on the blankets. As morning light filters through the window, casting soft shadows across the room, the owner, with a hint of amusement, pulls out a secret stash of treats from under the pillow, hoping to temporarily distract the demanding cat. The cat, momentarily intrigued by the treats, pauses its antics, its tail flicking back and forth, creating a playful, dynamic atmosphere in the room.
+A majestic proboscis monkey swings gracefully through the lush canopy of the Borneo jungle along the Kinabatangan River, while the golden rays of the setting sun cast long shadows across the water, creating a serene yet dynamic atmosphere. The scene is alive with the sounds of birds calling and the gentle rustling of leaves as a family of pygmy elephants wades through the shallow waters, their trunks playfully splashing in the river. The camera captures the vibrant hues of the tropical foliage, highlighting the contrast between light and shadow, and the sense of tranquility and awe that pervades this untouched wilderness.
+A vibrant video showcasing a lively Chinese Lunar New Year celebration featuring a majestic Chinese dragon, gracefully dancing through the streets adorned with colorful lanterns and vibrant decorations. The scene captures the dragon's powerful movements as it weaves through the crowds, its scales shimmering under the warm, golden glow of streetlights. The camera pans and zooms to capture close-ups of the intricate dragon designs and the joyful expressions of the spectators. The atmosphere is filled with the sounds of traditional music and laughter, while the dragon occasionally interacts with the audience, adding to the dynamic energy of the celebration.
+The camera tracks a white vintage SUV with a black roof rack as it accelerates up a steep dirt road, flanked by towering pine trees on a rugged mountain slope. Dust kicks up from its tires, creating a cloud of dust that lingers in the air. The sunlight bathes the SUV in a warm glow, casting long shadows across the scene. The dirt road curves gently into the distance, devoid of any other vehicles. Redwood trees stand tall on either side, their trunks adorned with patches of greenery. The SUV navigates the curves with ease, suggesting a rugged drive through the challenging terrain. As it rounds a bend, the vehicle momentarily catches a glimpse of a deer darting across the road, adding a touch of dynamic action to the serene landscape. The dirt road is encircled by steep hills and mountains, with a clear blue sky above, dotted with wispy clouds, enhancing the sense of vastness and freedom.
+As the train speeds through the Tokyo suburbs, the rhythmic clatter of wheels against the tracks echoes through the car. Outside the window, reflections of neon signs and bustling streets flicker and blur, creating a dynamic interplay of light and shadow. Passengers inside the train, some engrossed in their books, others lost in their own thoughts, occasionally glance up to catch glimpses of the cityscape whizzing by. The train car sways gently, adding a sense of motion to the scene, as the city's vibrant energy and the quiet solitude of the train cabin blend into a captivating urban tapestry.
+A drone camera gracefully circles a historic church perched on a rugged outcropping along the Amalfi Coast, capturing its magnificent architectural details and tiered pathways and patios. Below, waves crash against the rocks, while the horizon stretches out over the coastal waters and hilly landscapes of Italy. Distant figures stroll and enjoy the breathtaking ocean views from the patios, creating a dynamic scene. The warm glow of the afternoon sun bathes the scene in a magical and romantic light, casting long shadows and adding depth to the stunning vista. The camera occasionally zooms in to highlight the intricate details of the church, then pans out to showcase the expansive coastline, creating a captivating visual narrative.
+A large orange octopus rests peacefully on the ocean floor, its tentacles sprawled around its body and eyes closed, blending seamlessly into the sandy and rocky terrain. Unaware of the impending danger, the octopus remains still as a brown, spiny king crab with long legs and antennae crawls stealthily towards it from behind a nearby rock, its claws raised and ready to strike. The scene is captured from a wide angle, showcasing the vastness and depth of the ocean, with clear blue water and rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range, focusing on the octopus and the crab while the background is subtly blurred to create a depth of field effect. The sunlight casts intricate shadows across the seafloor, adding to the dramatic tension. The crab pauses momentarily, antennae twitching, as a school of small fish dart past, momentarily catching the octopus's attention, adding a sense of movement and suspense to the scene.
+A flock of intricately designed paper airplanes glides gracefully through the dense jungle, weaving around towering trees and casting fleeting shadows on the forest floor. The sunlight filters through the canopy, creating a dappled pattern on the ground, while the gentle breeze rustles the leaves, adding a sense of movement to the scene. Some of the planes dip and dive, mimicking the erratic flight patterns of real birds, while others soar higher, catching the sunlight and glinting with a delicate sparkle. The overall atmosphere is serene yet dynamic, as the paper planes continue their playful migration through the verdant expanse.
+A captivating silhouette animation depicts a lone wolf howling at the moon, its eyes glinting with a mix of loneliness and longing. The scene is set in a moonlit forest, where shadows dance across the ground, creating an ethereal atmosphere. The wolf's howl echoes through the stillness, as it prowls cautiously through the underbrush, searching for its pack. As the camera pans slowly, the wolf's silhouette comes into sharper focus, highlighting its determined movements. Suddenly, it catches a glimpse of movement in the distance and lets out a triumphant cry, racing towards the reunion. The camera zooms in as the wolf reunites with its pack, their silhouettes intertwining in a heartwarming display of unity and companionship.
+In a surreal, underwater New York City, fish, whales, sea turtles, and sharks glide gracefully through the streets, weaving past skyscrapers submerged beneath the waves. Sunlight filters down from above, casting shimmering reflections on the wet pavement, while the occasional dolphin leaps playfully, sending droplets of water into the air. Schools of smaller fish dart between the cars and buildings, their scales catching the light, creating a mesmerizing dance of light and shadow. The atmosphere is alive with the gentle hum of marine life, a stark contrast to the bustling cityscape that once thrived above.
+A litter of golden retriever puppies frolic in the snow, their heads popping out of the pristine white blanket, as they chase each other playfully. The soft sunlight filters through the trees, casting dappled shadows across the scene, while the puppies' fur glistens with a sparkling sheen. One puppy tumbles and rolls, sending a spray of snow into the air, while another leaps over a snowbank, adding to the lively and joyful atmosphere.
+A group of art enthusiasts embarks on a guided tour of an expansive art gallery, where they marvel at numerous stunning works of art in various styles, each piece showcasing the artist's unique touch. The gallery, bathed in soft, ambient lighting, casts gentle shadows that dance across the walls, enhancing the intricate details of the masterpieces. As they move from room to room, they pause to admire a vibrant abstract painting, discuss the symbolism in a sculpture, and engage in lively conversation about the historical context of an ancient masterpiece. The atmosphere is filled with a sense of wonder and curiosity, as the visitors' eyes follow the fluid movements of the guide, who skillfully weaves through the crowds, sharing stories and insights that bring the artwork to life.
+In the beautiful, snowy Tokyo city, the camera gracefully weaves through bustling streets, capturing the lively atmosphere as people revel in the stunning winter weather. Snowflakes and gorgeous sakura petals dance through the air, adding a magical touch to the scene. The camera follows a group of friends laughing and sipping hot tea at a street stall, while others eagerly shop for seasonal treats. The soft glow of streetlights casts a warm, inviting light, contrasting with the crisp shadows of the snow-covered buildings, creating a serene yet vibrant atmosphere.
+A captivating stop-motion animation showcases a delicate flower gradually sprouting from the windowsill of a quaint suburban house. The scene is bathed in the soft, golden light of dawn, casting gentle shadows that dance across the room. As the camera slowly zooms in, the flower's petals begin to unfurl, petals slowly unfurling, revealing vibrant colors and intricate details. The camera pans out to capture the serene morning atmosphere, where sunlight filters through the window, illuminating the room and casting a warm glow on the growing flower. The animation also includes subtle movements such as a gentle breeze causing the curtains to sway and the flower's leaves rustling softly, adding a dynamic and lifelike quality to the scene.
+In a dimly lit, rain-soaked alleyway of a cyberpunk city, a lone robot with gleaming metallic limbs and piercing blue optics navigates through the shadows. The neon signs flicker overhead, casting a kaleidoscope of colors across its polished surface. The robot's advanced sensors scan the environment, detecting the faint hum of distant drones and the distant chatter of pedestrians. As it moves, its mechanical joints creak softly, echoing through the deserted street. The scene is bathed in a mix of stark contrasts—bright lights and deep shadows, creating an atmosphere thick with tension and mystery. The robot pauses momentarily to inspect a small, mysterious package on the ground, its advanced algorithms racing to decipher its contents.
+An extreme close-up captures a gray-haired man with a beard in his 60s, deep in thought as he sits at a Parisian cafe, contemplating the history of the universe. His eyes focus intently on people walking offscreen, while he remains mostly motionless. Dressed in a wool coat, a button-down shirt, a brown beret, and glasses, he exudes a professorial demeanor. The man occasionally glances around, his gaze lingering on the bustling Parisian streets and cityscape in the background. The scene is bathed in golden light, reminiscent of a cinematic 35mm film. As he leans forward slightly, his eyes widen in a moment of epiphany, and he offers a subtle, closed-mouth smile, suggesting he has found the answer to the mystery of life. The depth of field creates a dynamic interplay of light and shadow, enhancing the atmosphere of intellectual contemplation.
+A vibrant red basketball, propelled by a powerful leap from a determined player, soars through the air before bursting into a colorful explosion of confetti against the backdrop of a sunlit outdoor basketball court. The camera captures the dramatic moment from a low angle, emphasizing the height and energy of the shot. The scene is bathed in warm, golden light, casting long shadows that dance on the ground. Spectators in the stands erupt into cheers and applause, adding to the dynamic atmosphere of excitement and celebration.
+Under the scorching desert sun, archeologists carefully excavate and dust off a generic plastic chair, meticulously preserving its every detail. The scene is bathed in the golden light of the setting sun, casting long shadows across the sandy dunes. One archeologist delicately brushes away the dirt, while another documents the find with a camera, capturing every nuance. The atmosphere is tense with anticipation, as they uncover the chair's history, each grain of dust revealing a piece of the past.
+A grandmother with neatly combed grey hair stands behind a colorful birthday cake adorned with numerous candles, her expression radiating pure joy and happiness with a happy glow in her eyes. She leans forward and gently blows out the candles with a soft puff, causing the pink frosting and sprinkles to catch the light. The grandmother, dressed in a light blue blouse adorned with floral patterns, is surrounded by several friends and family seated at the wooden dining room table, celebrating. The scene is beautifully captured in a 3/4 view, with warm color tones and soft lighting enhancing the cozy atmosphere. As she leans back, she smiles warmly, and one of the guests reaches for a slice of cake, while another clinks a glass in a toast. The camera subtly pans to capture the lively chatter and laughter, creating a dynamic and cinematic moment.
+A dynamic scene captures a determined athlete sprinting through a sun-drenched, tree-lined street, shot in cinematic 35mm film. The camera follows closely, emphasizing the runner's intense focus and powerful strides. Sunlight filters through the leaves, casting dappled shadows on the ground, while the rhythmic pounding of footsteps echoes in the air. In the background, the faint hum of distant traffic and the occasional rustle of leaves add to the atmosphere. The camera occasionally pans to capture the runner's determined expression, then zooms in on the sweat glistening on their brow, creating a sense of urgency and movement.
+Five playful gray wolf pups frolic and chase each other around a remote gravel road, their fur glistening in the soft morning light. The scene is framed with lush grass on either side, as the pups dart and leap, nipping at each other's tails and ears. Occasionally, one pup pauses to howl, its voice echoing through the stillness, while another tumbles to the ground, rolling and pouncing back up. The sunlight casts long shadows, adding depth to the vibrant atmosphere, as the pups' playful antics bring life to the serene wilderness.
+A tilt-shift shot captures the bustling construction site, where workers in hard hats and neon vests navigate through the chaos, operating heavy machinery and moving equipment. The scene is bathed in the golden hues of the setting sun, casting long shadows and adding depth to the vibrant activity. In the foreground, a crane swings a massive beam into place, while in the background, excavators dig deep into the earth, creating a dynamic and lively atmosphere.
+A colossal, towering cloud shaped like a man looms menacingly over the earth, casting long, ominous shadows across the landscape. The cloud man's colossal form crackles with electricity, shooting bolts of lightning down to the ground, illuminating the darkening sky with bursts of brilliant light. Thunder rumbles in the distance, adding to the dramatic atmosphere, as the cloud's eyes glow with an otherworldly intensity. The earth trembles slightly beneath the impact of each lightning strike, while the wind picks up, rustling the leaves and sending small animals scurrying for cover.
+A Samoyed and a Golden Retriever are playfully romping through a futuristic neon-lit city at night, their fur glistening under the vibrant neon lights that cast colorful reflections. As they dart between towering skyscrapers, their paws create a rhythmic patter on the sleek, rain-slicked pavement. The neon lights create a kaleidoscope of colors, casting dynamic shadows that dance across their forms. Occasionally, they leap and chase each other, their joyful barks echoing through the night air, adding to the electric atmosphere. The scene is captured in a series of rapid-fire shots, emphasizing the motion and energy of their playful antics.
+The Glenfinnan Viaduct, a historic railway bridge in Scotland, UK, stands majestically over the west highland line between the towns of Mallaig and Fort William. As a steam train leaves the bridge, it chugs along the arch-covered viaduct, casting long shadows across the landscape. The lush greenery and rugged rocky mountains create a breathtaking backdrop for the train journey. The blue sky and shining sun illuminate the scene, making it a perfect day to explore this majestic spot. Birds soar overhead, and a gentle breeze rustles through the trees, adding to the dynamic atmosphere.
+The camera captures a lively scene in Burano, Italy, where colorful buildings line the streets. An adorable dalmatian peers curiously through a window on the ground floor. Amidst the vibrant architecture, people stroll and cycle along the canal streets, their movements creating a dynamic atmosphere. Sunlight casts playful shadows, highlighting the bright hues of the buildings, while a few boats gently glide by, adding to the sense of movement and life in the bustling scene.
+A cheerful otter confidently balances on a surfboard, donning a bright yellow lifejacket, as it glides through the shimmering turquoise waters near lush tropical islands. The scene is rendered in a 3D digital art style, with the sunlight casting playful shadows on the water's surface. The otter occasionally dips its paws into the water, sending up sprays of droplets that catch the light, adding a sense of motion and excitement to the tranquil atmosphere.
+In this captivating close-up shot, a chameleon displays its remarkable color-changing abilities, its vibrant hues shifting subtly in the soft, diffused light. The blurred background highlights the animal's striking appearance, while the interplay of light and shadow accentuates the intricate details of its skin. As the chameleon slowly moves its head and eyes, the colors blend seamlessly, creating a mesmerizing dance of hues that draw the viewer's attention.
+A charming corgi, with its fluffy fur and expressive eyes, enthusiastically vlogs itself exploring the sun-drenched beaches of tropical Maui. The camera captures the dog playfully chasing seagulls along the golden sand, basking in the warm sunlight, and splashing in the crystal-clear waters. As the corgi frolics, the camera follows its every move, capturing the vibrant hues of the tropical landscape and the interplay of light and shadow that dance across the scene, creating a lively and joyful atmosphere.
+A white and orange tabby cat joyfully darts through a dense garden, its wide, happy eyes scanning the branches, flowers, and leaves as it jogs forward. The narrow path between the plants frames its energetic movements, captured from a low and intimate ground-level angle. The scene, bathed in warm, cinematic tones and a grainy texture, showcases the cat's orange fur against the scattered daylight filtering through the foliage. As the cat pauses to sniff at a blooming flower, its tail flicks playfully, and it suddenly leaps over a small obstacle, adding a dynamic sense of motion to the shot. The shallow depth of field keeps the cat in sharp focus while the lush greenery blurs around it, enhancing the vivid contrast and creating a lively, immersive atmosphere.
+A breathtaking aerial view of Santorini during the blue hour captures the stunning architecture of white Cycladic buildings with blue domes, casting long shadows against the twilight sky. The caldera views are awe-inspiring, with the interplay of light and shadow creating a serene atmosphere. As the sun dips below the horizon, the gentle glow of the setting sun bathes the scene in a warm, golden hue, while seagulls soar gracefully through the air and a few sailboats drift lazily in the caldera below.
+A tiltshift shot captures a bustling construction site teeming with workers operating heavy machinery and equipment, as they labor tirelessly under the bright midday sun, casting long shadows across the scene. The atmosphere is charged with the hum of engines and the rhythmic clatter of tools, while workers navigate the site with purpose, some pausing to communicate or take a moment to catch their breath. The interplay of light and shadow adds depth and texture to the scene, highlighting the intricate details of the machinery and the determined expressions on the workers' faces.
diff --git a/log_allinpaint_stage1.txt b/log_allinpaint_stage1.txt
new file mode 100644
index 000000000..d53cbd7e3
--- /dev/null
+++ b/log_allinpaint_stage1.txt
@@ -0,0 +1,7687 @@
+[2024-09-18 09:19:28,881] torch.distributed.run: [WARNING]
+[2024-09-18 09:19:28,881] torch.distributed.run: [WARNING] *****************************************
+[2024-09-18 09:19:28,881] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+[2024-09-18 09:19:28,881] torch.distributed.run: [WARNING] *****************************************
+[2024-09-18 09:19:34,850] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:209: ImportWarning:
+ *************************************************************************************************************
+ The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
+ The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
+ The backend in torch.distributed.init_process_group set to hccl now..
+ The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
+ The device parameters have been replaced with npu in the function below:
+ torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty
+ *************************************************************************************************************
+
+ warnings.warn(msg, ImportWarning)
+[2024-09-18 09:19:34,948] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 09:19:35,006] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[2024-09-18 09:19:35,008] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 09:19:35,129] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 09:19:35,232] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[2024-09-18 09:19:35,273] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 09:19:35,300] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+skip replace _has_inf_or_nan
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _DeepSpeedEngine__check_params
+skip replace _copy_recovery_script
+skip replace __init__
+skip replace _get_expert_ckpt_name
+skip replace _change_recovery_script_permissions
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 411's current affinity list: 0-191
+pid 411's new affinity list: 24-47
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+The npu_config.on_npu is True
+pid 412's current affinity list: 0-191
+pid 412's new affinity list: 48-71
+pid 415's current affinity list: 0-191
+pid 415's new affinity list: 120-143
+pid 410's current affinity list: 0-191
+pid 410's new affinity list: 0-23
+skip replace _has_inf_or_nan
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 413's current affinity list: 0-191
+pid 413's new affinity list: 72-95
+pid 414's current affinity list: 0-191
+pid 414's new affinity list: 96-119
+pid 417's current affinity list: 0-191
+pid 417's new affinity list: 168-191
+pid 416's current affinity list: 0-191
+pid 416's new affinity list: 144-167
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-6]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+[RANK-2]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+[RANK-0]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+[RANK-3]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+[RANK-1]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+[RANK-5]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:49,537] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:49,537] [INFO] [comm.py:637:init_distributed] cdb=None
+[RANK-4]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:49,539] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:49,539] [INFO] [comm.py:637:init_distributed] cdb=None
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:49,539] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:49,540] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-09-18 09:19:49,540] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend hccl
+[2024-09-18 09:19:49,540] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:49,540] [INFO] [comm.py:637:init_distributed] cdb=None
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:49,541] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:49,542] [INFO] [comm.py:637:init_distributed] cdb=None
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:49,542] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:49,543] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 6
+Local process index: 6
+Device: npu:6
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 2
+Local process index: 2
+Device: npu:2
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 0
+Local process index: 0
+Device: npu:0
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 3
+Local process index: 3
+Device: npu:3
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+[2024-09-18 09:19:49,550] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:49,550] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 1
+Local process index: 1
+Device: npu:1
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 5
+Local process index: 5
+Device: npu:5
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+09/18/2024 09:19:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 4
+Local process index: 4
+Device: npu:4
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-7]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint='/home/save_dir/runs/allinpaint_stage1/checkpoint-13000', logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 09:19:50,682] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 09:19:50,682] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 09:19:50 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 7
+Local process index: 7
+Device: npu:7
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
0%| | 0/1 [00:00, ?it/s]/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
0%| | 0/1 [00:00, ?it/s]missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3297/478625 [00:00<00:14, 32965.08it/s][A
+
1%|▏ | 6594/478625 [00:00<00:14, 32227.87it/s][A
+
2%|▏ | 9965/478625 [00:00<00:14, 32893.89it/s][A
+
3%|▎ | 13331/478625 [00:00<00:14, 33192.92it/s][A09/18/2024 09:21:50 - INFO - __main__ - optimizer: AdamW (
+Parameter Group 0
+ amsgrad: False
+ betas: (0.9, 0.999)
+ capturable: False
+ differentiable: False
+ eps: 1e-08
+ foreach: False
+ fused: None
+ lr: 1e-05
+ maximize: False
+ weight_decay: 0.01
+)
+
+
3%|▎ | 16652/478625 [00:00<00:14, 32314.62it/s][A
+
4%|▍ | 20016/478625 [00:00<00:14, 32753.36it/s][A
+
5%|▍ | 23296/478625 [00:00<00:14, 32072.75it/s][A
+
6%|▌ | 26656/478625 [00:00<00:13, 32543.95it/s][A
+
6%|▋ | 30015/478625 [00:00<00:13, 32864.76it/s][A
+
7%|▋ | 33305/478625 [00:01<00:13, 32308.03it/s][A
+
8%|▊ | 36657/478625 [00:01<00:13, 32670.60it/s][AYou are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
+
+
8%|▊ | 39928/478625 [00:01<00:13, 32029.45it/s][A
+
9%|▉ | 43269/478625 [00:01<00:13, 32434.58it/s][Amissing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
0%| | 0/478625 [00:00, ?it/s][A
+
10%|▉ | 46615/478625 [00:01<00:13, 32736.24it/s][Amissing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+
+
1%| | 3242/478625 [00:00<00:14, 32415.39it/s][ASuccessfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
10%|█ | 49892/478625 [00:01<00:13, 32138.22it/s][A
+
1%|▏ | 6484/478625 [00:00<00:14, 32001.52it/s][A
+
11%|█ | 53250/478625 [00:01<00:13, 32559.45it/s][A
+
2%|▏ | 9825/478625 [00:00<00:14, 32637.24it/s][A
+
12%|█▏ | 56605/478625 [00:01<00:12, 32851.23it/s][A
+
3%|▎ | 13159/478625 [00:00<00:14, 32911.98it/s][A
+
13%|█▎ | 59894/478625 [00:01<00:12, 32253.01it/s][A
+
3%|▎ | 16451/478625 [00:00<00:14, 32094.54it/s][A
+
13%|█▎ | 63198/478625 [00:01<00:12, 32482.61it/s][A
+
4%|▍ | 19778/478625 [00:00<00:14, 32486.25it/s][A
+
14%|█▍ | 66450/478625 [00:02<00:12, 32050.66it/s][A
+
5%|▍ | 23030/478625 [00:00<00:14, 31845.24it/s][A
+
15%|█▍ | 69815/478625 [00:02<00:12, 32518.56it/s][A
+
6%|▌ | 26356/478625 [00:00<00:14, 32283.23it/s][A
+
15%|█▌ | 73137/478625 [00:02<00:12, 32725.11it/s][A
+
6%|▌ | 29681/478625 [00:00<00:13, 32579.67it/s][A
+
16%|█▌ | 76412/478625 [00:02<00:12, 32211.11it/s][A
+
7%|▋ | 32942/478625 [00:01<00:13, 31951.33it/s][A
+
17%|█▋ | 79760/478625 [00:02<00:12, 32583.64it/s][A
+
8%|▊ | 36277/478625 [00:01<00:13, 32339.05it/s][A
+
17%|█▋ | 83022/478625 [00:02<00:12, 32098.99it/s][A
+
8%|▊ | 39602/478625 [00:01<00:13, 32610.44it/s][A
+
18%|█▊ | 86235/478625 [00:02<00:12, 31944.62it/s][A
+
9%|▉ | 42866/478625 [00:01<00:13, 32018.18it/s][A
+
19%|█▊ | 89611/478625 [00:02<00:11, 32477.31it/s][A
+
10%|▉ | 46133/478625 [00:01<00:13, 32208.98it/s][A
+
19%|█▉ | 92862/478625 [00:02<00:12, 31911.15it/s][A
+
10%|█ | 49357/478625 [00:01<00:13, 31570.68it/s][A
+
20%|██ | 96207/478625 [00:02<00:11, 32360.37it/s][A
+
11%|█ | 52652/478625 [00:01<00:13, 31973.27it/s][A
+
21%|██ | 99447/478625 [00:03<00:11, 31960.59it/s][A
+
12%|█▏ | 55972/478625 [00:01<00:13, 32289.94it/s][A
+
21%|██▏ | 102762/478625 [00:03<00:11, 32308.65it/s][A
+
12%|█▏ | 59205/478625 [00:01<00:13, 31785.21it/s][A
+
22%|██▏ | 106090/478625 [00:03<00:11, 32594.41it/s][A
+
13%|█▎ | 62518/478625 [00:01<00:12, 32179.67it/s][A
+
23%|██▎ | 109352/478625 [00:03<00:11, 32115.41it/s][A
+
14%|█▎ | 65740/478625 [00:02<00:13, 31697.48it/s][A
+
24%|██▎ | 112567/478625 [00:03<00:11, 31992.93it/s][A
+
14%|█▍ | 68966/478625 [00:02<00:12, 31861.22it/s][A
+
24%|██▍ | 115769/478625 [00:03<00:11, 31615.90it/s][A
+
15%|█▌ | 72218/478625 [00:02<00:12, 32055.32it/s][A
+
25%|██▍ | 119113/478625 [00:03<00:11, 32150.70it/s][A
+
16%|█▌ | 75426/478625 [00:02<00:12, 31160.66it/s][A
+
26%|██▌ | 122458/478625 [00:03<00:10, 32534.27it/s][A
+
16%|█▋ | 78549/478625 [00:02<00:12, 30823.73it/s][A
+
26%|██▋ | 125714/478625 [00:03<00:11, 32019.19it/s][A
+
17%|█▋ | 81879/478625 [00:02<00:12, 31544.96it/s][A
+
27%|██▋ | 129056/478625 [00:03<00:10, 32430.07it/s][A
+
18%|█▊ | 85039/478625 [00:02<00:12, 31142.54it/s][A
+
28%|██▊ | 132434/478625 [00:04<00:10, 32827.19it/s][A
+
18%|█▊ | 88350/478625 [00:02<00:12, 31717.70it/s][A
+
28%|██▊ | 135720/478625 [00:04<00:10, 31803.13it/s][A
+
19%|█▉ | 91526/478625 [00:02<00:12, 31173.46it/s][A
+
29%|██▉ | 139057/478625 [00:04<00:10, 32257.06it/s][A
+
20%|█▉ | 94801/478625 [00:02<00:12, 31634.71it/s][A
+
30%|██▉ | 142290/478625 [00:04<00:10, 31913.48it/s][A
+
21%|██ | 98145/478625 [00:03<00:11, 32165.67it/s][A
+
30%|███ | 145603/478625 [00:04<00:10, 32268.07it/s][A
+
21%|██ | 101366/478625 [00:03<00:11, 31535.09it/s][A
+
31%|███ | 148961/478625 [00:04<00:10, 32653.69it/s][A
+
22%|██▏ | 104539/478625 [00:03<00:11, 31588.91it/s][A
+
32%|███▏ | 152230/478625 [00:04<00:10, 32179.16it/s][A
+
23%|██▎ | 107702/478625 [00:03<00:11, 30991.31it/s][A
+
33%|███▎ | 155580/478625 [00:04<00:09, 32566.52it/s][A
+
23%|██▎ | 111006/478625 [00:03<00:11, 31589.99it/s][A
+
33%|███▎ | 158840/478625 [00:04<00:09, 32055.00it/s][A
+
24%|██▍ | 114297/478625 [00:03<00:11, 31978.19it/s][A
+
34%|███▍ | 162050/478625 [00:05<00:09, 31871.63it/s][A
+
25%|██▍ | 117499/478625 [00:03<00:11, 31387.67it/s][A
+
35%|███▍ | 165409/478625 [00:05<00:09, 32375.03it/s][A
+
25%|██▌ | 120643/478625 [00:03<00:11, 31284.55it/s][A
+
35%|███▌ | 168650/478625 [00:05<00:09, 31894.89it/s][A
+
26%|██▌ | 123861/478625 [00:03<00:11, 31545.84it/s][A
+
36%|███▌ | 171997/478625 [00:05<00:09, 32355.77it/s][A
+
27%|██▋ | 127019/478625 [00:03<00:11, 31012.76it/s][A
+
37%|███▋ | 175236/478625 [00:05<00:09, 31842.38it/s][A
+
27%|██▋ | 130313/478625 [00:04<00:11, 31575.45it/s][A
+
37%|███▋ | 178581/478625 [00:05<00:09, 32313.28it/s][A
+
28%|██▊ | 133474/478625 [00:04<00:11, 31346.83it/s][A
+
38%|███▊ | 181877/478625 [00:05<00:09, 32471.10it/s][A
+
29%|██▊ | 136757/478625 [00:04<00:10, 31782.45it/s][A
+
39%|███▊ | 185127/478625 [00:05<00:09, 31521.19it/s][A
+
29%|██▉ | 140069/478625 [00:04<00:10, 32178.02it/s][A
+
39%|███▉ | 188463/478625 [00:05<00:09, 32056.32it/s][A
+
30%|██▉ | 143289/478625 [00:04<00:10, 31509.97it/s][A
+
40%|████ | 191708/478625 [00:05<00:09, 31720.56it/s][A
+
31%|███ | 146578/478625 [00:04<00:10, 31914.48it/s][A
+
41%|████ | 195071/478625 [00:06<00:08, 32278.79it/s][A
+
31%|███▏ | 149774/478625 [00:04<00:10, 31542.39it/s][A
+
41%|████▏ | 198420/478625 [00:06<00:08, 32633.75it/s][A
+
32%|███▏ | 153036/478625 [00:04<00:10, 31858.75it/s][A
+
42%|████▏ | 201688/478625 [00:06<00:08, 31989.90it/s][A
+
33%|███▎ | 156328/478625 [00:04<00:10, 32169.45it/s][A
+
43%|████▎ | 204988/478625 [00:06<00:08, 32282.98it/s][A
+
33%|███▎ | 159548/478625 [00:05<00:10, 31619.17it/s][A
+
44%|████▎ | 208347/478625 [00:06<00:08, 32666.72it/s][A
+
34%|███▍ | 162801/478625 [00:05<00:09, 31886.72it/s][A
+
44%|████▍ | 211618/478625 [00:06<00:08, 31654.53it/s][A
+
35%|███▍ | 166011/478625 [00:05<00:09, 31946.94it/s][A
+
45%|████▍ | 214961/478625 [00:06<00:08, 32168.65it/s][A
+
35%|███▌ | 169208/478625 [00:05<00:09, 31401.70it/s][A
+
46%|████▌ | 218186/478625 [00:06<00:08, 31853.57it/s][A
+
36%|███▌ | 172513/478625 [00:05<00:09, 31886.09it/s][A
+
46%|████▋ | 221535/478625 [00:06<00:07, 32331.10it/s][A
+
37%|███▋ | 175705/478625 [00:05<00:09, 31168.17it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
47%|████▋ | 224900/478625 [00:06<00:07, 32716.22it/s][A
+
37%|███▋ | 179000/478625 [00:05<00:09, 31687.05it/s][A
+
48%|████▊ | 228176/478625 [00:07<00:07, 32156.88it/s][A
+
38%|███▊ | 182174/478625 [00:05<00:09, 31685.28it/s][A
+
48%|████▊ | 231435/478625 [00:07<00:07, 32281.18it/s][A
+
39%|███▊ | 185346/478625 [00:05<00:09, 31291.95it/s][A
+
49%|████▉ | 234667/478625 [00:07<00:07, 31849.71it/s][A
+
39%|███▉ | 188660/478625 [00:05<00:09, 31834.92it/s][A
+
50%|████▉ | 237862/478625 [00:07<00:07, 31876.80it/s][A
+
40%|████ | 191847/478625 [00:06<00:09, 30996.10it/s][A
+
50%|█████ | 241213/478625 [00:07<00:07, 32356.57it/s][A
+
41%|████ | 195096/478625 [00:06<00:09, 31431.09it/s][A
+
51%|█████ | 244451/478625 [00:07<00:07, 31940.87it/s][A
+
41%|████▏ | 198372/478625 [00:06<00:08, 31819.35it/s][A
+
52%|█████▏ | 247799/478625 [00:07<00:07, 32394.26it/s][A
+
42%|████▏ | 201559/478625 [00:06<00:08, 30921.71it/s][A
+
52%|█████▏ | 251041/478625 [00:07<00:07, 31898.86it/s][A
+
43%|████▎ | 204659/478625 [00:06<00:08, 30922.94it/s][A
+
53%|█████▎ | 254381/478625 [00:07<00:06, 32339.95it/s][A
+
43%|████▎ | 207941/478625 [00:06<00:08, 31477.86it/s][A
+
54%|█████▍ | 257716/478625 [00:07<00:06, 32637.94it/s][A
+
44%|████▍ | 211094/478625 [00:06<00:08, 31216.69it/s][A
+
55%|█████▍ | 260983/478625 [00:08<00:06, 31368.43it/s][A
+
45%|████▍ | 214305/478625 [00:06<00:08, 31478.92it/s][A
+
55%|█████▌ | 264314/478625 [00:08<00:06, 31927.83it/s][A
+
45%|████▌ | 217456/478625 [00:06<00:08, 31043.91it/s][A
+
56%|█████▌ | 267670/478625 [00:08<00:06, 32403.74it/s][A
+
46%|████▌ | 220754/478625 [00:06<00:08, 31611.15it/s][A
+
57%|█████▋ | 270919/478625 [00:08<00:06, 31895.20it/s][A
+
47%|████▋ | 223951/478625 [00:07<00:08, 31715.92it/s][A
+
57%|█████▋ | 274256/478625 [00:08<00:06, 32324.88it/s][A
+
47%|████▋ | 227125/478625 [00:07<00:08, 31267.55it/s][A
+
58%|█████▊ | 277495/478625 [00:08<00:06, 31851.73it/s][A
+
48%|████▊ | 230255/478625 [00:07<00:07, 31249.76it/s][A
0%| | 0/1 [00:00, ?it/s]09/18/2024 09:21:59 - INFO - opensora.dataset.t2v_datasets - Building /home/image_data/captions/TV01_clips_final_478625_llavanext_217405_aes478625.json...
+
+
59%|█████▊ | 280827/478625 [00:08<00:06, 32280.03it/s][A
+
49%|████▉ | 233446/478625 [00:07<00:07, 31443.10it/s][A
+
59%|█████▉ | 284178/478625 [00:08<00:05, 32640.81it/s][A
+
49%|████▉ | 236592/478625 [00:07<00:07, 31134.63it/s][A
+
60%|██████ | 287446/478625 [00:08<00:06, 31543.43it/s][A
+
50%|█████ | 239871/478625 [00:07<00:07, 31623.74it/s][A
+
61%|██████ | 290797/478625 [00:09<00:05, 32111.67it/s][A
+
51%|█████ | 243036/478625 [00:07<00:07, 31051.11it/s][A
+
61%|██████▏ | 294017/478625 [00:09<00:05, 31531.88it/s][A
+
51%|█████▏ | 246228/478625 [00:07<00:07, 31303.06it/s][A
+
62%|██████▏ | 297378/478625 [00:09<00:05, 32135.27it/s][A
+
52%|█████▏ | 249503/478625 [00:07<00:07, 31729.58it/s][A
+
63%|██████▎ | 300751/478625 [00:09<00:05, 32566.44it/s][A
+
53%|█████▎ | 252679/478625 [00:07<00:07, 30944.63it/s][A
+
64%|██████▎ | 304014/478625 [00:09<00:05, 32136.06it/s][A
+
53%|█████▎ | 255973/478625 [00:08<00:07, 31525.22it/s][A
+
64%|██████▍ | 307349/478625 [00:09<00:05, 32491.86it/s][A
+
54%|█████▍ | 259131/478625 [00:08<00:07, 31214.74it/s][A
+
65%|██████▍ | 310603/478625 [00:09<00:05, 32026.74it/s][A
+
55%|█████▍ | 262257/478625 [00:08<00:07, 30665.74it/s][A
+
66%|██████▌ | 313810/478625 [00:09<00:05, 31961.58it/s][A
+
55%|█████▌ | 265549/478625 [00:08<00:06, 31323.44it/s][A
+
66%|██████▋ | 317149/478625 [00:09<00:04, 32382.09it/s][A
+
56%|█████▌ | 268686/478625 [00:08<00:06, 30613.65it/s][A
+
67%|██████▋ | 320390/478625 [00:09<00:04, 31959.77it/s][A
+
57%|█████▋ | 271946/478625 [00:08<00:06, 31189.88it/s][A
+
68%|██████▊ | 323741/478625 [00:10<00:04, 32413.50it/s][A
+
58%|█████▊ | 275248/478625 [00:08<00:06, 31726.16it/s][A
+
68%|██████▊ | 326985/478625 [00:10<00:04, 31880.71it/s][A
+
58%|█████▊ | 278426/478625 [00:08<00:06, 30654.94it/s][A
+
69%|██████▉ | 330262/478625 [00:10<00:04, 32140.55it/s][A
+
59%|█████▉ | 281718/478625 [00:08<00:06, 31308.76it/s][A
+
70%|██████▉ | 333606/478625 [00:10<00:04, 32522.30it/s][A
+
60%|█████▉ | 284859/478625 [00:09<00:06, 30544.57it/s][A
+
70%|███████ | 336861/478625 [00:10<00:04, 31472.19it/s][A
+
60%|██████ | 288137/478625 [00:09<00:06, 31189.40it/s][A
+
71%|███████ | 340211/478625 [00:10<00:04, 32061.00it/s][A
+
61%|██████ | 291391/478625 [00:09<00:05, 31582.67it/s][A
+
72%|███████▏ | 343562/478625 [00:10<00:04, 32485.83it/s][A
+
62%|██████▏ | 294557/478625 [00:09<00:05, 30729.15it/s][A
+
72%|███████▏ | 346817/478625 [00:10<00:04, 31977.00it/s][A
+
62%|██████▏ | 297848/478625 [00:09<00:05, 31361.47it/s][A
+
73%|███████▎ | 350161/478625 [00:10<00:03, 32402.98it/s][A
+
63%|██████▎ | 300993/478625 [00:09<00:05, 31376.60it/s][A
+
74%|███████▍ | 353407/478625 [00:10<00:03, 31943.61it/s][A
+
64%|██████▎ | 304137/478625 [00:09<00:05, 31070.91it/s][A
+
75%|███████▍ | 356752/478625 [00:11<00:03, 32383.35it/s][A
+
64%|██████▍ | 307286/478625 [00:09<00:05, 31193.31it/s][A
+
75%|███████▌ | 360095/478625 [00:11<00:03, 32691.27it/s][A
+
65%|██████▍ | 310409/478625 [00:09<00:05, 30994.36it/s][A
+
76%|███████▌ | 363368/478625 [00:11<00:03, 31606.18it/s][A
+
66%|██████▌ | 313675/478625 [00:09<00:05, 31485.71it/s][A
+
77%|███████▋ | 366695/478625 [00:11<00:03, 32088.52it/s][A
+
66%|██████▌ | 316957/478625 [00:10<00:05, 31878.96it/s][A
+
77%|███████▋ | 369912/478625 [00:11<00:03, 31460.33it/s][A
+
67%|██████▋ | 320147/478625 [00:10<00:05, 31043.94it/s][A
+
78%|███████▊ | 373270/478625 [00:11<00:03, 32076.73it/s][A
+
68%|██████▊ | 323309/478625 [00:10<00:04, 31210.95it/s][A
+
79%|███████▊ | 376631/478625 [00:11<00:03, 32525.33it/s][A
+
68%|██████▊ | 326435/478625 [00:10<00:04, 31199.16it/s][A
+
79%|███████▉ | 379890/478625 [00:11<00:03, 32017.36it/s][A
+
69%|██████▉ | 329558/478625 [00:10<00:04, 30601.69it/s][A
+
80%|████████ | 383238/478625 [00:11<00:02, 32444.04it/s][A
+
69%|██████▉ | 332639/478625 [00:10<00:04, 30659.97it/s][A
+
81%|████████ | 386487/478625 [00:12<00:02, 31536.50it/s][A
+
70%|███████ | 335708/478625 [00:10<00:04, 30209.12it/s][A
+
81%|████████▏ | 389828/478625 [00:12<00:02, 32079.97it/s][A
+
71%|███████ | 338817/478625 [00:10<00:04, 30465.98it/s][A
+
82%|████████▏ | 393194/478625 [00:12<00:02, 32541.46it/s][A
+
71%|███████▏ | 342011/478625 [00:10<00:04, 30901.01it/s][A
+
83%|████████▎ | 396455/478625 [00:12<00:02, 32010.26it/s][A
+
72%|███████▏ | 345104/478625 [00:10<00:04, 30659.12it/s][A
+
84%|████████▎ | 399831/478625 [00:12<00:02, 32521.58it/s][A
+
73%|███████▎ | 348410/478625 [00:11<00:04, 31369.02it/s][A
+
84%|████████▍ | 403089/478625 [00:12<00:02, 32066.34it/s][A
+
73%|███████▎ | 351723/478625 [00:11<00:03, 31890.47it/s][A
+
85%|████████▍ | 406434/478625 [00:12<00:02, 32469.45it/s][A
+
74%|███████▍ | 354915/478625 [00:11<00:03, 31353.59it/s][A
+
86%|████████▌ | 409790/478625 [00:12<00:02, 32791.05it/s][A
+
75%|███████▍ | 358201/478625 [00:11<00:03, 31797.39it/s][A
+
86%|████████▋ | 413073/478625 [00:12<00:02, 31693.39it/s][A
+
76%|███████▌ | 361384/478625 [00:11<00:03, 31412.41it/s][A
+
87%|████████▋ | 416320/478625 [00:12<00:01, 31917.65it/s][A
+
76%|███████▌ | 364671/478625 [00:11<00:03, 31840.12it/s][A
+
88%|████████▊ | 419589/478625 [00:13<00:01, 31669.49it/s][A
+
77%|███████▋ | 367894/478625 [00:11<00:03, 31954.66it/s][A
+
88%|████████▊ | 422936/478625 [00:13<00:01, 32194.52it/s][A
+
78%|███████▊ | 371092/478625 [00:11<00:03, 30700.23it/s][A
+
89%|████████▉ | 426275/478625 [00:13<00:01, 32544.24it/s][A
+
78%|███████▊ | 374174/478625 [00:11<00:03, 30482.52it/s][A
+
90%|████████▉ | 429534/478625 [00:13<00:01, 32034.98it/s][A
+
79%|███████▉ | 377230/478625 [00:11<00:03, 30486.80it/s][A
+
90%|█████████ | 432893/478625 [00:13<00:01, 32490.72it/s][A
+
79%|███████▉ | 380284/478625 [00:12<00:03, 30304.33it/s][A
+
91%|█████████ | 436250/478625 [00:13<00:01, 32808.81it/s][A
+
80%|████████ | 383605/478625 [00:12<00:03, 31158.50it/s][A
+
92%|█████████▏| 439535/478625 [00:13<00:01, 31708.19it/s][A
+
81%|████████ | 386726/478625 [00:12<00:02, 30686.61it/s][A
+
93%|█████████▎| 442881/478625 [00:13<00:01, 32216.66it/s][A
+
81%|████████▏ | 389853/478625 [00:12<00:02, 30854.68it/s][A
+
93%|█████████▎| 446111/478625 [00:13<00:01, 31836.10it/s][A
+
82%|████████▏ | 393135/478625 [00:12<00:02, 31432.92it/s][A
+
94%|█████████▍| 449461/478625 [00:13<00:00, 32321.36it/s][A
+
83%|████████▎ | 396282/478625 [00:12<00:02, 30768.70it/s][A
+
95%|█████████▍| 452798/478625 [00:14<00:00, 32628.47it/s][A
+
83%|████████▎ | 399527/478625 [00:12<00:02, 31261.17it/s][A
+
95%|█████████▌| 456066/478625 [00:14<00:00, 32120.56it/s][A
+
84%|████████▍ | 402711/478625 [00:12<00:02, 31430.12it/s][A
+
96%|█████████▌| 459418/478625 [00:14<00:00, 32529.72it/s][A
+
85%|████████▍ | 405858/478625 [00:12<00:02, 31006.97it/s][A
+
97%|█████████▋| 462675/478625 [00:14<00:00, 31549.41it/s][A
+
85%|████████▌ | 408962/478625 [00:13<00:02, 30956.26it/s][A
+
97%|█████████▋| 465985/478625 [00:14<00:00, 31998.13it/s][A
+
86%|████████▌ | 412060/478625 [00:13<00:02, 30467.84it/s][A
+
98%|█████████▊| 469213/478625 [00:14<00:00, 32077.74it/s][A
+
87%|████████▋ | 415218/478625 [00:13<00:02, 30792.23it/s][A
+
99%|█████████▊| 472426/478625 [00:14<00:00, 31672.69it/s][A
+
87%|████████▋ | 418478/478625 [00:13<00:01, 31324.04it/s][A
+
99%|█████████▉| 475804/478625 [00:14<00:00, 32289.67it/s][A
+
88%|████████▊ | 421613/478625 [00:13<00:01, 31063.68it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32209.49it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.25s/it]
100%|██████████| 1/1 [00:21<00:00, 21.25s/it]
+
+
89%|████████▊ | 424722/478625 [00:13<00:01, 31012.43it/s][A
+
89%|████████▉ | 427993/478625 [00:13<00:01, 31515.50it/s][A
+
90%|█████████ | 431147/478625 [00:13<00:01, 31103.81it/s][A
+
91%|█████████ | 434260/478625 [00:13<00:01, 30986.05it/s][A
+
91%|█████████▏| 437360/478625 [00:13<00:01, 30616.80it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
92%|█████████▏| 440648/478625 [00:14<00:01, 31282.88it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
93%|█████████▎| 443870/478625 [00:14<00:01, 31557.51it/s][Atime 21.929378032684326
+
+
1%| | 3171/478625 [00:00<00:15, 31615.36it/s][An_elements: 474899
+data length: 474899
+
+
93%|█████████▎| 447028/478625 [00:14<00:01, 31219.23it/s][A
+
1%|▏ | 6333/478625 [00:00<00:15, 30991.04it/s][A
+
94%|█████████▍| 450152/478625 [00:14<00:00, 31091.51it/s][A
+
2%|▏ | 9624/478625 [00:00<00:14, 31855.15it/s][A
+
95%|█████████▍| 453387/478625 [00:14<00:00, 30930.23it/s][A
+
3%|▎ | 12958/478625 [00:00<00:14, 32435.59it/s][A
+
95%|█████████▌| 456652/478625 [00:14<00:00, 31435.02it/s][A
+
3%|▎ | 16203/478625 [00:00<00:14, 31692.30it/s][A
+
96%|█████████▌| 459928/478625 [00:14<00:00, 31824.25it/s][A
+
4%|▍ | 19505/478625 [00:00<00:14, 32133.41it/s][A
+
97%|█████████▋| 463113/478625 [00:14<00:00, 30891.58it/s][A
+
5%|▍ | 22831/478625 [00:00<00:14, 32494.94it/s][A
+
97%|█████████▋| 466377/478625 [00:14<00:00, 31399.31it/s][A
+
5%|▌ | 26084/478625 [00:00<00:14, 31839.65it/s][A
+
98%|█████████▊| 469523/478625 [00:14<00:00, 31159.09it/s][A
+
6%|▌ | 29378/478625 [00:00<00:13, 32173.60it/s][A
+
99%|█████████▉| 472644/478625 [00:15<00:00, 30825.48it/s][A
+
7%|▋ | 32599/478625 [00:01<00:14, 31626.46it/s][A
+
99%|█████████▉| 475730/478625 [00:15<00:00, 30736.41it/s][A
+
7%|▋ | 35877/478625 [00:01<00:13, 31970.25it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31385.08it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.55s/it]
100%|██████████| 1/1 [00:21<00:00, 21.55s/it]
+
+
8%|▊ | 39145/478625 [00:01<00:13, 32181.88it/s][A
+
9%|▉ | 42366/478625 [00:01<00:13, 31509.97it/s][A
+
10%|▉ | 45647/478625 [00:01<00:13, 31891.06it/s][A
0%| | 0/1 [00:00, ?it/s]
+
10%|█ | 48840/478625 [00:01<00:13, 31424.56it/s][A
+
11%|█ | 52107/478625 [00:01<00:13, 31790.13it/s][A
+
12%|█▏ | 55386/478625 [00:01<00:13, 32083.05it/s][A
+
12%|█▏ | 58598/478625 [00:01<00:13, 31543.61it/s][Atime 22.218870162963867
+n_elements: 474899
+data length: 474899
+
+
13%|█▎ | 61896/478625 [00:01<00:13, 31965.54it/s][A
+
14%|█▎ | 65096/478625 [00:02<00:13, 31356.22it/s][A
+
14%|█▍ | 68399/478625 [00:02<00:12, 31845.66it/s][A
+
15%|█▍ | 71664/478625 [00:02<00:12, 32081.48it/s][A
+
16%|█▌ | 74876/478625 [00:02<00:12, 31468.39it/s][A
+
16%|█▋ | 78151/478625 [00:02<00:12, 31842.51it/s][A
+
17%|█▋ | 81379/478625 [00:02<00:12, 31970.93it/s][A
+
18%|█▊ | 84579/478625 [00:02<00:12, 31284.52it/s][A
+
18%|█▊ | 87846/478625 [00:02<00:12, 31689.79it/s][A
+
19%|█▉ | 91020/478625 [00:02<00:12, 31147.71it/s][A
+
20%|█▉ | 94283/478625 [00:02<00:12, 31579.98it/s][A
+
20%|██ | 97556/478625 [00:03<00:11, 31918.48it/s][A
+
21%|██ | 100752/478625 [00:03<00:12, 31397.47it/s][A
+
22%|██▏ | 103984/478625 [00:03<00:11, 31668.07it/s][A
+
22%|██▏ | 107277/478625 [00:03<00:11, 32040.93it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
23%|██▎ | 110484/478625 [00:03<00:11, 31415.75it/s][A
+
24%|██▍ | 113737/478625 [00:03<00:11, 31742.01it/s][A
+
24%|██▍ | 116915/478625 [00:03<00:11, 31248.38it/s][A
+
25%|██▌ | 120187/478625 [00:03<00:11, 31678.84it/s][A
+
26%|██▌ | 123474/478625 [00:03<00:11, 32028.94it/s][A
+
26%|██▋ | 126680/478625 [00:03<00:11, 31468.80it/s][A
+
27%|██▋ | 129938/478625 [00:04<00:10, 31792.35it/s][A
+
28%|██▊ | 133121/478625 [00:04<00:11, 31386.19it/s][A
+
28%|██▊ | 136401/478625 [00:04<00:10, 31799.84it/s][A
+
29%|██▉ | 139684/478625 [00:04<00:10, 32102.39it/s][A
+
30%|██▉ | 142897/478625 [00:04<00:10, 31512.07it/s][A
+
31%|███ | 146157/478625 [00:04<00:10, 31829.46it/s][A
+
31%|███ | 149462/478625 [00:04<00:10, 32188.15it/s][A
+
32%|███▏ | 152684/478625 [00:04<00:10, 31557.11it/s][A
+
33%|███▎ | 155946/478625 [00:04<00:10, 31868.08it/s][A
+
33%|███▎ | 159137/478625 [00:05<00:10, 31401.30it/s][A
+
34%|███▍ | 162385/478625 [00:05<00:09, 31715.37it/s][A
+
35%|███▍ | 165678/478625 [00:05<00:09, 32072.78it/s][A
+
35%|███▌ | 168888/478625 [00:05<00:09, 31543.05it/s][A
0%| | 0/1 [00:00, ?it/s]
+
36%|███▌ | 172166/478625 [00:05<00:09, 31906.11it/s][A
+
37%|███▋ | 175360/478625 [00:05<00:09, 31395.83it/s][A
+
37%|███▋ | 178652/478625 [00:05<00:09, 31841.33it/s][A
+
38%|███▊ | 181877/478625 [00:05<00:09, 31936.75it/s][A
+
39%|███▊ | 185074/478625 [00:05<00:09, 31403.45it/s][A
+
39%|███▉ | 188356/478625 [00:05<00:09, 31818.64it/s][A
+
40%|████ | 191669/478625 [00:06<00:09, 31380.99it/s][A
+
41%|████ | 194952/478625 [00:06<00:08, 31800.72it/s][A
+
41%|████▏ | 198224/478625 [00:06<00:08, 32068.57it/s][A
+
42%|████▏ | 201435/478625 [00:06<00:08, 31480.01it/s][A
+
43%|████▎ | 204647/478625 [00:06<00:08, 31667.10it/s][A
+
43%|████▎ | 207920/478625 [00:06<00:08, 31977.89it/s][A
+
44%|████▍ | 211121/478625 [00:06<00:08, 31400.46it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
45%|████▍ | 214408/478625 [00:06<00:08, 31831.17it/s][A
+
45%|████▌ | 217595/478625 [00:06<00:08, 31369.28it/s][A
+
46%|████▌ | 220890/478625 [00:06<00:08, 31831.42it/s][A
+
47%|████▋ | 224187/478625 [00:07<00:07, 32167.20it/s][A
+
48%|████▊ | 227407/478625 [00:07<00:07, 31636.68it/s][A
+
48%|████▊ | 230575/478625 [00:07<00:07, 31335.99it/s][A
+
49%|████▉ | 233865/478625 [00:07<00:07, 30988.59it/s][A
+
50%|████▉ | 237160/478625 [00:07<00:07, 31555.99it/s][A
+
50%|█████ | 240440/478625 [00:07<00:07, 31919.84it/s][A
+
51%|█████ | 243636/478625 [00:07<00:07, 31379.51it/s][A
+
52%|█████▏ | 246916/478625 [00:07<00:07, 31794.23it/s][A
+
52%|█████▏ | 250187/478625 [00:07<00:07, 32061.73it/s][A
+
53%|█████▎ | 253397/478625 [00:07<00:07, 31504.84it/s][A
+
54%|█████▎ | 256684/478625 [00:08<00:06, 31904.95it/s][A
+
54%|█████▍ | 259878/478625 [00:08<00:07, 31155.52it/s][A
+
55%|█████▍ | 263171/478625 [00:08<00:06, 31672.91it/s][A
0%| | 0/1 [00:00, ?it/s]
+
56%|█████▌ | 266453/478625 [00:08<00:06, 32008.04it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
56%|█████▋ | 269659/478625 [00:08<00:06, 31423.59it/s][A
+
1%| | 2954/478625 [00:00<00:16, 29509.77it/s][A
+
57%|█████▋ | 272807/478625 [00:08<00:06, 31334.86it/s][A
+
1%|▏ | 5990/478625 [00:00<00:15, 29922.76it/s][A
+
58%|█████▊ | 276079/478625 [00:08<00:06, 31007.84it/s][A
+
2%|▏ | 9133/478625 [00:00<00:15, 30605.01it/s][A
+
58%|█████▊ | 279376/478625 [00:08<00:06, 31576.76it/s][A
+
3%|▎ | 12222/478625 [00:00<00:15, 30714.28it/s][A
+
59%|█████▉ | 282656/478625 [00:08<00:06, 31934.07it/s][A
+
3%|▎ | 15294/478625 [00:00<00:15, 29731.65it/s][A
+
60%|█████▉ | 285853/478625 [00:09<00:06, 31452.20it/s][A
+
4%|▍ | 18452/478625 [00:00<00:15, 30345.88it/s][A
+
60%|██████ | 289110/478625 [00:09<00:05, 31777.78it/s][A
+
5%|▍ | 21544/478625 [00:00<00:14, 30530.07it/s][A
+
61%|██████ | 292327/478625 [00:09<00:05, 31893.12it/s][A
+
5%|▌ | 24601/478625 [00:00<00:15, 29215.03it/s][A
+
62%|██████▏ | 295519/478625 [00:09<00:05, 31414.22it/s][A
+
6%|▌ | 27692/478625 [00:00<00:15, 29724.43it/s][A
+
62%|██████▏ | 298814/478625 [00:09<00:05, 31865.74it/s][A
+
6%|▋ | 30793/478625 [00:01<00:14, 30108.59it/s][A
+
63%|██████▎ | 302004/478625 [00:09<00:05, 31387.84it/s][A
+
7%|▋ | 33813/478625 [00:01<00:15, 29539.62it/s][A
+
64%|██████▍ | 305282/478625 [00:09<00:05, 31795.67it/s][A
+
8%|▊ | 36902/478625 [00:01<00:14, 29937.29it/s][A
+
64%|██████▍ | 308551/478625 [00:09<00:05, 32058.22it/s][A
+
8%|▊ | 39902/478625 [00:01<00:14, 29337.58it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
65%|██████▌ | 311760/478625 [00:09<00:05, 31543.49it/s][A
+
9%|▉ | 42915/478625 [00:01<00:14, 29569.32it/s][A
+
66%|██████▌ | 314918/478625 [00:09<00:05, 31408.19it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
10%|▉ | 45994/478625 [00:01<00:14, 29926.55it/s][A
+
66%|██████▋ | 318209/478625 [00:10<00:05, 31849.85it/s][A
+
10%|█ | 48991/478625 [00:01<00:14, 29410.02it/s][A
+
67%|██████▋ | 321397/478625 [00:10<00:05, 31301.66it/s][A
+
11%|█ | 52087/478625 [00:01<00:14, 29865.22it/s][A
+
68%|██████▊ | 324688/478625 [00:10<00:04, 31772.65it/s][A
+
12%|█▏ | 55178/478625 [00:01<00:14, 30172.61it/s][A
+
69%|██████▊ | 327869/478625 [00:10<00:04, 31010.35it/s][A
+
12%|█▏ | 58199/478625 [00:01<00:14, 29311.61it/s][A
+
69%|██████▉ | 331135/478625 [00:10<00:04, 31490.75it/s][A
+
13%|█▎ | 61290/478625 [00:02<00:14, 29774.92it/s][A
+
70%|██████▉ | 334436/478625 [00:10<00:04, 31935.65it/s][A
+
13%|█▎ | 64379/478625 [00:02<00:13, 30101.72it/s][A
+
71%|███████ | 337634/478625 [00:10<00:04, 31378.44it/s][A
+
14%|█▍ | 67395/478625 [00:02<00:13, 29515.72it/s][A
+
71%|███████ | 340923/478625 [00:10<00:04, 31820.82it/s][A
+
15%|█▍ | 70479/478625 [00:02<00:13, 29903.14it/s][A
+
72%|███████▏ | 344110/478625 [00:10<00:04, 31255.37it/s][A
+
15%|█▌ | 73530/478625 [00:02<00:13, 29285.88it/s][A
+
73%|███████▎ | 347372/478625 [00:10<00:04, 31652.27it/s][A
+
16%|█▌ | 76465/478625 [00:02<00:13, 29241.35it/s][A
+
73%|███████▎ | 350648/478625 [00:11<00:04, 31977.17it/s][A
+
17%|█▋ | 79551/478625 [00:02<00:13, 29714.21it/s][A
+
74%|███████▍ | 353850/478625 [00:11<00:03, 31442.09it/s][A
+
17%|█▋ | 82527/478625 [00:02<00:13, 29296.47it/s][A
+
75%|███████▍ | 356999/478625 [00:11<00:03, 31318.71it/s][A
+
18%|█▊ | 85588/478625 [00:02<00:13, 29681.07it/s][A
0%| | 0/1 [00:00, ?it/s]
+
75%|███████▌ | 360283/478625 [00:11<00:03, 31765.41it/s][A
+
19%|█▊ | 88662/478625 [00:02<00:13, 29991.17it/s][A
+
76%|███████▌ | 363463/478625 [00:11<00:03, 31259.11it/s][A
0%| | 0/1 [00:00, ?it/s]
+
19%|█▉ | 91664/478625 [00:03<00:13, 29299.42it/s][A
+
77%|███████▋ | 366736/478625 [00:11<00:03, 31690.88it/s][A
+
20%|█▉ | 94599/478625 [00:03<00:13, 29186.09it/s][A
+
77%|███████▋ | 369909/478625 [00:11<00:03, 30968.53it/s][A
+
20%|██ | 97644/478625 [00:03<00:12, 29555.53it/s][A
+
78%|███████▊ | 373190/478625 [00:11<00:03, 31504.44it/s][A
+
21%|██ | 100603/478625 [00:03<00:12, 29082.38it/s][A
+
79%|███████▊ | 376487/478625 [00:11<00:03, 31934.36it/s][A
+
22%|██▏ | 103651/478625 [00:03<00:12, 29489.85it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
79%|███████▉ | 379685/478625 [00:11<00:03, 31371.51it/s][A
+
22%|██▏ | 106688/478625 [00:03<00:12, 29747.78it/s][A
+
1%| | 3183/478625 [00:00<00:14, 31822.82it/s][A
+
80%|████████ | 382980/478625 [00:12<00:03, 31824.46it/s][A
+
23%|██▎ | 109666/478625 [00:03<00:12, 29195.45it/s][A
+
1%|▏ | 6366/478625 [00:00<00:15, 30818.99it/s][A
+
81%|████████ | 386167/478625 [00:12<00:02, 31359.06it/s][A
+
24%|██▎ | 112720/478625 [00:03<00:12, 29588.21it/s][A
+
2%|▏ | 9613/478625 [00:00<00:14, 31557.90it/s][A
+
81%|████████▏ | 389443/478625 [00:12<00:02, 31768.20it/s][A
+
24%|██▍ | 115683/478625 [00:03<00:12, 29399.87it/s][A
+
3%|▎ | 12886/478625 [00:00<00:14, 32014.56it/s][A
+
82%|████████▏ | 392749/478625 [00:12<00:02, 32147.76it/s][A
+
25%|██▍ | 118626/478625 [00:04<00:12, 28849.33it/s][A
+
3%|▎ | 16090/478625 [00:00<00:14, 31213.41it/s][A
+
83%|████████▎ | 395967/478625 [00:12<00:02, 31400.61it/s][A
+
25%|██▌ | 121685/478625 [00:04<00:12, 29356.61it/s][A
+
4%|▍ | 19348/478625 [00:00<00:14, 31666.30it/s][A
+
83%|████████▎ | 399113/478625 [00:12<00:02, 31333.80it/s][A
+
26%|██▌ | 124625/478625 [00:04<00:12, 28919.03it/s][A
+
5%|▍ | 22579/478625 [00:00<00:14, 31872.74it/s][A
+
84%|████████▍ | 402402/478625 [00:12<00:02, 31789.65it/s][A
+
27%|██▋ | 127662/478625 [00:04<00:11, 29342.45it/s][A
+
5%|▌ | 25770/478625 [00:00<00:14, 31211.62it/s][A
+
85%|████████▍ | 405585/478625 [00:12<00:02, 31256.33it/s][A
+
27%|██▋ | 130727/478625 [00:04<00:11, 29726.72it/s][A
+
6%|▌ | 29014/478625 [00:00<00:14, 31586.77it/s][A
+
85%|████████▌ | 408883/478625 [00:12<00:02, 31761.62it/s][A
+
28%|██▊ | 133703/478625 [00:04<00:12, 28708.27it/s][A
+
7%|▋ | 32177/478625 [00:01<00:14, 31040.18it/s][A
+
86%|████████▌ | 412063/478625 [00:13<00:02, 31136.58it/s][A
+
29%|██▊ | 136767/478625 [00:04<00:11, 29267.52it/s][A
+
7%|▋ | 35445/478625 [00:01<00:14, 31529.59it/s][A
+
87%|████████▋ | 415261/478625 [00:13<00:02, 31382.19it/s][A
+
29%|██▉ | 139806/478625 [00:04<00:11, 29594.57it/s][A
+
8%|▊ | 38684/478625 [00:01<00:13, 31786.02it/s][A
+
87%|████████▋ | 418543/478625 [00:13<00:01, 31805.40it/s][A
+
30%|██▉ | 142772/478625 [00:04<00:11, 29047.60it/s][A
+
9%|▊ | 41866/478625 [00:01<00:14, 31128.84it/s][A
+
88%|████████▊ | 421727/478625 [00:13<00:01, 31289.40it/s][A
+
30%|███ | 145820/478625 [00:04<00:11, 29464.89it/s][A
+
9%|▉ | 45113/478625 [00:01<00:13, 31522.14it/s][A
+
89%|████████▉ | 425032/478625 [00:13<00:01, 31805.35it/s][A
+
31%|███ | 148865/478625 [00:05<00:11, 29751.90it/s][A
+
10%|█ | 48270/478625 [00:01<00:13, 31046.80it/s][A
+
89%|████████▉ | 428217/478625 [00:13<00:01, 31326.37it/s][A
+
32%|███▏ | 151845/478625 [00:05<00:11, 29187.05it/s][A
+
11%|█ | 51518/478625 [00:01<00:13, 31466.64it/s][A
+
90%|█████████ | 431509/478625 [00:13<00:01, 31793.53it/s][A
+
32%|███▏ | 154769/478625 [00:05<00:11, 29126.45it/s][A
+
11%|█▏ | 54767/478625 [00:01<00:13, 31766.86it/s][A
+
91%|█████████ | 434820/478625 [00:13<00:01, 32179.75it/s][A
+
33%|███▎ | 157833/478625 [00:05<00:10, 29569.82it/s][A
+
12%|█▏ | 57947/478625 [00:01<00:13, 31211.69it/s][A
+
92%|█████████▏| 438041/478625 [00:13<00:01, 31094.08it/s][A
+
34%|███▎ | 160794/478625 [00:05<00:10, 28987.57it/s][A
+
13%|█▎ | 61207/478625 [00:01<00:13, 31619.78it/s][A
+
92%|█████████▏| 441336/478625 [00:13<00:01, 31632.28it/s][A
+
34%|███▍ | 163820/478625 [00:05<00:10, 29357.55it/s][A
+
13%|█▎ | 64461/478625 [00:02<00:12, 31889.65it/s][A
+
93%|█████████▎| 444621/478625 [00:14<00:01, 31989.11it/s][A
+
35%|███▍ | 166760/478625 [00:05<00:10, 28918.01it/s][A
+
14%|█▍ | 67653/478625 [00:02<00:13, 31313.84it/s][A
+
94%|█████████▎| 447827/478625 [00:14<00:00, 31479.47it/s][A
+
35%|███▌ | 169807/478625 [00:05<00:10, 29372.71it/s][A
+
15%|█▍ | 70890/478625 [00:02<00:12, 31614.05it/s][A
+
94%|█████████▍| 451102/478625 [00:14<00:00, 31849.35it/s][A
+
36%|███▌ | 172748/478625 [00:05<00:10, 29180.43it/s][A
+
15%|█▌ | 74055/478625 [00:02<00:12, 31148.54it/s][A
+
95%|█████████▍| 454292/478625 [00:14<00:00, 31338.20it/s][A
+
37%|███▋ | 175669/478625 [00:05<00:10, 28783.39it/s][A
+
16%|█▌ | 77286/478625 [00:02<00:12, 31486.54it/s][A
+
96%|█████████▌| 457574/478625 [00:14<00:00, 31770.57it/s][A
+
37%|███▋ | 178744/478625 [00:06<00:10, 29358.86it/s][A
+
17%|█▋ | 80530/478625 [00:02<00:12, 31767.37it/s][A
+
96%|█████████▋| 460870/478625 [00:14<00:00, 32118.79it/s][A
+
38%|███▊ | 181821/478625 [00:06<00:09, 29738.30it/s][A
+
17%|█▋ | 83710/478625 [00:02<00:12, 31228.74it/s][A
+
97%|█████████▋| 464086/478625 [00:14<00:00, 31382.04it/s][A
+
39%|███▊ | 184798/478625 [00:06<00:10, 29191.29it/s][A
+
18%|█▊ | 86944/478625 [00:02<00:12, 31554.84it/s][A
+
98%|█████████▊| 467286/478625 [00:14<00:00, 31561.36it/s][A
+
39%|███▉ | 187901/478625 [00:06<00:09, 29729.15it/s][A
+
19%|█▉ | 90188/478625 [00:02<00:12, 31816.17it/s][A
+
98%|█████████▊| 470447/478625 [00:14<00:00, 31166.85it/s][A
+
40%|███▉ | 191008/478625 [00:06<00:09, 30124.15it/s][A
+
20%|█▉ | 93373/478625 [00:02<00:12, 31198.01it/s][A
+
99%|█████████▉| 473729/478625 [00:14<00:00, 31651.10it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
41%|████ | 194024/478625 [00:06<00:09, 29080.65it/s][A
+
20%|██ | 96633/478625 [00:03<00:12, 31607.21it/s][A
+
100%|█████████▉| 477039/478625 [00:15<00:00, 32077.55it/s][A
+
1%| | 3144/478625 [00:00<00:15, 31428.79it/s][A
+
41%|████ | 197124/478625 [00:06<00:09, 29636.46it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31649.66it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.88s/it]
100%|██████████| 1/1 [00:21<00:00, 21.88s/it]
+
+
21%|██ | 99798/478625 [00:03<00:12, 31136.75it/s][A
+
1%|▏ | 6287/478625 [00:00<00:15, 30423.81it/s][A
+
42%|████▏ | 200097/478625 [00:06<00:09, 29193.37it/s][A
+
22%|██▏ | 103044/478625 [00:03<00:11, 31522.83it/s][A09/18/2024 09:22:21 - INFO - opensora.dataset.t2v_datasets - no_cap: 0, too_long: 3711, too_short: 2, no_resolution: 0, resolution_mismatch: 0, Counter(sample_size): Counter({'93x160x320': 84930, '29x160x320': 73201, '45x160x320': 68295, '61x160x320': 44578, '77x160x320': 38630, '93x128x320': 17805, '29x128x320': 16948, '93x224x320': 16403, '93x192x320': 15259, '45x128x320': 14788, '61x128x320': 9795, '29x224x320': 8615, '29x192x320': 8528, '45x224x320': 8477, '45x192x320': 8309, '77x128x320': 7730, '61x224x320': 6211, '61x192x320': 5983, '77x224x320': 5788, '77x192x320': 5268, '93x256x320': 3164, '45x256x320': 1510, '29x256x320': 1480, '61x256x320': 1152, '77x256x320': 1090, '93x96x320': 282, '45x96x320': 200, '29x96x320': 169, '61x96x320': 163, '77x96x320': 148}), cnt_movie: 0, cnt_img: 0, before filter: 478625, after filter: 474899
+
+
2%|▏ | 9503/478625 [00:00<00:15, 31198.01it/s][A
+
42%|████▏ | 203191/478625 [00:06<00:09, 29702.70it/s][A
+
22%|██▏ | 106279/478625 [00:03<00:11, 31766.29it/s][A
+
3%|▎ | 12706/478625 [00:00<00:14, 31520.18it/s][A
+
43%|████▎ | 206246/478625 [00:06<00:09, 29950.18it/s][A
+
23%|██▎ | 109459/478625 [00:03<00:11, 31199.65it/s][A
+
3%|▎ | 15860/478625 [00:00<00:15, 30716.94it/s][A
+
44%|████▎ | 209246/478625 [00:07<00:09, 29288.31it/s][A
+
24%|██▎ | 112726/478625 [00:03<00:11, 31629.47it/s][A09/18/2024 09:22:21 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | motion_score: 474899, cnt_no_motion: 13 | 192077 > 0.95, 0.7 > 65730 Mean: 0.8593367888417824, Var: 0.03075349223473551, Std: 0.17536673639757203, Min: -0.0717548280954361, Max: 1.0
+
+
4%|▍ | 19042/478625 [00:00<00:14, 31080.49it/s][A
+
44%|████▍ | 212181/478625 [00:07<00:09, 29284.33it/s][A
+
24%|██▍ | 115893/478625 [00:03<00:11, 30976.15it/s][A
+
5%|▍ | 22229/478625 [00:00<00:14, 31334.40it/s][A
+
45%|████▍ | 215256/478625 [00:07<00:08, 29713.69it/s][A
+
25%|██▍ | 119140/478625 [00:03<00:11, 31411.89it/s][A
+
5%|▌ | 25366/478625 [00:00<00:15, 30166.47it/s][A
+
46%|████▌ | 218231/478625 [00:07<00:08, 29118.31it/s][A09/18/2024 09:22:21 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | aesthetic_score: 478625, cnt_no_aesthetic: 0 | 14374 > 5.75, 4.5 > 113830 Mean: 4.846693657797633, Var: 0.24147353645946146, Std: 0.4913995690468821, Min: 2.685077953338623, Max: 6.742257436116536
+
+
26%|██▌ | 122381/478625 [00:03<00:11, 31704.62it/s][Atime 22.639402389526367
+n_elements: 474899
+data length: 474899
+
+
6%|▌ | 28545/478625 [00:00<00:14, 30654.10it/s][A
+
46%|████▌ | 221296/478625 [00:07<00:08, 29565.21it/s][A
+
26%|██▌ | 125556/478625 [00:03<00:11, 31125.87it/s][A
+
7%|▋ | 31619/478625 [00:01<00:14, 30297.23it/s][A
+
47%|████▋ | 224403/478625 [00:07<00:08, 30007.92it/s][A
+
27%|██▋ | 128786/478625 [00:04<00:11, 31469.69it/s][A
+
7%|▋ | 34812/478625 [00:01<00:14, 30781.63it/s][A
+
48%|████▊ | 227408/478625 [00:07<00:08, 29465.39it/s][A
+
28%|██▊ | 132040/478625 [00:04<00:10, 31782.03it/s][A
+
8%|▊ | 37986/478625 [00:01<00:14, 31065.31it/s][A
+
48%|████▊ | 230359/478625 [00:07<00:08, 28991.45it/s][A
+
28%|██▊ | 135222/478625 [00:04<00:10, 31275.44it/s][A
+
9%|▊ | 41097/478625 [00:01<00:14, 30545.13it/s][A
+
49%|████▊ | 233262/478625 [00:07<00:08, 28624.22it/s][A
+
29%|██▉ | 138461/478625 [00:04<00:10, 31601.77it/s][A
+
9%|▉ | 44260/478625 [00:01<00:14, 30864.40it/s][A
+
49%|████▉ | 236128/478625 [00:08<00:08, 27273.30it/s][A
+
30%|██▉ | 141625/478625 [00:04<00:10, 31120.55it/s][A
+
10%|▉ | 47457/478625 [00:01<00:13, 31191.24it/s][A
+
50%|████▉ | 239022/478625 [00:08<00:08, 27745.24it/s][A
+
30%|███ | 144824/478625 [00:04<00:10, 31374.95it/s][A
+
11%|█ | 50580/478625 [00:01<00:14, 30554.51it/s][A
+
51%|█████ | 241930/478625 [00:08<00:08, 28129.57it/s][A
+
31%|███ | 148080/478625 [00:04<00:10, 31721.92it/s][A09/18/2024 09:22:22 - INFO - __main__ - after train_dataloader
+09/18/2024 09:22:22 - INFO - __main__ - before accelerator.prepare
+[2024-09-18 09:22:22,652] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.6, git-hash=unknown, git-branch=unknown
+
+
11%|█ | 53788/478625 [00:01<00:13, 31001.98it/s][A
+
51%|█████ | 244753/478625 [00:08<00:08, 28029.90it/s][A
+
32%|███▏ | 151255/478625 [00:04<00:10, 31121.61it/s][A
+
12%|█▏ | 56893/478625 [00:01<00:13, 30474.98it/s][A
+
52%|█████▏ | 247842/478625 [00:08<00:07, 28868.58it/s][A
+
32%|███▏ | 154504/478625 [00:04<00:10, 31522.07it/s][A
+
13%|█▎ | 59945/478625 [00:01<00:13, 30464.58it/s][A
+
52%|█████▏ | 250736/478625 [00:08<00:08, 28455.92it/s][A
+
33%|███▎ | 157784/478625 [00:05<00:10, 31897.53it/s][A
+
13%|█▎ | 63126/478625 [00:02<00:13, 30858.87it/s][A
+
53%|█████▎ | 253672/478625 [00:08<00:07, 28719.07it/s][A
+
34%|███▎ | 160977/478625 [00:05<00:10, 31265.84it/s][A
+
14%|█▍ | 66215/478625 [00:02<00:13, 30441.86it/s][A
+
54%|█████▎ | 256768/478625 [00:08<00:07, 29380.33it/s][A
+
34%|███▍ | 164209/478625 [00:05<00:09, 31574.45it/s][A
+
15%|█▍ | 69414/478625 [00:02<00:13, 30896.67it/s][A
+
54%|█████▍ | 259711/478625 [00:08<00:07, 28800.83it/s][A
+
35%|███▍ | 167371/478625 [00:05<00:10, 31088.04it/s][A
+
15%|█▌ | 72600/478625 [00:02<00:13, 31179.23it/s][A
+
55%|█████▍ | 262814/478625 [00:08<00:07, 29452.11it/s][A
+
36%|███▌ | 170601/478625 [00:05<00:09, 31442.87it/s][A
+
16%|█▌ | 75721/478625 [00:02<00:13, 30597.32it/s][A
+
56%|█████▌ | 265970/478625 [00:09<00:07, 30072.57it/s][A
+
36%|███▋ | 173865/478625 [00:05<00:09, 31793.92it/s][A
+
16%|█▋ | 78866/478625 [00:02<00:12, 30845.65it/s][A
+
56%|█████▌ | 268982/478625 [00:09<00:07, 29747.45it/s][A
+
37%|███▋ | 177048/478625 [00:05<00:09, 31230.19it/s][A
+
17%|█▋ | 81996/478625 [00:02<00:13, 30287.76it/s][A
+
57%|█████▋ | 271969/478625 [00:09<00:06, 29783.06it/s][A
+
38%|███▊ | 180306/478625 [00:05<00:09, 31625.75it/s][A
+
18%|█▊ | 85169/478625 [00:02<00:12, 30707.49it/s][A
+
57%|█████▋ | 275123/478625 [00:09<00:06, 30301.47it/s][A
+
38%|███▊ | 183473/478625 [00:05<00:09, 30919.61it/s][A
+
18%|█▊ | 88344/478625 [00:02<00:12, 31013.84it/s][A
+
58%|█████▊ | 278156/478625 [00:09<00:06, 29884.81it/s][A
+
39%|███▉ | 186748/478625 [00:05<00:09, 31412.20it/s][A
+
19%|█▉ | 91449/478625 [00:02<00:12, 30424.59it/s][A
+
59%|█████▉ | 281292/478625 [00:09<00:06, 30318.41it/s][A
+
40%|███▉ | 190012/478625 [00:06<00:09, 31770.38it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
20%|█▉ | 94617/478625 [00:03<00:12, 30790.76it/s][A
+
59%|█████▉ | 284469/478625 [00:09<00:06, 30745.68it/s][A
+
40%|████ | 193194/478625 [00:06<00:09, 31228.06it/s][A
+
1%| | 3192/478625 [00:00<00:14, 31912.35it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
20%|██ | 97700/478625 [00:03<00:12, 30543.47it/s][A
+
60%|██████ | 287546/478625 [00:09<00:06, 30235.59it/s][A
+
41%|████ | 196440/478625 [00:06<00:08, 31589.16it/s][A
+
1%| | 3223/478625 [00:00<00:14, 32224.65it/s][A
+
1%|▏ | 6384/478625 [00:00<00:15, 31181.22it/s][A
+
21%|██ | 100758/478625 [00:03<00:12, 30138.27it/s][A
+
61%|██████ | 290700/478625 [00:09<00:06, 30618.25it/s][A
+
42%|████▏ | 199690/478625 [00:06<00:08, 31856.09it/s][A
+
2%|▏ | 9612/478625 [00:00<00:14, 31674.40it/s][A
+
1%|▏ | 6446/478625 [00:00<00:15, 30632.49it/s][A
+
22%|██▏ | 103887/478625 [00:03<00:12, 30475.85it/s][A
+
61%|██████▏ | 293765/478625 [00:09<00:06, 29447.78it/s][A
+
42%|████▏ | 202879/478625 [00:06<00:08, 31294.35it/s][A
+
3%|▎ | 12861/478625 [00:00<00:14, 31992.82it/s][A
+
2%|▏ | 9759/478625 [00:00<00:14, 31737.16it/s][A
+
22%|██▏ | 107080/478625 [00:03<00:12, 30902.93it/s][A
+
62%|██████▏ | 296940/478625 [00:10<00:06, 30112.91it/s][A
+
43%|████▎ | 206071/478625 [00:06<00:08, 31477.66it/s][A
+
3%|▎ | 13084/478625 [00:00<00:14, 32322.42it/s][A
+
3%|▎ | 16062/478625 [00:00<00:14, 31125.75it/s][A
+
23%|██▎ | 110173/478625 [00:03<00:12, 30353.21it/s][A
+
63%|██████▎ | 300095/478625 [00:10<00:05, 30532.46it/s][A
+
44%|████▎ | 209222/478625 [00:06<00:08, 31040.33it/s][A
+
4%|▍ | 19298/478625 [00:00<00:14, 31532.48it/s][A
+
3%|▎ | 16321/478625 [00:00<00:14, 31765.21it/s][A
+
24%|██▎ | 113360/478625 [00:03<00:11, 30798.11it/s][A
+
63%|██████▎ | 303157/478625 [00:10<00:05, 30095.52it/s][A
+
44%|████▍ | 212463/478625 [00:06<00:08, 31441.13it/s][A
+
4%|▍ | 19632/478625 [00:00<00:14, 32212.64it/s][A
+
5%|▍ | 22455/478625 [00:00<00:14, 30945.61it/s][A
+
24%|██▍ | 116444/478625 [00:03<00:11, 30256.92it/s][A
+
64%|██████▍ | 306313/478625 [00:10<00:05, 30478.04it/s][A
+
45%|████▌ | 215718/478625 [00:06<00:08, 31766.90it/s][A
+
5%|▌ | 25708/478625 [00:00<00:14, 31436.44it/s][A
+
5%|▍ | 22894/478625 [00:00<00:14, 31659.67it/s][A
+
25%|██▍ | 119631/478625 [00:03<00:11, 30728.92it/s][A
+
65%|██████▍ | 309463/478625 [00:10<00:05, 30776.76it/s][A
+
46%|████▌ | 218898/478625 [00:06<00:08, 31295.91it/s][A
+
6%|▌ | 28940/478625 [00:00<00:14, 31705.66it/s][A
+
5%|▌ | 26222/478625 [00:00<00:14, 32163.06it/s][A
+
26%|██▌ | 122796/478625 [00:03<00:11, 30999.63it/s][A
+
65%|██████▌ | 312545/478625 [00:10<00:05, 29838.99it/s][A
+
46%|████▋ | 222148/478625 [00:07<00:08, 31649.71it/s][A
+
6%|▌ | 29530/478625 [00:00<00:13, 32443.83it/s][A
+
7%|▋ | 32114/478625 [00:01<00:14, 31133.61it/s][A
+
26%|██▋ | 125900/478625 [00:04<00:11, 30453.26it/s][A
+
66%|██████▌ | 315677/478625 [00:10<00:05, 30268.13it/s][A
+
47%|████▋ | 225417/478625 [00:07<00:07, 31957.46it/s][A
+
7%|▋ | 35347/478625 [00:01<00:14, 31489.88it/s][A
+
7%|▋ | 32778/478625 [00:01<00:13, 31857.01it/s][A
+
27%|██▋ | 129063/478625 [00:04<00:11, 30796.51it/s][A
+
67%|██████▋ | 318711/478625 [00:10<00:05, 29948.89it/s][A
+
48%|████▊ | 228615/478625 [00:07<00:08, 31053.44it/s][A
+
8%|▊ | 38566/478625 [00:01<00:13, 31697.42it/s][A
+
8%|▊ | 36115/478625 [00:01<00:13, 32309.83it/s][A
+
28%|██▊ | 132246/478625 [00:04<00:11, 31101.65it/s][A
+
67%|██████▋ | 321848/478625 [00:10<00:05, 30363.69it/s][A
+
48%|████▊ | 231820/478625 [00:07<00:07, 31341.67it/s][A
+
8%|▊ | 39408/478625 [00:01<00:13, 32494.48it/s][A
+
9%|▊ | 41739/478625 [00:01<00:14, 30189.51it/s][A
+
28%|██▊ | 135359/478625 [00:04<00:11, 30095.06it/s][A
+
68%|██████▊ | 325020/478625 [00:10<00:04, 30762.87it/s][A
+
49%|████▉ | 234960/478625 [00:07<00:07, 30956.28it/s][A
+
9%|▉ | 42661/478625 [00:01<00:13, 31326.89it/s][A
+
9%|▉ | 44946/478625 [00:01<00:14, 30732.50it/s][A
+
29%|██▉ | 138510/478625 [00:04<00:11, 30505.32it/s][A
+
69%|██████▊ | 328101/478625 [00:11<00:05, 29930.29it/s][A
+
50%|████▉ | 238225/478625 [00:07<00:07, 31451.69it/s][A
+
10%|▉ | 45975/478625 [00:01<00:13, 31855.55it/s][A
+
10%|█ | 48033/478625 [00:01<00:14, 30457.14it/s][A
+
30%|██▉ | 141568/478625 [00:04<00:11, 30045.04it/s][A
+
69%|██████▉ | 331101/478625 [00:11<00:04, 29877.20it/s][A
+
50%|█████ | 241490/478625 [00:07<00:07, 31803.06it/s][A
+
11%|█ | 51236/478625 [00:01<00:13, 30915.25it/s][A
+
10%|█ | 49170/478625 [00:01<00:13, 31477.70it/s][A
+
30%|███ | 144696/478625 [00:04<00:10, 30404.47it/s][A
+
70%|██████▉ | 334271/478625 [00:11<00:04, 30412.12it/s][A
+
51%|█████ | 244674/478625 [00:07<00:07, 31216.90it/s][A
+
11%|█▏ | 54476/478625 [00:01<00:13, 31349.71it/s][A
+
11%|█ | 52471/478625 [00:01<00:13, 31926.85it/s][A
+
31%|███ | 147900/478625 [00:04<00:10, 30883.79it/s][A
+
70%|███████ | 337317/478625 [00:11<00:04, 29889.47it/s][A
+
52%|█████▏ | 247915/478625 [00:07<00:07, 31565.74it/s][A
+
12%|█▏ | 55799/478625 [00:01<00:13, 32325.25it/s][A
+
12%|█▏ | 57618/478625 [00:01<00:13, 30822.44it/s][A
+
32%|███▏ | 150993/478625 [00:04<00:10, 30281.89it/s][A
+
71%|███████ | 340475/478625 [00:11<00:04, 30382.17it/s][A
+
52%|█████▏ | 251076/478625 [00:07<00:07, 31024.09it/s][A
+
13%|█▎ | 60863/478625 [00:01<00:13, 31300.09it/s][A
+
12%|█▏ | 59037/478625 [00:01<00:13, 31788.06it/s][A
+
32%|███▏ | 154179/478625 [00:05<00:10, 30743.35it/s][A
+
72%|███████▏ | 343569/478625 [00:11<00:04, 29931.01it/s][A
+
53%|█████▎ | 254326/478625 [00:08<00:07, 31453.97it/s][A
+
13%|█▎ | 64074/478625 [00:02<00:13, 31538.83it/s][A
+
13%|█▎ | 62377/478625 [00:01<00:12, 32260.10it/s][A
+
33%|███▎ | 157367/478625 [00:05<00:10, 31076.34it/s][A
+
72%|███████▏ | 346705/478625 [00:11<00:04, 30346.70it/s][A
+
54%|█████▍ | 257571/478625 [00:08<00:06, 31746.72it/s][A
+
14%|█▍ | 67232/478625 [00:02<00:13, 31016.84it/s][A
+
14%|█▎ | 65608/478625 [00:02<00:13, 31502.18it/s][A
+
34%|███▎ | 160479/478625 [00:05<00:10, 30481.58it/s][A
+
73%|███████▎ | 349858/478625 [00:11<00:04, 30694.24it/s][A
+
54%|█████▍ | 260749/478625 [00:08<00:07, 31002.07it/s][A
+
15%|█▍ | 70478/478625 [00:02<00:12, 31439.87it/s][A
+
14%|█▍ | 68948/478625 [00:02<00:12, 32053.09it/s][A
+
34%|███▍ | 163630/478625 [00:05<00:10, 30780.83it/s][A
+
74%|███████▎ | 352931/478625 [00:11<00:04, 29631.42it/s][A
+
55%|█████▌ | 263855/478625 [00:08<00:06, 30990.10it/s][A
+
15%|█▌ | 72248/478625 [00:02<00:12, 32329.84it/s][A
+
15%|█▌ | 73626/478625 [00:02<00:13, 30941.55it/s][A
+
35%|███▍ | 166712/478625 [00:05<00:10, 30322.01it/s][A
+
74%|███████▍ | 356101/478625 [00:12<00:04, 30229.01it/s][A
+
56%|█████▌ | 267095/478625 [00:08<00:06, 31403.37it/s][A
+
16%|█▌ | 76855/478625 [00:02<00:12, 31337.05it/s][A
+
16%|█▌ | 75486/478625 [00:02<00:12, 31866.84it/s][A
+
35%|███▌ | 169859/478625 [00:05<00:10, 30657.74it/s][A
+
75%|███████▌ | 359225/478625 [00:12<00:03, 30522.40it/s][A
+
56%|█████▋ | 270239/478625 [00:08<00:06, 30845.78it/s][A
+
17%|█▋ | 80048/478625 [00:02<00:12, 31509.94it/s][A
+
16%|█▋ | 78809/478625 [00:02<00:12, 32266.04it/s][A
+
36%|███▌ | 172929/478625 [00:05<00:10, 30439.52it/s][A
+
76%|███████▌ | 362284/478625 [00:12<00:03, 30102.58it/s][A
+
57%|█████▋ | 273472/478625 [00:08<00:06, 31280.63it/s][A
+
17%|█▋ | 83202/478625 [00:02<00:12, 31046.80it/s][A
+
17%|█▋ | 82040/478625 [00:02<00:12, 31598.31it/s][A
+
37%|███▋ | 175976/478625 [00:05<00:10, 30047.87it/s][A
+
76%|███████▋ | 365427/478625 [00:12<00:03, 30489.71it/s][A
+
58%|█████▊ | 276604/478625 [00:08<00:06, 30892.78it/s][A
+
18%|█▊ | 85360/478625 [00:02<00:12, 32064.94it/s][A
+
18%|█▊ | 86310/478625 [00:02<00:12, 30433.41it/s][A
+
37%|███▋ | 179147/478625 [00:05<00:09, 30534.85it/s][A
+
77%|███████▋ | 368572/478625 [00:12<00:03, 30770.30it/s][A
+
58%|█████▊ | 279840/478625 [00:08<00:06, 31323.79it/s][A
+
19%|█▊ | 88670/478625 [00:02<00:12, 32366.40it/s][A
+
19%|█▊ | 89569/478625 [00:02<00:12, 31061.04it/s][A
+
38%|███▊ | 182282/478625 [00:05<00:09, 30774.00it/s][A
+
59%|█████▉ | 283109/478625 [00:09<00:06, 31726.09it/s][A
+
78%|███████▊ | 371653/478625 [00:12<00:03, 29512.00it/s][A
+
19%|█▉ | 91911/478625 [00:02<00:12, 31756.42it/s][A
+
19%|█▉ | 92680/478625 [00:02<00:12, 30522.11it/s][A
+
39%|███▊ | 185362/478625 [00:06<00:09, 30265.64it/s][A
+
78%|███████▊ | 374803/478625 [00:12<00:03, 30084.57it/s][A
+
60%|█████▉ | 286285/478625 [00:09<00:06, 31196.71it/s][A
+
20%|█▉ | 95219/478625 [00:02<00:11, 32143.44it/s][A
+
20%|██ | 95901/478625 [00:03<00:12, 31012.41it/s][A
+
39%|███▉ | 188556/478625 [00:06<00:09, 30757.96it/s][A
+
79%|███████▉ | 377823/478625 [00:12<00:03, 29790.74it/s][A
+
60%|██████ | 289524/478625 [00:09<00:05, 31546.32it/s][A
+
21%|██ | 98507/478625 [00:03<00:11, 32359.99it/s][A
+
21%|██ | 99007/478625 [00:03<00:12, 30611.47it/s][A
+
40%|████ | 191682/478625 [00:06<00:09, 30328.89it/s][A
+
80%|███████▉ | 380971/478625 [00:12<00:03, 30283.23it/s][A
+
61%|██████ | 292730/478625 [00:09<00:05, 31697.77it/s][A
+
+
21%|██▏ | 101747/478625 [00:03<00:11, 31751.62it/s][A
21%|██▏ | 102205/478625 [00:03<00:12, 31012.44it/s][A
+
41%|████ | 194867/478625 [00:06<00:09, 30772.28it/s][A
+
80%|████████ | 384140/478625 [00:12<00:03, 30697.03it/s][A
+
62%|██████▏ | 295903/478625 [00:09<00:05, 31155.21it/s][A
+
+
22%|██▏ | 105065/478625 [00:03<00:11, 32169.84it/s][A
22%|██▏ | 105401/478625 [00:03<00:11, 31290.17it/s][A
+
41%|████▏ | 198035/478625 [00:06<00:09, 31037.03it/s][A
+
81%|████████ | 387216/478625 [00:13<00:03, 30176.75it/s][A
+
63%|██████▎ | 299187/478625 [00:09<00:05, 31650.22it/s][A
+
23%|██▎ | 108533/478625 [00:03<00:11, 30859.11it/s][A
+
23%|██▎ | 108286/478625 [00:03<00:11, 31695.46it/s][A
+
42%|████▏ | 201142/478625 [00:06<00:09, 30456.21it/s][A
+
82%|████████▏ | 390364/478625 [00:13<00:02, 30557.97it/s][A
+
63%|██████▎ | 302356/478625 [00:09<00:05, 31157.12it/s][A
+
23%|██▎ | 111750/478625 [00:03<00:11, 31242.62it/s][A
+
23%|██▎ | 111602/478625 [00:03<00:11, 32125.02it/s][A
+
43%|████▎ | 204313/478625 [00:06<00:08, 30821.61it/s][A
+
82%|████████▏ | 393425/478625 [00:13<00:02, 30403.08it/s][A
+
64%|██████▍ | 305476/478625 [00:09<00:05, 31140.04it/s][A
+
24%|██▍ | 114962/478625 [00:03<00:11, 31500.95it/s][A
+
24%|██▍ | 114884/478625 [00:03<00:11, 32327.62it/s][A
+
43%|████▎ | 207437/478625 [00:06<00:08, 30942.40it/s][A
+
65%|██████▍ | 308749/478625 [00:09<00:05, 31607.78it/s][A
+
83%|████████▎ | 396469/478625 [00:13<00:02, 29905.98it/s][A
+
25%|██▍ | 118120/478625 [00:03<00:11, 31781.02it/s][A
+
25%|██▍ | 118115/478625 [00:03<00:11, 30864.69it/s][A
+
44%|████▍ | 210534/478625 [00:06<00:08, 29918.28it/s][A
+
83%|████████▎ | 399626/478625 [00:13<00:02, 30393.11it/s][A
+
65%|██████▌ | 311913/478625 [00:09<00:05, 31269.70it/s][A
+
25%|██▌ | 121435/478625 [00:03<00:11, 32181.26it/s][A
+
25%|██▌ | 121325/478625 [00:03<00:11, 31227.17it/s][A
+
45%|████▍ | 213701/478625 [00:06<00:08, 30424.95it/s][A
+
66%|██████▌ | 315164/478625 [00:10<00:05, 31635.58it/s][A
+
84%|████████▍ | 402669/478625 [00:13<00:02, 29960.94it/s][A
+
26%|██▌ | 124657/478625 [00:03<00:11, 31734.19it/s][A
+
26%|██▌ | 124452/478625 [00:04<00:11, 30758.95it/s][A
+
45%|████▌ | 216894/478625 [00:07<00:08, 30863.40it/s][A
+
85%|████████▍ | 405821/478625 [00:13<00:02, 30416.21it/s][A
+
67%|██████▋ | 318330/478625 [00:10<00:05, 31234.72it/s][A
+
27%|██▋ | 127931/478625 [00:03<00:10, 32028.93it/s][A
+
27%|██▋ | 127658/478625 [00:04<00:11, 31138.07it/s][A
+
46%|████▌ | 219987/478625 [00:07<00:08, 30406.85it/s][A
+
85%|████████▌ | 408969/478625 [00:13<00:02, 30708.84it/s][A
+
67%|██████▋ | 321617/478625 [00:10<00:04, 31716.63it/s][A
+
27%|██▋ | 131244/478625 [00:04<00:10, 32352.70it/s][A
+
27%|██▋ | 130776/478625 [00:04<00:11, 30736.29it/s][A
+
47%|████▋ | 223182/478625 [00:07<00:08, 30858.81it/s][A
+
68%|██████▊ | 324907/478625 [00:10<00:04, 32065.25it/s][A
+
86%|████████▌ | 412043/478625 [00:13<00:02, 29674.86it/s][A
+
28%|██▊ | 134482/478625 [00:04<00:10, 31792.56it/s][A
+
28%|██▊ | 133853/478625 [00:04<00:11, 30440.85it/s][A
+
47%|████▋ | 226273/478625 [00:07<00:08, 30407.96it/s][A
+
69%|██████▊ | 328116/478625 [00:10<00:04, 31181.82it/s]
+[A
87%|████████▋ | 415116/478625 [00:13<00:02, 29980.14it/s][A
+
29%|██▉ | 137786/478625 [00:04<00:10, 32159.14it/s][A
+
29%|██▊ | 137067/478625 [00:04<00:11, 30938.72it/s][A
+
48%|████▊ | 229449/478625 [00:07<00:08, 30802.46it/s][A
+
+
69%|██████▉ | 331372/478625 [00:10<00:04, 31583.08it/s][A
87%|████████▋ | 418260/478625 [00:14<00:01, 30406.98it/s][A
+
29%|██▉ | 140293/478625 [00:04<00:10, 31327.09it/s][A
+
29%|██▉ | 141053/478625 [00:04<00:10, 31658.13it/s][A
+
49%|████▊ | 232534/478625 [00:07<00:07, 30812.59it/s][A
+
70%|██████▉ | 334613/478625 [00:10<00:04, 31825.14it/s][A
+
88%|████████▊ | 421307/478625 [00:14<00:01, 30047.83it/s][A
+
30%|███ | 144321/478625 [00:04<00:10, 31956.03it/s][A
+
30%|██▉ | 143429/478625 [00:04<00:10, 30704.80it/s][A
+
49%|████▉ | 235618/478625 [00:07<00:08, 30345.60it/s][A
+
89%|████████▊ | 424453/478625 [00:14<00:01, 30462.24it/s][A
+
71%|███████ | 337800/478625 [00:10<00:04, 31129.55it/s][A
+
31%|███ | 147655/478625 [00:04<00:10, 32361.55it/s][A
+
31%|███ | 146621/478625 [00:04<00:10, 31061.21it/s][A
+
50%|████▉ | 238800/478625 [00:07<00:07, 30754.01it/s][A
+
89%|████████▉ | 427599/478625 [00:14<00:01, 30754.26it/s][A
+
71%|███████▏ | 341055/478625 [00:10<00:04, 31542.42it/s][A
+
32%|███▏ | 150895/478625 [00:04<00:10, 31734.25it/s][A
+
31%|███▏ | 149731/478625 [00:04<00:10, 30605.26it/s][A
+
51%|█████ | 241984/478625 [00:07<00:07, 31072.93it/s][A
+
90%|████████▉ | 430678/478625 [00:14<00:01, 30257.15it/s][A
+
72%|███████▏ | 344215/478625 [00:10<00:04, 31000.08it/s][A
+
32%|███▏ | 154229/478625 [00:04<00:10, 32204.59it/s][A
+
32%|███▏ | 152952/478625 [00:04<00:10, 31073.93it/s][A
+
51%|█████ | 245094/478625 [00:08<00:07, 30103.59it/s][A
+
91%|█████████ | 433708/478625 [00:14<00:01, 30011.78it/s][A
+
73%|███████▎ | 347320/478625 [00:11<00:04, 30878.44it/s][A
+
33%|███▎ | 157561/478625 [00:04<00:09, 32531.30it/s][A
+
33%|███▎ | 156175/478625 [00:05<00:10, 31413.36it/s][A
+
52%|█████▏ | 248262/478625 [00:08<00:07, 30561.73it/s][A
+
73%|███████▎ | 350570/478625 [00:11<00:04, 31354.61it/s][A
+
91%|█████████ | 436712/478625 [00:14<00:01, 29763.63it/s][A
+
+
33%|███▎ | 159320/478625 [00:05<00:10, 30947.43it/s][A
34%|███▎ | 160818/478625 [00:05<00:09, 31894.28it/s][A
+
53%|█████▎ | 251325/478625 [00:08<00:07, 30153.45it/s][A
+
92%|█████████▏| 439866/478625 [00:14<00:01, 30285.42it/s][A
+
74%|███████▍ | 353709/478625 [00:11<00:04, 30885.51it/s][A
+
+
34%|███▍ | 162490/478625 [00:05<00:10, 31168.57it/s][A
34%|███▍ | 164105/478625 [00:05<00:09, 32177.72it/s][A
+
53%|█████▎ | 254500/478625 [00:08<00:07, 30618.44it/s][A
+
93%|█████████▎| 443013/478625 [00:14<00:01, 30634.78it/s][A
+
75%|███████▍ | 356958/478625 [00:11<00:03, 31313.02it/s][A
+
35%|███▍ | 167327/478625 [00:05<00:09, 31642.39it/s][A
+
35%|███▍ | 165676/478625 [00:05<00:10, 30764.95it/s][A
+
54%|█████▍ | 257666/478625 [00:08<00:07, 30924.06it/s][A
+
+
75%|███████▌ | 360189/478625 [00:11<00:03, 31606.68it/s][A
93%|█████████▎| 446079/478625 [00:15<00:01, 30175.36it/s][A
+
36%|███▌ | 170672/478625 [00:05<00:09, 32169.63it/s][A
+
35%|███▌ | 168904/478625 [00:05<00:09, 31207.46it/s][A
+
54%|█████▍ | 260763/478625 [00:08<00:07, 30216.16it/s][A
+
94%|█████████▍| 449229/478625 [00:15<00:00, 30564.96it/s][A
+
76%|███████▌ | 363353/478625 [00:11<00:03, 31093.07it/s][A
+
36%|███▋ | 174036/478625 [00:05<00:09, 32602.98it/s][A
+
36%|███▌ | 172028/478625 [00:05<00:09, 30897.83it/s][A
+
55%|█████▌ | 263948/478625 [00:08<00:06, 30693.60it/s][A
+
94%|█████████▍| 452289/478625 [00:15<00:00, 30349.30it/s][A
+
77%|███████▋ | 366575/478625 [00:11<00:03, 31421.78it/s][A
+
37%|███▋ | 177300/478625 [00:05<00:09, 32108.93it/s][A
+
37%|███▋ | 175121/478625 [00:05<00:09, 30570.23it/s][A
+
56%|█████▌ | 267114/478625 [00:08<00:06, 30976.49it/s][A
+
95%|█████████▌| 455326/478625 [00:15<00:00, 29957.85it/s][A
+
77%|███████▋ | 369721/478625 [00:11<00:03, 30699.27it/s][A
+
38%|███▊ | 180633/478625 [00:05<00:09, 32466.35it/s][A
+
37%|███▋ | 178349/478625 [00:05<00:09, 31069.98it/s][A
+
56%|█████▋ | 270216/478625 [00:08<00:06, 30399.97it/s][A
+
96%|█████████▌| 458473/478625 [00:15<00:00, 30398.48it/s][A
+
78%|███████▊ | 373001/478625 [00:11<00:03, 31310.32it/s][A
+
38%|███▊ | 181532/478625 [00:05<00:09, 31293.61it/s][A
+
38%|███▊ | 183883/478625 [00:05<00:09, 31847.99it/s][A
+
57%|█████▋ | 273385/478625 [00:08<00:06, 30777.07it/s][A
+
96%|█████████▋| 461632/478625 [00:15<00:00, 30748.43it/s][A
+
79%|███████▊ | 376256/478625 [00:11<00:03, 31674.13it/s][A
+
39%|███▉ | 187254/478625 [00:05<00:08, 32392.11it/s][A
+
39%|███▊ | 184664/478625 [00:05<00:09, 30680.92it/s][A
+
58%|█████▊ | 276467/478625 [00:09<00:06, 30300.95it/s][A
+
97%|█████████▋| 464709/478625 [00:15<00:00, 30128.08it/s][A
+
79%|███████▉ | 379428/478625 [00:12<00:03, 31061.30it/s][A
+
40%|███▉ | 190638/478625 [00:05<00:08, 32817.92it/s][A
+
39%|███▉ | 187887/478625 [00:06<00:09, 31134.29it/s][A
+
58%|█████▊ | 279636/478625 [00:09<00:06, 30707.03it/s][A
+
98%|█████████▊| 467784/478625 [00:15<00:00, 30308.11it/s][A
+
80%|███████▉ | 382690/478625 [00:12<00:03, 31516.01it/s][A
+
41%|████ | 193924/478625 [00:06<00:08, 32259.03it/s][A
+
40%|███▉ | 191004/478625 [00:06<00:09, 30752.28it/s][A
+
59%|█████▉ | 282711/478625 [00:09<00:06, 30578.51it/s][A
+
98%|█████████▊| 470818/478625 [00:15<00:00, 29951.00it/s][A
+
81%|████████ | 385847/478625 [00:12<00:02, 30955.26it/s][A
+
41%|████ | 197297/478625 [00:06<00:08, 32690.32it/s][A
+
41%|████ | 194225/478625 [00:06<00:09, 31180.81it/s][A
+
60%|█████▉ | 285772/478625 [00:09<00:06, 30180.40it/s][A
+
99%|█████████▉| 473941/478625 [00:15<00:00, 30324.22it/s][A
+
81%|████████▏ | 388948/478625 [00:12<00:02, 30904.84it/s][A
+
41%|████ | 197428/478625 [00:06<00:08, 31428.64it/s][A
+
42%|████▏ | 200570/478625 [00:06<00:08, 32086.18it/s][A
+
60%|██████ | 288951/478625 [00:09<00:06, 30650.23it/s][A
+
100%|█████████▉| 477117/478625 [00:16<00:00, 30748.17it/s][A
+
82%|████████▏ | 392178/478625 [00:12<00:02, 31315.01it/s][A
+
+
43%|████▎ | 203913/478625 [00:06<00:08, 32477.33it/s][A
42%|████▏ | 200574/478625 [00:06<00:08, 30934.23it/s][A
100%|██████████| 478625/478625 [00:16<00:00, 29765.27it/s]
+
100%|██████████| 1/1 [00:23<00:00, 23.03s/it]
100%|██████████| 1/1 [00:23<00:00, 23.03s/it]
+
+
61%|██████ | 292056/478625 [00:09<00:06, 30767.51it/s][A
+
83%|████████▎ | 395313/478625 [00:12<00:02, 30861.55it/s][A
+
+
43%|████▎ | 207221/478625 [00:06<00:08, 32654.02it/s][A
43%|████▎ | 203766/478625 [00:06<00:08, 31222.81it/s][A
+
62%|██████▏ | 295135/478625 [00:09<00:06, 30287.56it/s][A
+
83%|████████▎ | 398554/478625 [00:12<00:02, 31313.94it/s][A
+
43%|████▎ | 206961/478625 [00:06<00:08, 31436.81it/s][A
+
44%|████▍ | 210490/478625 [00:06<00:08, 32204.40it/s][A
+
62%|██████▏ | 298347/478625 [00:09<00:05, 30826.89it/s][A
+
84%|████████▍ | 401809/478625 [00:12<00:02, 31678.32it/s][A
+
45%|████▍ | 213851/478625 [00:06<00:08, 32616.08it/s][A
+
44%|████▍ | 210107/478625 [00:06<00:08, 30151.72it/s][A
+
63%|██████▎ | 301433/478625 [00:09<00:05, 30338.97it/s][A
+
85%|████████▍ | 404980/478625 [00:12<00:02, 31115.14it/s][A
+
45%|████▍ | 213340/478625 [00:06<00:08, 30779.86it/s][A
+
45%|████▌ | 217116/478625 [00:06<00:08, 31604.50it/s][A
+
64%|██████▎ | 304610/478625 [00:09<00:05, 30756.63it/s][A
+
85%|████████▌ | 408232/478625 [00:13<00:02, 31526.31it/s][A
+
46%|████▌ | 220488/478625 [00:06<00:08, 32217.76it/s][A
+
45%|████▌ | 216429/478625 [00:06<00:08, 30441.09it/s][A
+
64%|██████▍ | 307783/478625 [00:10<00:05, 31043.18it/s][A
+
86%|████████▌ | 411388/478625 [00:13<00:02, 30984.61it/s][A
+
47%|████▋ | 223859/478625 [00:06<00:07, 32654.86it/s][A
+
46%|████▌ | 219665/478625 [00:07<00:08, 31000.31it/s][A
+
65%|██████▍ | 310890/478625 [00:10<00:05, 30529.93it/s][A
+
87%|████████▋ | 414620/478625 [00:13<00:02, 31375.22it/s][A
+
47%|████▋ | 222894/478625 [00:07<00:08, 31378.23it/s][A
+
47%|████▋ | 227131/478625 [00:07<00:07, 32161.69it/s][A
+
66%|██████▌ | 314080/478625 [00:10<00:05, 30932.69it/s][A
+
87%|████████▋ | 417762/478625 [00:13<00:01, 31352.27it/s][A
+
48%|████▊ | 230414/478625 [00:07<00:07, 32356.84it/s][A
+
47%|████▋ | 226038/478625 [00:07<00:08, 30991.47it/s][A
+
66%|██████▋ | 317255/478625 [00:10<00:05, 31173.37it/s][A
+
88%|████████▊ | 420900/478625 [00:13<00:01, 30903.19it/s][A
+
49%|████▉ | 233790/478625 [00:07<00:07, 32768.57it/s][A
+
48%|████▊ | 229239/478625 [00:07<00:07, 31288.94it/s][A
+
67%|██████▋ | 320375/478625 [00:10<00:05, 30137.93it/s][A
+
89%|████████▊ | 424148/478625 [00:13<00:01, 31364.07it/s][A
+
49%|████▊ | 232400/478625 [00:07<00:07, 31381.82it/s][A
+
50%|████▉ | 237071/478625 [00:07<00:07, 32261.04it/s][A
+
68%|██████▊ | 323547/478625 [00:10<00:05, 30595.20it/s][A
+
89%|████████▉ | 427385/478625 [00:13<00:01, 31661.12it/s][A
+
50%|█████ | 240301/478625 [00:07<00:07, 32164.14it/s][A
+
49%|████▉ | 235541/478625 [00:07<00:07, 30894.19it/s][A
+
68%|██████▊ | 326727/478625 [00:10<00:04, 30947.37it/s][A
+
90%|████████▉ | 430554/478625 [00:13<00:01, 30660.23it/s][A
+
50%|████▉ | 238766/478625 [00:07<00:07, 31292.51it/s][A
+
51%|█████ | 243520/478625 [00:07<00:07, 31830.85it/s][A
+
69%|██████▉ | 329828/478625 [00:10<00:04, 30199.53it/s][Atime 24.2894606590271
+
+
91%|█████████ | 433801/478625 [00:13<00:01, 31185.84it/s][An_elements: 474899
+data length: 474899
+
+
52%|█████▏ | 246879/478625 [00:07<00:07, 32347.46it/s][A
+
51%|█████ | 241899/478625 [00:07<00:07, 30809.89it/s][A
+
70%|██████▉ | 333020/478625 [00:10<00:04, 30700.87it/s][A
+
91%|█████████▏| 436927/478625 [00:13<00:01, 30809.05it/s][A
+
52%|█████▏ | 250227/478625 [00:07<00:06, 32679.70it/s][A
+
51%|█████ | 245120/478625 [00:07<00:07, 31219.50it/s][A
+
70%|███████ | 336097/478625 [00:10<00:04, 30194.63it/s][A
+
92%|█████████▏| 440177/478625 [00:14<00:01, 31303.83it/s][A
+
52%|█████▏ | 248349/478625 [00:07<00:07, 31534.26it/s][A
+
53%|█████▎ | 253498/478625 [00:07<00:06, 32208.07it/s][A
+
71%|███████ | 339283/478625 [00:11<00:04, 30679.94it/s][A
+
93%|█████████▎| 443452/478625 [00:14<00:01, 31727.37it/s][A
+
54%|█████▎ | 256861/478625 [00:07<00:06, 32625.62it/s][A
+
53%|█████▎ | 251506/478625 [00:08<00:07, 30948.19it/s][A
+
72%|███████▏ | 342458/478625 [00:11<00:04, 30994.27it/s][A
+
93%|█████████▎| 446629/478625 [00:14<00:01, 31153.75it/s][A
+
53%|█████▎ | 254605/478625 [00:08<00:07, 30413.28it/s][A
+
54%|█████▍ | 260127/478625 [00:08<00:06, 31357.73it/s][A
+
72%|███████▏ | 345562/478625 [00:11<00:04, 30432.41it/s][A
+
94%|█████████▍| 449860/478625 [00:14<00:00, 31490.93it/s][A
+
54%|█████▍ | 257808/478625 [00:08<00:07, 30883.99it/s][A
+
55%|█████▌ | 263483/478625 [00:08<00:06, 31991.66it/s][A
+
73%|███████▎ | 348725/478625 [00:11<00:04, 30783.22it/s][A
+
95%|█████████▍| 453100/478625 [00:14<00:00, 31758.18it/s][A
+
56%|█████▌ | 266871/478625 [00:08<00:06, 32542.11it/s][A
+
55%|█████▍ | 260901/478625 [00:08<00:07, 30288.84it/s][A
+
74%|███████▎ | 351907/478625 [00:11<00:04, 31087.50it/s][A
+
95%|█████████▌| 456279/478625 [00:14<00:00, 31174.42it/s][A
+
55%|█████▌ | 264101/478625 [00:08<00:06, 30787.31it/s][A
+
56%|█████▋ | 270134/478625 [00:08<00:06, 31910.58it/s][A
+
74%|███████▍ | 355020/478625 [00:11<00:04, 30388.06it/s][A
+
96%|█████████▌| 459536/478625 [00:14<00:00, 31582.45it/s][A
+
57%|█████▋ | 273490/478625 [00:08<00:06, 32391.62it/s][A
+
56%|█████▌ | 267185/478625 [00:08<00:06, 30445.49it/s][A
+
75%|███████▍ | 358064/478625 [00:11<00:03, 30301.15it/s][A
+
97%|█████████▋| 462698/478625 [00:14<00:00, 31054.53it/s][A
+
56%|█████▋ | 270397/478625 [00:08<00:06, 30934.76it/s][A
+
58%|█████▊ | 276736/478625 [00:08<00:06, 31998.95it/s][A
+
75%|███████▌ | 361098/478625 [00:11<00:03, 29985.93it/s][A
+
97%|█████████▋| 465912/478625 [00:14<00:00, 31370.35it/s][A
+
57%|█████▋ | 273615/478625 [00:08<00:06, 31300.85it/s][A
+
59%|█████▊ | 280080/478625 [00:08<00:06, 32418.69it/s][A
+
76%|███████▌ | 364265/478625 [00:11<00:03, 30479.70it/s][A
+
98%|█████████▊| 469053/478625 [00:14<00:00, 31346.60it/s][A
+
58%|█████▊ | 276749/478625 [00:08<00:06, 30811.78it/s][A
+
59%|█████▉ | 283327/478625 [00:08<00:06, 32268.03it/s][A
+
77%|███████▋ | 367421/478625 [00:11<00:03, 30796.32it/s][A
+
99%|█████████▊| 472191/478625 [00:15<00:00, 30424.74it/s][A
+
58%|█████▊ | 279956/478625 [00:09<00:06, 31180.67it/s][A
+
60%|█████▉ | 286557/478625 [00:08<00:06, 31935.83it/s][A
+
77%|███████▋ | 370504/478625 [00:12<00:03, 30075.61it/s][A
+
99%|█████████▉| 475433/478625 [00:15<00:00, 31004.94it/s][A
+
59%|█████▉ | 283186/478625 [00:09<00:06, 31509.10it/s][A
+
61%|██████ | 289908/478625 [00:09<00:05, 32396.48it/s][A
+
78%|███████▊ | 373711/478625 [00:12<00:03, 30657.38it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31353.14it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.92s/it]
100%|██████████| 1/1 [00:21<00:00, 21.92s/it]
+
+
60%|█████▉ | 286340/478625 [00:09<00:06, 31017.90it/s][A
+
61%|██████ | 293151/478625 [00:09<00:05, 31802.51it/s][A
+
79%|███████▊ | 376889/478625 [00:12<00:03, 30987.00it/s][A
+
60%|██████ | 289515/478625 [00:09<00:06, 31232.09it/s][A
+
62%|██████▏ | 296521/478625 [00:09<00:05, 32358.66it/s][A
+
79%|███████▉ | 379992/478625 [00:12<00:03, 30499.79it/s][A
+
63%|██████▎ | 299906/478625 [00:09<00:05, 32797.89it/s][A
+
61%|██████ | 292641/478625 [00:09<00:06, 30554.89it/s][A
+
80%|████████ | 383170/478625 [00:12<00:03, 30874.39it/s][A
+
62%|██████▏ | 295817/478625 [00:09<00:05, 30871.78it/s][A
+
63%|██████▎ | 303190/478625 [00:09<00:05, 31772.46it/s][A
+
81%|████████ | 386261/478625 [00:12<00:03, 30109.45it/s][A
+
62%|██████▏ | 299032/478625 [00:09<00:05, 31245.82it/s][A
+
64%|██████▍ | 306539/478625 [00:09<00:05, 32269.40it/s][A
+
81%|████████▏ | 389429/478625 [00:12<00:02, 30566.48it/s][A
+
65%|██████▍ | 309883/478625 [00:09<00:05, 31923.10it/s][A
+
63%|██████▎ | 302160/478625 [00:09<00:05, 29767.65it/s][A
+
82%|████████▏ | 392638/478625 [00:12<00:02, 31011.35it/s][A
+
65%|██████▌ | 313269/478625 [00:09<00:05, 32484.07it/s][A
+
64%|██████▍ | 305362/478625 [00:09<00:05, 30411.63it/s][A
+
83%|████████▎ | 395744/478625 [00:12<00:02, 29934.16it/s][Atime 22.66064691543579
+
+
66%|██████▌ | 316628/478625 [00:09<00:04, 32806.86it/s][A
+
64%|██████▍ | 308516/478625 [00:09<00:05, 30738.33it/s][An_elements: 474899
+data length: 474899
+
+
83%|████████▎ | 398938/478625 [00:13<00:02, 30512.24it/s][A
+
65%|██████▌ | 311601/478625 [00:10<00:05, 30403.34it/s][A
+
67%|██████▋ | 319914/478625 [00:09<00:04, 32240.08it/s][A
+
84%|████████▍ | 402108/478625 [00:13<00:02, 30857.33it/s][A
+
66%|██████▌ | 314811/478625 [00:10<00:05, 30870.58it/s][A
+
68%|██████▊ | 323193/478625 [00:10<00:04, 32170.16it/s][A
+
85%|████████▍ | 405201/478625 [00:13<00:02, 30425.41it/s][A
+
68%|██████▊ | 326526/478625 [00:10<00:04, 32508.47it/s][A
+
66%|██████▋ | 317905/478625 [00:10<00:05, 30499.50it/s][A
+
85%|████████▌ | 408358/478625 [00:13<00:02, 30759.32it/s][A
+
67%|██████▋ | 321127/478625 [00:10<00:05, 31003.33it/s][A
+
69%|██████▉ | 329780/478625 [00:10<00:04, 31825.64it/s][A
+
86%|████████▌ | 411439/478625 [00:13<00:02, 30308.81it/s][A
+
68%|██████▊ | 324347/478625 [00:10<00:04, 31356.33it/s][A
+
70%|██████▉ | 333143/478625 [00:10<00:04, 32351.17it/s][A
+
87%|████████▋ | 414610/478625 [00:13<00:02, 30718.96it/s][A
+
68%|██████▊ | 327487/478625 [00:10<00:04, 30733.02it/s][A
+
70%|███████ | 336383/478625 [00:10<00:04, 31935.58it/s][A
+
87%|████████▋ | 417686/478625 [00:13<00:01, 30697.60it/s][A
+
69%|██████▉ | 330626/478625 [00:10<00:04, 30923.56it/s][A
+
71%|███████ | 339742/478625 [00:10<00:04, 32417.94it/s][A
+
88%|████████▊ | 420759/478625 [00:13<00:01, 30306.60it/s][A
+
70%|██████▉ | 333832/478625 [00:10<00:04, 31258.41it/s][A
+
72%|███████▏ | 343102/478625 [00:10<00:04, 32765.90it/s][A
+
89%|████████▊ | 423937/478625 [00:13<00:01, 30740.13it/s][A
+
70%|███████ | 336962/478625 [00:10<00:04, 30764.17it/s][A
+
72%|███████▏ | 346382/478625 [00:10<00:04, 31717.69it/s][A
+
89%|████████▉ | 427112/478625 [00:13<00:01, 31036.79it/s][A
+
71%|███████ | 340195/478625 [00:10<00:04, 31223.66it/s][A
+
73%|███████▎ | 349748/478625 [00:10<00:03, 32281.74it/s][A
+
90%|████████▉ | 430218/478625 [00:14<00:01, 30019.53it/s][A
+
72%|███████▏ | 343321/478625 [00:11<00:04, 30753.09it/s][A
+
74%|███████▎ | 352984/478625 [00:10<00:03, 31860.93it/s][A
+
91%|█████████ | 433408/478625 [00:14<00:01, 30566.64it/s][A
+
72%|███████▏ | 346400/478625 [00:11<00:04, 30217.26it/s][A
+
74%|███████▍ | 356345/478625 [00:11<00:03, 32371.07it/s][A
+
91%|█████████ | 436472/478625 [00:14<00:01, 30156.83it/s][A
+
73%|███████▎ | 349613/478625 [00:11<00:04, 30773.16it/s][A
+
75%|███████▌ | 359718/478625 [00:11<00:03, 32768.81it/s][A
+
92%|█████████▏| 439663/478625 [00:14<00:01, 30667.22it/s][A
+
74%|███████▎ | 352695/478625 [00:11<00:04, 30380.89it/s][A
+
76%|███████▌ | 363000/478625 [00:11<00:03, 32261.29it/s][A
+
93%|█████████▎| 442848/478625 [00:14<00:01, 31012.97it/s][A
+
74%|███████▍ | 355917/478625 [00:11<00:03, 30918.30it/s][A
+
77%|███████▋ | 366231/478625 [00:11<00:03, 31958.88it/s][A
+
93%|█████████▎| 445954/478625 [00:14<00:01, 30228.94it/s][A
+
75%|███████▌ | 359111/478625 [00:11<00:03, 31218.42it/s][A
+
77%|███████▋ | 369431/478625 [00:11<00:03, 31457.07it/s][A
+
94%|█████████▍| 449122/478625 [00:14<00:00, 30649.99it/s][A
+
76%|███████▌ | 362236/478625 [00:11<00:03, 30808.25it/s][A
+
78%|███████▊ | 372795/478625 [00:11<00:03, 32093.46it/s][A
+
94%|█████████▍| 452290/478625 [00:14<00:00, 30952.67it/s][A
+
76%|███████▋ | 365467/478625 [00:11<00:03, 31248.08it/s][A
+
79%|███████▊ | 376141/478625 [00:11<00:03, 32494.91it/s][A
+
95%|█████████▌| 455390/478625 [00:14<00:00, 30434.39it/s][A
+
77%|███████▋ | 368595/478625 [00:11<00:03, 30750.98it/s][A
+
79%|███████▉ | 379394/478625 [00:11<00:03, 32033.68it/s][A
+
96%|█████████▌| 458567/478625 [00:14<00:00, 30824.21it/s][A
+
78%|███████▊ | 371730/478625 [00:11<00:03, 30923.99it/s][A
+
80%|███████▉ | 382767/478625 [00:11<00:02, 32530.89it/s][A
+
96%|█████████▋| 461758/478625 [00:15<00:00, 30384.75it/s][A
+
78%|███████▊ | 374964/478625 [00:12<00:03, 31340.27it/s][A
+
81%|████████ | 386024/478625 [00:12<00:02, 32073.34it/s][A
+
97%|█████████▋| 464918/478625 [00:15<00:00, 30738.71it/s][A
+
79%|███████▉ | 378101/478625 [00:12<00:03, 30818.69it/s][A
+
81%|████████▏ | 389235/478625 [00:12<00:02, 32031.75it/s][A
+
98%|█████████▊| 467997/478625 [00:15<00:00, 30264.80it/s][A
+
80%|███████▉ | 381344/478625 [00:12<00:03, 31292.19it/s][A
+
82%|████████▏ | 392606/478625 [00:12<00:02, 32525.77it/s][A
+
98%|█████████▊| 471028/478625 [00:15<00:00, 29917.86it/s][A
+
80%|████████ | 384569/478625 [00:12<00:02, 31574.10it/s][A
+
83%|████████▎ | 395861/478625 [00:12<00:02, 32032.24it/s][A
+
99%|█████████▉| 474211/478625 [00:15<00:00, 30475.80it/s][A
+
81%|████████ | 387729/478625 [00:12<00:02, 31049.96it/s][A
+
83%|████████▎ | 399219/478625 [00:12<00:02, 32486.95it/s][A
+
100%|█████████▉| 477364/478625 [00:15<00:00, 30785.53it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 30596.83it/s]
+
100%|██████████| 1/1 [00:22<00:00, 22.34s/it]
100%|██████████| 1/1 [00:22<00:00, 22.34s/it]
+
+
84%|████████▍ | 402608/478625 [00:12<00:02, 32901.76it/s][A
+
82%|████████▏ | 390838/478625 [00:12<00:02, 30430.39it/s][A
+
82%|████████▏ | 393886/478625 [00:12<00:02, 30191.35it/s][A
+
85%|████████▍ | 405901/478625 [00:12<00:02, 32269.50it/s][A
+
83%|████████▎ | 397085/478625 [00:12<00:02, 30715.79it/s][A
+
85%|████████▌ | 409132/478625 [00:12<00:02, 32084.96it/s][A
+
84%|████████▎ | 400322/478625 [00:12<00:02, 31202.43it/s][A
+
86%|████████▌ | 412344/478625 [00:12<00:02, 31647.94it/s][A
+
84%|████████▍ | 403446/478625 [00:13<00:02, 30768.68it/s][A
+
87%|████████▋ | 415570/478625 [00:12<00:01, 31826.55it/s][A
+
85%|████████▍ | 406677/478625 [00:13<00:02, 31220.60it/s][A
+
88%|████████▊ | 418902/478625 [00:13<00:01, 32264.99it/s][A
+
86%|████████▌ | 409895/478625 [00:13<00:02, 31502.54it/s][A
+
88%|████████▊ | 422131/478625 [00:13<00:01, 31830.50it/s][Atime 23.071712017059326
+
+
86%|████████▋ | 413048/478625 [00:13<00:02, 30850.91it/s][A
+
89%|████████▉ | 425440/478625 [00:13<00:01, 32199.50it/s][An_elements: 474899
+data length: 474899
+
+
87%|████████▋ | 416160/478625 [00:13<00:02, 30927.62it/s][A
+
90%|████████▉ | 428663/478625 [00:13<00:01, 31749.58it/s][A
+
88%|████████▊ | 419256/478625 [00:13<00:01, 30608.09it/s][A
+
90%|█████████ | 431841/478625 [00:13<00:01, 31716.49it/s][A
+
88%|████████▊ | 422474/478625 [00:13<00:01, 31070.64it/s][A
+
91%|█████████ | 435194/478625 [00:13<00:01, 32252.13it/s][A
+
89%|████████▉ | 425688/478625 [00:13<00:01, 31386.44it/s][A
+
92%|█████████▏| 438422/478625 [00:13<00:01, 31881.36it/s][A
+
90%|████████▉ | 428829/478625 [00:13<00:01, 30890.06it/s][A
+
92%|█████████▏| 441793/478625 [00:13<00:01, 32418.56it/s][A
+
90%|█████████ | 432053/478625 [00:13<00:01, 31286.76it/s][A
+
93%|█████████▎| 445038/478625 [00:13<00:01, 31919.52it/s][A
+
91%|█████████ | 435185/478625 [00:14<00:01, 30712.28it/s][A
+
94%|█████████▎| 448415/478625 [00:13<00:00, 32462.44it/s][A
+
92%|█████████▏| 438260/478625 [00:14<00:01, 30424.87it/s][A
+
94%|█████████▍| 451665/478625 [00:14<00:00, 32117.66it/s][A
+
92%|█████████▏| 441491/478625 [00:14<00:01, 30974.49it/s][A
+
95%|█████████▌| 454880/478625 [00:14<00:00, 31707.20it/s][A
+
93%|█████████▎| 444592/478625 [00:14<00:01, 30581.99it/s][A
+
96%|█████████▌| 458239/478625 [00:14<00:00, 32257.81it/s][A
+
94%|█████████▎| 447823/478625 [00:14<00:00, 31088.93it/s][A
+
96%|█████████▋| 461598/478625 [00:14<00:00, 32650.95it/s][A
+
94%|█████████▍| 451037/478625 [00:14<00:00, 31397.42it/s][A
+
97%|█████████▋| 464866/478625 [00:14<00:00, 31967.01it/s][A
+
95%|█████████▍| 454180/478625 [00:14<00:00, 30820.03it/s][A
+
98%|█████████▊| 468110/478625 [00:14<00:00, 32104.59it/s][A
+
96%|█████████▌| 457412/478625 [00:14<00:00, 31258.01it/s][A
+
98%|█████████▊| 471324/478625 [00:14<00:00, 31357.94it/s][A
+
96%|█████████▌| 460632/478625 [00:14<00:00, 31534.86it/s][A
+
99%|█████████▉| 474679/478625 [00:14<00:00, 31996.83it/s][A
+
97%|█████████▋| 463789/478625 [00:14<00:00, 30880.16it/s][A
+
100%|█████████▉| 478030/478625 [00:14<00:00, 32440.21it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32143.17it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.57s/it]
100%|██████████| 1/1 [00:21<00:00, 21.57s/it]
+
+
98%|█████████▊| 466891/478625 [00:15<00:00, 30919.89it/s][A
+
98%|█████████▊| 469987/478625 [00:15<00:00, 30595.65it/s][A
+
99%|█████████▉| 473188/478625 [00:15<00:00, 31009.62it/s][A
+
100%|█████████▉| 476426/478625 [00:15<00:00, 31414.81it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 30958.30it/s]
+
100%|██████████| 1/1 [00:22<00:00, 22.16s/it]
100%|██████████| 1/1 [00:22<00:00, 22.16s/it]
+time 22.229705810546875
+n_elements: 474899
+data length: 474899
+time 22.814438343048096
+n_elements: 474899
+data length: 474899
+[2024-09-18 09:22:58,771] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2024-09-18 09:22:58,780] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2024-09-18 09:22:58,780] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+[2024-09-18 09:22:58,932] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
+[2024-09-18 09:22:58,932] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=.NewCls'>
+zp rank is 3, zp_size=8
+[2024-09-18 09:22:58,932] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
+[2024-09-18 09:22:58,932] [INFO] [stage_1_and_2.py:173:__init__] Reduce bucket size 536870912
+zp rank is 6, zp_size=8
+[2024-09-18 09:22:58,932] [INFO] [stage_1_and_2.py:174:__init__] Allgather bucket size 536870912
+[2024-09-18 09:22:58,932] [INFO] [stage_1_and_2.py:175:__init__] CPU Offload: False
+zp rank is 5, zp_size=8
+[2024-09-18 09:22:58,932] [INFO] [stage_1_and_2.py:176:__init__] Round robin gradient partitioning: False
+zp rank is 2, zp_size=8
+zp rank is 0, zp_size=8
+zp rank is 4, zp_size=8
+zp rank is 1, zp_size=8
+zp rank is 7, zp_size=8
+[2024-09-18 09:23:05,463] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:06,091] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states
+[2024-09-18 09:23:06,093] [INFO] [utils.py:792:see_memory_usage] MA 17.78 GB Max_MA 18.44 GB CA 18.78 GB Max_CA 19 GB
+[2024-09-18 09:23:06,093] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 235.17 GB, percent = 15.6%
+[2024-09-18 09:23:06,325] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:08,504] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states
+[2024-09-18 09:23:08,505] [INFO] [utils.py:792:see_memory_usage] MA 20.41 GB Max_MA 24.35 GB CA 25.36 GB Max_CA 25 GB
+[2024-09-18 09:23:08,505] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 236.17 GB, percent = 15.6%
+[2024-09-18 09:23:08,506] [INFO] [stage_1_and_2.py:552:__init__] optimizer state initialized
+[2024-09-18 09:23:09,037] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:09,206] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:09,593] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:09,620] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:10,872] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer
+[2024-09-18 09:23:10,873] [INFO] [utils.py:792:see_memory_usage] MA 20.41 GB Max_MA 20.41 GB CA 25.36 GB Max_CA 25 GB
+[2024-09-18 09:23:10,873] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 227.46 GB, percent = 15.1%
+[2024-09-18 09:23:10,882] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
+[2024-09-18 09:23:10,882] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2024-09-18 09:23:10,882] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2024-09-18 09:23:10,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[(0.9, 0.999)]
+[2024-09-18 09:23:10,885] [INFO] [config.py:984:print] DeepSpeedEngine configuration:
+[2024-09-18 09:23:10,885] [INFO] [config.py:988:print] activation_checkpointing_config {
+ "partition_activations": false,
+ "contiguous_memory_optimization": false,
+ "cpu_checkpointing": false,
+ "number_checkpoints": null,
+ "synchronize_checkpoint_boundary": false,
+ "profile": false
+}
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] amp_enabled .................. False
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] amp_params ................... False
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] autotuning_config ............ {
+ "enabled": false,
+ "start_step": null,
+ "end_step": null,
+ "metric_path": null,
+ "arg_mappings": null,
+ "metric": "throughput",
+ "model_info": null,
+ "results_dir": "autotuning_results",
+ "exps_dir": "autotuning_exps",
+ "overwrite": true,
+ "fast": true,
+ "start_profile_step": 3,
+ "end_profile_step": 5,
+ "tuner_type": "gridsearch",
+ "tuner_early_stopping": 5,
+ "tuner_num_trials": 50,
+ "model_info_path": null,
+ "mp_size": 1,
+ "max_train_batch_size": null,
+ "min_train_batch_size": 1,
+ "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+ "min_train_micro_batch_size_per_gpu": 1,
+ "num_tuning_micro_batch_sizes": 3
+}
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] bfloat16_enabled ............. True
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] comms_config .................
+[2024-09-18 09:23:10,886] [INFO] [config.py:988:print] communication_data_type ...... torch.float32
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] curriculum_params_legacy ..... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] data_efficiency_enabled ...... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] dataloader_drop_last ......... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] disable_allgather ............ False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] dump_state ................... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_enabled ........... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] eigenvalue_verbose ........... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] elasticity_enabled ........... False
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] flops_profiler_config ........ {
+ "enabled": false,
+ "recompute_fwd_factor": 0.0,
+ "profile_step": 1,
+ "module_depth": -1,
+ "top_modules": 1,
+ "detailed": true,
+ "output_file": null
+}
+[2024-09-18 09:23:10,887] [INFO] [config.py:988:print] fp16_auto_cast ............... None
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] fp16_enabled ................. False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] global_rank .................. 0
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] grad_accum_dtype ............. None
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] gradient_accumulation_steps .. 1
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] gradient_clipping ............ 1.0
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] graph_harvesting ............. False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] load_universal_checkpoint .... False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] loss_scale ................... 1.0
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] memory_breakdown ............. False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] mics_hierarchial_params_gather False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] mics_shard_size .............. -1
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2024-09-18 09:23:10,888] [INFO] [config.py:988:print] nebula_config ................ {
+ "enabled": false,
+ "persistent_storage_path": null,
+ "persistent_time_interval": 100,
+ "num_of_version_in_retention": 2,
+ "enable_nebula_load": true,
+ "load_path": null
+}
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] optimizer_name ............... None
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] optimizer_params ............. None
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] pld_enabled .................. False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] pld_params ................... False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] prescale_gradients ........... False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] scheduler_name ............... None
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] scheduler_params ............. None
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] sparse_attention ............. None
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] steps_per_print .............. inf
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] train_batch_size ............. 8
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 1
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] use_node_local_storage ....... False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] wall_clock_breakdown ......... False
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] weight_quantization_config ... None
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] world_size ................... 8
+[2024-09-18 09:23:10,889] [INFO] [config.py:988:print] zero_allow_untested_optimizer True
+[2024-09-18 09:23:10,890] [INFO] [config.py:988:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=536870912 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=536870912 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2024-09-18 09:23:10,890] [INFO] [config.py:988:print] zero_enabled ................. True
+[2024-09-18 09:23:10,890] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True
+[2024-09-18 09:23:10,890] [INFO] [config.py:988:print] zero_optimization_stage ...... 2
+[2024-09-18 09:23:10,890] [INFO] [config.py:974:print_user_config] json = {
+ "fp16": {
+ "enabled": false,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 16,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "bf16": {
+ "enabled": true
+ },
+ "communication_data_type": "fp32",
+ "gradient_clipping": 1.0,
+ "train_micro_batch_size_per_gpu": 1,
+ "train_batch_size": 8,
+ "gradient_accumulation_steps": 1,
+ "zero_optimization": {
+ "stage": 2,
+ "overlap_comm": true,
+ "allgather_bucket_size": 5.368709e+08,
+ "contiguous_gradients": true,
+ "reduce_bucket_size": 5.368709e+08
+ },
+ "steps_per_print": inf,
+ "zero_allow_untested_optimizer": true
+}
+09/18/2024 09:23:10 - INFO - __main__ - after accelerator.prepare
+09/18/2024 09:23:12 - INFO - __main__ - init trackers...
+[2024-09-18 09:23:12,725] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:13,099] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:13,334] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:13,413] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:13,766] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:14,287] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:16,262] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:16,262] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 4
+[2024-09-18 09:23:16,538] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 4
+[2024-09-18 09:23:16,683] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:16,683] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 5
+[2024-09-18 09:23:16,975] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 5
+[2024-09-18 09:23:18,768] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:18,919] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:18,942] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:18,943] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:19,722] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+wandb: Currently logged in as: pkuhxy (pkuhxy-Peking University). Use `wandb login --relogin` to force relogin
+wandb: - Waiting for wandb.init()...
wandb: \ Waiting for wandb.init()...
wandb: wandb version 0.18.1 is available! To upgrade, please run:
+wandb: $ pip install wandb --upgrade
+wandb: Tracking run with wandb version 0.16.3
+wandb: Run data is saved locally in /home/image_data/hxy/Open-Sora-Plan/wandb/run-20240918_092320-8wekatqc
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run desert-sponge-10
+wandb: ⭐️ View project at https://wandb.ai/pkuhxy-Peking%20University/allinpaint_stage1
+wandb: 🚀 View run at https://wandb.ai/pkuhxy-Peking%20University/allinpaint_stage1/runs/8wekatqc
+09/18/2024 09:23:22 - INFO - __main__ - ***** Running training *****
+09/18/2024 09:23:22 - INFO - __main__ - Model = DeepSpeedEngine(
+ (module): OpenSoraInpaint(
+ (pos_embed): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (transformer_blocks): ModuleList(
+ (0-31): 32 x BasicTransformerBlock(
+ (norm1): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn1): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (norm2): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn2): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (ff): FeedForward(
+ (net): ModuleList(
+ (0): GELU(
+ (proj): Linear(in_features=2304, out_features=9216, bias=True)
+ )
+ (1): Dropout(p=0.0, inplace=False)
+ (2): Linear(in_features=9216, out_features=2304, bias=True)
+ )
+ )
+ )
+ )
+ (norm_out): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (proj_out): Linear(in_features=2304, out_features=32, bias=True)
+ (adaln_single): AdaLayerNormSingle(
+ (emb): PixArtAlphaCombinedTimestepSizeEmbeddings(
+ (time_proj): Timesteps()
+ (timestep_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (caption_projection): PixArtAlphaTextProjection(
+ (linear_1): Linear(in_features=4096, out_features=2304, bias=True)
+ (act_1): GELU(approximate='tanh')
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ (motion_projection): MotionAdaLayerNormSingle(
+ (emb): MotionEmbeddings(
+ (motion_proj): Timesteps()
+ (motion_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (pos_embed_mask): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(4, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ (pos_embed_masked_hidden_states): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ )
+)
+09/18/2024 09:23:22 - INFO - __main__ - Num examples = 474899
+09/18/2024 09:23:22 - INFO - __main__ - Num Epochs = 17
+09/18/2024 09:23:22 - INFO - __main__ - Instantaneous batch size per device = 1
+09/18/2024 09:23:22 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8
+09/18/2024 09:23:22 - INFO - __main__ - Gradient Accumulation steps = 1
+09/18/2024 09:23:22 - INFO - __main__ - Total optimization steps = 1000000
+09/18/2024 09:23:22 - INFO - __main__ - Total optimization steps (num_update_steps_per_epoch) = 59362
+09/18/2024 09:23:22 - INFO - __main__ - Total trainable parameters = 2.8204808 B
+Resuming from checkpoint checkpoint-13000
+09/18/2024 09:23:22 - INFO - accelerate.accelerator - Loading states from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000
+09/18/2024 09:23:22 - INFO - accelerate.accelerator - Loading DeepSpeed Model and Optimizer
+[2024-09-18 09:23:22,967] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:23,746] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:24,841] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:24,864] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:25,360] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:25,430] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:26,500] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:27,025] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:27,749] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:27,749] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 2
+[2024-09-18 09:23:28,015] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 2
+[2024-09-18 09:23:28,032] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:28,033] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 7
+[2024-09-18 09:23:29,237] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 7
+[2024-09-18 09:23:29,345] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:29,346] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:29,627] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:29,629] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:31,125] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:33,822] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:34,833] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:34,908] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:34,937] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:35,246] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:35,274] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-18 09:23:36,809] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:36,810] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 09:23:37,179] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:37,179] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 6
+[2024-09-18 09:23:37,426] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:37,427] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 3
+[2024-09-18 09:23:37,464] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 6
+[2024-09-18 09:23:37,622] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 3
+[2024-09-18 09:23:37,652] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:37,653] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 0
+[2024-09-18 09:23:37,850] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 0
+09/18/2024 09:23:37 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer loaded from input dir /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model
+The config attributes {'decay': 0.9999, 'inv_gamma': 1.0, 'min_decay': 0.0, 'optimization_step': 13000, 'power': 0.6666666666666666, 'update_after_step': 0, 'use_ema_warmup': False} were passed to OpenSoraInpaint, but are not expected and will be ignored. Please verify your config.json configuration file.
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [164918]
+[2024-09-18 09:23:42,341] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 09:23:42,775] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[] -> [184079]
+[] -> [164918]
+[] -> [164918]
+[] -> [184079]
+[] -> [184079]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[2024-09-18 09:23:45,062] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /home/save_dir/runs/allinpaint_stage1/checkpoint-13000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-18 09:23:45,062] [INFO] [engine.py:2998:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 1
+[2024-09-18 09:23:45,334] [INFO] [engine.py:2930:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 1
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [164918]
+[] -> [164918]
+[] -> [195210]
+[] -> [164918]
+[] -> [164918]
+[] -> [195210]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+-
-
[] -> [195210]
+[] -> [164918]
+[] -> [195210]
+[] -> [164918]
+[] -> [184079]
+09/18/2024 09:24:02 - INFO - accelerate.checkpointing - All model weights loaded successfully
+09/18/2024 09:24:02 - INFO - accelerate.checkpointing - All optimizer states loaded successfully
+09/18/2024 09:24:02 - INFO - accelerate.checkpointing - All scheduler states loaded successfully
+09/18/2024 09:24:02 - INFO - accelerate.checkpointing - All dataloader sampler states loaded successfully
+09/18/2024 09:24:02 - INFO - accelerate.checkpointing - All random states loaded successfully
+09/18/2024 09:24:02 - INFO - accelerate.accelerator - Loading in 0 custom states
+
Steps: 1%|▏ | 13000/1000000 [00:00, ?it/s][] -> [164918]
+[] -> [164918]
+[] -> [164918]
+[] -> [195210]
+[] -> [164918]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+\
-
-
\
[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+-
shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+|
\
-
\
-
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+|
\
/
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+-
|
\
\
/
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+|
|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+\
|
|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+/
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+/
|
/
/
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+/
/
-
-
-
-
-
-
-
Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.Warning: Device do not support double dtype now, dtype cast repalce with float.
+
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+
Steps: 1%|▏ | 13001/1000000 [01:08<18666:08:11, 68.08s/it][RANK-0]: Step: [13001], local_loss=0.6912007927894592, train_loss=0.12457204610109329, time_cost=19.01971125602722
+
Steps: 1%|▏ | 13001/1000000 [01:08<18666:08:11, 68.08s/it, lr=1e-5, step_loss=0.691]
Steps: 1%|▏ | 13002/1000000 [01:21<9797:18:55, 35.73s/it, lr=1e-5, step_loss=0.691] [RANK-0]: Step: [13002], local_loss=0.05346745625138283, train_loss=0.15149728953838348, time_cost=3.7545714378356934
+
Steps: 1%|▏ | 13002/1000000 [01:21<9797:18:55, 35.73s/it, lr=1e-5, step_loss=0.0535]
Steps: 1%|▏ | 13003/1000000 [01:27<6056:04:01, 22.09s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [13003], local_loss=0.05149325728416443, train_loss=0.05202096700668335, time_cost=1.3344981670379639
+
Steps: 1%|▏ | 13003/1000000 [01:27<6056:04:01, 22.09s/it, lr=1e-5, step_loss=0.0515]
Steps: 1%|▏ | 13004/1000000 [01:39<4968:33:34, 18.12s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [13004], local_loss=0.09427880495786667, train_loss=0.2024877518415451, time_cost=4.650153875350952
+
Steps: 1%|▏ | 13004/1000000 [01:39<4968:33:34, 18.12s/it, lr=1e-5, step_loss=0.0943]
Steps: 1%|▏ | 13005/1000000 [01:47<4016:39:12, 14.65s/it, lr=1e-5, step_loss=0.0943][RANK-0]: Step: [13005], local_loss=0.09295719861984253, train_loss=0.07010133564472198, time_cost=2.67057728767395
+
Steps: 1%|▏ | 13005/1000000 [01:47<4016:39:12, 14.65s/it, lr=1e-5, step_loss=0.093]
Steps: 1%|▏ | 13006/1000000 [02:00<3857:48:01, 14.07s/it, lr=1e-5, step_loss=0.093][RANK-0]: Step: [13006], local_loss=0.25009989738464355, train_loss=0.08790505677461624, time_cost=3.0588011741638184
+
Steps: 1%|▏ | 13006/1000000 [02:00<3857:48:01, 14.07s/it, lr=1e-5, step_loss=0.25]
Steps: 1%|▏ | 13007/1000000 [02:12<3631:26:20, 13.25s/it, lr=1e-5, step_loss=0.25][RANK-0]: Step: [13007], local_loss=0.017799871042370796, train_loss=0.04780075326561928, time_cost=4.980525732040405
+
Steps: 1%|▏ | 13007/1000000 [02:12<3631:26:20, 13.25s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%|▏ | 13008/1000000 [02:25<3614:36:52, 13.18s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [13008], local_loss=0.019148966297507286, train_loss=0.13378217816352844, time_cost=3.6623857021331787
+
Steps: 1%|▏ | 13008/1000000 [02:25<3614:36:52, 13.18s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%|▏ | 13009/1000000 [02:35<3384:15:27, 12.34s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [13009], local_loss=0.05808651074767113, train_loss=0.07216843962669373, time_cost=4.509132146835327
+
Steps: 1%|▏ | 13009/1000000 [02:35<3384:15:27, 12.34s/it, lr=1e-5, step_loss=0.0581]
Steps: 1%|▏ | 13010/1000000 [02:42<2957:24:19, 10.79s/it, lr=1e-5, step_loss=0.0581][RANK-0]: Step: [13010], local_loss=0.08228850364685059, train_loss=0.14680400490760803, time_cost=5.417214870452881
+
Steps: 1%|▏ | 13010/1000000 [02:42<2957:24:19, 10.79s/it, lr=1e-5, step_loss=0.0823]
Steps: 1%|▏ | 13011/1000000 [02:57<3261:44:27, 11.90s/it, lr=1e-5, step_loss=0.0823][RANK-0]: Step: [13011], local_loss=0.07371076941490173, train_loss=0.11238759011030197, time_cost=10.597983837127686
+
Steps: 1%|▏ | 13011/1000000 [02:57<3261:44:27, 11.90s/it, lr=1e-5, step_loss=0.0737]
Steps: 1%|▏ | 13012/1000000 [03:02<2716:23:18, 9.91s/it, lr=1e-5, step_loss=0.0737][RANK-0]: Step: [13012], local_loss=0.07446154952049255, train_loss=0.10717670619487762, time_cost=4.037163734436035
+
Steps: 1%|▏ | 13012/1000000 [03:02<2716:23:18, 9.91s/it, lr=1e-5, step_loss=0.0745]
Steps: 1%|▏ | 13013/1000000 [03:14<2876:15:11, 10.49s/it, lr=1e-5, step_loss=0.0745][RANK-0]: Step: [13013], local_loss=0.07520817965269089, train_loss=0.09095172584056854, time_cost=6.417724847793579
+
Steps: 1%|▏ | 13013/1000000 [03:14<2876:15:11, 10.49s/it, lr=1e-5, step_loss=0.0752]
Steps: 1%|▏ | 13014/1000000 [03:25<2921:26:02, 10.66s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [13014], local_loss=0.09652598202228546, train_loss=0.04960218444466591, time_cost=1.1971819400787354
+
Steps: 1%|▏ | 13014/1000000 [03:25<2921:26:02, 10.66s/it, lr=1e-5, step_loss=0.0965]
Steps: 1%|▏ | 13015/1000000 [03:30<2424:49:38, 8.84s/it, lr=1e-5, step_loss=0.0965][RANK-0]: Step: [13015], local_loss=0.04726234823465347, train_loss=0.06895175576210022, time_cost=1.2257764339447021
+
Steps: 1%|▏ | 13015/1000000 [03:30<2424:49:38, 8.84s/it, lr=1e-5, step_loss=0.0473]
Steps: 1%|▏ | 13016/1000000 [03:38<2348:13:01, 8.57s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [13016], local_loss=0.05506616458296776, train_loss=0.3400516211986542, time_cost=6.736793279647827
+
Steps: 1%|▏ | 13016/1000000 [03:38<2348:13:01, 8.57s/it, lr=1e-5, step_loss=0.0551]
Steps: 1%|▏ | 13017/1000000 [03:53<2948:58:34, 10.76s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [13017], local_loss=0.05423709377646446, train_loss=0.06873168051242828, time_cost=7.181159257888794
+
Steps: 1%|▏ | 13017/1000000 [03:53<2948:58:34, 10.76s/it, lr=1e-5, step_loss=0.0542]
Steps: 1%|▏ | 13018/1000000 [04:01<2684:23:15, 9.79s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [13018], local_loss=0.040669962763786316, train_loss=0.040384046733379364, time_cost=2.5510759353637695
+
Steps: 1%|▏ | 13018/1000000 [04:01<2684:23:15, 9.79s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%|▏ | 13019/1000000 [04:10<2608:07:55, 9.51s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [13019], local_loss=0.05926625803112984, train_loss=0.11356660723686218, time_cost=3.3819761276245117
+
Steps: 1%|▏ | 13019/1000000 [04:10<2608:07:55, 9.51s/it, lr=1e-5, step_loss=0.0593]
Steps: 1%|▏ | 13020/1000000 [04:20<2673:09:02, 9.75s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [13020], local_loss=0.09075923264026642, train_loss=0.07698927819728851, time_cost=2.5750885009765625
+
Steps: 1%|▏ | 13020/1000000 [04:20<2673:09:02, 9.75s/it, lr=1e-5, step_loss=0.0908]
Steps: 1%|▏ | 13021/1000000 [04:28<2513:54:26, 9.17s/it, lr=1e-5, step_loss=0.0908][RANK-0]: Step: [13021], local_loss=0.04174676910042763, train_loss=0.05156717076897621, time_cost=1.179793357849121
+
Steps: 1%|▏ | 13021/1000000 [04:28<2513:54:26, 9.17s/it, lr=1e-5, step_loss=0.0417]
Steps: 1%|▏ | 13022/1000000 [04:43<3007:14:17, 10.97s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [13022], local_loss=0.8709360957145691, train_loss=0.15748746693134308, time_cost=7.2980382442474365
+
Steps: 1%|▏ | 13022/1000000 [04:43<3007:14:17, 10.97s/it, lr=1e-5, step_loss=0.871] -
\
\
Steps: 1%|▏ | 13023/1000000 [04:58<3332:11:53, 12.15s/it, lr=1e-5, step_loss=0.871][RANK-0]: Step: [13023], local_loss=0.06282350420951843, train_loss=0.04616738110780716, time_cost=1.1814916133880615
+
Steps: 1%|▏ | 13023/1000000 [04:58<3332:11:53, 12.15s/it, lr=1e-5, step_loss=0.0628]
Steps: 1%|▏ | 13024/1000000 [05:15<3685:12:27, 13.44s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [13024], local_loss=0.049919042736291885, train_loss=0.05239512026309967, time_cost=1.166670799255371
+
Steps: 1%|▏ | 13024/1000000 [05:15<3685:12:27, 13.44s/it, lr=1e-5, step_loss=0.0499]
Steps: 1%|▏ | 13025/1000000 [05:27<3574:47:06, 13.04s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [13025], local_loss=0.03861309587955475, train_loss=0.03512583300471306, time_cost=1.349654197692871
+
Steps: 1%|▏ | 13025/1000000 [05:27<3574:47:06, 13.04s/it, lr=1e-5, step_loss=0.0386]
Steps: 1%|▏ | 13026/1000000 [05:33<2998:16:11, 10.94s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [13026], local_loss=0.15979133546352386, train_loss=0.1384120136499405, time_cost=4.767601490020752
+
Steps: 1%|▏ | 13026/1000000 [05:33<2998:16:11, 10.94s/it, lr=1e-5, step_loss=0.16]
Steps: 1%|▏ | 13027/1000000 [05:41<2783:19:10, 10.15s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [13027], local_loss=0.09809400141239166, train_loss=0.06433211266994476, time_cost=2.022052526473999
+
Steps: 1%|▏ | 13027/1000000 [05:41<2783:19:10, 10.15s/it, lr=1e-5, step_loss=0.0981]
Steps: 1%|▏ | 13028/1000000 [05:56<3198:43:07, 11.67s/it, lr=1e-5, step_loss=0.0981][RANK-0]: Step: [13028], local_loss=0.2219545543193817, train_loss=0.06990769505500793, time_cost=5.798627853393555
+
Steps: 1%|▏ | 13028/1000000 [05:56<3198:43:07, 11.67s/it, lr=1e-5, step_loss=0.222]
Steps: 1%|▏ | 13029/1000000 [06:11<3491:42:00, 12.74s/it, lr=1e-5, step_loss=0.222][RANK-0]: Step: [13029], local_loss=0.03141722083091736, train_loss=0.05907723680138588, time_cost=5.804453611373901
+
Steps: 1%|▏ | 13029/1000000 [06:11<3491:42:00, 12.74s/it, lr=1e-5, step_loss=0.0314]
Steps: 1%|▏ | 13030/1000000 [06:20<3172:55:28, 11.57s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [13030], local_loss=0.03849932178854942, train_loss=0.06205734238028526, time_cost=6.8168840408325195
+
Steps: 1%|▏ | 13030/1000000 [06:20<3172:55:28, 11.57s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%|▏ | 13031/1000000 [06:29<2935:16:08, 10.71s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [13031], local_loss=0.039324142038822174, train_loss=0.05625995993614197, time_cost=1.6398472785949707
+
Steps: 1%|▏ | 13031/1000000 [06:29<2935:16:08, 10.71s/it, lr=1e-5, step_loss=0.0393]
Steps: 1%|▏ | 13032/1000000 [06:38<2823:04:25, 10.30s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [13032], local_loss=0.11689900606870651, train_loss=0.09351343661546707, time_cost=1.6884443759918213
+
Steps: 1%|▏ | 13032/1000000 [06:38<2823:04:25, 10.30s/it, lr=1e-5, step_loss=0.117]
Steps: 1%|▏ | 13033/1000000 [06:49<2862:35:14, 10.44s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [13033], local_loss=0.025134621188044548, train_loss=0.03481939807534218, time_cost=1.7861218452453613
+
Steps: 1%|▏ | 13033/1000000 [06:49<2862:35:14, 10.44s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%|▏ | 13034/1000000 [07:03<3122:23:17, 11.39s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [13034], local_loss=0.061988476663827896, train_loss=0.0456680990755558, time_cost=3.758488893508911
+
Steps: 1%|▏ | 13034/1000000 [07:03<3122:23:17, 11.39s/it, lr=1e-5, step_loss=0.062]
Steps: 1%|▏ | 13035/1000000 [07:08<2610:01:06, 9.52s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [13035], local_loss=0.031341444700956345, train_loss=0.03647599741816521, time_cost=2.0890414714813232
+
Steps: 1%|▏ | 13035/1000000 [07:08<2610:01:06, 9.52s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%|▏ | 13036/1000000 [07:17<2554:00:26, 9.32s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [13036], local_loss=0.042183853685855865, train_loss=0.06822597980499268, time_cost=3.629178047180176
+
Steps: 1%|▏ | 13036/1000000 [07:17<2554:00:26, 9.32s/it, lr=1e-5, step_loss=0.0422]
Steps: 1%|▏ | 13037/1000000 [07:22<2188:24:24, 7.98s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [13037], local_loss=0.06189291551709175, train_loss=0.05200623348355293, time_cost=1.892902135848999
+
Steps: 1%|▏ | 13037/1000000 [07:22<2188:24:24, 7.98s/it, lr=1e-5, step_loss=0.0619]
Steps: 1%|▏ | 13038/1000000 [07:29<2149:23:35, 7.84s/it, lr=1e-5, step_loss=0.0619][RANK-0]: Step: [13038], local_loss=0.08573242276906967, train_loss=0.08797606825828552, time_cost=3.000669240951538
+
Steps: 1%|▏ | 13038/1000000 [07:29<2149:23:35, 7.84s/it, lr=1e-5, step_loss=0.0857]
Steps: 1%|▏ | 13039/1000000 [07:36<2093:31:45, 7.64s/it, lr=1e-5, step_loss=0.0857][RANK-0]: Step: [13039], local_loss=0.07212965935468674, train_loss=0.12634927034378052, time_cost=1.7842943668365479
+
Steps: 1%|▏ | 13039/1000000 [07:36<2093:31:45, 7.64s/it, lr=1e-5, step_loss=0.0721]
Steps: 1%|▏ | 13040/1000000 [07:49<2504:04:10, 9.13s/it, lr=1e-5, step_loss=0.0721][RANK-0]: Step: [13040], local_loss=0.05564303323626518, train_loss=0.08759918808937073, time_cost=1.8180129528045654
+
Steps: 1%|▏ | 13040/1000000 [07:49<2504:04:10, 9.13s/it, lr=1e-5, step_loss=0.0556]
Steps: 1%|▏ | 13041/1000000 [07:54<2139:33:28, 7.80s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [13041], local_loss=0.03688441589474678, train_loss=0.04052276164293289, time_cost=1.7254462242126465
+
Steps: 1%|▏ | 13041/1000000 [07:54<2139:33:28, 7.80s/it, lr=1e-5, step_loss=0.0369]
Steps: 1%|▏ | 13042/1000000 [07:58<1857:42:35, 6.78s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [13042], local_loss=0.09647025913000107, train_loss=0.0604621097445488, time_cost=1.1955194473266602
+
Steps: 1%|▏ | 13042/1000000 [07:58<1857:42:35, 6.78s/it, lr=1e-5, step_loss=0.0965]
Steps: 1%|▏ | 13043/1000000 [08:04<1769:44:30, 6.46s/it, lr=1e-5, step_loss=0.0965][RANK-0]: Step: [13043], local_loss=0.030699556693434715, train_loss=0.06671479344367981, time_cost=2.955723762512207
+
Steps: 1%|▏ | 13043/1000000 [08:04<1769:44:30, 6.46s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%|▏ | 13044/1000000 [08:15<2210:01:17, 8.06s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [13044], local_loss=0.0862542912364006, train_loss=0.07042145729064941, time_cost=3.6811792850494385
+
Steps: 1%|▏ | 13044/1000000 [08:15<2210:01:17, 8.06s/it, lr=1e-5, step_loss=0.0863]
Steps: 1%|▏ | 13045/1000000 [08:19<1876:51:41, 6.85s/it, lr=1e-5, step_loss=0.0863][RANK-0]: Step: [13045], local_loss=0.03839487209916115, train_loss=0.11812586337327957, time_cost=1.3724353313446045
+
Steps: 1%|▏ | 13045/1000000 [08:19<1876:51:41, 6.85s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%|▏ | 13046/1000000 [08:26<1858:02:22, 6.78s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [13046], local_loss=0.21678945422172546, train_loss=0.07650742679834366, time_cost=1.6032350063323975
+
Steps: 1%|▏ | 13046/1000000 [08:26<1858:02:22, 6.78s/it, lr=1e-5, step_loss=0.217]
Steps: 1%|▏ | 13047/1000000 [08:31<1711:33:35, 6.24s/it, lr=1e-5, step_loss=0.217][RANK-0]: Step: [13047], local_loss=0.4205295443534851, train_loss=0.10969267040491104, time_cost=3.995391607284546
+
Steps: 1%|▏ | 13047/1000000 [08:31<1711:33:35, 6.24s/it, lr=1e-5, step_loss=0.421]
Steps: 1%|▏ | 13048/1000000 [08:41<2049:46:18, 7.48s/it, lr=1e-5, step_loss=0.421][RANK-0]: Step: [13048], local_loss=0.06901390105485916, train_loss=0.12885023653507233, time_cost=2.125579595565796
+
Steps: 1%|▏ | 13048/1000000 [08:41<2049:46:18, 7.48s/it, lr=1e-5, step_loss=0.069]
Steps: 1%|▏ | 13049/1000000 [08:54<2475:55:55, 9.03s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [13049], local_loss=0.4977265000343323, train_loss=0.11252966523170471, time_cost=3.8791282176971436
+
Steps: 1%|▏ | 13049/1000000 [08:54<2475:55:55, 9.03s/it, lr=1e-5, step_loss=0.498]
Steps: 1%|▏ | 13050/1000000 [09:11<3131:36:46, 11.42s/it, lr=1e-5, step_loss=0.498][RANK-0]: Step: [13050], local_loss=0.040667250752449036, train_loss=0.06971438229084015, time_cost=8.345251321792603
+
Steps: 1%|▏ | 13050/1000000 [09:11<3131:36:46, 11.42s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%|▏ | 13051/1000000 [09:23<3198:52:19, 11.67s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [13051], local_loss=0.11105022579431534, train_loss=0.08668375015258789, time_cost=2.6864776611328125
+
Steps: 1%|▏ | 13051/1000000 [09:23<3198:52:19, 11.67s/it, lr=1e-5, step_loss=0.111]
Steps: 1%|▏ | 13052/1000000 [09:28<2587:28:30, 9.44s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [13052], local_loss=0.04316336661577225, train_loss=0.045426253229379654, time_cost=3.1966445446014404
+
Steps: 1%|▏ | 13052/1000000 [09:28<2587:28:30, 9.44s/it, lr=1e-5, step_loss=0.0432]
Steps: 1%|▏ | 13053/1000000 [09:37<2589:32:05, 9.45s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [13053], local_loss=0.08944638073444366, train_loss=0.1109633594751358, time_cost=1.5816352367401123
+
Steps: 1%|▏ | 13053/1000000 [09:37<2589:32:05, 9.45s/it, lr=1e-5, step_loss=0.0894]
Steps: 1%|▏ | 13054/1000000 [09:42<2237:52:01, 8.16s/it, lr=1e-5, step_loss=0.0894][RANK-0]: Step: [13054], local_loss=0.0322047621011734, train_loss=0.04483100026845932, time_cost=2.3469419479370117
+
Steps: 1%|▏ | 13054/1000000 [09:42<2237:52:01, 8.16s/it, lr=1e-5, step_loss=0.0322]
Steps: 1%|▏ | 13055/1000000 [09:50<2237:07:00, 8.16s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [13055], local_loss=0.04546746239066124, train_loss=0.08085073530673981, time_cost=1.1877892017364502
+
Steps: 1%|▏ | 13055/1000000 [09:50<2237:07:00, 8.16s/it, lr=1e-5, step_loss=0.0455]
Steps: 1%|▏ | 13056/1000000 [10:03<2641:38:07, 9.64s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [13056], local_loss=0.030740784481167793, train_loss=0.051894668489694595, time_cost=1.177433729171753
+
Steps: 1%|▏ | 13056/1000000 [10:03<2641:38:07, 9.64s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%|▏ | 13057/1000000 [10:08<2263:39:22, 8.26s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [13057], local_loss=0.028211969882249832, train_loss=0.07409092783927917, time_cost=1.7246479988098145
+
Steps: 1%|▏ | 13057/1000000 [10:08<2263:39:22, 8.26s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%|▏ | 13058/1000000 [10:17<2254:55:22, 8.23s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [13058], local_loss=0.09834682941436768, train_loss=0.08208297938108444, time_cost=6.672968864440918
+
Steps: 1%|▏ | 13058/1000000 [10:17<2254:55:22, 8.23s/it, lr=1e-5, step_loss=0.0983]
Steps: 1%|▏ | 13059/1000000 [10:26<2323:45:17, 8.48s/it, lr=1e-5, step_loss=0.0983][RANK-0]: Step: [13059], local_loss=0.07239135354757309, train_loss=0.06804746389389038, time_cost=1.1543831825256348
+
Steps: 1%|▏ | 13059/1000000 [10:26<2323:45:17, 8.48s/it, lr=1e-5, step_loss=0.0724]
Steps: 1%|▏ | 13060/1000000 [10:30<1972:18:18, 7.19s/it, lr=1e-5, step_loss=0.0724][RANK-0]: Step: [13060], local_loss=0.031734079122543335, train_loss=0.04941225051879883, time_cost=1.232001781463623
+
Steps: 1%|▏ | 13060/1000000 [10:30<1972:18:18, 7.19s/it, lr=1e-5, step_loss=0.0317]
Steps: 1%|▏ | 13061/1000000 [10:36<1917:09:08, 6.99s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [13061], local_loss=0.0321425125002861, train_loss=0.051371339708566666, time_cost=2.02801251411438
+
Steps: 1%|▏ | 13061/1000000 [10:36<1917:09:08, 6.99s/it, lr=1e-5, step_loss=0.0321]
Steps: 1%|▏ | 13062/1000000 [10:47<2208:46:20, 8.06s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [13062], local_loss=0.0281971488147974, train_loss=0.03899931535124779, time_cost=1.2628154754638672
+
Steps: 1%|▏ | 13062/1000000 [10:47<2208:46:20, 8.06s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%|▏ | 13063/1000000 [10:51<1911:21:10, 6.97s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [13063], local_loss=0.034505829215049744, train_loss=0.036058228462934494, time_cost=2.996906280517578
+
Steps: 1%|▏ | 13063/1000000 [10:51<1911:21:10, 6.97s/it, lr=1e-5, step_loss=0.0345]
Steps: 1%|▏ | 13064/1000000 [10:57<1803:41:50, 6.58s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [13064], local_loss=0.10796792805194855, train_loss=0.11528446525335312, time_cost=1.1932508945465088
+
Steps: 1%|▏ | 13064/1000000 [10:57<1803:41:50, 6.58s/it, lr=1e-5, step_loss=0.108]
Steps: 1%|▏ | 13065/1000000 [11:09<2248:30:06, 8.20s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [13065], local_loss=0.016511397436261177, train_loss=0.08320055902004242, time_cost=5.838212490081787
+
Steps: 1%|▏ | 13065/1000000 [11:09<2248:30:06, 8.20s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%|▏ | 13066/1000000 [11:18<2343:05:11, 8.55s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [13066], local_loss=0.053020454943180084, train_loss=0.09214553236961365, time_cost=3.3808982372283936
+
Steps: 1%|▏ | 13066/1000000 [11:18<2343:05:11, 8.55s/it, lr=1e-5, step_loss=0.053]
Steps: 1%|▏ | 13067/1000000 [11:23<2055:27:28, 7.50s/it, lr=1e-5, step_loss=0.053][RANK-0]: Step: [13067], local_loss=0.021782565861940384, train_loss=0.19200366735458374, time_cost=2.2089591026306152
+
Steps: 1%|▏ | 13067/1000000 [11:23<2055:27:28, 7.50s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%|▏ | 13068/1000000 [11:29<1873:21:58, 6.83s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [13068], local_loss=0.053066276013851166, train_loss=0.06589754670858383, time_cost=2.5695953369140625
+
Steps: 1%|▏ | 13068/1000000 [11:29<1873:21:58, 6.83s/it, lr=1e-5, step_loss=0.0531]
Steps: 1%|▏ | 13069/1000000 [11:36<1889:19:33, 6.89s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [13069], local_loss=12.626391410827637, train_loss=1.7698101997375488, time_cost=2.0311930179595947
+
Steps: 1%|▏ | 13069/1000000 [11:36<1889:19:33, 6.89s/it, lr=1e-5, step_loss=12.6]
Steps: 1%|▏ | 13070/1000000 [11:41<1779:30:41, 6.49s/it, lr=1e-5, step_loss=12.6][RANK-0]: Step: [13070], local_loss=0.1008138582110405, train_loss=0.1352553814649582, time_cost=3.000626802444458
+
Steps: 1%|▏ | 13070/1000000 [11:41<1779:30:41, 6.49s/it, lr=1e-5, step_loss=0.101]
Steps: 1%|▏ | 13071/1000000 [11:49<1901:25:36, 6.94s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [13071], local_loss=0.0929906815290451, train_loss=0.06481929123401642, time_cost=4.204439163208008
+
Steps: 1%|▏ | 13071/1000000 [11:49<1901:25:36, 6.94s/it, lr=1e-5, step_loss=0.093]
Steps: 1%|▏ | 13072/1000000 [11:58<2045:10:05, 7.46s/it, lr=1e-5, step_loss=0.093][RANK-0]: Step: [13072], local_loss=0.041687414050102234, train_loss=0.042519938200712204, time_cost=2.517719030380249
+
Steps: 1%|▏ | 13072/1000000 [11:58<2045:10:05, 7.46s/it, lr=1e-5, step_loss=0.0417]
Steps: 1%|▏ | 13073/1000000 [12:09<2312:53:23, 8.44s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [13073], local_loss=0.027906322851777077, train_loss=0.057496048510074615, time_cost=1.2641541957855225
+
Steps: 1%|▏ | 13073/1000000 [12:09<2312:53:23, 8.44s/it, lr=1e-5, step_loss=0.0279]
Steps: 1%|▏ | 13074/1000000 [12:16<2234:38:38, 8.15s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [13074], local_loss=0.05297687649726868, train_loss=0.08475500345230103, time_cost=3.516213893890381
+
Steps: 1%|▏ | 13074/1000000 [12:16<2234:38:38, 8.15s/it, lr=1e-5, step_loss=0.053]
Steps: 1%|▏ | 13075/1000000 [12:26<2374:51:10, 8.66s/it, lr=1e-5, step_loss=0.053][RANK-0]: Step: [13075], local_loss=0.017238549888134003, train_loss=0.12071967124938965, time_cost=1.200681447982788
+
Steps: 1%|▏ | 13075/1000000 [12:26<2374:51:10, 8.66s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%|▏ | 13076/1000000 [12:37<2553:27:00, 9.31s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [13076], local_loss=0.020224357023835182, train_loss=0.05678611993789673, time_cost=1.204789161682129
+
Steps: 1%|▏ | 13076/1000000 [12:37<2553:27:00, 9.31s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%|▏ | 13077/1000000 [12:45<2459:00:20, 8.97s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [13077], local_loss=0.036586444824934006, train_loss=0.04997251182794571, time_cost=1.7450807094573975
+
Steps: 1%|▏ | 13077/1000000 [12:45<2459:00:20, 8.97s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%|▏ | 13078/1000000 [12:50<2159:40:07, 7.88s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [13078], local_loss=0.028620729222893715, train_loss=0.060873307287693024, time_cost=4.1338512897491455
+
Steps: 1%|▏ | 13078/1000000 [12:50<2159:40:07, 7.88s/it, lr=1e-5, step_loss=0.0286]
Steps: 1%|▏ | 13079/1000000 [12:57<2063:03:42, 7.53s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [13079], local_loss=0.06756910681724548, train_loss=0.16852787137031555, time_cost=2.517662286758423
+
Steps: 1%|▏ | 13079/1000000 [12:57<2063:03:42, 7.53s/it, lr=1e-5, step_loss=0.0676]
Steps: 1%|▏ | 13080/1000000 [13:11<2613:46:07, 9.53s/it, lr=1e-5, step_loss=0.0676][RANK-0]: Step: [13080], local_loss=0.02846851386129856, train_loss=2.790355920791626, time_cost=1.2054784297943115
+
Steps: 1%|▏ | 13080/1000000 [13:11<2613:46:07, 9.53s/it, lr=1e-5, step_loss=0.0285]
Steps: 1%|▏ | 13081/1000000 [13:24<2901:18:01, 10.58s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [13081], local_loss=0.03340919315814972, train_loss=0.14923742413520813, time_cost=11.22702407836914
+
Steps: 1%|▏ | 13081/1000000 [13:24<2901:18:01, 10.58s/it, lr=1e-5, step_loss=0.0334]
Steps: 1%|▏ | 13082/1000000 [13:33<2765:50:47, 10.09s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [13082], local_loss=0.24614779651165009, train_loss=0.06319369375705719, time_cost=1.1739492416381836
+
Steps: 1%|▏ | 13082/1000000 [13:33<2765:50:47, 10.09s/it, lr=1e-5, step_loss=0.246]
Steps: 1%|▏ | 13083/1000000 [13:42<2693:25:06, 9.82s/it, lr=1e-5, step_loss=0.246][RANK-0]: Step: [13083], local_loss=0.03692837059497833, train_loss=0.1230938583612442, time_cost=1.6080572605133057
+
Steps: 1%|▏ | 13083/1000000 [13:42<2693:25:06, 9.82s/it, lr=1e-5, step_loss=0.0369]
Steps: 1%|▏ | 13084/1000000 [13:56<3025:51:56, 11.04s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [13084], local_loss=0.013816016726195812, train_loss=0.0423557385802269, time_cost=1.1852567195892334
+
Steps: 1%|▏ | 13084/1000000 [13:56<3025:51:56, 11.04s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 13085/1000000 [14:02<2607:01:28, 9.51s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [13085], local_loss=0.024363450706005096, train_loss=0.04568168520927429, time_cost=1.8156864643096924
+
Steps: 1%|▏ | 13085/1000000 [14:02<2607:01:28, 9.51s/it, lr=1e-5, step_loss=0.0244]
Steps: 1%|▏ | 13086/1000000 [14:15<2884:56:23, 10.52s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [13086], local_loss=0.2272205352783203, train_loss=0.102378249168396, time_cost=3.174506187438965
+
Steps: 1%|▏ | 13086/1000000 [14:15<2884:56:23, 10.52s/it, lr=1e-5, step_loss=0.227]
Steps: 1%|▏ | 13087/1000000 [14:22<2609:12:13, 9.52s/it, lr=1e-5, step_loss=0.227][RANK-0]: Step: [13087], local_loss=0.029507849365472794, train_loss=0.03481770306825638, time_cost=2.7101190090179443
+
Steps: 1%|▏ | 13087/1000000 [14:22<2609:12:13, 9.52s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%|▏ | 13088/1000000 [14:31<2562:04:07, 9.35s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [13088], local_loss=0.034952204674482346, train_loss=0.16942976415157318, time_cost=2.7203369140625
+
Steps: 1%|▏ | 13088/1000000 [14:31<2562:04:07, 9.35s/it, lr=1e-5, step_loss=0.035]
Steps: 1%|▏ | 13089/1000000 [14:36<2202:10:44, 8.03s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [13089], local_loss=0.05845103785395622, train_loss=0.07421861588954926, time_cost=1.1768662929534912
+
Steps: 1%|▏ | 13089/1000000 [14:36<2202:10:44, 8.03s/it, lr=1e-5, step_loss=0.0585]
Steps: 1%|▏ | 13090/1000000 [14:43<2114:02:11, 7.71s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [13090], local_loss=0.044071584939956665, train_loss=0.042979493737220764, time_cost=2.6621956825256348
+
Steps: 1%|▏ | 13090/1000000 [14:43<2114:02:11, 7.71s/it, lr=1e-5, step_loss=0.0441]
Steps: 1%|▏ | 13091/1000000 [14:50<2077:53:08, 7.58s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [13091], local_loss=0.03262651339173317, train_loss=0.04302695393562317, time_cost=2.614711046218872
+
Steps: 1%|▏ | 13091/1000000 [14:50<2077:53:08, 7.58s/it, lr=1e-5, step_loss=0.0326]
Steps: 1%|▏ | 13092/1000000 [14:55<1849:47:50, 6.75s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [13092], local_loss=0.02401992492377758, train_loss=0.04160035401582718, time_cost=3.8219733238220215
+
Steps: 1%|▏ | 13092/1000000 [14:55<1849:47:50, 6.75s/it, lr=1e-5, step_loss=0.024]
Steps: 1%|▏ | 13093/1000000 [15:02<1821:09:07, 6.64s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [13093], local_loss=0.04525874927639961, train_loss=0.038348302245140076, time_cost=2.5620081424713135
+
Steps: 1%|▏ | 13093/1000000 [15:02<1821:09:07, 6.64s/it, lr=1e-5, step_loss=0.0453]
Steps: 1%|▏ | 13094/1000000 [15:06<1635:55:49, 5.97s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [13094], local_loss=0.038956418633461, train_loss=0.08861736953258514, time_cost=1.7374298572540283
+
Steps: 1%|▏ | 13094/1000000 [15:06<1635:55:49, 5.97s/it, lr=1e-5, step_loss=0.039]
Steps: 1%|▏ | 13095/1000000 [15:14<1788:16:12, 6.52s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [13095], local_loss=0.03843923285603523, train_loss=0.06873827427625656, time_cost=1.6491179466247559
+
Steps: 1%|▏ | 13095/1000000 [15:14<1788:16:12, 6.52s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%|▏ | 13096/1000000 [15:28<2451:58:00, 8.94s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [13096], local_loss=0.04634671285748482, train_loss=0.03067866712808609, time_cost=6.420986652374268
+
Steps: 1%|▏ | 13096/1000000 [15:28<2451:58:00, 8.94s/it, lr=1e-5, step_loss=0.0463]
Steps: 1%|▏ | 13097/1000000 [15:37<2385:00:39, 8.70s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [13097], local_loss=0.014897074550390244, train_loss=0.04523338004946709, time_cost=3.0526394844055176
+
Steps: 1%|▏ | 13097/1000000 [15:37<2385:00:39, 8.70s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 13098/1000000 [15:44<2302:14:56, 8.40s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [13098], local_loss=0.10048118233680725, train_loss=0.032033879309892654, time_cost=2.5276007652282715
+
Steps: 1%|▏ | 13098/1000000 [15:44<2302:14:56, 8.40s/it, lr=1e-5, step_loss=0.1]
Steps: 1%|▏ | 13099/1000000 [15:50<2086:43:36, 7.61s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [13099], local_loss=0.03699418902397156, train_loss=0.052640125155448914, time_cost=1.249600887298584
+
Steps: 1%|▏ | 13099/1000000 [15:50<2086:43:36, 7.61s/it, lr=1e-5, step_loss=0.037]\
|
\
\
\
\
|
\
Steps: 1%|▏ | 13100/1000000 [15:56<1931:52:01, 7.05s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [13100], local_loss=0.019349968060851097, train_loss=0.04692091420292854, time_cost=1.6953916549682617
+
Steps: 1%|▏ | 13100/1000000 [15:56<1931:52:01, 7.05s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%|▏ | 13101/1000000 [16:06<2184:32:49, 7.97s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [13101], local_loss=0.04074251651763916, train_loss=0.05827762931585312, time_cost=7.5485780239105225
+
Steps: 1%|▏ | 13101/1000000 [16:06<2184:32:49, 7.97s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%|▏ | 13102/1000000 [16:13<2133:29:56, 7.78s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [13102], local_loss=0.04693391174077988, train_loss=0.03496510162949562, time_cost=5.060737609863281
+
Steps: 1%|▏ | 13102/1000000 [16:13<2133:29:56, 7.78s/it, lr=1e-5, step_loss=0.0469]
Steps: 1%|▏ | 13103/1000000 [16:23<2285:45:12, 8.34s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [13103], local_loss=0.09972681105136871, train_loss=0.04201456904411316, time_cost=1.3102316856384277
+
Steps: 1%|▏ | 13103/1000000 [16:23<2285:45:12, 8.34s/it, lr=1e-5, step_loss=0.0997]
Steps: 1%|▏ | 13104/1000000 [16:33<2421:24:41, 8.83s/it, lr=1e-5, step_loss=0.0997][RANK-0]: Step: [13104], local_loss=0.3339431881904602, train_loss=0.09735651314258575, time_cost=1.6835877895355225
+
Steps: 1%|▏ | 13104/1000000 [16:33<2421:24:41, 8.83s/it, lr=1e-5, step_loss=0.334]
Steps: 1%|▏ | 13105/1000000 [16:37<2054:12:00, 7.49s/it, lr=1e-5, step_loss=0.334][RANK-0]: Step: [13105], local_loss=1.0055567026138306, train_loss=0.18502509593963623, time_cost=1.3435649871826172
+
Steps: 1%|▏ | 13105/1000000 [16:37<2054:12:00, 7.49s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 13106/1000000 [16:49<2401:03:00, 8.76s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [13106], local_loss=0.021855153143405914, train_loss=0.04572649300098419, time_cost=2.246246337890625
+
Steps: 1%|▏ | 13106/1000000 [16:49<2401:03:00, 8.76s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%|▏ | 13107/1000000 [17:03<2844:00:16, 10.37s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [13107], local_loss=0.057524994015693665, train_loss=0.15326879918575287, time_cost=3.1481869220733643
+
Steps: 1%|▏ | 13107/1000000 [17:03<2844:00:16, 10.37s/it, lr=1e-5, step_loss=0.0575]
Steps: 1%|▏ | 13108/1000000 [17:19<3270:44:19, 11.93s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [13108], local_loss=0.03417633846402168, train_loss=0.025647010654211044, time_cost=5.079161882400513
+
Steps: 1%|▏ | 13108/1000000 [17:19<3270:44:19, 11.93s/it, lr=1e-5, step_loss=0.0342]
Steps: 1%|▏ | 13109/1000000 [17:27<2934:53:59, 10.71s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [13109], local_loss=0.10346468538045883, train_loss=0.03790801763534546, time_cost=5.850907802581787
+
Steps: 1%|▏ | 13109/1000000 [17:27<2934:53:59, 10.71s/it, lr=1e-5, step_loss=0.103]
Steps: 1%|▏ | 13110/1000000 [17:41<3226:55:07, 11.77s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [13110], local_loss=0.041591577231884, train_loss=0.05055317282676697, time_cost=4.300922155380249
+
Steps: 1%|▏ | 13110/1000000 [17:41<3226:55:07, 11.77s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%|▏ | 13111/1000000 [17:49<2966:27:42, 10.82s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [13111], local_loss=0.19305622577667236, train_loss=0.15996259450912476, time_cost=7.175643444061279
+
Steps: 1%|▏ | 13111/1000000 [17:49<2966:27:42, 10.82s/it, lr=1e-5, step_loss=0.193]
Steps: 1%|▏ | 13112/1000000 [18:04<3239:05:19, 11.82s/it, lr=1e-5, step_loss=0.193][RANK-0]: Step: [13112], local_loss=0.09676774591207504, train_loss=0.04465578496456146, time_cost=10.7322678565979
+
Steps: 1%|▏ | 13112/1000000 [18:04<3239:05:19, 11.82s/it, lr=1e-5, step_loss=0.0968]
Steps: 1%|▏ | 13113/1000000 [18:13<3008:01:04, 10.97s/it, lr=1e-5, step_loss=0.0968][RANK-0]: Step: [13113], local_loss=0.08763279765844345, train_loss=0.12336064875125885, time_cost=1.3171472549438477
+
Steps: 1%|▏ | 13113/1000000 [18:13<3008:01:04, 10.97s/it, lr=1e-5, step_loss=0.0876]
Steps: 1%|▏ | 13114/1000000 [18:22<2898:50:03, 10.57s/it, lr=1e-5, step_loss=0.0876][RANK-0]: Step: [13114], local_loss=0.18038348853588104, train_loss=0.10528422892093658, time_cost=1.6713910102844238
+
Steps: 1%|▏ | 13114/1000000 [18:22<2898:50:03, 10.57s/it, lr=1e-5, step_loss=0.18]
Steps: 1%|▏ | 13115/1000000 [18:29<2591:09:42, 9.45s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [13115], local_loss=0.10920414328575134, train_loss=0.050302062183618546, time_cost=2.1505613327026367
+
Steps: 1%|▏ | 13115/1000000 [18:29<2591:09:42, 9.45s/it, lr=1e-5, step_loss=0.109]
Steps: 1%|▏ | 13116/1000000 [18:35<2268:04:36, 8.27s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [13116], local_loss=0.018159223720431328, train_loss=0.03283941373229027, time_cost=1.9134230613708496
+
Steps: 1%|▏ | 13116/1000000 [18:35<2268:04:36, 8.27s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%|▏ | 13117/1000000 [18:43<2302:57:18, 8.40s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [13117], local_loss=0.17999626696109772, train_loss=0.08191338181495667, time_cost=2.4765350818634033
+
Steps: 1%|▏ | 13117/1000000 [18:43<2302:57:18, 8.40s/it, lr=1e-5, step_loss=0.18]
Steps: 1%|▏ | 13118/1000000 [18:54<2492:20:24, 9.09s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [13118], local_loss=0.061047062277793884, train_loss=0.06060813367366791, time_cost=1.5453870296478271
+
Steps: 1%|▏ | 13118/1000000 [18:54<2492:20:24, 9.09s/it, lr=1e-5, step_loss=0.061]
Steps: 1%|▏ | 13119/1000000 [19:05<2646:29:09, 9.65s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [13119], local_loss=0.02680654637515545, train_loss=0.05349159985780716, time_cost=2.311969757080078
+
Steps: 1%|▏ | 13119/1000000 [19:05<2646:29:09, 9.65s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%|▏ | 13120/1000000 [19:17<2807:56:56, 10.24s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [13120], local_loss=0.022763241082429886, train_loss=0.03609267622232437, time_cost=2.763843059539795
+
Steps: 1%|▏ | 13120/1000000 [19:17<2807:56:56, 10.24s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%|▏ | 13121/1000000 [19:22<2410:42:58, 8.79s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [13121], local_loss=0.019231528043746948, train_loss=0.03059130162000656, time_cost=2.4536657333374023
+
Steps: 1%|▏ | 13121/1000000 [19:22<2410:42:58, 8.79s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%|▏ | 13122/1000000 [19:26<2027:10:21, 7.39s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [13122], local_loss=0.03370708227157593, train_loss=16.550260543823242, time_cost=3.07731556892395
+
Steps: 1%|▏ | 13122/1000000 [19:26<2027:10:21, 7.39s/it, lr=1e-5, step_loss=0.0337]
Steps: 1%|▏ | 13123/1000000 [19:35<2162:55:38, 7.89s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [13123], local_loss=0.06831838935613632, train_loss=0.06980307400226593, time_cost=1.2256388664245605
+
Steps: 1%|▏ | 13123/1000000 [19:35<2162:55:38, 7.89s/it, lr=1e-5, step_loss=0.0683]
Steps: 1%|▏ | 13124/1000000 [19:41<2005:20:52, 7.32s/it, lr=1e-5, step_loss=0.0683][RANK-0]: Step: [13124], local_loss=0.04542916640639305, train_loss=0.1738218516111374, time_cost=2.5165488719940186
+
Steps: 1%|▏ | 13124/1000000 [19:41<2005:20:52, 7.32s/it, lr=1e-5, step_loss=0.0454]
Steps: 1%|▏ | 13125/1000000 [19:57<2735:44:52, 9.98s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [13125], local_loss=312.01141357421875, train_loss=39.066001892089844, time_cost=6.094141483306885
+
Steps: 1%|▏ | 13125/1000000 [19:57<2735:44:52, 9.98s/it, lr=1e-5, step_loss=312]
Steps: 1%|▏ | 13126/1000000 [20:13<3181:58:32, 11.61s/it, lr=1e-5, step_loss=312][RANK-0]: Step: [13126], local_loss=0.01936848647892475, train_loss=0.05701569467782974, time_cost=6.352644681930542
+
Steps: 1%|▏ | 13126/1000000 [20:13<3181:58:32, 11.61s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%|▏ | 13127/1000000 [20:19<2733:23:59, 9.97s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [13127], local_loss=0.04245865345001221, train_loss=0.1919579952955246, time_cost=1.3725497722625732
+
Steps: 1%|▏ | 13127/1000000 [20:19<2733:23:59, 9.97s/it, lr=1e-5, step_loss=0.0425]
Steps: 1%|▏ | 13128/1000000 [20:25<2398:15:01, 8.75s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [13128], local_loss=0.040106769651174545, train_loss=0.056483328342437744, time_cost=2.1607120037078857
+
Steps: 1%|▏ | 13128/1000000 [20:25<2398:15:01, 8.75s/it, lr=1e-5, step_loss=0.0401]
Steps: 1%|▏ | 13129/1000000 [20:30<2139:40:26, 7.81s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [13129], local_loss=0.025106167420744896, train_loss=0.07494321465492249, time_cost=2.6307239532470703
+
Steps: 1%|▏ | 13129/1000000 [20:30<2139:40:26, 7.81s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%|▏ | 13130/1000000 [20:35<1871:49:53, 6.83s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [13130], local_loss=0.08088233321905136, train_loss=0.08678480237722397, time_cost=3.594440460205078
+
Steps: 1%|▏ | 13130/1000000 [20:35<1871:49:53, 6.83s/it, lr=1e-5, step_loss=0.0809]
Steps: 1%|▏ | 13131/1000000 [20:45<2169:20:49, 7.91s/it, lr=1e-5, step_loss=0.0809][RANK-0]: Step: [13131], local_loss=0.12218077480792999, train_loss=0.12001074850559235, time_cost=3.886932611465454
+
Steps: 1%|▏ | 13131/1000000 [20:45<2169:20:49, 7.91s/it, lr=1e-5, step_loss=0.122]
Steps: 1%|▏ | 13132/1000000 [20:50<1937:51:15, 7.07s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [13132], local_loss=1.0068145990371704, train_loss=0.19208204746246338, time_cost=1.2345881462097168
+
Steps: 1%|▏ | 13132/1000000 [20:50<1937:51:15, 7.07s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 13133/1000000 [20:58<1985:20:39, 7.24s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [13133], local_loss=0.026281308382749557, train_loss=0.06508874893188477, time_cost=5.409805774688721
+
Steps: 1%|▏ | 13133/1000000 [20:58<1985:20:39, 7.24s/it, lr=1e-5, step_loss=0.0263]
Steps: 1%|▏ | 13134/1000000 [21:12<2558:27:55, 9.33s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [13134], local_loss=0.13868924975395203, train_loss=0.041370801627635956, time_cost=4.5665905475616455
+
Steps: 1%|▏ | 13134/1000000 [21:12<2558:27:55, 9.33s/it, lr=1e-5, step_loss=0.139]
Steps: 1%|▏ | 13135/1000000 [21:23<2643:46:07, 9.64s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [13135], local_loss=0.011522999033331871, train_loss=0.04047710821032524, time_cost=4.657576084136963
+
Steps: 1%|▏ | 13135/1000000 [21:23<2643:46:07, 9.64s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%|▏ | 13136/1000000 [21:28<2304:41:08, 8.41s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [13136], local_loss=0.028660595417022705, train_loss=0.037886470556259155, time_cost=4.084460973739624
+
Steps: 1%|▏ | 13136/1000000 [21:28<2304:41:08, 8.41s/it, lr=1e-5, step_loss=0.0287]
Steps: 1%|▏ | 13137/1000000 [21:40<2559:12:52, 9.34s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [13137], local_loss=0.1074889600276947, train_loss=0.03941509500145912, time_cost=4.258029937744141
+
Steps: 1%|▏ | 13137/1000000 [21:40<2559:12:52, 9.34s/it, lr=1e-5, step_loss=0.107]
Steps: 1%|▏ | 13138/1000000 [21:53<2871:12:44, 10.47s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [13138], local_loss=0.038507185876369476, train_loss=0.061482373625040054, time_cost=2.1262359619140625
+
Steps: 1%|▏ | 13138/1000000 [21:53<2871:12:44, 10.47s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%|▏ | 13139/1000000 [22:03<2887:43:26, 10.53s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [13139], local_loss=0.01888616569340229, train_loss=0.03157810866832733, time_cost=4.634539604187012
+
Steps: 1%|▏ | 13139/1000000 [22:03<2887:43:26, 10.53s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%|▏ | 13140/1000000 [22:15<2951:18:24, 10.77s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [13140], local_loss=0.01614447869360447, train_loss=0.047176212072372437, time_cost=3.617816209793091
+
Steps: 1%|▏ | 13140/1000000 [22:15<2951:18:24, 10.77s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%|▏ | 13141/1000000 [22:19<2420:52:39, 8.83s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [13141], local_loss=0.018788618966937065, train_loss=0.021329745650291443, time_cost=1.873966932296753
+
Steps: 1%|▏ | 13141/1000000 [22:19<2420:52:39, 8.83s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%|▏ | 13142/1000000 [22:26<2267:48:18, 8.27s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [13142], local_loss=0.012427844107151031, train_loss=0.06315744668245316, time_cost=5.466761827468872
+
Steps: 1%|▏ | 13142/1000000 [22:26<2267:48:18, 8.27s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13143/1000000 [22:31<1985:10:51, 7.24s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13143], local_loss=0.1265893280506134, train_loss=0.05349583923816681, time_cost=1.8503401279449463
+
Steps: 1%|▏ | 13143/1000000 [22:31<1985:10:51, 7.24s/it, lr=1e-5, step_loss=0.127]
Steps: 1%|▏ | 13144/1000000 [22:37<1871:54:57, 6.83s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [13144], local_loss=0.045149900019168854, train_loss=0.06810595095157623, time_cost=1.7451469898223877
+
Steps: 1%|▏ | 13144/1000000 [22:37<1871:54:57, 6.83s/it, lr=1e-5, step_loss=0.0451]
Steps: 1%|▏ | 13145/1000000 [22:42<1753:44:09, 6.40s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [13145], local_loss=0.031243067234754562, train_loss=0.0517609566450119, time_cost=1.4542531967163086
+
Steps: 1%|▏ | 13145/1000000 [22:42<1753:44:09, 6.40s/it, lr=1e-5, step_loss=0.0312]
Steps: 1%|▏ | 13146/1000000 [22:53<2157:53:17, 7.87s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [13146], local_loss=0.05994883552193642, train_loss=0.023839913308620453, time_cost=1.8006527423858643
+
Steps: 1%|▏ | 13146/1000000 [22:53<2157:53:17, 7.87s/it, lr=1e-5, step_loss=0.0599]
Steps: 1%|▏ | 13147/1000000 [22:59<1947:55:27, 7.11s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [13147], local_loss=0.03550783544778824, train_loss=0.08677570521831512, time_cost=1.2746918201446533
+
Steps: 1%|▏ | 13147/1000000 [22:59<1947:55:27, 7.11s/it, lr=1e-5, step_loss=0.0355]
Steps: 1%|▏ | 13148/1000000 [23:06<1931:55:13, 7.05s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [13148], local_loss=0.012929833494126797, train_loss=0.03608351945877075, time_cost=2.3490285873413086
+
Steps: 1%|▏ | 13148/1000000 [23:06<1931:55:13, 7.05s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 13149/1000000 [23:15<2095:06:18, 7.64s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [13149], local_loss=0.05941811949014664, train_loss=0.04758409783244133, time_cost=1.9761345386505127
+
Steps: 1%|▏ | 13149/1000000 [23:15<2095:06:18, 7.64s/it, lr=1e-5, step_loss=0.0594]
Steps: 1%|▏ | 13150/1000000 [23:24<2227:57:08, 8.13s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [13150], local_loss=0.013666301034390926, train_loss=0.062167271971702576, time_cost=1.8359179496765137
+
Steps: 1%|▏ | 13150/1000000 [23:24<2227:57:08, 8.13s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%|▏ | 13151/1000000 [23:31<2167:40:15, 7.91s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [13151], local_loss=0.024904116988182068, train_loss=0.05628638714551926, time_cost=2.111520290374756
+
Steps: 1%|▏ | 13151/1000000 [23:31<2167:40:15, 7.91s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%|▏ | 13152/1000000 [23:36<1914:58:45, 6.99s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [13152], local_loss=0.055536869913339615, train_loss=0.04820302128791809, time_cost=1.9099736213684082
+
Steps: 1%|▏ | 13152/1000000 [23:36<1914:58:45, 6.99s/it, lr=1e-5, step_loss=0.0555]
Steps: 1%|▏ | 13153/1000000 [23:43<1896:26:18, 6.92s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [13153], local_loss=0.009403069503605366, train_loss=0.03509426489472389, time_cost=1.778031587600708
+
Steps: 1%|▏ | 13153/1000000 [23:43<1896:26:18, 6.92s/it, lr=1e-5, step_loss=0.0094]
Steps: 1%|▏ | 13154/1000000 [23:52<2045:28:51, 7.46s/it, lr=1e-5, step_loss=0.0094][RANK-0]: Step: [13154], local_loss=0.04079338535666466, train_loss=0.06180643290281296, time_cost=1.2301602363586426
+
Steps: 1%|▏ | 13154/1000000 [23:52<2045:28:51, 7.46s/it, lr=1e-5, step_loss=0.0408]
Steps: 1%|▏ | 13155/1000000 [24:04<2457:59:22, 8.97s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [13155], local_loss=0.021748840808868408, train_loss=0.035820141434669495, time_cost=5.1499598026275635
+
Steps: 1%|▏ | 13155/1000000 [24:04<2457:59:22, 8.97s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%|▏ | 13156/1000000 [24:18<2842:53:12, 10.37s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [13156], local_loss=0.013484518975019455, train_loss=0.10290974378585815, time_cost=9.99199652671814
+
Steps: 1%|▏ | 13156/1000000 [24:18<2842:53:12, 10.37s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%|▏ | 13157/1000000 [24:27<2784:23:08, 10.16s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [13157], local_loss=0.04063837230205536, train_loss=0.03617580235004425, time_cost=1.5251717567443848
+
Steps: 1%|▏ | 13157/1000000 [24:27<2784:23:08, 10.16s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%|▏ | 13158/1000000 [24:38<2814:49:47, 10.27s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [13158], local_loss=0.012009682133793831, train_loss=0.043030496686697006, time_cost=5.737839937210083
+
Steps: 1%|▏ | 13158/1000000 [24:38<2814:49:47, 10.27s/it, lr=1e-5, step_loss=0.012]
Steps: 1%|▏ | 13159/1000000 [24:49<2870:52:16, 10.47s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [13159], local_loss=0.179998517036438, train_loss=0.08336964249610901, time_cost=3.203026294708252
+
Steps: 1%|▏ | 13159/1000000 [24:49<2870:52:16, 10.47s/it, lr=1e-5, step_loss=0.18]
Steps: 1%|▏ | 13160/1000000 [24:56<2627:37:19, 9.59s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [13160], local_loss=0.027663052082061768, train_loss=0.03352579474449158, time_cost=3.0615298748016357
+
Steps: 1%|▏ | 13160/1000000 [24:56<2627:37:19, 9.59s/it, lr=1e-5, step_loss=0.0277]
Steps: 1%|▏ | 13161/1000000 [25:04<2470:00:05, 9.01s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [13161], local_loss=0.5575603246688843, train_loss=0.14774489402770996, time_cost=3.438587188720703
+
Steps: 1%|▏ | 13161/1000000 [25:04<2470:00:05, 9.01s/it, lr=1e-5, step_loss=0.558]
Steps: 1%|▏ | 13162/1000000 [25:20<3026:56:34, 11.04s/it, lr=1e-5, step_loss=0.558][RANK-0]: Step: [13162], local_loss=0.05174930766224861, train_loss=0.04845255985856056, time_cost=13.240673065185547
+
Steps: 1%|▏ | 13162/1000000 [25:20<3026:56:34, 11.04s/it, lr=1e-5, step_loss=0.0517]
Steps: 1%|▏ | 13163/1000000 [25:27<2703:45:33, 9.86s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [13163], local_loss=0.06260128319263458, train_loss=0.027198350057005882, time_cost=1.2182166576385498
+
Steps: 1%|▏ | 13163/1000000 [25:27<2703:45:33, 9.86s/it, lr=1e-5, step_loss=0.0626]
Steps: 1%|▏ | 13164/1000000 [25:40<2934:38:28, 10.71s/it, lr=1e-5, step_loss=0.0626][RANK-0]: Step: [13164], local_loss=0.014770268462598324, train_loss=0.026304520666599274, time_cost=2.7253105640411377
+
Steps: 1%|▏ | 13164/1000000 [25:40<2934:38:28, 10.71s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%|▏ | 13165/1000000 [25:45<2524:15:55, 9.21s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [13165], local_loss=0.016317496076226234, train_loss=0.058757103979587555, time_cost=1.7391879558563232
+
Steps: 1%|▏ | 13165/1000000 [25:45<2524:15:55, 9.21s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%|▏ | 13166/1000000 [25:51<2208:04:33, 8.06s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [13166], local_loss=0.35062646865844727, train_loss=0.16899043321609497, time_cost=2.8104989528656006
+
Steps: 1%|▏ | 13166/1000000 [25:51<2208:04:33, 8.06s/it, lr=1e-5, step_loss=0.351]
Steps: 1%|▏ | 13167/1000000 [26:00<2301:10:50, 8.39s/it, lr=1e-5, step_loss=0.351][RANK-0]: Step: [13167], local_loss=0.07780645787715912, train_loss=0.16930219531059265, time_cost=3.933316469192505
+
Steps: 1%|▏ | 13167/1000000 [26:00<2301:10:50, 8.39s/it, lr=1e-5, step_loss=0.0778]
Steps: 1%|▏ | 13168/1000000 [26:05<2028:26:01, 7.40s/it, lr=1e-5, step_loss=0.0778][RANK-0]: Step: [13168], local_loss=0.02413816563785076, train_loss=0.16572228074073792, time_cost=1.8234851360321045
+
Steps: 1%|▏ | 13168/1000000 [26:05<2028:26:01, 7.40s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%|▏ | 13169/1000000 [26:12<2007:13:50, 7.32s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [13169], local_loss=0.04536370187997818, train_loss=0.0483078807592392, time_cost=2.708235502243042
+
Steps: 1%|▏ | 13169/1000000 [26:12<2007:13:50, 7.32s/it, lr=1e-5, step_loss=0.0454]
Steps: 1%|▏ | 13170/1000000 [26:18<1859:03:22, 6.78s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [13170], local_loss=0.014442075043916702, train_loss=0.07222521305084229, time_cost=2.138774871826172
+
Steps: 1%|▏ | 13170/1000000 [26:18<1859:03:22, 6.78s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 13171/1000000 [26:23<1701:06:26, 6.21s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [13171], local_loss=0.035142526030540466, train_loss=0.05523782595992088, time_cost=1.9399337768554688
+
Steps: 1%|▏ | 13171/1000000 [26:23<1701:06:26, 6.21s/it, lr=1e-5, step_loss=0.0351]
Steps: 1%|▏ | 13172/1000000 [26:27<1589:05:48, 5.80s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [13172], local_loss=0.08355949819087982, train_loss=0.08822311460971832, time_cost=1.6423592567443848
+
Steps: 1%|▏ | 13172/1000000 [26:27<1589:05:48, 5.80s/it, lr=1e-5, step_loss=0.0836]
Steps: 1%|▏ | 13173/1000000 [26:32<1506:48:00, 5.50s/it, lr=1e-5, step_loss=0.0836][RANK-0]: Step: [13173], local_loss=0.026214230805635452, train_loss=0.026717063039541245, time_cost=1.773697853088379
+
Steps: 1%|▏ | 13173/1000000 [26:32<1506:48:00, 5.50s/it, lr=1e-5, step_loss=0.0262]
Steps: 1%|▏ | 13174/1000000 [26:39<1626:00:43, 5.93s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [13174], local_loss=0.026615776121616364, train_loss=0.07085120677947998, time_cost=2.4955170154571533
+
Steps: 1%|▏ | 13174/1000000 [26:39<1626:00:43, 5.93s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%|▏ | 13175/1000000 [26:44<1547:05:13, 5.64s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [13175], local_loss=0.023811236023902893, train_loss=0.06780962646007538, time_cost=2.4630703926086426
+
Steps: 1%|▏ | 13175/1000000 [26:44<1547:05:13, 5.64s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%|▏ | 13176/1000000 [26:51<1664:45:33, 6.07s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [13176], local_loss=0.5746315717697144, train_loss=0.09480898827314377, time_cost=2.755544900894165
+
Steps: 1%|▏ | 13176/1000000 [26:51<1664:45:33, 6.07s/it, lr=1e-5, step_loss=0.575]
Steps: 1%|▏ | 13177/1000000 [27:01<1978:20:14, 7.22s/it, lr=1e-5, step_loss=0.575][RANK-0]: Step: [13177], local_loss=0.02878577448427677, train_loss=0.06572560220956802, time_cost=3.0436317920684814
+
Steps: 1%|▏ | 13177/1000000 [27:01<1978:20:14, 7.22s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 13178/1000000 [27:12<2257:04:37, 8.23s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [13178], local_loss=0.0211885217577219, train_loss=0.047985292971134186, time_cost=4.271620273590088
+
Steps: 1%|▏ | 13178/1000000 [27:12<2257:04:37, 8.23s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%|▏ | 13179/1000000 [27:27<2798:47:29, 10.21s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [13179], local_loss=0.0381661057472229, train_loss=0.04049187898635864, time_cost=5.897655963897705
+
Steps: 1%|▏ | 13179/1000000 [27:27<2798:47:29, 10.21s/it, lr=1e-5, step_loss=0.0382]
Steps: 1%|▏ | 13180/1000000 [27:39<2997:11:42, 10.93s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [13180], local_loss=0.012414146214723587, train_loss=0.04437246546149254, time_cost=3.7502779960632324
+
Steps: 1%|▏ | 13180/1000000 [27:39<2997:11:42, 10.93s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13181/1000000 [27:52<3138:59:12, 11.45s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13181], local_loss=0.014756560325622559, train_loss=0.07469622045755386, time_cost=3.6149184703826904
+
Steps: 1%|▏ | 13181/1000000 [27:52<3138:59:12, 11.45s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%|▏ | 13182/1000000 [28:00<2832:59:30, 10.34s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [13182], local_loss=0.0814359039068222, train_loss=0.05149794742465019, time_cost=2.059143543243408
+
Steps: 1%|▏ | 13182/1000000 [28:00<2832:59:30, 10.34s/it, lr=1e-5, step_loss=0.0814]
Steps: 1%|▏ | 13183/1000000 [28:09<2778:27:12, 10.14s/it, lr=1e-5, step_loss=0.0814][RANK-0]: Step: [13183], local_loss=0.028783895075321198, train_loss=0.03758067637681961, time_cost=1.236525535583496
+
Steps: 1%|▏ | 13183/1000000 [28:09<2778:27:12, 10.14s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 13184/1000000 [28:18<2654:25:37, 9.68s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [13184], local_loss=0.08533217012882233, train_loss=0.10311339795589447, time_cost=2.45756459236145
+
Steps: 1%|▏ | 13184/1000000 [28:18<2654:25:37, 9.68s/it, lr=1e-5, step_loss=0.0853]
Steps: 1%|▏ | 13185/1000000 [28:29<2768:16:40, 10.10s/it, lr=1e-5, step_loss=0.0853][RANK-0]: Step: [13185], local_loss=0.014653196558356285, train_loss=0.027267461642622948, time_cost=1.57425856590271
+
Steps: 1%|▏ | 13185/1000000 [28:29<2768:16:40, 10.10s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 13186/1000000 [28:38<2666:02:15, 9.73s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [13186], local_loss=0.06270230561494827, train_loss=0.044516000896692276, time_cost=4.802984237670898
+
Steps: 1%|▏ | 13186/1000000 [28:38<2666:02:15, 9.73s/it, lr=1e-5, step_loss=0.0627]
Steps: 1%|▏ | 13187/1000000 [28:43<2297:19:47, 8.38s/it, lr=1e-5, step_loss=0.0627][RANK-0]: Step: [13187], local_loss=0.05779623985290527, train_loss=0.035368531942367554, time_cost=2.5114808082580566
+
Steps: 1%|▏ | 13187/1000000 [28:43<2297:19:47, 8.38s/it, lr=1e-5, step_loss=0.0578]
Steps: 1%|▏ | 13188/1000000 [28:48<2054:48:49, 7.50s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [13188], local_loss=0.018115097656846046, train_loss=0.05278296768665314, time_cost=3.312152624130249
+
Steps: 1%|▏ | 13188/1000000 [28:48<2054:48:49, 7.50s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%|▏ | 13189/1000000 [28:58<2184:23:41, 7.97s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [13189], local_loss=0.020780378952622414, train_loss=0.17965993285179138, time_cost=1.3796124458312988
+
Steps: 1%|▏ | 13189/1000000 [28:58<2184:23:41, 7.97s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%|▏ | 13190/1000000 [29:03<2008:27:26, 7.33s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [13190], local_loss=0.04102002829313278, train_loss=0.04811627417802811, time_cost=1.4678480625152588
+
Steps: 1%|▏ | 13190/1000000 [29:03<2008:27:26, 7.33s/it, lr=1e-5, step_loss=0.041]
Steps: 1%|▏ | 13191/1000000 [29:08<1780:37:29, 6.50s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [13191], local_loss=1.0074646472930908, train_loss=0.16876699030399323, time_cost=1.5589632987976074
+
Steps: 1%|▏ | 13191/1000000 [29:08<1780:37:29, 6.50s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 13192/1000000 [29:15<1817:17:53, 6.63s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [13192], local_loss=0.01777707040309906, train_loss=0.042525339871644974, time_cost=2.560573101043701
+
Steps: 1%|▏ | 13192/1000000 [29:15<1817:17:53, 6.63s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%|▏ | 13193/1000000 [29:24<1983:30:09, 7.24s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [13193], local_loss=0.1714099645614624, train_loss=0.07798004150390625, time_cost=3.142624616622925
+
Steps: 1%|▏ | 13193/1000000 [29:24<1983:30:09, 7.24s/it, lr=1e-5, step_loss=0.171]
Steps: 1%|▏ | 13194/1000000 [29:32<2066:18:53, 7.54s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [13194], local_loss=0.06351673603057861, train_loss=0.18569988012313843, time_cost=3.3390960693359375
+
Steps: 1%|▏ | 13194/1000000 [29:32<2066:18:53, 7.54s/it, lr=1e-5, step_loss=0.0635]
Steps: 1%|▏ | 13195/1000000 [29:41<2226:03:20, 8.12s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [13195], local_loss=0.04240182787179947, train_loss=0.04030241817235947, time_cost=1.2375669479370117
+
Steps: 1%|▏ | 13195/1000000 [29:41<2226:03:20, 8.12s/it, lr=1e-5, step_loss=0.0424]
Steps: 1%|▏ | 13196/1000000 [29:56<2743:48:53, 10.01s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [13196], local_loss=0.22454792261123657, train_loss=0.0648391842842102, time_cost=5.882969617843628
+
Steps: 1%|▏ | 13196/1000000 [29:56<2743:48:53, 10.01s/it, lr=1e-5, step_loss=0.225]
Steps: 1%|▏ | 13197/1000000 [30:06<2769:33:11, 10.10s/it, lr=1e-5, step_loss=0.225][RANK-0]: Step: [13197], local_loss=0.0547226220369339, train_loss=0.044726498425006866, time_cost=4.697136878967285
+
Steps: 1%|▏ | 13197/1000000 [30:06<2769:33:11, 10.10s/it, lr=1e-5, step_loss=0.0547]
Steps: 1%|▏ | 13198/1000000 [30:14<2607:03:40, 9.51s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [13198], local_loss=0.08305056393146515, train_loss=0.15255725383758545, time_cost=3.922651767730713
+
Steps: 1%|▏ | 13198/1000000 [30:14<2607:03:40, 9.51s/it, lr=1e-5, step_loss=0.0831]
Steps: 1%|▏ | 13199/1000000 [30:20<2293:30:29, 8.37s/it, lr=1e-5, step_loss=0.0831][RANK-0]: Step: [13199], local_loss=0.06236198544502258, train_loss=0.03913804143667221, time_cost=2.3457438945770264
+
Steps: 1%|▏ | 13199/1000000 [30:20<2293:30:29, 8.37s/it, lr=1e-5, step_loss=0.0624]
Steps: 1%|▏ | 13200/1000000 [30:26<2136:56:04, 7.80s/it, lr=1e-5, step_loss=0.0624][RANK-0]: Step: [13200], local_loss=0.0108203599229455, train_loss=0.08954431116580963, time_cost=2.2190535068511963
+
Steps: 1%|▏ | 13200/1000000 [30:26<2136:56:04, 7.80s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 13201/1000000 [30:39<2506:38:49, 9.14s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [13201], local_loss=0.33779412508010864, train_loss=0.07983358949422836, time_cost=3.076204538345337
+
Steps: 1%|▏ | 13201/1000000 [30:39<2506:38:49, 9.14s/it, lr=1e-5, step_loss=0.338]
Steps: 1%|▏ | 13202/1000000 [30:51<2819:41:43, 10.29s/it, lr=1e-5, step_loss=0.338][RANK-0]: Step: [13202], local_loss=0.025400204584002495, train_loss=0.02904755435883999, time_cost=11.22696042060852
+
Steps: 1%|▏ | 13202/1000000 [30:51<2819:41:43, 10.29s/it, lr=1e-5, step_loss=0.0254]
Steps: 1%|▏ | 13203/1000000 [31:06<3170:09:05, 11.57s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [13203], local_loss=0.021866198629140854, train_loss=0.034987498074769974, time_cost=5.48032808303833
+
Steps: 1%|▏ | 13203/1000000 [31:06<3170:09:05, 11.57s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%|▏ | 13204/1000000 [31:19<3288:50:51, 12.00s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [13204], local_loss=0.051197804510593414, train_loss=0.1069478690624237, time_cost=1.2041220664978027
+
Steps: 1%|▏ | 13204/1000000 [31:19<3288:50:51, 12.00s/it, lr=1e-5, step_loss=0.0512]
Steps: 1%|▏ | 13205/1000000 [31:26<2899:42:57, 10.58s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [13205], local_loss=0.025320161134004593, train_loss=0.05392134189605713, time_cost=1.2643849849700928
+
Steps: 1%|▏ | 13205/1000000 [31:26<2899:42:57, 10.58s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%|▏ | 13206/1000000 [31:32<2515:01:13, 9.18s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [13206], local_loss=0.009441281668841839, train_loss=0.07628699392080307, time_cost=1.5151610374450684
+
Steps: 1%|▏ | 13206/1000000 [31:32<2515:01:13, 9.18s/it, lr=1e-5, step_loss=0.00944]
Steps: 1%|▏ | 13207/1000000 [31:43<2637:57:20, 9.62s/it, lr=1e-5, step_loss=0.00944][RANK-0]: Step: [13207], local_loss=1.0126692056655884, train_loss=0.24244432151317596, time_cost=1.2501001358032227
+
Steps: 1%|▏ | 13207/1000000 [31:43<2637:57:20, 9.62s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 13208/1000000 [31:55<2849:37:52, 10.40s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [13208], local_loss=0.06026504561305046, train_loss=0.029073406010866165, time_cost=5.167768239974976
+
Steps: 1%|▏ | 13208/1000000 [31:55<2849:37:52, 10.40s/it, lr=1e-5, step_loss=0.0603]
Steps: 1%|▏ | 13209/1000000 [32:00<2415:49:40, 8.81s/it, lr=1e-5, step_loss=0.0603][RANK-0]: Step: [13209], local_loss=0.029952233657240868, train_loss=0.04244694113731384, time_cost=1.2265324592590332
+
Steps: 1%|▏ | 13209/1000000 [32:00<2415:49:40, 8.81s/it, lr=1e-5, step_loss=0.03]
Steps: 1%|▏ | 13210/1000000 [32:17<3033:55:27, 11.07s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [13210], local_loss=0.10210392624139786, train_loss=0.17225544154644012, time_cost=7.482334136962891
+
Steps: 1%|▏ | 13210/1000000 [32:17<3033:55:27, 11.07s/it, lr=1e-5, step_loss=0.102]
Steps: 1%|▏ | 13211/1000000 [32:27<3019:23:11, 11.02s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [13211], local_loss=0.11307652294635773, train_loss=0.041888732463121414, time_cost=3.252293109893799
+
Steps: 1%|▏ | 13211/1000000 [32:27<3019:23:11, 11.02s/it, lr=1e-5, step_loss=0.113]
Steps: 1%|▏ | 13212/1000000 [32:38<3007:55:34, 10.97s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [13212], local_loss=0.03579035401344299, train_loss=0.05269023776054382, time_cost=4.050771951675415
+
Steps: 1%|▏ | 13212/1000000 [32:38<3007:55:34, 10.97s/it, lr=1e-5, step_loss=0.0358]
Steps: 1%|▏ | 13213/1000000 [32:50<3032:25:21, 11.06s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [13213], local_loss=0.0388774573802948, train_loss=0.039823777973651886, time_cost=1.2368927001953125
+
Steps: 1%|▏ | 13213/1000000 [32:50<3032:25:21, 11.06s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%|▏ | 13214/1000000 [32:55<2603:38:50, 9.50s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [13214], local_loss=0.022237857803702354, train_loss=0.042536478489637375, time_cost=1.5613021850585938
+
Steps: 1%|▏ | 13214/1000000 [32:55<2603:38:50, 9.50s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%|▏ | 13215/1000000 [33:06<2718:30:11, 9.92s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [13215], local_loss=0.052725207060575485, train_loss=0.11891829967498779, time_cost=3.307107925415039
+
Steps: 1%|▏ | 13215/1000000 [33:06<2718:30:11, 9.92s/it, lr=1e-5, step_loss=0.0527]
Steps: 1%|▏ | 13216/1000000 [33:11<2269:12:03, 8.28s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [13216], local_loss=0.18011213839054108, train_loss=0.050204865634441376, time_cost=1.946136236190796
+
Steps: 1%|▏ | 13216/1000000 [33:11<2269:12:03, 8.28s/it, lr=1e-5, step_loss=0.18]
Steps: 1%|▏ | 13217/1000000 [33:18<2199:34:40, 8.02s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [13217], local_loss=0.01586761884391308, train_loss=0.07587067782878876, time_cost=1.5612397193908691
+
Steps: 1%|▏ | 13217/1000000 [33:18<2199:34:40, 8.02s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%|▏ | 13218/1000000 [33:29<2412:17:54, 8.80s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [13218], local_loss=0.026785828173160553, train_loss=0.021971937268972397, time_cost=4.026827573776245
+
Steps: 1%|▏ | 13218/1000000 [33:29<2412:17:54, 8.80s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%|▏ | 13219/1000000 [33:42<2789:59:37, 10.18s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [13219], local_loss=0.023232780396938324, train_loss=0.027279717847704887, time_cost=4.56596565246582
+
Steps: 1%|▏ | 13219/1000000 [33:42<2789:59:37, 10.18s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%|▏ | 13220/1000000 [33:54<2886:19:23, 10.53s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [13220], local_loss=0.08396926522254944, train_loss=0.041984569281339645, time_cost=1.9785830974578857
+
Steps: 1%|▏ | 13220/1000000 [33:54<2886:19:23, 10.53s/it, lr=1e-5, step_loss=0.084]
Steps: 1%|▏ | 13221/1000000 [34:02<2749:30:50, 10.03s/it, lr=1e-5, step_loss=0.084][RANK-0]: Step: [13221], local_loss=0.01659863442182541, train_loss=0.2789533734321594, time_cost=3.7492728233337402
+
Steps: 1%|▏ | 13221/1000000 [34:02<2749:30:50, 10.03s/it, lr=1e-5, step_loss=0.0166]
Steps: 1%|▏ | 13222/1000000 [34:13<2805:32:45, 10.24s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [13222], local_loss=0.06205308809876442, train_loss=0.044160500168800354, time_cost=5.844446182250977
+
Steps: 1%|▏ | 13222/1000000 [34:13<2805:32:45, 10.24s/it, lr=1e-5, step_loss=0.0621]
Steps: 1%|▏ | 13223/1000000 [34:19<2442:25:29, 8.91s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [13223], local_loss=0.022355198860168457, train_loss=0.1225549578666687, time_cost=3.1095025539398193
+
Steps: 1%|▏ | 13223/1000000 [34:19<2442:25:29, 8.91s/it, lr=1e-5, step_loss=0.0224]
Steps: 1%|▏ | 13224/1000000 [34:35<2994:42:48, 10.93s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [13224], local_loss=0.03297046571969986, train_loss=12.8208646774292, time_cost=8.014299869537354
+
Steps: 1%|▏ | 13224/1000000 [34:35<2994:42:48, 10.93s/it, lr=1e-5, step_loss=0.033]
Steps: 1%|▏ | 13225/1000000 [34:41<2643:27:16, 9.64s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [13225], local_loss=0.039107177406549454, train_loss=0.030412090942263603, time_cost=2.2182440757751465
+
Steps: 1%|▏ | 13225/1000000 [34:41<2643:27:16, 9.64s/it, lr=1e-5, step_loss=0.0391]
Steps: 1%|▏ | 13226/1000000 [34:46<2208:29:29, 8.06s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [13226], local_loss=0.02415456436574459, train_loss=0.10672350972890854, time_cost=1.3024578094482422
+
Steps: 1%|▏ | 13226/1000000 [34:46<2208:29:29, 8.06s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%|▏ | 13227/1000000 [34:56<2374:15:23, 8.66s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [13227], local_loss=0.04886593297123909, train_loss=0.04102000966668129, time_cost=5.149754762649536
+
Steps: 1%|▏ | 13227/1000000 [34:56<2374:15:23, 8.66s/it, lr=1e-5, step_loss=0.0489]/home/image_data/hxy/Open-Sora-Plan/opensora/utils/utils.py:369: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
+ caption = BeautifulSoup(caption, features='html.parser').text
+
Steps: 1%|▏ | 13228/1000000 [35:10<2879:07:04, 10.50s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [13228], local_loss=0.029158135876059532, train_loss=0.043564170598983765, time_cost=7.219979763031006
+
Steps: 1%|▏ | 13228/1000000 [35:10<2879:07:04, 10.50s/it, lr=1e-5, step_loss=0.0292]
Steps: 1%|▏ | 13229/1000000 [35:24<3094:31:09, 11.29s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [13229], local_loss=0.012530694715678692, train_loss=0.07269572466611862, time_cost=1.20662260055542
+
Steps: 1%|▏ | 13229/1000000 [35:24<3094:31:09, 11.29s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%|▏ | 13230/1000000 [35:30<2655:25:37, 9.69s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [13230], local_loss=0.09910750389099121, train_loss=0.035583704710006714, time_cost=1.6438374519348145
+
Steps: 1%|▏ | 13230/1000000 [35:30<2655:25:37, 9.69s/it, lr=1e-5, step_loss=0.0991]
Steps: 1%|▏ | 13231/1000000 [35:37<2490:51:41, 9.09s/it, lr=1e-5, step_loss=0.0991][RANK-0]: Step: [13231], local_loss=0.028180783614516258, train_loss=0.05604290962219238, time_cost=1.2158377170562744
+
Steps: 1%|▏ | 13231/1000000 [35:37<2490:51:41, 9.09s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%|▏ | 13232/1000000 [35:51<2837:25:13, 10.35s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [13232], local_loss=0.12223666906356812, train_loss=0.031778931617736816, time_cost=4.323913812637329
+
Steps: 1%|▏ | 13232/1000000 [35:51<2837:25:13, 10.35s/it, lr=1e-5, step_loss=0.122]
Steps: 1%|▏ | 13233/1000000 [36:03<3036:24:55, 11.08s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [13233], local_loss=0.04655452072620392, train_loss=0.060358963906764984, time_cost=3.3398211002349854
+
Steps: 1%|▏ | 13233/1000000 [36:03<3036:24:55, 11.08s/it, lr=1e-5, step_loss=0.0466]
Steps: 1%|▏ | 13234/1000000 [36:10<2657:03:51, 9.69s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [13234], local_loss=0.1595897376537323, train_loss=0.0779021829366684, time_cost=4.944026470184326
+
Steps: 1%|▏ | 13234/1000000 [36:10<2657:03:51, 9.69s/it, lr=1e-5, step_loss=0.16]
Steps: 1%|▏ | 13235/1000000 [36:21<2811:58:28, 10.26s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [13235], local_loss=0.07774230092763901, train_loss=0.08970507979393005, time_cost=3.870039939880371
+
Steps: 1%|▏ | 13235/1000000 [36:21<2811:58:28, 10.26s/it, lr=1e-5, step_loss=0.0777]
Steps: 1%|▏ | 13236/1000000 [36:28<2475:28:05, 9.03s/it, lr=1e-5, step_loss=0.0777][RANK-0]: Step: [13236], local_loss=0.04048386588692665, train_loss=0.029368672519922256, time_cost=1.55411958694458
+
Steps: 1%|▏ | 13236/1000000 [36:28<2475:28:05, 9.03s/it, lr=1e-5, step_loss=0.0405]
Steps: 1%|▏ | 13237/1000000 [36:34<2295:26:05, 8.37s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [13237], local_loss=0.06867463886737823, train_loss=0.07370585203170776, time_cost=1.916755199432373
+
Steps: 1%|▏ | 13237/1000000 [36:34<2295:26:05, 8.37s/it, lr=1e-5, step_loss=0.0687]
Steps: 1%|▏ | 13238/1000000 [36:42<2198:22:42, 8.02s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [13238], local_loss=0.03526769578456879, train_loss=0.14609640836715698, time_cost=2.5113868713378906
+
Steps: 1%|▏ | 13238/1000000 [36:42<2198:22:42, 8.02s/it, lr=1e-5, step_loss=0.0353]
Steps: 1%|▏ | 13239/1000000 [36:53<2501:01:45, 9.12s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [13239], local_loss=0.039242666214704514, train_loss=0.03837670385837555, time_cost=1.303511619567871
+
Steps: 1%|▏ | 13239/1000000 [36:53<2501:01:45, 9.12s/it, lr=1e-5, step_loss=0.0392]
Steps: 1%|▏ | 13240/1000000 [36:58<2178:24:53, 7.95s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [13240], local_loss=0.023738034069538116, train_loss=0.030270591378211975, time_cost=2.472184181213379
+
Steps: 1%|▏ | 13240/1000000 [36:58<2178:24:53, 7.95s/it, lr=1e-5, step_loss=0.0237]
Steps: 1%|▏ | 13241/1000000 [37:04<2020:50:16, 7.37s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [13241], local_loss=0.02118365466594696, train_loss=0.049884915351867676, time_cost=1.3674225807189941
+
Steps: 1%|▏ | 13241/1000000 [37:04<2020:50:16, 7.37s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%|▏ | 13242/1000000 [37:10<1895:41:00, 6.92s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [13242], local_loss=0.01706778258085251, train_loss=0.0384918674826622, time_cost=1.4158086776733398
+
Steps: 1%|▏ | 13242/1000000 [37:10<1895:41:00, 6.92s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%|▏ | 13243/1000000 [37:24<2437:49:15, 8.89s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [13243], local_loss=0.035617999732494354, train_loss=0.0522271990776062, time_cost=1.402034044265747
+
Steps: 1%|▏ | 13243/1000000 [37:24<2437:49:15, 8.89s/it, lr=1e-5, step_loss=0.0356]
Steps: 1%|▏ | 13244/1000000 [37:35<2654:30:52, 9.68s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [13244], local_loss=0.03445132449269295, train_loss=0.06343646347522736, time_cost=1.4917008876800537
+
Steps: 1%|▏ | 13244/1000000 [37:35<2654:30:52, 9.68s/it, lr=1e-5, step_loss=0.0345]
Steps: 1%|▏ | 13245/1000000 [37:45<2615:52:52, 9.54s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [13245], local_loss=0.017564380541443825, train_loss=0.026506688445806503, time_cost=2.272190809249878
+
Steps: 1%|▏ | 13245/1000000 [37:45<2615:52:52, 9.54s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%|▏ | 13246/1000000 [37:56<2755:10:59, 10.05s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [13246], local_loss=0.056560490280389786, train_loss=0.09615917503833771, time_cost=1.2406349182128906
+
Steps: 1%|▏ | 13246/1000000 [37:56<2755:10:59, 10.05s/it, lr=1e-5, step_loss=0.0566]
Steps: 1%|▏ | 13247/1000000 [38:05<2685:47:57, 9.80s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [13247], local_loss=0.018830012530088425, train_loss=0.08555755019187927, time_cost=3.3143081665039062
+
Steps: 1%|▏ | 13247/1000000 [38:05<2685:47:57, 9.80s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%|▏ | 13248/1000000 [38:14<2641:11:12, 9.64s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [13248], local_loss=0.05266211926937103, train_loss=0.06670302152633667, time_cost=1.2920453548431396
+
Steps: 1%|▏ | 13248/1000000 [38:14<2641:11:12, 9.64s/it, lr=1e-5, step_loss=0.0527]
Steps: 1%|▏ | 13249/1000000 [38:19<2232:59:51, 8.15s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [13249], local_loss=0.09989000111818314, train_loss=0.0841955840587616, time_cost=1.7803802490234375
+
Steps: 1%|▏ | 13249/1000000 [38:19<2232:59:51, 8.15s/it, lr=1e-5, step_loss=0.0999]
Steps: 1%|▏ | 13250/1000000 [38:27<2228:44:24, 8.13s/it, lr=1e-5, step_loss=0.0999][RANK-0]: Step: [13250], local_loss=0.03720015287399292, train_loss=0.053516972810029984, time_cost=6.744050741195679
+
Steps: 1%|▏ | 13250/1000000 [38:27<2228:44:24, 8.13s/it, lr=1e-5, step_loss=0.0372]
Steps: 1%|▏ | 13251/1000000 [38:32<1976:44:16, 7.21s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [13251], local_loss=0.07961182296276093, train_loss=0.053351230919361115, time_cost=2.384568452835083
+
Steps: 1%|▏ | 13251/1000000 [38:32<1976:44:16, 7.21s/it, lr=1e-5, step_loss=0.0796]
Steps: 1%|▏ | 13252/1000000 [38:43<2252:12:12, 8.22s/it, lr=1e-5, step_loss=0.0796][RANK-0]: Step: [13252], local_loss=0.015796620398759842, train_loss=0.17418231070041656, time_cost=2.382411241531372
+
Steps: 1%|▏ | 13252/1000000 [38:43<2252:12:12, 8.22s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%|▏ | 13253/1000000 [38:52<2305:01:30, 8.41s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [13253], local_loss=0.07458055764436722, train_loss=0.07889203727245331, time_cost=1.7724196910858154
+
Steps: 1%|▏ | 13253/1000000 [38:52<2305:01:30, 8.41s/it, lr=1e-5, step_loss=0.0746]
Steps: 1%|▏ | 13254/1000000 [38:56<1962:17:49, 7.16s/it, lr=1e-5, step_loss=0.0746][RANK-0]: Step: [13254], local_loss=0.025701012462377548, train_loss=0.07602238655090332, time_cost=1.8163414001464844
+
Steps: 1%|▏ | 13254/1000000 [38:56<1962:17:49, 7.16s/it, lr=1e-5, step_loss=0.0257]
Steps: 1%|▏ | 13255/1000000 [39:12<2676:49:30, 9.77s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [13255], local_loss=0.027635402977466583, train_loss=0.058616966009140015, time_cost=7.431949615478516
+
Steps: 1%|▏ | 13255/1000000 [39:12<2676:49:30, 9.77s/it, lr=1e-5, step_loss=0.0276]
Steps: 1%|▏ | 13256/1000000 [39:16<2230:01:00, 8.14s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [13256], local_loss=0.020394328981637955, train_loss=0.021112142130732536, time_cost=1.262434959411621
+
Steps: 1%|▏ | 13256/1000000 [39:16<2230:01:00, 8.14s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 13257/1000000 [39:21<1971:11:52, 7.19s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [13257], local_loss=0.012350838631391525, train_loss=0.039548859000205994, time_cost=2.54235577583313
+
Steps: 1%|▏ | 13257/1000000 [39:21<1971:11:52, 7.19s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13258/1000000 [39:35<2496:27:31, 9.11s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13258], local_loss=0.020094793289899826, train_loss=0.07480413466691971, time_cost=1.309025526046753
+
Steps: 1%|▏ | 13258/1000000 [39:35<2496:27:31, 9.11s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%|▏ | 13259/1000000 [39:42<2357:06:28, 8.60s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [13259], local_loss=0.02606811374425888, train_loss=0.027547543868422508, time_cost=1.2466356754302979
+
Steps: 1%|▏ | 13259/1000000 [39:42<2357:06:28, 8.60s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%|▏ | 13260/1000000 [39:48<2133:44:29, 7.78s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [13260], local_loss=0.05412370339035988, train_loss=0.08327548950910568, time_cost=1.4100117683410645
+
Steps: 1%|▏ | 13260/1000000 [39:48<2133:44:29, 7.78s/it, lr=1e-5, step_loss=0.0541]
Steps: 1%|▏ | 13261/1000000 [39:55<2074:42:38, 7.57s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [13261], local_loss=0.21133683621883392, train_loss=0.1596008688211441, time_cost=2.9798543453216553
+
Steps: 1%|▏ | 13261/1000000 [39:55<2074:42:38, 7.57s/it, lr=1e-5, step_loss=0.211]
Steps: 1%|▏ | 13262/1000000 [40:09<2640:59:46, 9.64s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [13262], local_loss=0.017359454184770584, train_loss=0.03434324264526367, time_cost=6.912027359008789
+
Steps: 1%|▏ | 13262/1000000 [40:09<2640:59:46, 9.64s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%|▏ | 13263/1000000 [40:25<3155:03:49, 11.51s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [13263], local_loss=0.014002986252307892, train_loss=0.027342714369297028, time_cost=7.501801490783691
+
Steps: 1%|▏ | 13263/1000000 [40:25<3155:03:49, 11.51s/it, lr=1e-5, step_loss=0.014]
Steps: 1%|▏ | 13264/1000000 [40:35<3010:08:52, 10.98s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [13264], local_loss=0.02626928873360157, train_loss=0.05778592452406883, time_cost=1.9225330352783203
+
Steps: 1%|▏ | 13264/1000000 [40:35<3010:08:52, 10.98s/it, lr=1e-5, step_loss=0.0263]
Steps: 1%|▏ | 13265/1000000 [40:46<2972:30:17, 10.84s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [13265], local_loss=0.025640571489930153, train_loss=0.02667124569416046, time_cost=1.2913858890533447
+
Steps: 1%|▏ | 13265/1000000 [40:46<2972:30:17, 10.84s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%|▏ | 13266/1000000 [40:51<2547:07:13, 9.29s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [13266], local_loss=0.016864506527781487, train_loss=0.11388338357210159, time_cost=3.244433879852295
+
Steps: 1%|▏ | 13266/1000000 [40:51<2547:07:13, 9.29s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 13267/1000000 [40:58<2366:47:19, 8.63s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [13267], local_loss=0.04119095206260681, train_loss=0.046285152435302734, time_cost=1.2298507690429688
+
Steps: 1%|▏ | 13267/1000000 [40:58<2366:47:19, 8.63s/it, lr=1e-5, step_loss=0.0412]
Steps: 1%|▏ | 13268/1000000 [41:12<2767:51:23, 10.10s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [13268], local_loss=0.01919611170887947, train_loss=0.035679690539836884, time_cost=1.2211289405822754
+
Steps: 1%|▏ | 13268/1000000 [41:12<2767:51:23, 10.10s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%|▏ | 13269/1000000 [41:22<2774:59:08, 10.12s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [13269], local_loss=0.011227262206375599, train_loss=0.05267580598592758, time_cost=3.1360268592834473
+
Steps: 1%|▏ | 13269/1000000 [41:22<2774:59:08, 10.12s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 13270/1000000 [41:28<2396:19:30, 8.74s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [13270], local_loss=0.02328656241297722, train_loss=0.04988039284944534, time_cost=2.0436277389526367
+
Steps: 1%|▏ | 13270/1000000 [41:28<2396:19:30, 8.74s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%|▏ | 13271/1000000 [41:45<3111:35:21, 11.35s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [13271], local_loss=0.014960705302655697, train_loss=0.04837513715028763, time_cost=8.12609052658081
+
Steps: 1%|▏ | 13271/1000000 [41:45<3111:35:21, 11.35s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 13272/1000000 [41:50<2601:40:01, 9.49s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [13272], local_loss=0.01692132279276848, train_loss=0.03109065815806389, time_cost=1.401900291442871
+
Steps: 1%|▏ | 13272/1000000 [41:50<2601:40:01, 9.49s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 13273/1000000 [41:56<2271:15:26, 8.29s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [13273], local_loss=0.017250914126634598, train_loss=0.0342203751206398, time_cost=2.540235757827759
+
Steps: 1%|▏ | 13273/1000000 [41:56<2271:15:26, 8.29s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%|▏ | 13274/1000000 [42:10<2745:50:40, 10.02s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [13274], local_loss=0.017631491646170616, train_loss=0.052391327917575836, time_cost=1.3311412334442139
+
Steps: 1%|▏ | 13274/1000000 [42:10<2745:50:40, 10.02s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%|▏ | 13275/1000000 [42:26<3288:28:20, 12.00s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [13275], local_loss=0.015629563480615616, train_loss=0.18927693367004395, time_cost=8.730788469314575
+
Steps: 1%|▏ | 13275/1000000 [42:26<3288:28:20, 12.00s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%|▏ | 13276/1000000 [42:38<3304:47:01, 12.06s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [13276], local_loss=0.012111738324165344, train_loss=0.07306176424026489, time_cost=2.4006383419036865
+
Steps: 1%|▏ | 13276/1000000 [42:38<3304:47:01, 12.06s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 13277/1000000 [42:44<2797:50:13, 10.21s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [13277], local_loss=0.02809874340891838, train_loss=0.05806947126984596, time_cost=1.7300746440887451
+
Steps: 1%|▏ | 13277/1000000 [42:44<2797:50:13, 10.21s/it, lr=1e-5, step_loss=0.0281]
Steps: 1%|▏ | 13278/1000000 [42:55<2828:11:13, 10.32s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [13278], local_loss=0.10846161097288132, train_loss=0.05084802582859993, time_cost=2.926593065261841
+
Steps: 1%|▏ | 13278/1000000 [42:55<2828:11:13, 10.32s/it, lr=1e-5, step_loss=0.108]
Steps: 1%|▏ | 13279/1000000 [43:00<2395:23:06, 8.74s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [13279], local_loss=0.04860755428671837, train_loss=0.08207790553569794, time_cost=1.291182518005371
+
Steps: 1%|▏ | 13279/1000000 [43:00<2395:23:06, 8.74s/it, lr=1e-5, step_loss=0.0486]
Steps: 1%|▏ | 13280/1000000 [43:09<2404:56:30, 8.77s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [13280], local_loss=0.04584875330328941, train_loss=0.04918181896209717, time_cost=1.3648948669433594
+
Steps: 1%|▏ | 13280/1000000 [43:09<2404:56:30, 8.77s/it, lr=1e-5, step_loss=0.0458]
Steps: 1%|▏ | 13281/1000000 [43:14<2125:05:53, 7.75s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [13281], local_loss=0.013278250582516193, train_loss=0.06731817126274109, time_cost=1.6505706310272217
+
Steps: 1%|▏ | 13281/1000000 [43:14<2125:05:53, 7.75s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%|▏ | 13282/1000000 [43:19<1911:32:59, 6.97s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [13282], local_loss=0.03920094668865204, train_loss=0.06761215627193451, time_cost=2.098390579223633
+
Steps: 1%|▏ | 13282/1000000 [43:19<1911:32:59, 6.97s/it, lr=1e-5, step_loss=0.0392]
Steps: 1%|▏ | 13283/1000000 [43:27<1974:33:52, 7.20s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [13283], local_loss=0.019182443618774414, train_loss=0.06579490005970001, time_cost=4.646694898605347
+
Steps: 1%|▏ | 13283/1000000 [43:27<1974:33:52, 7.20s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%|▏ | 13284/1000000 [43:42<2625:44:02, 9.58s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [13284], local_loss=0.043693944811820984, train_loss=0.16536925733089447, time_cost=11.187982082366943
+
Steps: 1%|▏ | 13284/1000000 [43:42<2625:44:02, 9.58s/it, lr=1e-5, step_loss=0.0437]
Steps: 1%|▏ | 13285/1000000 [43:50<2472:27:43, 9.02s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [13285], local_loss=0.06285645067691803, train_loss=0.03354477882385254, time_cost=3.6338367462158203
+
Steps: 1%|▏ | 13285/1000000 [43:50<2472:27:43, 9.02s/it, lr=1e-5, step_loss=0.0629]
Steps: 1%|▏ | 13286/1000000 [43:56<2208:07:58, 8.06s/it, lr=1e-5, step_loss=0.0629][RANK-0]: Step: [13286], local_loss=0.027101946994662285, train_loss=0.04167298600077629, time_cost=4.673206329345703
+
Steps: 1%|▏ | 13286/1000000 [43:56<2208:07:58, 8.06s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%|▏ | 13287/1000000 [44:07<2476:07:04, 9.03s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [13287], local_loss=0.05175875872373581, train_loss=0.07403577119112015, time_cost=2.082552433013916
+
Steps: 1%|▏ | 13287/1000000 [44:07<2476:07:04, 9.03s/it, lr=1e-5, step_loss=0.0518]
Steps: 1%|▏ | 13288/1000000 [44:12<2159:05:29, 7.88s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [13288], local_loss=0.010506227612495422, train_loss=0.023325607180595398, time_cost=2.1480839252471924
+
Steps: 1%|▏ | 13288/1000000 [44:12<2159:05:29, 7.88s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%|▏ | 13289/1000000 [44:20<2118:40:20, 7.73s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [13289], local_loss=0.01630805805325508, train_loss=0.1064055860042572, time_cost=1.2874903678894043
+
Steps: 1%|▏ | 13289/1000000 [44:20<2118:40:20, 7.73s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%|▏ | 13290/1000000 [44:33<2618:07:08, 9.55s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [13290], local_loss=0.05307132750749588, train_loss=0.020687682554125786, time_cost=1.232146978378296
+
Steps: 1%|▏ | 13290/1000000 [44:33<2618:07:08, 9.55s/it, lr=1e-5, step_loss=0.0531]
Steps: 1%|▏ | 13291/1000000 [44:39<2251:27:25, 8.21s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [13291], local_loss=0.022184807807207108, train_loss=0.047195594757795334, time_cost=1.2565314769744873
+
Steps: 1%|▏ | 13291/1000000 [44:39<2251:27:25, 8.21s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%|▏ | 13292/1000000 [44:46<2176:48:26, 7.94s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [13292], local_loss=0.8545049428939819, train_loss=0.1453384906053543, time_cost=2.8409368991851807
+
Steps: 1%|▏ | 13292/1000000 [44:46<2176:48:26, 7.94s/it, lr=1e-5, step_loss=0.855]
Steps: 1%|▏ | 13293/1000000 [44:57<2447:03:38, 8.93s/it, lr=1e-5, step_loss=0.855][RANK-0]: Step: [13293], local_loss=0.0399441123008728, train_loss=0.10846521705389023, time_cost=2.4090678691864014
+
Steps: 1%|▏ | 13293/1000000 [44:57<2447:03:38, 8.93s/it, lr=1e-5, step_loss=0.0399]
Steps: 1%|▏ | 13294/1000000 [45:11<2833:36:56, 10.34s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [13294], local_loss=0.037796273827552795, train_loss=0.02963564731180668, time_cost=6.257978439331055
+
Steps: 1%|▏ | 13294/1000000 [45:11<2833:36:56, 10.34s/it, lr=1e-5, step_loss=0.0378]
Steps: 1%|▏ | 13295/1000000 [45:15<2340:30:37, 8.54s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [13295], local_loss=0.015184439718723297, train_loss=0.03379826992750168, time_cost=1.7011020183563232
+
Steps: 1%|▏ | 13295/1000000 [45:15<2340:30:37, 8.54s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%|▏ | 13296/1000000 [45:26<2569:56:18, 9.38s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [13296], local_loss=0.49836429953575134, train_loss=0.1246710941195488, time_cost=3.3236849308013916
+
Steps: 1%|▏ | 13296/1000000 [45:26<2569:56:18, 9.38s/it, lr=1e-5, step_loss=0.498]
Steps: 1%|▏ | 13297/1000000 [45:36<2627:00:11, 9.58s/it, lr=1e-5, step_loss=0.498][RANK-0]: Step: [13297], local_loss=0.10053510963916779, train_loss=0.06217005476355553, time_cost=4.1290624141693115
+
Steps: 1%|▏ | 13297/1000000 [45:36<2627:00:11, 9.58s/it, lr=1e-5, step_loss=0.101]
Steps: 1%|▏ | 13298/1000000 [45:41<2216:03:35, 8.09s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [13298], local_loss=0.03079104796051979, train_loss=0.027677038684487343, time_cost=1.5999464988708496
+
Steps: 1%|▏ | 13298/1000000 [45:41<2216:03:35, 8.09s/it, lr=1e-5, step_loss=0.0308]
Steps: 1%|▏ | 13299/1000000 [45:49<2174:25:35, 7.93s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [13299], local_loss=0.020261496305465698, train_loss=0.12389448285102844, time_cost=1.4455230236053467
+
Steps: 1%|▏ | 13299/1000000 [45:49<2174:25:35, 7.93s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%|▏ | 13300/1000000 [45:59<2382:54:47, 8.69s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [13300], local_loss=0.020043572410941124, train_loss=0.023878712207078934, time_cost=1.8857581615447998
+
Steps: 1%|▏ | 13300/1000000 [45:59<2382:54:47, 8.69s/it, lr=1e-5, step_loss=0.02]
Steps: 1%|▏ | 13301/1000000 [46:10<2601:41:16, 9.49s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [13301], local_loss=0.014194154180586338, train_loss=0.059609610587358475, time_cost=3.2576045989990234
+
Steps: 1%|▏ | 13301/1000000 [46:10<2601:41:16, 9.49s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%|▏ | 13302/1000000 [46:16<2256:59:26, 8.23s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [13302], local_loss=0.0134323975071311, train_loss=19.1467342376709, time_cost=2.338552474975586
+
Steps: 1%|▏ | 13302/1000000 [46:16<2256:59:26, 8.23s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%|▏ | 13303/1000000 [46:22<2099:56:34, 7.66s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [13303], local_loss=0.02480911836028099, train_loss=0.03662927821278572, time_cost=5.305424451828003
+
Steps: 1%|▏ | 13303/1000000 [46:22<2099:56:34, 7.66s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 13304/1000000 [46:27<1876:47:50, 6.85s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [13304], local_loss=0.08034798502922058, train_loss=0.03600595146417618, time_cost=1.650475025177002
+
Steps: 1%|▏ | 13304/1000000 [46:27<1876:47:50, 6.85s/it, lr=1e-5, step_loss=0.0803]
Steps: 1%|▏ | 13305/1000000 [46:38<2224:46:47, 8.12s/it, lr=1e-5, step_loss=0.0803][RANK-0]: Step: [13305], local_loss=0.097853884100914, train_loss=0.1932350993156433, time_cost=1.2445790767669678
+
Steps: 1%|▏ | 13305/1000000 [46:38<2224:46:47, 8.12s/it, lr=1e-5, step_loss=0.0979]
Steps: 1%|▏ | 13306/1000000 [46:43<1989:01:02, 7.26s/it, lr=1e-5, step_loss=0.0979][RANK-0]: Step: [13306], local_loss=0.07187826931476593, train_loss=0.06907309591770172, time_cost=2.2842419147491455
+
Steps: 1%|▏ | 13306/1000000 [46:43<1989:01:02, 7.26s/it, lr=1e-5, step_loss=0.0719]
Steps: 1%|▏ | 13307/1000000 [46:54<2309:16:10, 8.43s/it, lr=1e-5, step_loss=0.0719][RANK-0]: Step: [13307], local_loss=281.74884033203125, train_loss=35.26844787597656, time_cost=1.2416374683380127
+
Steps: 1%|▏ | 13307/1000000 [46:54<2309:16:10, 8.43s/it, lr=1e-5, step_loss=282]
Steps: 1%|▏ | 13308/1000000 [47:07<2657:53:21, 9.70s/it, lr=1e-5, step_loss=282][RANK-0]: Step: [13308], local_loss=0.026358909904956818, train_loss=0.03124437853693962, time_cost=1.2247307300567627
+
Steps: 1%|▏ | 13308/1000000 [47:07<2657:53:21, 9.70s/it, lr=1e-5, step_loss=0.0264]
Steps: 1%|▏ | 13309/1000000 [47:13<2335:07:26, 8.52s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [13309], local_loss=170.4026641845703, train_loss=21.327056884765625, time_cost=1.4817988872528076
+
Steps: 1%|▏ | 13309/1000000 [47:13<2335:07:26, 8.52s/it, lr=1e-5, step_loss=170]
Steps: 1%|▏ | 13310/1000000 [47:24<2572:51:10, 9.39s/it, lr=1e-5, step_loss=170][RANK-0]: Step: [13310], local_loss=0.04473113268613815, train_loss=0.04614754393696785, time_cost=7.6263861656188965
+
Steps: 1%|▏ | 13310/1000000 [47:24<2572:51:10, 9.39s/it, lr=1e-5, step_loss=0.0447]
Steps: 1%|▏ | 13311/1000000 [47:33<2539:45:48, 9.27s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [13311], local_loss=0.03925153613090515, train_loss=0.03421158343553543, time_cost=1.801577091217041
+
Steps: 1%|▏ | 13311/1000000 [47:33<2539:45:48, 9.27s/it, lr=1e-5, step_loss=0.0393]
Steps: 1%|▏ | 13312/1000000 [47:45<2715:54:34, 9.91s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [13312], local_loss=0.006488011684268713, train_loss=0.08091143518686295, time_cost=2.829402208328247
+
Steps: 1%|▏ | 13312/1000000 [47:45<2715:54:34, 9.91s/it, lr=1e-5, step_loss=0.00649]
Steps: 1%|▏ | 13313/1000000 [47:50<2316:08:52, 8.45s/it, lr=1e-5, step_loss=0.00649][RANK-0]: Step: [13313], local_loss=0.005220478400588036, train_loss=0.0647137388586998, time_cost=2.2020199298858643
+
Steps: 1%|▏ | 13313/1000000 [47:50<2316:08:52, 8.45s/it, lr=1e-5, step_loss=0.00522]
Steps: 1%|▏ | 13314/1000000 [47:54<1973:57:31, 7.20s/it, lr=1e-5, step_loss=0.00522][RANK-0]: Step: [13314], local_loss=0.015673378482460976, train_loss=0.04919004067778587, time_cost=1.5104775428771973
+
Steps: 1%|▏ | 13314/1000000 [47:54<1973:57:31, 7.20s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%|▏ | 13315/1000000 [48:08<2552:43:28, 9.31s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [13315], local_loss=0.027240900322794914, train_loss=0.03357582539319992, time_cost=5.846371650695801
+
Steps: 1%|▏ | 13315/1000000 [48:08<2552:43:28, 9.31s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%|▏ | 13316/1000000 [48:17<2535:20:00, 9.25s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [13316], local_loss=0.009674720466136932, train_loss=0.022135213017463684, time_cost=1.2201831340789795
+
Steps: 1%|▏ | 13316/1000000 [48:17<2535:20:00, 9.25s/it, lr=1e-5, step_loss=0.00967]
Steps: 1%|▏ | 13317/1000000 [48:24<2341:43:02, 8.54s/it, lr=1e-5, step_loss=0.00967][RANK-0]: Step: [13317], local_loss=0.06236722320318222, train_loss=0.151855930685997, time_cost=1.2316319942474365
+
Steps: 1%|▏ | 13317/1000000 [48:24<2341:43:02, 8.54s/it, lr=1e-5, step_loss=0.0624]
Steps: 1%|▏ | 13318/1000000 [48:33<2367:05:31, 8.64s/it, lr=1e-5, step_loss=0.0624][RANK-0]: Step: [13318], local_loss=0.009712663479149342, train_loss=0.05003098398447037, time_cost=2.5188722610473633
+
Steps: 1%|▏ | 13318/1000000 [48:33<2367:05:31, 8.64s/it, lr=1e-5, step_loss=0.00971]
Steps: 1%|▏ | 13319/1000000 [48:44<2585:36:42, 9.43s/it, lr=1e-5, step_loss=0.00971][RANK-0]: Step: [13319], local_loss=0.028636811301112175, train_loss=0.10119649767875671, time_cost=3.1316163539886475
+
Steps: 1%|▏ | 13319/1000000 [48:44<2585:36:42, 9.43s/it, lr=1e-5, step_loss=0.0286]
Steps: 1%|▏ | 13320/1000000 [48:53<2508:49:33, 9.15s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [13320], local_loss=0.6056174039840698, train_loss=0.11884045600891113, time_cost=2.69834041595459
+
Steps: 1%|▏ | 13320/1000000 [48:53<2508:49:33, 9.15s/it, lr=1e-5, step_loss=0.606]
Steps: 1%|▏ | 13321/1000000 [49:04<2649:45:25, 9.67s/it, lr=1e-5, step_loss=0.606][RANK-0]: Step: [13321], local_loss=0.07807517796754837, train_loss=0.12563897669315338, time_cost=3.098313331604004
+
Steps: 1%|▏ | 13321/1000000 [49:04<2649:45:25, 9.67s/it, lr=1e-5, step_loss=0.0781]
Steps: 1%|▏ | 13322/1000000 [49:15<2775:45:15, 10.13s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [13322], local_loss=0.011362706311047077, train_loss=0.0292237289249897, time_cost=3.5619497299194336
+
Steps: 1%|▏ | 13322/1000000 [49:15<2775:45:15, 10.13s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%|▏ | 13323/1000000 [49:24<2689:44:37, 9.81s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [13323], local_loss=0.0324849858880043, train_loss=0.04209686070680618, time_cost=3.035335063934326
+
Steps: 1%|▏ | 13323/1000000 [49:24<2689:44:37, 9.81s/it, lr=1e-5, step_loss=0.0325]
Steps: 1%|▏ | 13324/1000000 [49:29<2328:08:54, 8.49s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [13324], local_loss=0.01688859611749649, train_loss=0.08274635672569275, time_cost=2.699368476867676
+
Steps: 1%|▏ | 13324/1000000 [49:29<2328:08:54, 8.49s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 13325/1000000 [49:40<2458:36:07, 8.97s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [13325], local_loss=0.026844438165426254, train_loss=0.09741184115409851, time_cost=2.0248188972473145
+
Steps: 1%|▏ | 13325/1000000 [49:40<2458:36:07, 8.97s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%|▏ | 13326/1000000 [49:45<2206:02:58, 8.05s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [13326], local_loss=0.04451395571231842, train_loss=0.031061800196766853, time_cost=4.305134057998657
+
Steps: 1%|▏ | 13326/1000000 [49:45<2206:02:58, 8.05s/it, lr=1e-5, step_loss=0.0445]
Steps: 1%|▏ | 13327/1000000 [49:50<1945:37:56, 7.10s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [13327], local_loss=0.05411623790860176, train_loss=0.032151855528354645, time_cost=2.3139212131500244
+
Steps: 1%|▏ | 13327/1000000 [49:50<1945:37:56, 7.10s/it, lr=1e-5, step_loss=0.0541]
Steps: 1%|▏ | 13328/1000000 [49:57<1944:15:35, 7.09s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [13328], local_loss=0.027308663353323936, train_loss=0.06240338459610939, time_cost=2.931884527206421
+
Steps: 1%|▏ | 13328/1000000 [49:57<1944:15:35, 7.09s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%|▏ | 13329/1000000 [50:05<1943:13:01, 7.09s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [13329], local_loss=0.01057969406247139, train_loss=0.09835398942232132, time_cost=2.8156468868255615
+
Steps: 1%|▏ | 13329/1000000 [50:05<1943:13:01, 7.09s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 13330/1000000 [50:20<2660:33:55, 9.71s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [13330], local_loss=0.032460883259773254, train_loss=0.030169906094670296, time_cost=6.8585309982299805
+
Steps: 1%|▏ | 13330/1000000 [50:20<2660:33:55, 9.71s/it, lr=1e-5, step_loss=0.0325]
Steps: 1%|▏ | 13331/1000000 [50:34<2964:10:13, 10.82s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [13331], local_loss=0.03578707203269005, train_loss=0.04667501151561737, time_cost=4.151549577713013
+
Steps: 1%|▏ | 13331/1000000 [50:34<2964:10:13, 10.82s/it, lr=1e-5, step_loss=0.0358]
Steps: 1%|▏ | 13332/1000000 [50:41<2712:41:37, 9.90s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [13332], local_loss=0.027314241975545883, train_loss=0.14391618967056274, time_cost=2.949911594390869
+
Steps: 1%|▏ | 13332/1000000 [50:41<2712:41:37, 9.90s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%|▏ | 13333/1000000 [50:47<2324:03:01, 8.48s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [13333], local_loss=0.24328850209712982, train_loss=0.07507207989692688, time_cost=1.4949922561645508
+
Steps: 1%|▏ | 13333/1000000 [50:47<2324:03:01, 8.48s/it, lr=1e-5, step_loss=0.243]
Steps: 1%|▏ | 13334/1000000 [50:52<2036:59:11, 7.43s/it, lr=1e-5, step_loss=0.243][RANK-0]: Step: [13334], local_loss=0.060028694570064545, train_loss=0.05040821060538292, time_cost=1.228543758392334
+
Steps: 1%|▏ | 13334/1000000 [50:52<2036:59:11, 7.43s/it, lr=1e-5, step_loss=0.06]
Steps: 1%|▏ | 13335/1000000 [51:00<2153:09:08, 7.86s/it, lr=1e-5, step_loss=0.06][RANK-0]: Step: [13335], local_loss=0.011626871302723885, train_loss=0.034397467970848083, time_cost=2.557065010070801
+
Steps: 1%|▏ | 13335/1000000 [51:00<2153:09:08, 7.86s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 13336/1000000 [51:15<2676:02:33, 9.76s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [13336], local_loss=0.026585901156067848, train_loss=0.04108984395861626, time_cost=3.6662821769714355
+
Steps: 1%|▏ | 13336/1000000 [51:15<2676:02:33, 9.76s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%|▏ | 13337/1000000 [51:22<2462:51:45, 8.99s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [13337], local_loss=0.09263008832931519, train_loss=0.04232759028673172, time_cost=2.712800979614258
+
Steps: 1%|▏ | 13337/1000000 [51:22<2462:51:45, 8.99s/it, lr=1e-5, step_loss=0.0926]
Steps: 1%|▏ | 13338/1000000 [51:27<2142:26:02, 7.82s/it, lr=1e-5, step_loss=0.0926][RANK-0]: Step: [13338], local_loss=0.04826108738780022, train_loss=8.55874252319336, time_cost=1.4137234687805176
+
Steps: 1%|▏ | 13338/1000000 [51:27<2142:26:02, 7.82s/it, lr=1e-5, step_loss=0.0483]
Steps: 1%|▏ | 13339/1000000 [51:32<1917:53:57, 7.00s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [13339], local_loss=0.10751545429229736, train_loss=0.0737651139497757, time_cost=2.2693984508514404
+
Steps: 1%|▏ | 13339/1000000 [51:32<1917:53:57, 7.00s/it, lr=1e-5, step_loss=0.108]
Steps: 1%|▏ | 13340/1000000 [51:43<2212:38:05, 8.07s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [13340], local_loss=0.1352383941411972, train_loss=0.06688454002141953, time_cost=3.5050406455993652
+
Steps: 1%|▏ | 13340/1000000 [51:43<2212:38:05, 8.07s/it, lr=1e-5, step_loss=0.135]
Steps: 1%|▏ | 13341/1000000 [51:49<2045:43:13, 7.46s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [13341], local_loss=0.04132281243801117, train_loss=0.025782369077205658, time_cost=1.5158288478851318
+
Steps: 1%|▏ | 13341/1000000 [51:49<2045:43:13, 7.46s/it, lr=1e-5, step_loss=0.0413]
Steps: 1%|▏ | 13342/1000000 [52:01<2472:06:13, 9.02s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [13342], local_loss=0.04941233992576599, train_loss=0.0974804237484932, time_cost=5.681484699249268
+
Steps: 1%|▏ | 13342/1000000 [52:01<2472:06:13, 9.02s/it, lr=1e-5, step_loss=0.0494]
Steps: 1%|▏ | 13343/1000000 [52:10<2460:41:56, 8.98s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [13343], local_loss=0.020147783681750298, train_loss=0.0780140832066536, time_cost=6.692410469055176
+
Steps: 1%|▏ | 13343/1000000 [52:10<2460:41:56, 8.98s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%|▏ | 13344/1000000 [52:20<2495:14:46, 9.10s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [13344], local_loss=0.049814071506261826, train_loss=0.04078809916973114, time_cost=1.2289834022521973
+
Steps: 1%|▏ | 13344/1000000 [52:20<2495:14:46, 9.10s/it, lr=1e-5, step_loss=0.0498]
Steps: 1%|▏ | 13345/1000000 [52:27<2354:37:41, 8.59s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [13345], local_loss=244.424072265625, train_loss=30.588388442993164, time_cost=2.866863489151001
+
Steps: 1%|▏ | 13345/1000000 [52:27<2354:37:41, 8.59s/it, lr=1e-5, step_loss=244]
Steps: 1%|▏ | 13346/1000000 [52:37<2449:44:40, 8.94s/it, lr=1e-5, step_loss=244][RANK-0]: Step: [13346], local_loss=0.02008160389959812, train_loss=0.08608280122280121, time_cost=3.6839919090270996
+
Steps: 1%|▏ | 13346/1000000 [52:37<2449:44:40, 8.94s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%|▏ | 13347/1000000 [52:46<2461:40:49, 8.98s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [13347], local_loss=0.030062638223171234, train_loss=0.03009345941245556, time_cost=6.6511523723602295
+
Steps: 1%|▏ | 13347/1000000 [52:46<2461:40:49, 8.98s/it, lr=1e-5, step_loss=0.0301]
Steps: 1%|▏ | 13348/1000000 [52:59<2802:36:17, 10.23s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [13348], local_loss=0.026722412556409836, train_loss=0.04343476891517639, time_cost=1.234490156173706
+
Steps: 1%|▏ | 13348/1000000 [52:59<2802:36:17, 10.23s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%|▏ | 13349/1000000 [53:12<3027:21:19, 11.05s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [13349], local_loss=0.05214068293571472, train_loss=0.09599217772483826, time_cost=4.44559645652771
+
Steps: 1%|▏ | 13349/1000000 [53:12<3027:21:19, 11.05s/it, lr=1e-5, step_loss=0.0521]
Steps: 1%|▏ | 13350/1000000 [53:26<3304:23:53, 12.06s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [13350], local_loss=0.05278506129980087, train_loss=0.03069988451898098, time_cost=6.815937519073486
+
Steps: 1%|▏ | 13350/1000000 [53:26<3304:23:53, 12.06s/it, lr=1e-5, step_loss=0.0528]
Steps: 1%|▏ | 13351/1000000 [53:32<2809:49:55, 10.25s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [13351], local_loss=0.06537193804979324, train_loss=0.04471161216497421, time_cost=1.9784374237060547
+
Steps: 1%|▏ | 13351/1000000 [53:32<2809:49:55, 10.25s/it, lr=1e-5, step_loss=0.0654]
Steps: 1%|▏ | 13352/1000000 [53:42<2760:52:14, 10.07s/it, lr=1e-5, step_loss=0.0654][RANK-0]: Step: [13352], local_loss=0.006810406222939491, train_loss=0.015554985962808132, time_cost=1.8631060123443604
+
Steps: 1%|▏ | 13352/1000000 [53:42<2760:52:14, 10.07s/it, lr=1e-5, step_loss=0.00681]
Steps: 1%|▏ | 13353/1000000 [53:48<2421:04:28, 8.83s/it, lr=1e-5, step_loss=0.00681][RANK-0]: Step: [13353], local_loss=0.06332708895206451, train_loss=24.08957290649414, time_cost=1.7219135761260986
+
Steps: 1%|▏ | 13353/1000000 [53:48<2421:04:28, 8.83s/it, lr=1e-5, step_loss=0.0633]
Steps: 1%|▏ | 13354/1000000 [54:03<2958:40:51, 10.80s/it, lr=1e-5, step_loss=0.0633][RANK-0]: Step: [13354], local_loss=0.01434463169425726, train_loss=0.03998997062444687, time_cost=6.565210342407227
+
Steps: 1%|▏ | 13354/1000000 [54:03<2958:40:51, 10.80s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%|▏ | 13355/1000000 [54:15<3002:53:04, 10.96s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [13355], local_loss=0.4376968741416931, train_loss=0.0967601016163826, time_cost=4.714349985122681
+
Steps: 1%|▏ | 13355/1000000 [54:15<3002:53:04, 10.96s/it, lr=1e-5, step_loss=0.438]
Steps: 1%|▏ | 13356/1000000 [54:22<2694:15:14, 9.83s/it, lr=1e-5, step_loss=0.438][RANK-0]: Step: [13356], local_loss=0.01468427013605833, train_loss=0.06606552004814148, time_cost=2.900102376937866
+
Steps: 1%|▏ | 13356/1000000 [54:22<2694:15:14, 9.83s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 13357/1000000 [54:37<3121:49:47, 11.39s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [13357], local_loss=0.014760911464691162, train_loss=0.02279829792678356, time_cost=7.745791912078857
+
Steps: 1%|▏ | 13357/1000000 [54:37<3121:49:47, 11.39s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%|▏ | 13358/1000000 [54:46<2936:57:44, 10.72s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [13358], local_loss=0.07612963765859604, train_loss=0.031725142151117325, time_cost=6.0366010665893555
+
Steps: 1%|▏ | 13358/1000000 [54:46<2936:57:44, 10.72s/it, lr=1e-5, step_loss=0.0761]
Steps: 1%|▏ | 13359/1000000 [54:53<2587:02:06, 9.44s/it, lr=1e-5, step_loss=0.0761][RANK-0]: Step: [13359], local_loss=0.1462632268667221, train_loss=0.17476105690002441, time_cost=1.2337591648101807
+
Steps: 1%|▏ | 13359/1000000 [54:53<2587:02:06, 9.44s/it, lr=1e-5, step_loss=0.146]
Steps: 1%|▏ | 13360/1000000 [55:08<3116:46:30, 11.37s/it, lr=1e-5, step_loss=0.146][RANK-0]: Step: [13360], local_loss=0.17708313465118408, train_loss=0.16356801986694336, time_cost=2.7185311317443848
+
Steps: 1%|▏ | 13360/1000000 [55:08<3116:46:30, 11.37s/it, lr=1e-5, step_loss=0.177]
Steps: 1%|▏ | 13361/1000000 [55:13<2534:15:14, 9.25s/it, lr=1e-5, step_loss=0.177][RANK-0]: Step: [13361], local_loss=0.016999881714582443, train_loss=0.02666482701897621, time_cost=1.4073889255523682
+
Steps: 1%|▏ | 13361/1000000 [55:13<2534:15:14, 9.25s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 13362/1000000 [55:20<2336:15:59, 8.52s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [13362], local_loss=1.006155014038086, train_loss=0.16074970364570618, time_cost=2.568528413772583
+
Steps: 1%|▏ | 13362/1000000 [55:20<2336:15:59, 8.52s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 13363/1000000 [55:27<2263:05:43, 8.26s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [13363], local_loss=0.024823637679219246, train_loss=0.051574524492025375, time_cost=1.2393913269042969
+
Steps: 1%|▏ | 13363/1000000 [55:27<2263:05:43, 8.26s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 13364/1000000 [55:38<2478:35:34, 9.04s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [13364], local_loss=0.018226753920316696, train_loss=0.03160303086042404, time_cost=3.0222010612487793
+
Steps: 1%|▏ | 13364/1000000 [55:38<2478:35:34, 9.04s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%|▏ | 13365/1000000 [55:43<2179:08:11, 7.95s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [13365], local_loss=0.010546551086008549, train_loss=0.03113425523042679, time_cost=1.3430333137512207
+
Steps: 1%|▏ | 13365/1000000 [55:43<2179:08:11, 7.95s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%|▏ | 13366/1000000 [55:48<1881:43:09, 6.87s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [13366], local_loss=0.013563291169703007, train_loss=0.14777985215187073, time_cost=1.2963736057281494
+
Steps: 1%|▏ | 13366/1000000 [55:48<1881:43:09, 6.87s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%|▏ | 13367/1000000 [55:55<1882:41:49, 6.87s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [13367], local_loss=0.07418973743915558, train_loss=0.04753166437149048, time_cost=2.1415505409240723
+
Steps: 1%|▏ | 13367/1000000 [55:55<1882:41:49, 6.87s/it, lr=1e-5, step_loss=0.0742]
Steps: 1%|▏ | 13368/1000000 [56:10<2549:56:20, 9.30s/it, lr=1e-5, step_loss=0.0742][RANK-0]: Step: [13368], local_loss=0.03207717090845108, train_loss=0.028668474406003952, time_cost=5.076571941375732
+
Steps: 1%|▏ | 13368/1000000 [56:10<2549:56:20, 9.30s/it, lr=1e-5, step_loss=0.0321]
Steps: 1%|▏ | 13369/1000000 [56:17<2389:11:40, 8.72s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [13369], local_loss=0.0369577556848526, train_loss=0.04006209969520569, time_cost=2.873807668685913
+
Steps: 1%|▏ | 13369/1000000 [56:17<2389:11:40, 8.72s/it, lr=1e-5, step_loss=0.037]
Steps: 1%|▏ | 13370/1000000 [56:31<2806:57:54, 10.24s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [13370], local_loss=0.01985526829957962, train_loss=1.1008286476135254, time_cost=6.33524751663208
+
Steps: 1%|▏ | 13370/1000000 [56:31<2806:57:54, 10.24s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%|▏ | 13371/1000000 [56:46<3245:32:48, 11.84s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [13371], local_loss=0.0050602080300450325, train_loss=0.06471315026283264, time_cost=11.311499118804932
+
Steps: 1%|▏ | 13371/1000000 [56:46<3245:32:48, 11.84s/it, lr=1e-5, step_loss=0.00506]
Steps: 1%|▏ | 13372/1000000 [57:00<3383:54:07, 12.35s/it, lr=1e-5, step_loss=0.00506][RANK-0]: Step: [13372], local_loss=0.0350380577147007, train_loss=0.025568882003426552, time_cost=10.719223022460938
+
Steps: 1%|▏ | 13372/1000000 [57:00<3383:54:07, 12.35s/it, lr=1e-5, step_loss=0.035]
Steps: 1%|▏ | 13373/1000000 [57:14<3558:46:17, 12.99s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [13373], local_loss=0.01822029799222946, train_loss=0.0225354116410017, time_cost=4.9205896854400635
+
Steps: 1%|▏ | 13373/1000000 [57:14<3558:46:17, 12.99s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%|▏ | 13374/1000000 [57:23<3220:37:00, 11.75s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [13374], local_loss=0.017108505591750145, train_loss=0.04516957700252533, time_cost=2.7040252685546875
+
Steps: 1%|▏ | 13374/1000000 [57:23<3220:37:00, 11.75s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%|▏ | 13375/1000000 [57:32<2994:20:29, 10.93s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [13375], local_loss=0.03642949461936951, train_loss=0.023724323138594627, time_cost=2.377298593521118
+
Steps: 1%|▏ | 13375/1000000 [57:32<2994:20:29, 10.93s/it, lr=1e-5, step_loss=0.0364]
Steps: 1%|▏ | 13376/1000000 [57:38<2543:38:19, 9.28s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [13376], local_loss=0.05480866879224777, train_loss=0.11001627892255783, time_cost=1.69140625
+
Steps: 1%|▏ | 13376/1000000 [57:38<2543:38:19, 9.28s/it, lr=1e-5, step_loss=0.0548]
Steps: 1%|▏ | 13377/1000000 [57:44<2288:05:42, 8.35s/it, lr=1e-5, step_loss=0.0548][RANK-0]: Step: [13377], local_loss=0.0655045285820961, train_loss=0.038618750870227814, time_cost=1.9694020748138428
+
Steps: 1%|▏ | 13377/1000000 [57:44<2288:05:42, 8.35s/it, lr=1e-5, step_loss=0.0655]
Steps: 1%|▏ | 13378/1000000 [57:56<2573:50:44, 9.39s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [13378], local_loss=0.014439061284065247, train_loss=0.10573671013116837, time_cost=3.7614026069641113
+
Steps: 1%|▏ | 13378/1000000 [57:56<2573:50:44, 9.39s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 13379/1000000 [58:09<2868:22:02, 10.47s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [13379], local_loss=0.13863369822502136, train_loss=18.48619842529297, time_cost=7.390361785888672
+
Steps: 1%|▏ | 13379/1000000 [58:09<2868:22:02, 10.47s/it, lr=1e-5, step_loss=0.139]
Steps: 1%|▏ | 13380/1000000 [58:16<2608:19:24, 9.52s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [13380], local_loss=0.013151640072464943, train_loss=0.024674540385603905, time_cost=3.335901975631714
+
Steps: 1%|▏ | 13380/1000000 [58:16<2608:19:24, 9.52s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%|▏ | 13381/1000000 [58:24<2501:12:26, 9.13s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [13381], local_loss=0.0209711492061615, train_loss=0.09135176241397858, time_cost=1.3142693042755127
+
Steps: 1%|▏ | 13381/1000000 [58:24<2501:12:26, 9.13s/it, lr=1e-5, step_loss=0.021]
Steps: 1%|▏ | 13382/1000000 [58:29<2176:04:16, 7.94s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [13382], local_loss=0.024780811741948128, train_loss=0.05301811918616295, time_cost=2.263522148132324
+
Steps: 1%|▏ | 13382/1000000 [58:29<2176:04:16, 7.94s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 13383/1000000 [58:38<2242:30:54, 8.18s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [13383], local_loss=0.014243857935070992, train_loss=0.025038016960024834, time_cost=2.737236499786377
+
Steps: 1%|▏ | 13383/1000000 [58:38<2242:30:54, 8.18s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%|▏ | 13384/1000000 [58:52<2677:05:14, 9.77s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [13384], local_loss=0.091873399913311, train_loss=0.0508151650428772, time_cost=1.3255279064178467
+
Steps: 1%|▏ | 13384/1000000 [58:52<2677:05:14, 9.77s/it, lr=1e-5, step_loss=0.0919]
Steps: 1%|▏ | 13385/1000000 [59:00<2531:35:36, 9.24s/it, lr=1e-5, step_loss=0.0919][RANK-0]: Step: [13385], local_loss=0.04976430535316467, train_loss=0.03199118375778198, time_cost=4.015051603317261
+
Steps: 1%|▏ | 13385/1000000 [59:00<2531:35:36, 9.24s/it, lr=1e-5, step_loss=0.0498]
Steps: 1%|▏ | 13386/1000000 [59:13<2896:57:57, 10.57s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [13386], local_loss=0.07710816711187363, train_loss=0.03868410736322403, time_cost=4.471138000488281
+
Steps: 1%|▏ | 13386/1000000 [59:13<2896:57:57, 10.57s/it, lr=1e-5, step_loss=0.0771]
Steps: 1%|▏ | 13387/1000000 [59:19<2469:16:38, 9.01s/it, lr=1e-5, step_loss=0.0771][RANK-0]: Step: [13387], local_loss=0.043587006628513336, train_loss=0.06240995228290558, time_cost=2.843569040298462
+
Steps: 1%|▏ | 13387/1000000 [59:19<2469:16:38, 9.01s/it, lr=1e-5, step_loss=0.0436]
Steps: 1%|▏ | 13388/1000000 [59:28<2504:12:59, 9.14s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [13388], local_loss=0.4195859134197235, train_loss=0.08260181546211243, time_cost=3.278897523880005
+
Steps: 1%|▏ | 13388/1000000 [59:28<2504:12:59, 9.14s/it, lr=1e-5, step_loss=0.42]
Steps: 1%|▏ | 13389/1000000 [59:34<2211:39:00, 8.07s/it, lr=1e-5, step_loss=0.42][RANK-0]: Step: [13389], local_loss=0.033519960939884186, train_loss=0.0991690456867218, time_cost=2.9549012184143066
+
Steps: 1%|▏ | 13389/1000000 [59:34<2211:39:00, 8.07s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%|▏ | 13390/1000000 [59:39<1975:12:14, 7.21s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [13390], local_loss=0.0353257916867733, train_loss=0.03446975350379944, time_cost=1.210618019104004
+
Steps: 1%|▏ | 13390/1000000 [59:39<1975:12:14, 7.21s/it, lr=1e-5, step_loss=0.0353]
Steps: 1%|▏ | 13391/1000000 [59:48<2121:49:28, 7.74s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [13391], local_loss=0.03633086010813713, train_loss=0.07132652401924133, time_cost=2.788881778717041
+
Steps: 1%|▏ | 13391/1000000 [59:48<2121:49:28, 7.74s/it, lr=1e-5, step_loss=0.0363]
Steps: 1%|▏ | 13392/1000000 [59:54<1983:08:40, 7.24s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [13392], local_loss=0.010344509035348892, train_loss=0.03688613325357437, time_cost=1.763657808303833
+
Steps: 1%|▏ | 13392/1000000 [59:54<1983:08:40, 7.24s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%|▏ | 13393/1000000 [1:00:10<2722:59:46, 9.94s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [13393], local_loss=0.03756529465317726, train_loss=35.9518928527832, time_cost=4.62352180480957
+
Steps: 1%|▏ | 13393/1000000 [1:00:10<2722:59:46, 9.94s/it, lr=1e-5, step_loss=0.0376]
Steps: 1%|▏ | 13394/1000000 [1:00:16<2412:15:24, 8.80s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [13394], local_loss=0.008721912279725075, train_loss=0.06465691328048706, time_cost=1.7620854377746582
+
Steps: 1%|▏ | 13394/1000000 [1:00:16<2412:15:24, 8.80s/it, lr=1e-5, step_loss=0.00872]
Steps: 1%|▏ | 13395/1000000 [1:00:21<2112:15:11, 7.71s/it, lr=1e-5, step_loss=0.00872][RANK-0]: Step: [13395], local_loss=0.016787080094218254, train_loss=0.026608088985085487, time_cost=3.070619821548462
+
Steps: 1%|▏ | 13395/1000000 [1:00:21<2112:15:11, 7.71s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%|▏ | 13396/1000000 [1:00:26<1880:02:26, 6.86s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [13396], local_loss=0.04872794821858406, train_loss=0.1839793622493744, time_cost=1.2264814376831055
+
Steps: 1%|▏ | 13396/1000000 [1:00:26<1880:02:26, 6.86s/it, lr=1e-5, step_loss=0.0487]
Steps: 1%|▏ | 13397/1000000 [1:00:36<2082:32:02, 7.60s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [13397], local_loss=0.05830727145075798, train_loss=0.16490785777568817, time_cost=3.180774688720703
+
Steps: 1%|▏ | 13397/1000000 [1:00:36<2082:32:02, 7.60s/it, lr=1e-5, step_loss=0.0583]
Steps: 1%|▏ | 13398/1000000 [1:00:49<2574:00:15, 9.39s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [13398], local_loss=0.01830042526125908, train_loss=0.04398070275783539, time_cost=4.361066579818726
+
Steps: 1%|▏ | 13398/1000000 [1:00:49<2574:00:15, 9.39s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%|▏ | 13399/1000000 [1:00:57<2450:28:59, 8.94s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [13399], local_loss=0.014929658733308315, train_loss=0.22068187594413757, time_cost=2.3745977878570557
+
Steps: 1%|▏ | 13399/1000000 [1:00:57<2450:28:59, 8.94s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 13400/1000000 [1:01:03<2223:34:54, 8.11s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [13400], local_loss=0.04520266875624657, train_loss=0.029418999329209328, time_cost=1.5569522380828857
+
Steps: 1%|▏ | 13400/1000000 [1:01:03<2223:34:54, 8.11s/it, lr=1e-5, step_loss=0.0452]
Steps: 1%|▏ | 13401/1000000 [1:01:13<2372:46:14, 8.66s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [13401], local_loss=0.01987704634666443, train_loss=0.04367604851722717, time_cost=1.5681984424591064
+
Steps: 1%|▏ | 13401/1000000 [1:01:13<2372:46:14, 8.66s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%|▏ | 13402/1000000 [1:01:24<2559:50:44, 9.34s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [13402], local_loss=0.03814798966050148, train_loss=0.021018000319600105, time_cost=7.819523096084595
+
Steps: 1%|▏ | 13402/1000000 [1:01:24<2559:50:44, 9.34s/it, lr=1e-5, step_loss=0.0381]
Steps: 1%|▏ | 13403/1000000 [1:01:31<2381:50:43, 8.69s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [13403], local_loss=0.040169019252061844, train_loss=0.06764864176511765, time_cost=1.411731481552124
+
Steps: 1%|▏ | 13403/1000000 [1:01:31<2381:50:43, 8.69s/it, lr=1e-5, step_loss=0.0402]
Steps: 1%|▏ | 13404/1000000 [1:01:40<2362:23:40, 8.62s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [13404], local_loss=0.02446884661912918, train_loss=0.10064812004566193, time_cost=3.2313289642333984
+
Steps: 1%|▏ | 13404/1000000 [1:01:40<2362:23:40, 8.62s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%|▏ | 13405/1000000 [1:01:47<2214:24:23, 8.08s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [13405], local_loss=0.0499863363802433, train_loss=0.03861695155501366, time_cost=4.092482089996338
+
Steps: 1%|▏ | 13405/1000000 [1:01:47<2214:24:23, 8.08s/it, lr=1e-5, step_loss=0.05]
Steps: 1%|▏ | 13406/1000000 [1:01:58<2483:52:03, 9.06s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [13406], local_loss=0.06053977087140083, train_loss=0.028553897514939308, time_cost=1.4095499515533447
+
Steps: 1%|▏ | 13406/1000000 [1:01:58<2483:52:03, 9.06s/it, lr=1e-5, step_loss=0.0605]
Steps: 1%|▏ | 13407/1000000 [1:02:02<2109:47:09, 7.70s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [13407], local_loss=0.02082095853984356, train_loss=0.017203906551003456, time_cost=1.7036778926849365
+
Steps: 1%|▏ | 13407/1000000 [1:02:02<2109:47:09, 7.70s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%|▏ | 13408/1000000 [1:02:13<2349:27:44, 8.57s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [13408], local_loss=0.12504081428050995, train_loss=0.04624281823635101, time_cost=1.2161426544189453
+
Steps: 1%|▏ | 13408/1000000 [1:02:13<2349:27:44, 8.57s/it, lr=1e-5, step_loss=0.125]
Steps: 1%|▏ | 13409/1000000 [1:02:26<2721:16:38, 9.93s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [13409], local_loss=0.007445784285664558, train_loss=0.01663006655871868, time_cost=3.7761425971984863
+
Steps: 1%|▏ | 13409/1000000 [1:02:26<2721:16:38, 9.93s/it, lr=1e-5, step_loss=0.00745]
Steps: 1%|▏ | 13410/1000000 [1:02:35<2598:25:14, 9.48s/it, lr=1e-5, step_loss=0.00745][RANK-0]: Step: [13410], local_loss=0.014580187387764454, train_loss=0.2640576958656311, time_cost=1.6262142658233643
+
Steps: 1%|▏ | 13410/1000000 [1:02:35<2598:25:14, 9.48s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%|▏ | 13411/1000000 [1:02:42<2394:48:07, 8.74s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [13411], local_loss=0.0389036163687706, train_loss=0.019887376576662064, time_cost=2.7340242862701416
+
Steps: 1%|▏ | 13411/1000000 [1:02:42<2394:48:07, 8.74s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%|▏ | 13412/1000000 [1:02:56<2834:09:11, 10.34s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [13412], local_loss=0.03187441825866699, train_loss=0.15637657046318054, time_cost=4.434034824371338
+
Steps: 1%|▏ | 13412/1000000 [1:02:56<2834:09:11, 10.34s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%|▏ | 13413/1000000 [1:03:07<2886:16:16, 10.53s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [13413], local_loss=0.01521142665296793, train_loss=0.03691333532333374, time_cost=2.227590322494507
+
Steps: 1%|▏ | 13413/1000000 [1:03:07<2886:16:16, 10.53s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%|▏ | 13414/1000000 [1:03:18<2975:25:01, 10.86s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [13414], local_loss=0.025149138644337654, train_loss=0.018655460327863693, time_cost=2.8614368438720703
+
Steps: 1%|▏ | 13414/1000000 [1:03:18<2975:25:01, 10.86s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%|▏ | 13415/1000000 [1:03:29<3003:42:43, 10.96s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [13415], local_loss=0.007679829839617014, train_loss=0.04139535501599312, time_cost=1.2413816452026367
+
Steps: 1%|▏ | 13415/1000000 [1:03:29<3003:42:43, 10.96s/it, lr=1e-5, step_loss=0.00768]
Steps: 1%|▏ | 13416/1000000 [1:03:40<2941:13:59, 10.73s/it, lr=1e-5, step_loss=0.00768][RANK-0]: Step: [13416], local_loss=0.3926008343696594, train_loss=0.07707295566797256, time_cost=4.5992913246154785
+
Steps: 1%|▏ | 13416/1000000 [1:03:40<2941:13:59, 10.73s/it, lr=1e-5, step_loss=0.393]
Steps: 1%|▏ | 13417/1000000 [1:03:45<2489:38:42, 9.08s/it, lr=1e-5, step_loss=0.393][RANK-0]: Step: [13417], local_loss=0.013426109217107296, train_loss=0.0401192270219326, time_cost=2.1638379096984863
+
Steps: 1%|▏ | 13417/1000000 [1:03:45<2489:38:42, 9.08s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%|▏ | 13418/1000000 [1:03:54<2460:21:01, 8.98s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [13418], local_loss=0.03932974487543106, train_loss=0.01973990723490715, time_cost=1.5112485885620117
+
Steps: 1%|▏ | 13418/1000000 [1:03:54<2460:21:01, 8.98s/it, lr=1e-5, step_loss=0.0393]
Steps: 1%|▏ | 13419/1000000 [1:04:08<2894:41:24, 10.56s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [13419], local_loss=0.012399469502270222, train_loss=0.04070045053958893, time_cost=4.944052219390869
+
Steps: 1%|▏ | 13419/1000000 [1:04:08<2894:41:24, 10.56s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13420/1000000 [1:04:26<3544:45:09, 12.93s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13420], local_loss=0.0083867646753788, train_loss=0.0622248612344265, time_cost=9.73266077041626
+
Steps: 1%|▏ | 13420/1000000 [1:04:26<3544:45:09, 12.93s/it, lr=1e-5, step_loss=0.00839]
Steps: 1%|▏ | 13421/1000000 [1:04:32<2956:36:09, 10.79s/it, lr=1e-5, step_loss=0.00839][RANK-0]: Step: [13421], local_loss=0.028450682759284973, train_loss=0.0609433688223362, time_cost=2.4317891597747803
+
Steps: 1%|▏ | 13421/1000000 [1:04:32<2956:36:09, 10.79s/it, lr=1e-5, step_loss=0.0285]
Steps: 1%|▏ | 13422/1000000 [1:04:47<3319:36:15, 12.11s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [13422], local_loss=0.030676597729325294, train_loss=0.035663001239299774, time_cost=6.660227298736572
+
Steps: 1%|▏ | 13422/1000000 [1:04:47<3319:36:15, 12.11s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%|▏ | 13423/1000000 [1:05:00<3328:46:37, 12.15s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [13423], local_loss=0.007248851470649242, train_loss=0.07775641977787018, time_cost=3.1849217414855957
+
Steps: 1%|▏ | 13423/1000000 [1:05:00<3328:46:37, 12.15s/it, lr=1e-5, step_loss=0.00725]
Steps: 1%|▏ | 13424/1000000 [1:05:05<2807:59:42, 10.25s/it, lr=1e-5, step_loss=0.00725][RANK-0]: Step: [13424], local_loss=0.017919132485985756, train_loss=0.015421522781252861, time_cost=1.4149558544158936
+
Steps: 1%|▏ | 13424/1000000 [1:05:05<2807:59:42, 10.25s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 13425/1000000 [1:05:11<2412:29:05, 8.80s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [13425], local_loss=0.032874319702386856, train_loss=0.037880443036556244, time_cost=2.9946932792663574
+
Steps: 1%|▏ | 13425/1000000 [1:05:11<2412:29:05, 8.80s/it, lr=1e-5, step_loss=0.0329]
Steps: 1%|▏ | 13426/1000000 [1:05:25<2821:19:06, 10.29s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [13426], local_loss=0.2733602523803711, train_loss=0.1749114990234375, time_cost=5.941072702407837
+
Steps: 1%|▏ | 13426/1000000 [1:05:25<2821:19:06, 10.29s/it, lr=1e-5, step_loss=0.273]
Steps: 1%|▏ | 13427/1000000 [1:05:40<3265:56:13, 11.92s/it, lr=1e-5, step_loss=0.273][RANK-0]: Step: [13427], local_loss=0.018469523638486862, train_loss=0.01541188731789589, time_cost=7.081875562667847
+
Steps: 1%|▏ | 13427/1000000 [1:05:40<3265:56:13, 11.92s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%|▏ | 13428/1000000 [1:05:48<2897:18:34, 10.57s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [13428], local_loss=0.062484100461006165, train_loss=0.03922051563858986, time_cost=1.3582870960235596
+
Steps: 1%|▏ | 13428/1000000 [1:05:48<2897:18:34, 10.57s/it, lr=1e-5, step_loss=0.0625]
Steps: 1%|▏ | 13429/1000000 [1:05:56<2733:36:03, 9.97s/it, lr=1e-5, step_loss=0.0625][RANK-0]: Step: [13429], local_loss=0.14889773726463318, train_loss=0.0936734527349472, time_cost=1.480560541152954
+
Steps: 1%|▏ | 13429/1000000 [1:05:56<2733:36:03, 9.97s/it, lr=1e-5, step_loss=0.149]
Steps: 1%|▏ | 13430/1000000 [1:06:13<3296:13:04, 12.03s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [13430], local_loss=0.022045768797397614, train_loss=0.024541834369301796, time_cost=14.242629766464233
+
Steps: 1%|▏ | 13430/1000000 [1:06:13<3296:13:04, 12.03s/it, lr=1e-5, step_loss=0.022]
Steps: 1%|▏ | 13431/1000000 [1:06:18<2723:09:33, 9.94s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [13431], local_loss=0.07407911866903305, train_loss=0.037950240075588226, time_cost=1.789142370223999
+
Steps: 1%|▏ | 13431/1000000 [1:06:18<2723:09:33, 9.94s/it, lr=1e-5, step_loss=0.0741]
Steps: 1%|▏ | 13432/1000000 [1:06:25<2490:03:54, 9.09s/it, lr=1e-5, step_loss=0.0741][RANK-0]: Step: [13432], local_loss=0.18828965723514557, train_loss=0.06866196542978287, time_cost=2.923959732055664
+
Steps: 1%|▏ | 13432/1000000 [1:06:25<2490:03:54, 9.09s/it, lr=1e-5, step_loss=0.188]
Steps: 1%|▏ | 13433/1000000 [1:06:32<2304:34:15, 8.41s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [13433], local_loss=0.03707019239664078, train_loss=0.05139297991991043, time_cost=2.082620859146118
+
Steps: 1%|▏ | 13433/1000000 [1:06:32<2304:34:15, 8.41s/it, lr=1e-5, step_loss=0.0371]
Steps: 1%|▏ | 13434/1000000 [1:06:40<2291:45:10, 8.36s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [13434], local_loss=0.016176434233784676, train_loss=32.80952453613281, time_cost=7.181689739227295
+
Steps: 1%|▏ | 13434/1000000 [1:06:40<2291:45:10, 8.36s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%|▏ | 13435/1000000 [1:06:48<2192:42:23, 8.00s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [13435], local_loss=0.05516984686255455, train_loss=0.05498719960451126, time_cost=1.4438543319702148
+
Steps: 1%|▏ | 13435/1000000 [1:06:48<2192:42:23, 8.00s/it, lr=1e-5, step_loss=0.0552]
Steps: 1%|▏ | 13436/1000000 [1:06:55<2132:14:08, 7.78s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [13436], local_loss=0.01519696693867445, train_loss=0.04781854897737503, time_cost=2.5932843685150146
+
Steps: 1%|▏ | 13436/1000000 [1:06:55<2132:14:08, 7.78s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%|▏ | 13437/1000000 [1:07:06<2438:55:55, 8.90s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [13437], local_loss=0.10526491701602936, train_loss=0.16375082731246948, time_cost=2.04752779006958
+
Steps: 1%|▏ | 13437/1000000 [1:07:06<2438:55:55, 8.90s/it, lr=1e-5, step_loss=0.105]
Steps: 1%|▏ | 13438/1000000 [1:07:15<2436:19:05, 8.89s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [13438], local_loss=0.006354304030537605, train_loss=0.04539300128817558, time_cost=1.2842450141906738
+
Steps: 1%|▏ | 13438/1000000 [1:07:15<2436:19:05, 8.89s/it, lr=1e-5, step_loss=0.00635]
Steps: 1%|▏ | 13439/1000000 [1:07:20<2125:44:30, 7.76s/it, lr=1e-5, step_loss=0.00635][RANK-0]: Step: [13439], local_loss=0.06425725668668747, train_loss=0.028269357979297638, time_cost=4.378046274185181
+
Steps: 1%|▏ | 13439/1000000 [1:07:20<2125:44:30, 7.76s/it, lr=1e-5, step_loss=0.0643]
Steps: 1%|▏ | 13440/1000000 [1:07:26<1975:43:29, 7.21s/it, lr=1e-5, step_loss=0.0643][RANK-0]: Step: [13440], local_loss=0.024249164387583733, train_loss=0.025916853919625282, time_cost=1.5114312171936035
+
Steps: 1%|▏ | 13440/1000000 [1:07:26<1975:43:29, 7.21s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%|▏ | 13441/1000000 [1:07:40<2527:43:14, 9.22s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [13441], local_loss=0.172204852104187, train_loss=0.0772193893790245, time_cost=2.7295024394989014
+
Steps: 1%|▏ | 13441/1000000 [1:07:40<2527:43:14, 9.22s/it, lr=1e-5, step_loss=0.172]
Steps: 1%|▏ | 13442/1000000 [1:07:51<2688:15:18, 9.81s/it, lr=1e-5, step_loss=0.172][RANK-0]: Step: [13442], local_loss=0.024368636310100555, train_loss=0.027115536853671074, time_cost=2.54191255569458
+
Steps: 1%|▏ | 13442/1000000 [1:07:51<2688:15:18, 9.81s/it, lr=1e-5, step_loss=0.0244]
Steps: 1%|▏ | 13443/1000000 [1:07:57<2329:52:39, 8.50s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [13443], local_loss=0.004216600209474564, train_loss=0.017986886203289032, time_cost=2.0832302570343018
+
Steps: 1%|▏ | 13443/1000000 [1:07:57<2329:52:39, 8.50s/it, lr=1e-5, step_loss=0.00422]
Steps: 1%|▏ | 13444/1000000 [1:08:11<2786:12:59, 10.17s/it, lr=1e-5, step_loss=0.00422][RANK-0]: Step: [13444], local_loss=0.025516433641314507, train_loss=0.07759125530719757, time_cost=1.3235478401184082
+
Steps: 1%|▏ | 13444/1000000 [1:08:11<2786:12:59, 10.17s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%|▏ | 13445/1000000 [1:08:21<2774:00:56, 10.12s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [13445], local_loss=0.0274177398532629, train_loss=0.061729416251182556, time_cost=3.270811080932617
+
Steps: 1%|▏ | 13445/1000000 [1:08:21<2774:00:56, 10.12s/it, lr=1e-5, step_loss=0.0274]
Steps: 1%|▏ | 13446/1000000 [1:08:30<2707:12:18, 9.88s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [13446], local_loss=0.029917854815721512, train_loss=0.06421303749084473, time_cost=2.016934394836426
+
Steps: 1%|▏ | 13446/1000000 [1:08:30<2707:12:18, 9.88s/it, lr=1e-5, step_loss=0.0299]
Steps: 1%|▏ | 13447/1000000 [1:08:36<2401:39:49, 8.76s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [13447], local_loss=0.31477823853492737, train_loss=0.10013383626937866, time_cost=1.3234570026397705
+
Steps: 1%|▏ | 13447/1000000 [1:08:36<2401:39:49, 8.76s/it, lr=1e-5, step_loss=0.315]
Steps: 1%|▏ | 13448/1000000 [1:08:47<2599:14:58, 9.48s/it, lr=1e-5, step_loss=0.315][RANK-0]: Step: [13448], local_loss=0.08251605182886124, train_loss=0.10633190721273422, time_cost=2.8150486946105957
+
Steps: 1%|▏ | 13448/1000000 [1:08:47<2599:14:58, 9.48s/it, lr=1e-5, step_loss=0.0825]
Steps: 1%|▏ | 13449/1000000 [1:08:59<2753:02:49, 10.05s/it, lr=1e-5, step_loss=0.0825][RANK-0]: Step: [13449], local_loss=0.006495151203125715, train_loss=0.01513778604567051, time_cost=1.9726784229278564
+
Steps: 1%|▏ | 13449/1000000 [1:08:59<2753:02:49, 10.05s/it, lr=1e-5, step_loss=0.0065]
Steps: 1%|▏ | 13450/1000000 [1:09:04<2342:56:58, 8.55s/it, lr=1e-5, step_loss=0.0065][RANK-0]: Step: [13450], local_loss=0.03894970566034317, train_loss=0.04564206302165985, time_cost=1.8264243602752686
+
Steps: 1%|▏ | 13450/1000000 [1:09:04<2342:56:58, 8.55s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%|▏ | 13451/1000000 [1:09:11<2196:30:22, 8.02s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [13451], local_loss=0.3929636478424072, train_loss=0.13453862071037292, time_cost=4.89240574836731
+
Steps: 1%|▏ | 13451/1000000 [1:09:11<2196:30:22, 8.02s/it, lr=1e-5, step_loss=0.393]
Steps: 1%|▏ | 13452/1000000 [1:09:22<2473:44:14, 9.03s/it, lr=1e-5, step_loss=0.393][RANK-0]: Step: [13452], local_loss=0.08991958945989609, train_loss=0.04613839089870453, time_cost=2.65274715423584
+
Steps: 1%|▏ | 13452/1000000 [1:09:22<2473:44:14, 9.03s/it, lr=1e-5, step_loss=0.0899]
Steps: 1%|▏ | 13453/1000000 [1:09:34<2747:06:12, 10.02s/it, lr=1e-5, step_loss=0.0899][RANK-0]: Step: [13453], local_loss=0.02387641742825508, train_loss=0.01821312867105007, time_cost=4.851022720336914
+
Steps: 1%|▏ | 13453/1000000 [1:09:34<2747:06:12, 10.02s/it, lr=1e-5, step_loss=0.0239]
Steps: 1%|▏ | 13454/1000000 [1:09:50<3175:08:21, 11.59s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [13454], local_loss=0.07020364701747894, train_loss=0.032888263463974, time_cost=5.270947694778442
+
Steps: 1%|▏ | 13454/1000000 [1:09:50<3175:08:21, 11.59s/it, lr=1e-5, step_loss=0.0702]
Steps: 1%|▏ | 13455/1000000 [1:10:02<3248:16:19, 11.85s/it, lr=1e-5, step_loss=0.0702][RANK-0]: Step: [13455], local_loss=0.011015107855200768, train_loss=0.057356689125299454, time_cost=4.468997955322266
+
Steps: 1%|▏ | 13455/1000000 [1:10:02<3248:16:19, 11.85s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 13456/1000000 [1:10:09<2875:33:12, 10.49s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [13456], local_loss=0.011571062728762627, train_loss=0.02337290160357952, time_cost=1.3196706771850586
+
Steps: 1%|▏ | 13456/1000000 [1:10:09<2875:33:12, 10.49s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 13457/1000000 [1:10:14<2378:07:58, 8.68s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [13457], local_loss=0.024676410481333733, train_loss=0.041536152362823486, time_cost=1.3240666389465332
+
Steps: 1%|▏ | 13457/1000000 [1:10:14<2378:07:58, 8.68s/it, lr=1e-5, step_loss=0.0247]
Steps: 1%|▏ | 13458/1000000 [1:10:28<2822:30:17, 10.30s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [13458], local_loss=0.07060454040765762, train_loss=0.030306223779916763, time_cost=5.2901716232299805
+
Steps: 1%|▏ | 13458/1000000 [1:10:28<2822:30:17, 10.30s/it, lr=1e-5, step_loss=0.0706]
Steps: 1%|▏ | 13459/1000000 [1:10:42<3170:24:54, 11.57s/it, lr=1e-5, step_loss=0.0706][RANK-0]: Step: [13459], local_loss=0.03112248331308365, train_loss=0.051696911454200745, time_cost=5.829628944396973
+
Steps: 1%|▏ | 13459/1000000 [1:10:42<3170:24:54, 11.57s/it, lr=1e-5, step_loss=0.0311]
Steps: 1%|▏ | 13460/1000000 [1:10:52<2995:35:15, 10.93s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [13460], local_loss=0.010440787300467491, train_loss=0.013565733097493649, time_cost=4.139368772506714
+
Steps: 1%|▏ | 13460/1000000 [1:10:52<2995:35:15, 10.93s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 13461/1000000 [1:11:03<3014:09:34, 11.00s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [13461], local_loss=145.44522094726562, train_loss=18.205745697021484, time_cost=1.240164041519165
+
Steps: 1%|▏ | 13461/1000000 [1:11:03<3014:09:34, 11.00s/it, lr=1e-5, step_loss=145]
Steps: 1%|▏ | 13462/1000000 [1:11:10<2677:28:32, 9.77s/it, lr=1e-5, step_loss=145][RANK-0]: Step: [13462], local_loss=0.07776476442813873, train_loss=0.07336530089378357, time_cost=2.666696548461914
+
Steps: 1%|▏ | 13462/1000000 [1:11:10<2677:28:32, 9.77s/it, lr=1e-5, step_loss=0.0778]
Steps: 1%|▏ | 13463/1000000 [1:11:28<3351:40:52, 12.23s/it, lr=1e-5, step_loss=0.0778][RANK-0]: Step: [13463], local_loss=0.035174909979104996, train_loss=0.10648418217897415, time_cost=2.39044451713562
+
Steps: 1%|▏ | 13463/1000000 [1:11:28<3351:40:52, 12.23s/it, lr=1e-5, step_loss=0.0352]
Steps: 1%|▏ | 13464/1000000 [1:11:33<2770:37:13, 10.11s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [13464], local_loss=0.0077007003128528595, train_loss=0.04158441722393036, time_cost=2.380465030670166
+
Steps: 1%|▏ | 13464/1000000 [1:11:33<2770:37:13, 10.11s/it, lr=1e-5, step_loss=0.0077]
Steps: 1%|▏ | 13465/1000000 [1:11:46<3028:11:33, 11.05s/it, lr=1e-5, step_loss=0.0077][RANK-0]: Step: [13465], local_loss=0.010959434323012829, train_loss=0.04304429516196251, time_cost=3.4604055881500244
+
Steps: 1%|▏ | 13465/1000000 [1:11:46<3028:11:33, 11.05s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 13466/1000000 [1:11:56<2893:46:33, 10.56s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [13466], local_loss=0.008379964157938957, train_loss=0.05783399939537048, time_cost=2.1544501781463623
+
Steps: 1%|▏ | 13466/1000000 [1:11:56<2893:46:33, 10.56s/it, lr=1e-5, step_loss=0.00838]
Steps: 1%|▏ | 13467/1000000 [1:12:01<2448:10:59, 8.93s/it, lr=1e-5, step_loss=0.00838][RANK-0]: Step: [13467], local_loss=0.17421448230743408, train_loss=63.01335525512695, time_cost=1.2231040000915527
+
Steps: 1%|▏ | 13467/1000000 [1:12:01<2448:10:59, 8.93s/it, lr=1e-5, step_loss=0.174]
Steps: 1%|▏ | 13468/1000000 [1:12:08<2277:54:02, 8.31s/it, lr=1e-5, step_loss=0.174][RANK-0]: Step: [13468], local_loss=0.05589023232460022, train_loss=0.07840459793806076, time_cost=2.330594301223755
+
Steps: 1%|▏ | 13468/1000000 [1:12:08<2277:54:02, 8.31s/it, lr=1e-5, step_loss=0.0559]
Steps: 1%|▏ | 13469/1000000 [1:12:19<2536:45:00, 9.26s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [13469], local_loss=0.03643672540783882, train_loss=0.10711498558521271, time_cost=3.538748264312744
+
Steps: 1%|▏ | 13469/1000000 [1:12:19<2536:45:00, 9.26s/it, lr=1e-5, step_loss=0.0364]
Steps: 1%|▏ | 13470/1000000 [1:12:31<2746:30:09, 10.02s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [13470], local_loss=0.009278764016926289, train_loss=0.0403079092502594, time_cost=2.227555513381958
+
Steps: 1%|▏ | 13470/1000000 [1:12:31<2746:30:09, 10.02s/it, lr=1e-5, step_loss=0.00928]
Steps: 1%|▏ | 13471/1000000 [1:12:42<2831:26:30, 10.33s/it, lr=1e-5, step_loss=0.00928][RANK-0]: Step: [13471], local_loss=0.025980714708566666, train_loss=0.039562925696372986, time_cost=1.244180679321289
+
Steps: 1%|▏ | 13471/1000000 [1:12:42<2831:26:30, 10.33s/it, lr=1e-5, step_loss=0.026]
Steps: 1%|▏ | 13472/1000000 [1:12:48<2447:22:35, 8.93s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [13472], local_loss=0.038514383137226105, train_loss=0.25180375576019287, time_cost=1.2421250343322754
+
Steps: 1%|▏ | 13472/1000000 [1:12:48<2447:22:35, 8.93s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%|▏ | 13473/1000000 [1:12:52<2100:29:09, 7.67s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [13473], local_loss=0.007490531541407108, train_loss=0.039226457476615906, time_cost=2.271483898162842
+
Steps: 1%|▏ | 13473/1000000 [1:12:52<2100:29:09, 7.67s/it, lr=1e-5, step_loss=0.00749]
Steps: 1%|▏ | 13474/1000000 [1:12:57<1834:22:06, 6.69s/it, lr=1e-5, step_loss=0.00749][RANK-0]: Step: [13474], local_loss=0.04528864100575447, train_loss=0.05077015236020088, time_cost=1.582387924194336
+
Steps: 1%|▏ | 13474/1000000 [1:12:57<1834:22:06, 6.69s/it, lr=1e-5, step_loss=0.0453]
Steps: 1%|▏ | 13475/1000000 [1:13:02<1729:05:07, 6.31s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [13475], local_loss=0.007025555707514286, train_loss=0.034066349267959595, time_cost=2.2898449897766113
+
Steps: 1%|▏ | 13475/1000000 [1:13:02<1729:05:07, 6.31s/it, lr=1e-5, step_loss=0.00703]
Steps: 1%|▏ | 13476/1000000 [1:13:15<2229:57:22, 8.14s/it, lr=1e-5, step_loss=0.00703][RANK-0]: Step: [13476], local_loss=0.07442154735326767, train_loss=0.04415661469101906, time_cost=3.053424596786499
+
Steps: 1%|▏ | 13476/1000000 [1:13:15<2229:57:22, 8.14s/it, lr=1e-5, step_loss=0.0744]
Steps: 1%|▏ | 13477/1000000 [1:13:25<2432:15:20, 8.88s/it, lr=1e-5, step_loss=0.0744][RANK-0]: Step: [13477], local_loss=0.06249763444066048, train_loss=0.02724355086684227, time_cost=1.2134387493133545
+
Steps: 1%|▏ | 13477/1000000 [1:13:25<2432:15:20, 8.88s/it, lr=1e-5, step_loss=0.0625]
Steps: 1%|▏ | 13478/1000000 [1:13:30<2052:04:24, 7.49s/it, lr=1e-5, step_loss=0.0625][RANK-0]: Step: [13478], local_loss=0.03631168603897095, train_loss=0.019536221399903297, time_cost=1.2938117980957031
+
Steps: 1%|▏ | 13478/1000000 [1:13:30<2052:04:24, 7.49s/it, lr=1e-5, step_loss=0.0363]
Steps: 1%|▏ | 13479/1000000 [1:13:41<2397:28:27, 8.75s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [13479], local_loss=0.022229589521884918, train_loss=0.017340252175927162, time_cost=8.956727981567383
+
Steps: 1%|▏ | 13479/1000000 [1:13:41<2397:28:27, 8.75s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%|▏ | 13480/1000000 [1:13:46<2079:23:59, 7.59s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [13480], local_loss=0.0781027153134346, train_loss=0.037299156188964844, time_cost=1.245403528213501
+
Steps: 1%|▏ | 13480/1000000 [1:13:46<2079:23:59, 7.59s/it, lr=1e-5, step_loss=0.0781]
Steps: 1%|▏ | 13481/1000000 [1:14:01<2646:34:09, 9.66s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [13481], local_loss=0.03694213181734085, train_loss=0.03324352577328682, time_cost=4.405088663101196
+
Steps: 1%|▏ | 13481/1000000 [1:14:01<2646:34:09, 9.66s/it, lr=1e-5, step_loss=0.0369]
Steps: 1%|▏ | 13482/1000000 [1:14:06<2332:22:16, 8.51s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [13482], local_loss=0.0179352518171072, train_loss=0.02694975584745407, time_cost=3.042612075805664
+
Steps: 1%|▏ | 13482/1000000 [1:14:06<2332:22:16, 8.51s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 13483/1000000 [1:14:18<2542:20:25, 9.28s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [13483], local_loss=0.05038916692137718, train_loss=26.54259490966797, time_cost=4.580181837081909
+
Steps: 1%|▏ | 13483/1000000 [1:14:18<2542:20:25, 9.28s/it, lr=1e-5, step_loss=0.0504]|
|
/
|
|
/
|
Steps: 1%|▏ | 13484/1000000 [1:14:28<2607:09:08, 9.51s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [13484], local_loss=0.006762497592717409, train_loss=0.050267599523067474, time_cost=1.2682929039001465
+
Steps: 1%|▏ | 13484/1000000 [1:14:28<2607:09:08, 9.51s/it, lr=1e-5, step_loss=0.00676]/
/
-
/
|
/
-
/
Steps: 1%|▏ | 13485/1000000 [1:14:41<2942:37:54, 10.74s/it, lr=1e-5, step_loss=0.00676][RANK-0]: Step: [13485], local_loss=0.08640918135643005, train_loss=0.02978549711406231, time_cost=2.5366547107696533
+
Steps: 1%|▏ | 13485/1000000 [1:14:41<2942:37:54, 10.74s/it, lr=1e-5, step_loss=0.0864]
Steps: 1%|▏ | 13486/1000000 [1:14:58<3404:41:30, 12.42s/it, lr=1e-5, step_loss=0.0864][RANK-0]: Step: [13486], local_loss=0.019935771822929382, train_loss=0.017678361386060715, time_cost=8.261767864227295
+
Steps: 1%|▏ | 13486/1000000 [1:14:58<3404:41:30, 12.42s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%|▏ | 13487/1000000 [1:15:04<2920:20:35, 10.66s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [13487], local_loss=1.0003952980041504, train_loss=0.13789281249046326, time_cost=1.2041471004486084
+
Steps: 1%|▏ | 13487/1000000 [1:15:04<2920:20:35, 10.66s/it, lr=1e-5, step_loss=1]
Steps: 1%|▏ | 13488/1000000 [1:15:10<2531:25:11, 9.24s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [13488], local_loss=0.02293170429766178, train_loss=0.04117806255817413, time_cost=1.531876802444458
+
Steps: 1%|▏ | 13488/1000000 [1:15:10<2531:25:11, 9.24s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%|▏ | 13489/1000000 [1:15:21<2691:03:57, 9.82s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [13489], local_loss=0.12880438566207886, train_loss=0.05084838718175888, time_cost=1.7156214714050293
+
Steps: 1%|▏ | 13489/1000000 [1:15:21<2691:03:57, 9.82s/it, lr=1e-5, step_loss=0.129]
Steps: 1%|▏ | 13490/1000000 [1:15:29<2528:20:17, 9.23s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [13490], local_loss=0.01214587315917015, train_loss=0.020175890997052193, time_cost=1.5786104202270508
+
Steps: 1%|▏ | 13490/1000000 [1:15:29<2528:20:17, 9.23s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 13491/1000000 [1:15:46<3178:33:38, 11.60s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [13491], local_loss=0.034356843680143356, train_loss=0.03220405802130699, time_cost=9.639241933822632
+
Steps: 1%|▏ | 13491/1000000 [1:15:46<3178:33:38, 11.60s/it, lr=1e-5, step_loss=0.0344]
Steps: 1%|▏ | 13492/1000000 [1:15:51<2634:27:47, 9.61s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [13492], local_loss=0.010254329070448875, train_loss=0.02390408329665661, time_cost=2.0443167686462402
+
Steps: 1%|▏ | 13492/1000000 [1:15:51<2634:27:47, 9.61s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%|▏ | 13493/1000000 [1:15:57<2327:08:55, 8.49s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [13493], local_loss=0.043622273951768875, train_loss=0.03242401033639908, time_cost=1.3295061588287354
+
Steps: 1%|▏ | 13493/1000000 [1:15:57<2327:08:55, 8.49s/it, lr=1e-5, step_loss=0.0436]
Steps: 1%|▏ | 13494/1000000 [1:16:04<2196:56:58, 8.02s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [13494], local_loss=0.20824414491653442, train_loss=0.08121237903833389, time_cost=3.8494186401367188
+
Steps: 1%|▏ | 13494/1000000 [1:16:04<2196:56:58, 8.02s/it, lr=1e-5, step_loss=0.208]
Steps: 1%|▏ | 13495/1000000 [1:16:16<2515:56:45, 9.18s/it, lr=1e-5, step_loss=0.208][RANK-0]: Step: [13495], local_loss=0.07659745961427689, train_loss=0.06904137134552002, time_cost=3.7649996280670166
+
Steps: 1%|▏ | 13495/1000000 [1:16:16<2515:56:45, 9.18s/it, lr=1e-5, step_loss=0.0766]
Steps: 1%|▏ | 13496/1000000 [1:16:23<2331:43:17, 8.51s/it, lr=1e-5, step_loss=0.0766][RANK-0]: Step: [13496], local_loss=0.01557248830795288, train_loss=0.02028432860970497, time_cost=2.046816349029541
+
Steps: 1%|▏ | 13496/1000000 [1:16:23<2331:43:17, 8.51s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%|▏ | 13497/1000000 [1:16:36<2689:20:17, 9.81s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [13497], local_loss=0.012823191471397877, train_loss=0.06567222625017166, time_cost=1.279653549194336
+
Steps: 1%|▏ | 13497/1000000 [1:16:36<2689:20:17, 9.81s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%|▏ | 13498/1000000 [1:16:45<2627:42:02, 9.59s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [13498], local_loss=0.014527659863233566, train_loss=0.04903150349855423, time_cost=2.826890707015991
+
Steps: 1%|▏ | 13498/1000000 [1:16:45<2627:42:02, 9.59s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%|▏ | 13499/1000000 [1:16:58<2926:19:42, 10.68s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [13499], local_loss=0.028670791536569595, train_loss=0.0602911040186882, time_cost=2.7523529529571533
+
Steps: 1%|▏ | 13499/1000000 [1:16:58<2926:19:42, 10.68s/it, lr=1e-5, step_loss=0.0287]
Steps: 1%|▏ | 13500/1000000 [1:17:09<2949:10:14, 10.76s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [13500], local_loss=0.07678918540477753, train_loss=0.03560516983270645, time_cost=2.3127293586730957
+
Steps: 1%|▏ | 13500/1000000 [1:17:09<2949:10:14, 10.76s/it, lr=1e-5, step_loss=0.0768]
Steps: 1%|▏ | 13501/1000000 [1:17:20<2970:32:11, 10.84s/it, lr=1e-5, step_loss=0.0768][RANK-0]: Step: [13501], local_loss=0.018153242766857147, train_loss=0.02459515631198883, time_cost=3.625596761703491
+
Steps: 1%|▏ | 13501/1000000 [1:17:20<2970:32:11, 10.84s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%|▏ | 13502/1000000 [1:17:25<2520:15:09, 9.20s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [13502], local_loss=0.06076524034142494, train_loss=0.18084953725337982, time_cost=1.9896702766418457
+
Steps: 1%|▏ | 13502/1000000 [1:17:25<2520:15:09, 9.20s/it, lr=1e-5, step_loss=0.0608]
Steps: 1%|▏ | 13503/1000000 [1:17:32<2291:23:15, 8.36s/it, lr=1e-5, step_loss=0.0608][RANK-0]: Step: [13503], local_loss=0.023337136954069138, train_loss=0.040236085653305054, time_cost=1.2813355922698975
+
Steps: 1%|▏ | 13503/1000000 [1:17:32<2291:23:15, 8.36s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%|▏ | 13504/1000000 [1:17:41<2393:07:24, 8.73s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [13504], local_loss=0.021319802850484848, train_loss=0.1643945425748825, time_cost=3.3399975299835205
+
Steps: 1%|▏ | 13504/1000000 [1:17:41<2393:07:24, 8.73s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%|▏ | 13505/1000000 [1:17:47<2175:50:21, 7.94s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [13505], local_loss=0.05695595592260361, train_loss=0.03901471197605133, time_cost=1.6883418560028076
+
Steps: 1%|▏ | 13505/1000000 [1:17:47<2175:50:21, 7.94s/it, lr=1e-5, step_loss=0.057]
Steps: 1%|▏ | 13506/1000000 [1:17:59<2472:08:18, 9.02s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [13506], local_loss=0.04263490065932274, train_loss=0.017678624019026756, time_cost=2.260347843170166
+
Steps: 1%|▏ | 13506/1000000 [1:17:59<2472:08:18, 9.02s/it, lr=1e-5, step_loss=0.0426]
Steps: 1%|▏ | 13507/1000000 [1:18:03<2074:30:00, 7.57s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [13507], local_loss=0.10694895684719086, train_loss=0.04344388097524643, time_cost=1.2790753841400146
+
Steps: 1%|▏ | 13507/1000000 [1:18:03<2074:30:00, 7.57s/it, lr=1e-5, step_loss=0.107]
Steps: 1%|▏ | 13508/1000000 [1:18:10<2010:32:02, 7.34s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [13508], local_loss=0.02612791582942009, train_loss=0.10584457218647003, time_cost=2.1338212490081787
+
Steps: 1%|▏ | 13508/1000000 [1:18:10<2010:32:02, 7.34s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%|▏ | 13509/1000000 [1:18:15<1828:27:38, 6.67s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [13509], local_loss=0.007091263309121132, train_loss=0.15227589011192322, time_cost=2.293788194656372
+
Steps: 1%|▏ | 13509/1000000 [1:18:15<1828:27:38, 6.67s/it, lr=1e-5, step_loss=0.00709]
Steps: 1%|▏ | 13510/1000000 [1:18:20<1717:24:38, 6.27s/it, lr=1e-5, step_loss=0.00709][RANK-0]: Step: [13510], local_loss=0.08214671909809113, train_loss=0.07367546856403351, time_cost=3.207035541534424
+
Steps: 1%|▏ | 13510/1000000 [1:18:20<1717:24:38, 6.27s/it, lr=1e-5, step_loss=0.0821]
Steps: 1%|▏ | 13511/1000000 [1:18:28<1819:42:48, 6.64s/it, lr=1e-5, step_loss=0.0821][RANK-0]: Step: [13511], local_loss=0.08467753976583481, train_loss=18.818492889404297, time_cost=1.6404848098754883
+
Steps: 1%|▏ | 13511/1000000 [1:18:28<1819:42:48, 6.64s/it, lr=1e-5, step_loss=0.0847]
Steps: 1%|▏ | 13512/1000000 [1:18:34<1805:24:13, 6.59s/it, lr=1e-5, step_loss=0.0847][RANK-0]: Step: [13512], local_loss=0.0378742553293705, train_loss=0.01885201595723629, time_cost=2.7032949924468994
+
Steps: 1%|▏ | 13512/1000000 [1:18:34<1805:24:13, 6.59s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%|▏ | 13513/1000000 [1:18:40<1692:58:02, 6.18s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [13513], local_loss=0.021657461300492287, train_loss=0.030979469418525696, time_cost=1.2116124629974365
+
Steps: 1%|▏ | 13513/1000000 [1:18:40<1692:58:02, 6.18s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%|▏ | 13514/1000000 [1:18:50<2010:28:00, 7.34s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [13514], local_loss=0.09828627109527588, train_loss=0.0457051545381546, time_cost=4.240977048873901
+
Steps: 1%|▏ | 13514/1000000 [1:18:50<2010:28:00, 7.34s/it, lr=1e-5, step_loss=0.0983]
Steps: 1%|▏ | 13515/1000000 [1:18:55<1844:35:45, 6.73s/it, lr=1e-5, step_loss=0.0983][RANK-0]: Step: [13515], local_loss=0.05152630805969238, train_loss=0.03728126734495163, time_cost=2.2323992252349854
+
Steps: 1%|▏ | 13515/1000000 [1:18:55<1844:35:45, 6.73s/it, lr=1e-5, step_loss=0.0515]
Steps: 1%|▏ | 13516/1000000 [1:18:59<1650:33:30, 6.02s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [13516], local_loss=0.010977016761898994, train_loss=0.12276202440261841, time_cost=1.4882874488830566
+
Steps: 1%|▏ | 13516/1000000 [1:18:59<1650:33:30, 6.02s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 13517/1000000 [1:19:05<1647:27:40, 6.01s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [13517], local_loss=0.024564852938055992, train_loss=0.03463781625032425, time_cost=1.671367883682251
+
Steps: 1%|▏ | 13517/1000000 [1:19:05<1647:27:40, 6.01s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%|▏ | 13518/1000000 [1:19:15<1941:39:23, 7.09s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [13518], local_loss=0.0070060668513178825, train_loss=0.10591447353363037, time_cost=2.1797521114349365
+
Steps: 1%|▏ | 13518/1000000 [1:19:15<1941:39:23, 7.09s/it, lr=1e-5, step_loss=0.00701]
Steps: 1%|▏ | 13519/1000000 [1:19:24<2121:07:20, 7.74s/it, lr=1e-5, step_loss=0.00701][RANK-0]: Step: [13519], local_loss=0.06387892365455627, train_loss=0.05861004814505577, time_cost=3.2255077362060547
+
Steps: 1%|▏ | 13519/1000000 [1:19:24<2121:07:20, 7.74s/it, lr=1e-5, step_loss=0.0639]
Steps: 1%|▏ | 13520/1000000 [1:19:30<1980:23:59, 7.23s/it, lr=1e-5, step_loss=0.0639][RANK-0]: Step: [13520], local_loss=0.012081848457455635, train_loss=0.027945848181843758, time_cost=1.3186514377593994
+
Steps: 1%|▏ | 13520/1000000 [1:19:30<1980:23:59, 7.23s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 13521/1000000 [1:19:37<1970:41:20, 7.19s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [13521], local_loss=0.009671962819993496, train_loss=0.024932604283094406, time_cost=2.8769123554229736
+
Steps: 1%|▏ | 13521/1000000 [1:19:37<1970:41:20, 7.19s/it, lr=1e-5, step_loss=0.00967]
Steps: 1%|▏ | 13522/1000000 [1:19:48<2263:02:59, 8.26s/it, lr=1e-5, step_loss=0.00967][RANK-0]: Step: [13522], local_loss=0.01791446842253208, train_loss=0.0479484423995018, time_cost=5.3638739585876465
+
Steps: 1%|▏ | 13522/1000000 [1:19:48<2263:02:59, 8.26s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 13523/1000000 [1:19:55<2190:32:24, 7.99s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [13523], local_loss=0.08896106481552124, train_loss=0.2303117960691452, time_cost=1.5920085906982422
+
Steps: 1%|▏ | 13523/1000000 [1:19:55<2190:32:24, 7.99s/it, lr=1e-5, step_loss=0.089]
Steps: 1%|▏ | 13524/1000000 [1:20:05<2309:44:02, 8.43s/it, lr=1e-5, step_loss=0.089][RANK-0]: Step: [13524], local_loss=0.061960369348526, train_loss=0.06470860540866852, time_cost=3.4117941856384277
+
Steps: 1%|▏ | 13524/1000000 [1:20:05<2309:44:02, 8.43s/it, lr=1e-5, step_loss=0.062]
Steps: 1%|▏ | 13525/1000000 [1:20:10<2043:56:00, 7.46s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [13525], local_loss=0.05926484242081642, train_loss=0.04549054056406021, time_cost=2.0794193744659424
+
Steps: 1%|▏ | 13525/1000000 [1:20:10<2043:56:00, 7.46s/it, lr=1e-5, step_loss=0.0593]
Steps: 1%|▏ | 13526/1000000 [1:20:21<2346:51:00, 8.56s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [13526], local_loss=0.015361077152192593, train_loss=0.07941398024559021, time_cost=1.904160499572754
+
Steps: 1%|▏ | 13526/1000000 [1:20:21<2346:51:00, 8.56s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%|▏ | 13527/1000000 [1:20:34<2675:15:34, 9.76s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [13527], local_loss=0.05643894150853157, train_loss=0.06736813485622406, time_cost=2.4665770530700684
+
Steps: 1%|▏ | 13527/1000000 [1:20:34<2675:15:34, 9.76s/it, lr=1e-5, step_loss=0.0564]
Steps: 1%|▏ | 13528/1000000 [1:20:44<2697:07:21, 9.84s/it, lr=1e-5, step_loss=0.0564][RANK-0]: Step: [13528], local_loss=0.007376055233180523, train_loss=0.038642510771751404, time_cost=3.5097222328186035
+
Steps: 1%|▏ | 13528/1000000 [1:20:44<2697:07:21, 9.84s/it, lr=1e-5, step_loss=0.00738]
Steps: 1%|▏ | 13529/1000000 [1:20:57<3007:19:19, 10.97s/it, lr=1e-5, step_loss=0.00738][RANK-0]: Step: [13529], local_loss=0.024697743356227875, train_loss=0.34595799446105957, time_cost=5.433651447296143
+
Steps: 1%|▏ | 13529/1000000 [1:20:57<3007:19:19, 10.97s/it, lr=1e-5, step_loss=0.0247]
Steps: 1%|▏ | 13530/1000000 [1:21:11<3201:32:34, 11.68s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [13530], local_loss=0.00617164745926857, train_loss=0.0214274600148201, time_cost=3.0830891132354736
+
Steps: 1%|▏ | 13530/1000000 [1:21:11<3201:32:34, 11.68s/it, lr=1e-5, step_loss=0.00617]
Steps: 1%|▏ | 13531/1000000 [1:21:16<2668:12:09, 9.74s/it, lr=1e-5, step_loss=0.00617][RANK-0]: Step: [13531], local_loss=0.294655978679657, train_loss=0.10106628388166428, time_cost=1.2993834018707275
+
Steps: 1%|▏ | 13531/1000000 [1:21:16<2668:12:09, 9.74s/it, lr=1e-5, step_loss=0.295]
Steps: 1%|▏ | 13532/1000000 [1:21:27<2768:56:30, 10.10s/it, lr=1e-5, step_loss=0.295][RANK-0]: Step: [13532], local_loss=0.044595953077077866, train_loss=0.06720365583896637, time_cost=2.8912997245788574
+
Steps: 1%|▏ | 13532/1000000 [1:21:27<2768:56:30, 10.10s/it, lr=1e-5, step_loss=0.0446]
Steps: 1%|▏ | 13533/1000000 [1:21:31<2295:51:54, 8.38s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [13533], local_loss=0.008885572664439678, train_loss=0.042855486273765564, time_cost=1.2913358211517334
+
Steps: 1%|▏ | 13533/1000000 [1:21:31<2295:51:54, 8.38s/it, lr=1e-5, step_loss=0.00889]
Steps: 1%|▏ | 13534/1000000 [1:21:45<2714:21:51, 9.91s/it, lr=1e-5, step_loss=0.00889][RANK-0]: Step: [13534], local_loss=0.29790836572647095, train_loss=0.08268847316503525, time_cost=4.0549821853637695
+
Steps: 1%|▏ | 13534/1000000 [1:21:45<2714:21:51, 9.91s/it, lr=1e-5, step_loss=0.298]
Steps: 1%|▏ | 13535/1000000 [1:21:56<2840:09:08, 10.36s/it, lr=1e-5, step_loss=0.298][RANK-0]: Step: [13535], local_loss=0.010866482742130756, train_loss=0.04164379462599754, time_cost=3.690647840499878
+
Steps: 1%|▏ | 13535/1000000 [1:21:56<2840:09:08, 10.36s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 13536/1000000 [1:22:06<2828:19:05, 10.32s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [13536], local_loss=0.10009956359863281, train_loss=0.03721529245376587, time_cost=4.6367878913879395
+
Steps: 1%|▏ | 13536/1000000 [1:22:06<2828:19:05, 10.32s/it, lr=1e-5, step_loss=0.1]
Steps: 1%|▏ | 13537/1000000 [1:22:12<2426:12:09, 8.85s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [13537], local_loss=0.007584104780107737, train_loss=0.03518504649400711, time_cost=2.145461320877075
+
Steps: 1%|▏ | 13537/1000000 [1:22:12<2426:12:09, 8.85s/it, lr=1e-5, step_loss=0.00758]
Steps: 1%|▏ | 13538/1000000 [1:22:19<2257:07:10, 8.24s/it, lr=1e-5, step_loss=0.00758][RANK-0]: Step: [13538], local_loss=0.01852034218609333, train_loss=16.444406509399414, time_cost=2.147953987121582
+
Steps: 1%|▏ | 13538/1000000 [1:22:19<2257:07:10, 8.24s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%|▏ | 13539/1000000 [1:22:23<1961:22:14, 7.16s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [13539], local_loss=0.010879885405302048, train_loss=0.0403568372130394, time_cost=3.5872716903686523
+
Steps: 1%|▏ | 13539/1000000 [1:22:23<1961:22:14, 7.16s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 13540/1000000 [1:22:34<2279:51:05, 8.32s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [13540], local_loss=0.03739374503493309, train_loss=0.03591972216963768, time_cost=1.882869005203247
+
Steps: 1%|▏ | 13540/1000000 [1:22:34<2279:51:05, 8.32s/it, lr=1e-5, step_loss=0.0374]
Steps: 1%|▏ | 13541/1000000 [1:22:43<2337:10:17, 8.53s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [13541], local_loss=0.012886463664472103, train_loss=0.07567234337329865, time_cost=6.588395357131958
+
Steps: 1%|▏ | 13541/1000000 [1:22:43<2337:10:17, 8.53s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 13542/1000000 [1:22:53<2399:17:42, 8.76s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [13542], local_loss=0.08386404812335968, train_loss=0.056719645857810974, time_cost=3.0393552780151367
+
Steps: 1%|▏ | 13542/1000000 [1:22:53<2399:17:42, 8.76s/it, lr=1e-5, step_loss=0.0839]
Steps: 1%|▏ | 13543/1000000 [1:22:58<2111:35:54, 7.71s/it, lr=1e-5, step_loss=0.0839][RANK-0]: Step: [13543], local_loss=0.09219960123300552, train_loss=0.041891910135746, time_cost=3.1553447246551514
+
Steps: 1%|▏ | 13543/1000000 [1:22:58<2111:35:54, 7.71s/it, lr=1e-5, step_loss=0.0922]
Steps: 1%|▏ | 13544/1000000 [1:23:10<2479:35:19, 9.05s/it, lr=1e-5, step_loss=0.0922][RANK-0]: Step: [13544], local_loss=0.04662865027785301, train_loss=0.04670669883489609, time_cost=1.210752010345459
+
Steps: 1%|▏ | 13544/1000000 [1:23:10<2479:35:19, 9.05s/it, lr=1e-5, step_loss=0.0466]
Steps: 1%|▏ | 13545/1000000 [1:23:16<2210:22:42, 8.07s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [13545], local_loss=0.022356124594807625, train_loss=0.02857118658721447, time_cost=3.0753180980682373
+
Steps: 1%|▏ | 13545/1000000 [1:23:16<2210:22:42, 8.07s/it, lr=1e-5, step_loss=0.0224]
Steps: 1%|▏ | 13546/1000000 [1:23:22<2056:51:06, 7.51s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [13546], local_loss=0.01905054599046707, train_loss=0.0399710014462471, time_cost=2.616689443588257
+
Steps: 1%|▏ | 13546/1000000 [1:23:22<2056:51:06, 7.51s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%|▏ | 13547/1000000 [1:23:30<2127:19:43, 7.76s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [13547], local_loss=0.04478845372796059, train_loss=0.0734330415725708, time_cost=2.164574384689331
+
Steps: 1%|▏ | 13547/1000000 [1:23:30<2127:19:43, 7.76s/it, lr=1e-5, step_loss=0.0448]
Steps: 1%|▏ | 13548/1000000 [1:23:42<2440:51:18, 8.91s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [13548], local_loss=0.02180783450603485, train_loss=0.06082848459482193, time_cost=5.007571697235107
+
Steps: 1%|▏ | 13548/1000000 [1:23:42<2440:51:18, 8.91s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%|▏ | 13549/1000000 [1:23:55<2766:12:33, 10.10s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [13549], local_loss=0.012904969044029713, train_loss=0.12385768443346024, time_cost=6.0099828243255615
+
Steps: 1%|▏ | 13549/1000000 [1:23:55<2766:12:33, 10.10s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 13550/1000000 [1:24:00<2360:40:05, 8.62s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [13550], local_loss=0.10364852845668793, train_loss=0.043477773666381836, time_cost=1.2643215656280518
+
Steps: 1%|▏ | 13550/1000000 [1:24:00<2360:40:05, 8.62s/it, lr=1e-5, step_loss=0.104]
Steps: 1%|▏ | 13551/1000000 [1:24:14<2795:02:06, 10.20s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [13551], local_loss=0.03309439495205879, train_loss=0.030365465208888054, time_cost=3.3470065593719482
+
Steps: 1%|▏ | 13551/1000000 [1:24:14<2795:02:06, 10.20s/it, lr=1e-5, step_loss=0.0331]
Steps: 1%|▏ | 13552/1000000 [1:24:24<2828:49:58, 10.32s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [13552], local_loss=0.5384688973426819, train_loss=0.11389186978340149, time_cost=1.930778980255127
+
Steps: 1%|▏ | 13552/1000000 [1:24:24<2828:49:58, 10.32s/it, lr=1e-5, step_loss=0.538]
Steps: 1%|▏ | 13553/1000000 [1:24:30<2431:57:34, 8.88s/it, lr=1e-5, step_loss=0.538][RANK-0]: Step: [13553], local_loss=0.010410747490823269, train_loss=0.030582236126065254, time_cost=2.441148042678833
+
Steps: 1%|▏ | 13553/1000000 [1:24:30<2431:57:34, 8.88s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 13554/1000000 [1:24:37<2278:02:52, 8.31s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [13554], local_loss=0.015203110873699188, train_loss=0.14712998270988464, time_cost=1.917569637298584
+
Steps: 1%|▏ | 13554/1000000 [1:24:37<2278:02:52, 8.31s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%|▏ | 13555/1000000 [1:24:49<2569:17:56, 9.38s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [13555], local_loss=0.06938642263412476, train_loss=0.07082715630531311, time_cost=1.6910622119903564
+
Steps: 1%|▏ | 13555/1000000 [1:24:49<2569:17:56, 9.38s/it, lr=1e-5, step_loss=0.0694]
Steps: 1%|▏ | 13556/1000000 [1:24:54<2233:55:59, 8.15s/it, lr=1e-5, step_loss=0.0694][RANK-0]: Step: [13556], local_loss=0.017687227576971054, train_loss=0.029077846556901932, time_cost=2.021012783050537
+
Steps: 1%|▏ | 13556/1000000 [1:24:54<2233:55:59, 8.15s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%|▏ | 13557/1000000 [1:25:03<2310:58:08, 8.43s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [13557], local_loss=0.006569851189851761, train_loss=0.034569986164569855, time_cost=3.7972843647003174
+
Steps: 1%|▏ | 13557/1000000 [1:25:03<2310:58:08, 8.43s/it, lr=1e-5, step_loss=0.00657]
Steps: 1%|▏ | 13558/1000000 [1:25:16<2684:01:58, 9.80s/it, lr=1e-5, step_loss=0.00657][RANK-0]: Step: [13558], local_loss=0.006246811710298061, train_loss=0.02183774672448635, time_cost=5.08479642868042
+
Steps: 1%|▏ | 13558/1000000 [1:25:16<2684:01:58, 9.80s/it, lr=1e-5, step_loss=0.00625]
Steps: 1%|▏ | 13559/1000000 [1:25:23<2454:13:48, 8.96s/it, lr=1e-5, step_loss=0.00625][RANK-0]: Step: [13559], local_loss=0.020867537707090378, train_loss=0.0643685832619667, time_cost=2.6174442768096924
+
Steps: 1%|▏ | 13559/1000000 [1:25:23<2454:13:48, 8.96s/it, lr=1e-5, step_loss=0.0209]
Steps: 1%|▏ | 13560/1000000 [1:25:38<2930:07:42, 10.69s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [13560], local_loss=0.047534916549921036, train_loss=0.028591744601726532, time_cost=1.2117328643798828
+
Steps: 1%|▏ | 13560/1000000 [1:25:38<2930:07:42, 10.69s/it, lr=1e-5, step_loss=0.0475]
Steps: 1%|▏ | 13561/1000000 [1:25:46<2733:52:16, 9.98s/it, lr=1e-5, step_loss=0.0475][RANK-0]: Step: [13561], local_loss=0.07666411995887756, train_loss=0.04567226022481918, time_cost=2.289416790008545
+
Steps: 1%|▏ | 13561/1000000 [1:25:46<2733:52:16, 9.98s/it, lr=1e-5, step_loss=0.0767]
Steps: 1%|▏ | 13562/1000000 [1:25:53<2497:19:05, 9.11s/it, lr=1e-5, step_loss=0.0767][RANK-0]: Step: [13562], local_loss=0.057612281292676926, train_loss=0.04583393782377243, time_cost=2.5697951316833496
+
Steps: 1%|▏ | 13562/1000000 [1:25:53<2497:19:05, 9.11s/it, lr=1e-5, step_loss=0.0576]
Steps: 1%|▏ | 13563/1000000 [1:26:04<2594:59:09, 9.47s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [13563], local_loss=0.008948611095547676, train_loss=0.03311866149306297, time_cost=4.8264970779418945
+
Steps: 1%|▏ | 13563/1000000 [1:26:04<2594:59:09, 9.47s/it, lr=1e-5, step_loss=0.00895]
Steps: 1%|▏ | 13564/1000000 [1:26:14<2675:39:37, 9.76s/it, lr=1e-5, step_loss=0.00895][RANK-0]: Step: [13564], local_loss=0.022727997973561287, train_loss=0.04403404891490936, time_cost=2.8713760375976562
+
Steps: 1%|▏ | 13564/1000000 [1:26:14<2675:39:37, 9.76s/it, lr=1e-5, step_loss=0.0227]
Steps: 1%|▏ | 13565/1000000 [1:26:22<2510:10:44, 9.16s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [13565], local_loss=0.010449697263538837, train_loss=0.034252382814884186, time_cost=3.03513765335083
+
Steps: 1%|▏ | 13565/1000000 [1:26:22<2510:10:44, 9.16s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 13566/1000000 [1:26:29<2322:21:47, 8.48s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [13566], local_loss=0.009560049511492252, train_loss=0.0808768942952156, time_cost=2.2973482608795166
+
Steps: 1%|▏ | 13566/1000000 [1:26:29<2322:21:47, 8.48s/it, lr=1e-5, step_loss=0.00956]
Steps: 1%|▏ | 13567/1000000 [1:26:42<2694:55:44, 9.84s/it, lr=1e-5, step_loss=0.00956][RANK-0]: Step: [13567], local_loss=0.011594499461352825, train_loss=0.038776181638240814, time_cost=5.175064325332642
+
Steps: 1%|▏ | 13567/1000000 [1:26:42<2694:55:44, 9.84s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 13568/1000000 [1:26:53<2853:47:08, 10.41s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [13568], local_loss=0.007312936708331108, train_loss=0.02666979655623436, time_cost=3.4778354167938232
+
Steps: 1%|▏ | 13568/1000000 [1:26:53<2853:47:08, 10.41s/it, lr=1e-5, step_loss=0.00731]
Steps: 1%|▏ | 13569/1000000 [1:27:05<2924:25:31, 10.67s/it, lr=1e-5, step_loss=0.00731][RANK-0]: Step: [13569], local_loss=0.02685626968741417, train_loss=0.05848403647542, time_cost=1.2518281936645508
+
Steps: 1%|▏ | 13569/1000000 [1:27:05<2924:25:31, 10.67s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%|▏ | 13570/1000000 [1:27:14<2770:12:28, 10.11s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [13570], local_loss=0.1368994265794754, train_loss=0.03895565867424011, time_cost=2.6095175743103027
+
Steps: 1%|▏ | 13570/1000000 [1:27:14<2770:12:28, 10.11s/it, lr=1e-5, step_loss=0.137]
Steps: 1%|▏ | 13571/1000000 [1:27:27<3080:04:27, 11.24s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [13571], local_loss=0.020042551681399345, train_loss=0.02258184924721718, time_cost=1.24021577835083
+
Steps: 1%|▏ | 13571/1000000 [1:27:27<3080:04:27, 11.24s/it, lr=1e-5, step_loss=0.02]
Steps: 1%|▏ | 13572/1000000 [1:27:40<3156:42:22, 11.52s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [13572], local_loss=0.08115807175636292, train_loss=0.07391180098056793, time_cost=2.524873733520508
+
Steps: 1%|▏ | 13572/1000000 [1:27:40<3156:42:22, 11.52s/it, lr=1e-5, step_loss=0.0812]
Steps: 1%|▏ | 13573/1000000 [1:27:51<3174:23:22, 11.59s/it, lr=1e-5, step_loss=0.0812][RANK-0]: Step: [13573], local_loss=0.011445179581642151, train_loss=0.03889039158821106, time_cost=1.3392577171325684
+
Steps: 1%|▏ | 13573/1000000 [1:27:51<3174:23:22, 11.59s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%|▏ | 13574/1000000 [1:28:00<2975:33:20, 10.86s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [13574], local_loss=0.13295848667621613, train_loss=0.06316057592630386, time_cost=2.0233309268951416
+
Steps: 1%|▏ | 13574/1000000 [1:28:00<2975:33:20, 10.86s/it, lr=1e-5, step_loss=0.133]
Steps: 1%|▏ | 13575/1000000 [1:28:09<2768:52:14, 10.11s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [13575], local_loss=0.016432328149676323, train_loss=0.029164109379053116, time_cost=2.8716626167297363
+
Steps: 1%|▏ | 13575/1000000 [1:28:09<2768:52:14, 10.11s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%|▏ | 13576/1000000 [1:28:15<2470:15:52, 9.02s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [13576], local_loss=0.050703052431344986, train_loss=0.0404706709086895, time_cost=1.3780603408813477
+
Steps: 1%|▏ | 13576/1000000 [1:28:15<2470:15:52, 9.02s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%|▏ | 13577/1000000 [1:28:29<2829:25:27, 10.33s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [13577], local_loss=0.0406213104724884, train_loss=0.07380657643079758, time_cost=5.398955583572388
+
Steps: 1%|▏ | 13577/1000000 [1:28:29<2829:25:27, 10.33s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%|▏ | 13578/1000000 [1:28:44<3203:09:44, 11.69s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [13578], local_loss=0.02099902182817459, train_loss=0.04503826051950455, time_cost=4.338331699371338
+
Steps: 1%|▏ | 13578/1000000 [1:28:44<3203:09:44, 11.69s/it, lr=1e-5, step_loss=0.021]
Steps: 1%|▏ | 13579/1000000 [1:28:48<2616:12:25, 9.55s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [13579], local_loss=0.07014869898557663, train_loss=0.04097539186477661, time_cost=1.416116714477539
+
Steps: 1%|▏ | 13579/1000000 [1:28:48<2616:12:25, 9.55s/it, lr=1e-5, step_loss=0.0701]
Steps: 1%|▏ | 13580/1000000 [1:28:57<2561:19:36, 9.35s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [13580], local_loss=0.02893897332251072, train_loss=0.05373413860797882, time_cost=3.6566107273101807
+
Steps: 1%|▏ | 13580/1000000 [1:28:57<2561:19:36, 9.35s/it, lr=1e-5, step_loss=0.0289]
Steps: 1%|▏ | 13581/1000000 [1:29:08<2733:13:43, 9.98s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [13581], local_loss=0.05747728794813156, train_loss=0.06227482855319977, time_cost=8.217947483062744
+
Steps: 1%|▏ | 13581/1000000 [1:29:08<2733:13:43, 9.98s/it, lr=1e-5, step_loss=0.0575]
Steps: 1%|▏ | 13582/1000000 [1:29:17<2591:21:38, 9.46s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [13582], local_loss=0.23849596083164215, train_loss=0.07489500939846039, time_cost=1.2329163551330566
+
Steps: 1%|▏ | 13582/1000000 [1:29:17<2591:21:38, 9.46s/it, lr=1e-5, step_loss=0.238]
Steps: 1%|▏ | 13583/1000000 [1:29:30<2916:17:55, 10.64s/it, lr=1e-5, step_loss=0.238][RANK-0]: Step: [13583], local_loss=0.01086368877440691, train_loss=0.02826532907783985, time_cost=3.920062780380249
+
Steps: 1%|▏ | 13583/1000000 [1:29:30<2916:17:55, 10.64s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 13584/1000000 [1:29:34<2397:40:30, 8.75s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [13584], local_loss=0.019175589084625244, train_loss=0.036905523389577866, time_cost=1.2894947528839111
+
Steps: 1%|▏ | 13584/1000000 [1:29:34<2397:40:30, 8.75s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%|▏ | 13585/1000000 [1:29:48<2794:04:11, 10.20s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [13585], local_loss=0.02734476327896118, train_loss=0.06398456543684006, time_cost=1.2342252731323242
+
Steps: 1%|▏ | 13585/1000000 [1:29:48<2794:04:11, 10.20s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%|▏ | 13586/1000000 [1:29:58<2743:53:26, 10.01s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [13586], local_loss=0.9979109764099121, train_loss=0.15505920350551605, time_cost=1.2731409072875977
+
Steps: 1%|▏ | 13586/1000000 [1:29:58<2743:53:26, 10.01s/it, lr=1e-5, step_loss=0.998]
Steps: 1%|▏ | 13587/1000000 [1:30:05<2506:53:06, 9.15s/it, lr=1e-5, step_loss=0.998][RANK-0]: Step: [13587], local_loss=0.03649694845080376, train_loss=0.028935490176081657, time_cost=2.143374443054199
+
Steps: 1%|▏ | 13587/1000000 [1:30:05<2506:53:06, 9.15s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%|▏ | 13588/1000000 [1:30:17<2737:09:27, 9.99s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [13588], local_loss=0.040028300136327744, train_loss=0.028529725968837738, time_cost=4.069651365280151
+
Steps: 1%|▏ | 13588/1000000 [1:30:17<2737:09:27, 9.99s/it, lr=1e-5, step_loss=0.04]
Steps: 1%|▏ | 13589/1000000 [1:30:24<2547:57:25, 9.30s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [13589], local_loss=0.020380057394504547, train_loss=0.07821429520845413, time_cost=1.8310763835906982
+
Steps: 1%|▏ | 13589/1000000 [1:30:24<2547:57:25, 9.30s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 13590/1000000 [1:30:40<3100:01:51, 11.31s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [13590], local_loss=0.014412648044526577, train_loss=0.15341705083847046, time_cost=6.944220066070557
+
Steps: 1%|▏ | 13590/1000000 [1:30:40<3100:01:51, 11.31s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 13591/1000000 [1:30:51<3021:03:05, 11.03s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [13591], local_loss=0.015563737601041794, train_loss=0.012792417779564857, time_cost=5.2547996044158936
+
Steps: 1%|▏ | 13591/1000000 [1:30:51<3021:03:05, 11.03s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%|▏ | 13592/1000000 [1:30:59<2798:08:38, 10.21s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [13592], local_loss=0.030842158943414688, train_loss=0.03595426306128502, time_cost=6.1790430545806885
+
Steps: 1%|▏ | 13592/1000000 [1:30:59<2798:08:38, 10.21s/it, lr=1e-5, step_loss=0.0308]
Steps: 1%|▏ | 13593/1000000 [1:31:04<2385:40:32, 8.71s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [13593], local_loss=0.061168041080236435, train_loss=0.03631541132926941, time_cost=2.5859692096710205
+
Steps: 1%|▏ | 13593/1000000 [1:31:04<2385:40:32, 8.71s/it, lr=1e-5, step_loss=0.0612]
Steps: 1%|▏ | 13594/1000000 [1:31:11<2245:57:06, 8.20s/it, lr=1e-5, step_loss=0.0612][RANK-0]: Step: [13594], local_loss=0.08633250743150711, train_loss=0.19733968377113342, time_cost=2.3164772987365723
+
Steps: 1%|▏ | 13594/1000000 [1:31:11<2245:57:06, 8.20s/it, lr=1e-5, step_loss=0.0863]
Steps: 1%|▏ | 13595/1000000 [1:31:22<2418:46:14, 8.83s/it, lr=1e-5, step_loss=0.0863][RANK-0]: Step: [13595], local_loss=0.011421024799346924, train_loss=0.019369522109627724, time_cost=4.136067152023315
+
Steps: 1%|▏ | 13595/1000000 [1:31:22<2418:46:14, 8.83s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%|▏ | 13596/1000000 [1:31:28<2258:51:09, 8.24s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [13596], local_loss=0.01145812775939703, train_loss=0.06440046429634094, time_cost=2.175340175628662
+
Steps: 1%|▏ | 13596/1000000 [1:31:28<2258:51:09, 8.24s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%|▏ | 13597/1000000 [1:31:37<2283:53:20, 8.34s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [13597], local_loss=0.026593519374728203, train_loss=0.04268699884414673, time_cost=2.509920120239258
+
Steps: 1%|▏ | 13597/1000000 [1:31:37<2283:53:20, 8.34s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%|▏ | 13598/1000000 [1:31:44<2216:48:14, 8.09s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [13598], local_loss=0.014432608149945736, train_loss=0.06231778860092163, time_cost=1.2352612018585205
+
Steps: 1%|▏ | 13598/1000000 [1:31:44<2216:48:14, 8.09s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 13599/1000000 [1:31:53<2252:18:58, 8.22s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [13599], local_loss=0.05084170028567314, train_loss=0.035098105669021606, time_cost=1.8856046199798584
+
Steps: 1%|▏ | 13599/1000000 [1:31:53<2252:18:58, 8.22s/it, lr=1e-5, step_loss=0.0508]
Steps: 1%|▏ | 13600/1000000 [1:32:04<2477:08:09, 9.04s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [13600], local_loss=0.04254422336816788, train_loss=0.052663370966911316, time_cost=3.1754400730133057
+
Steps: 1%|▏ | 13600/1000000 [1:32:04<2477:08:09, 9.04s/it, lr=1e-5, step_loss=0.0425]
Steps: 1%|▏ | 13601/1000000 [1:32:10<2251:50:57, 8.22s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [13601], local_loss=0.01370675303041935, train_loss=0.033465173095464706, time_cost=1.684713363647461
+
Steps: 1%|▏ | 13601/1000000 [1:32:10<2251:50:57, 8.22s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%|▏ | 13602/1000000 [1:32:15<1946:27:06, 7.10s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [13602], local_loss=0.04374246299266815, train_loss=0.049179233610630035, time_cost=2.248624563217163
+
Steps: 1%|▏ | 13602/1000000 [1:32:15<1946:27:06, 7.10s/it, lr=1e-5, step_loss=0.0437]
Steps: 1%|▏ | 13603/1000000 [1:32:27<2409:13:09, 8.79s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [13603], local_loss=0.05002579838037491, train_loss=0.03852646052837372, time_cost=4.321326017379761
+
Steps: 1%|▏ | 13603/1000000 [1:32:27<2409:13:09, 8.79s/it, lr=1e-5, step_loss=0.05]
Steps: 1%|▏ | 13604/1000000 [1:32:38<2584:09:36, 9.43s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [13604], local_loss=0.006990035064518452, train_loss=0.04080234467983246, time_cost=2.9767262935638428
+
Steps: 1%|▏ | 13604/1000000 [1:32:38<2584:09:36, 9.43s/it, lr=1e-5, step_loss=0.00699]
Steps: 1%|▏ | 13605/1000000 [1:32:43<2189:17:47, 7.99s/it, lr=1e-5, step_loss=0.00699][RANK-0]: Step: [13605], local_loss=0.01164232101291418, train_loss=24.524015426635742, time_cost=1.9553699493408203
+
Steps: 1%|▏ | 13605/1000000 [1:32:43<2189:17:47, 7.99s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 13606/1000000 [1:32:48<1910:18:35, 6.97s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [13606], local_loss=0.04283353313803673, train_loss=0.1034206822514534, time_cost=3.8076605796813965
+
Steps: 1%|▏ | 13606/1000000 [1:32:48<1910:18:35, 6.97s/it, lr=1e-5, step_loss=0.0428]
Steps: 1%|▏ | 13607/1000000 [1:32:57<2127:57:45, 7.77s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [13607], local_loss=0.039047498255968094, train_loss=0.14072340726852417, time_cost=4.588254928588867
+
Steps: 1%|▏ | 13607/1000000 [1:32:57<2127:57:45, 7.77s/it, lr=1e-5, step_loss=0.039]
Steps: 1%|▏ | 13608/1000000 [1:33:04<2082:19:07, 7.60s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [13608], local_loss=0.01180798839777708, train_loss=0.03480500355362892, time_cost=1.2517046928405762
+
Steps: 1%|▏ | 13608/1000000 [1:33:04<2082:19:07, 7.60s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 13609/1000000 [1:33:10<1874:38:13, 6.84s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [13609], local_loss=0.013677260838449001, train_loss=0.05991987884044647, time_cost=2.0830295085906982
+
Steps: 1%|▏ | 13609/1000000 [1:33:10<1874:38:13, 6.84s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%|▏ | 13610/1000000 [1:33:15<1796:31:03, 6.56s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [13610], local_loss=0.05074771121144295, train_loss=0.025802452117204666, time_cost=1.4035165309906006
+
Steps: 1%|▏ | 13610/1000000 [1:33:15<1796:31:03, 6.56s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%|▏ | 13611/1000000 [1:33:23<1891:12:36, 6.90s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [13611], local_loss=0.01238885149359703, train_loss=0.052031636238098145, time_cost=1.2803521156311035
+
Steps: 1%|▏ | 13611/1000000 [1:33:23<1891:12:36, 6.90s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13612/1000000 [1:33:28<1732:16:57, 6.32s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13612], local_loss=0.015490403398871422, train_loss=0.034821514040231705, time_cost=1.900383710861206
+
Steps: 1%|▏ | 13612/1000000 [1:33:28<1732:16:57, 6.32s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 13613/1000000 [1:33:38<2002:29:43, 7.31s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [13613], local_loss=0.005529902409762144, train_loss=0.021708142012357712, time_cost=3.6527438163757324
+
Steps: 1%|▏ | 13613/1000000 [1:33:38<2002:29:43, 7.31s/it, lr=1e-5, step_loss=0.00553]
Steps: 1%|▏ | 13614/1000000 [1:33:48<2214:38:39, 8.08s/it, lr=1e-5, step_loss=0.00553][RANK-0]: Step: [13614], local_loss=0.010357033461332321, train_loss=0.043445710092782974, time_cost=1.2155086994171143
+
Steps: 1%|▏ | 13614/1000000 [1:33:48<2214:38:39, 8.08s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 13615/1000000 [1:33:53<1957:50:00, 7.15s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [13615], local_loss=0.05093826353549957, train_loss=0.059381477534770966, time_cost=1.7824983596801758
+
Steps: 1%|▏ | 13615/1000000 [1:33:53<1957:50:00, 7.15s/it, lr=1e-5, step_loss=0.0509]
Steps: 1%|▏ | 13616/1000000 [1:34:03<2263:57:52, 8.26s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [13616], local_loss=0.0973222479224205, train_loss=0.04454312101006508, time_cost=1.2466611862182617
+
Steps: 1%|▏ | 13616/1000000 [1:34:03<2263:57:52, 8.26s/it, lr=1e-5, step_loss=0.0973]
Steps: 1%|▏ | 13617/1000000 [1:34:14<2493:38:05, 9.10s/it, lr=1e-5, step_loss=0.0973][RANK-0]: Step: [13617], local_loss=0.07337404042482376, train_loss=0.11921750009059906, time_cost=3.827188730239868
+
Steps: 1%|▏ | 13617/1000000 [1:34:14<2493:38:05, 9.10s/it, lr=1e-5, step_loss=0.0734]
Steps: 1%|▏ | 13618/1000000 [1:34:28<2834:41:18, 10.35s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [13618], local_loss=0.020648537203669548, train_loss=36.40310287475586, time_cost=4.47575306892395
+
Steps: 1%|▏ | 13618/1000000 [1:34:28<2834:41:18, 10.35s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%|▏ | 13619/1000000 [1:34:35<2586:24:21, 9.44s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [13619], local_loss=0.011239544488489628, train_loss=0.13868866860866547, time_cost=5.4403369426727295
+
Steps: 1%|▏ | 13619/1000000 [1:34:35<2586:24:21, 9.44s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 13620/1000000 [1:34:43<2437:33:31, 8.90s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [13620], local_loss=0.03475087136030197, train_loss=0.02688005194067955, time_cost=3.206758975982666
+
Steps: 1%|▏ | 13620/1000000 [1:34:43<2437:33:31, 8.90s/it, lr=1e-5, step_loss=0.0348]
Steps: 1%|▏ | 13621/1000000 [1:34:48<2126:10:49, 7.76s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [13621], local_loss=0.05692543089389801, train_loss=0.030183644965291023, time_cost=1.211411476135254
+
Steps: 1%|▏ | 13621/1000000 [1:34:48<2126:10:49, 7.76s/it, lr=1e-5, step_loss=0.0569]
Steps: 1%|▏ | 13622/1000000 [1:34:57<2254:21:38, 8.23s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [13622], local_loss=0.0181175135076046, train_loss=0.024288242682814598, time_cost=3.701101779937744
+
Steps: 1%|▏ | 13622/1000000 [1:34:57<2254:21:38, 8.23s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%|▏ | 13623/1000000 [1:35:09<2581:44:37, 9.42s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [13623], local_loss=0.02293536253273487, train_loss=0.054931849241256714, time_cost=1.2299585342407227
+
Steps: 1%|▏ | 13623/1000000 [1:35:09<2581:44:37, 9.42s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%|▏ | 13624/1000000 [1:35:17<2461:03:34, 8.98s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [13624], local_loss=0.38393667340278625, train_loss=0.06821141391992569, time_cost=4.070836782455444
+
Steps: 1%|▏ | 13624/1000000 [1:35:17<2461:03:34, 8.98s/it, lr=1e-5, step_loss=0.384]
Steps: 1%|▏ | 13625/1000000 [1:35:23<2195:23:09, 8.01s/it, lr=1e-5, step_loss=0.384][RANK-0]: Step: [13625], local_loss=0.031510189175605774, train_loss=0.023498352617025375, time_cost=3.0918049812316895
+
Steps: 1%|▏ | 13625/1000000 [1:35:23<2195:23:09, 8.01s/it, lr=1e-5, step_loss=0.0315]
Steps: 1%|▏ | 13626/1000000 [1:35:30<2128:30:21, 7.77s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [13626], local_loss=0.09595772624015808, train_loss=0.3001069128513336, time_cost=2.822772264480591
+
Steps: 1%|▏ | 13626/1000000 [1:35:30<2128:30:21, 7.77s/it, lr=1e-5, step_loss=0.096]
Steps: 1%|▏ | 13627/1000000 [1:35:47<2856:27:50, 10.43s/it, lr=1e-5, step_loss=0.096][RANK-0]: Step: [13627], local_loss=0.007267650682479143, train_loss=0.09020930528640747, time_cost=11.612439393997192
+
Steps: 1%|▏ | 13627/1000000 [1:35:47<2856:27:50, 10.43s/it, lr=1e-5, step_loss=0.00727]
Steps: 1%|▏ | 13628/1000000 [1:36:00<3077:33:43, 11.23s/it, lr=1e-5, step_loss=0.00727][RANK-0]: Step: [13628], local_loss=0.020827775821089745, train_loss=0.031315166503190994, time_cost=5.713529586791992
+
Steps: 1%|▏ | 13628/1000000 [1:36:00<3077:33:43, 11.23s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%|▏ | 13629/1000000 [1:36:13<3236:48:01, 11.81s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [13629], local_loss=0.010453813709318638, train_loss=0.062221959233284, time_cost=1.681931495666504
+
Steps: 1%|▏ | 13629/1000000 [1:36:13<3236:48:01, 11.81s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%|▏ | 13630/1000000 [1:36:18<2685:05:39, 9.80s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [13630], local_loss=0.024534516036510468, train_loss=0.01653997227549553, time_cost=2.1690595149993896
+
Steps: 1%|▏ | 13630/1000000 [1:36:18<2685:05:39, 9.80s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%|▏ | 13631/1000000 [1:36:30<2873:59:29, 10.49s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [13631], local_loss=0.015029976144433022, train_loss=0.01885218918323517, time_cost=2.9521372318267822
+
Steps: 1%|▏ | 13631/1000000 [1:36:30<2873:59:29, 10.49s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 13632/1000000 [1:36:44<3159:40:39, 11.53s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [13632], local_loss=0.0066606951877474785, train_loss=0.022280516102910042, time_cost=3.970038890838623
+
Steps: 1%|▏ | 13632/1000000 [1:36:44<3159:40:39, 11.53s/it, lr=1e-5, step_loss=0.00666]
Steps: 1%|▏ | 13633/1000000 [1:36:50<2703:40:13, 9.87s/it, lr=1e-5, step_loss=0.00666][RANK-0]: Step: [13633], local_loss=0.025211358442902565, train_loss=0.06101243942975998, time_cost=2.0979926586151123
+
Steps: 1%|▏ | 13633/1000000 [1:36:50<2703:40:13, 9.87s/it, lr=1e-5, step_loss=0.0252]
Steps: 1%|▏ | 13634/1000000 [1:37:01<2756:29:46, 10.06s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [13634], local_loss=0.011787773109972477, train_loss=0.03715755045413971, time_cost=1.8030998706817627
+
Steps: 1%|▏ | 13634/1000000 [1:37:01<2756:29:46, 10.06s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 13635/1000000 [1:37:07<2430:39:55, 8.87s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [13635], local_loss=0.02816508337855339, train_loss=0.03225202113389969, time_cost=1.3118896484375
+
Steps: 1%|▏ | 13635/1000000 [1:37:07<2430:39:55, 8.87s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%|▏ | 13636/1000000 [1:37:12<2117:35:29, 7.73s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [13636], local_loss=0.00791912991553545, train_loss=0.03116568736732006, time_cost=1.9275295734405518
+
Steps: 1%|▏ | 13636/1000000 [1:37:12<2117:35:29, 7.73s/it, lr=1e-5, step_loss=0.00792]
Steps: 1%|▏ | 13637/1000000 [1:37:23<2360:50:16, 8.62s/it, lr=1e-5, step_loss=0.00792][RANK-0]: Step: [13637], local_loss=0.0094273891299963, train_loss=0.057307593524456024, time_cost=1.5763065814971924
+
Steps: 1%|▏ | 13637/1000000 [1:37:23<2360:50:16, 8.62s/it, lr=1e-5, step_loss=0.00943]
Steps: 1%|▏ | 13638/1000000 [1:37:34<2607:45:02, 9.52s/it, lr=1e-5, step_loss=0.00943][RANK-0]: Step: [13638], local_loss=0.012045128270983696, train_loss=4.077707767486572, time_cost=8.53620433807373
+
Steps: 1%|▏ | 13638/1000000 [1:37:34<2607:45:02, 9.52s/it, lr=1e-5, step_loss=0.012]
Steps: 1%|▏ | 13639/1000000 [1:37:47<2845:39:14, 10.39s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [13639], local_loss=0.06398788094520569, train_loss=0.04501717537641525, time_cost=3.338911294937134
+
Steps: 1%|▏ | 13639/1000000 [1:37:47<2845:39:14, 10.39s/it, lr=1e-5, step_loss=0.064]
Steps: 1%|▏ | 13640/1000000 [1:37:59<2986:19:13, 10.90s/it, lr=1e-5, step_loss=0.064][RANK-0]: Step: [13640], local_loss=0.016850246116518974, train_loss=0.02864188328385353, time_cost=2.5375735759735107
+
Steps: 1%|▏ | 13640/1000000 [1:37:59<2986:19:13, 10.90s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 13641/1000000 [1:38:06<2691:10:35, 9.82s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [13641], local_loss=0.011878646910190582, train_loss=0.03590576350688934, time_cost=1.231719970703125
+
Steps: 1%|▏ | 13641/1000000 [1:38:06<2691:10:35, 9.82s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%|▏ | 13642/1000000 [1:38:18<2825:25:44, 10.31s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [13642], local_loss=0.013182628899812698, train_loss=0.08752206712961197, time_cost=1.2113397121429443
+
Steps: 1%|▏ | 13642/1000000 [1:38:18<2825:25:44, 10.31s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%|▏ | 13643/1000000 [1:38:33<3236:17:20, 11.81s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [13643], local_loss=0.010245371609926224, train_loss=0.015742190182209015, time_cost=4.730705976486206
+
Steps: 1%|▏ | 13643/1000000 [1:38:33<3236:17:20, 11.81s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%|▏ | 13644/1000000 [1:38:39<2757:34:39, 10.06s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [13644], local_loss=0.2553240656852722, train_loss=0.058710433542728424, time_cost=1.6369693279266357
+
Steps: 1%|▏ | 13644/1000000 [1:38:39<2757:34:39, 10.06s/it, lr=1e-5, step_loss=0.255]
Steps: 1%|▏ | 13645/1000000 [1:38:49<2776:53:52, 10.14s/it, lr=1e-5, step_loss=0.255][RANK-0]: Step: [13645], local_loss=0.025033816695213318, train_loss=0.07249405980110168, time_cost=3.845902442932129
+
Steps: 1%|▏ | 13645/1000000 [1:38:49<2776:53:52, 10.14s/it, lr=1e-5, step_loss=0.025]
Steps: 1%|▏ | 13646/1000000 [1:38:54<2323:10:11, 8.48s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [13646], local_loss=0.031207073479890823, train_loss=0.027780286967754364, time_cost=3.544386386871338
+
Steps: 1%|▏ | 13646/1000000 [1:38:54<2323:10:11, 8.48s/it, lr=1e-5, step_loss=0.0312]
Steps: 1%|▏ | 13647/1000000 [1:39:01<2211:22:54, 8.07s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [13647], local_loss=0.04694930464029312, train_loss=0.05757085233926773, time_cost=1.5386412143707275
+
Steps: 1%|▏ | 13647/1000000 [1:39:01<2211:22:54, 8.07s/it, lr=1e-5, step_loss=0.0469]
Steps: 1%|▏ | 13648/1000000 [1:39:12<2466:53:01, 9.00s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [13648], local_loss=0.011566980741918087, train_loss=0.13291969895362854, time_cost=2.720949411392212
+
Steps: 1%|▏ | 13648/1000000 [1:39:12<2466:53:01, 9.00s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 13649/1000000 [1:39:20<2341:58:15, 8.55s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [13649], local_loss=0.020444203168153763, train_loss=0.058333080261945724, time_cost=2.879676580429077
+
Steps: 1%|▏ | 13649/1000000 [1:39:20<2341:58:15, 8.55s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 13650/1000000 [1:39:27<2230:16:02, 8.14s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [13650], local_loss=0.0062948730774223804, train_loss=0.017876196652650833, time_cost=1.2669579982757568
+
Steps: 1%|▏ | 13650/1000000 [1:39:27<2230:16:02, 8.14s/it, lr=1e-5, step_loss=0.00629]
Steps: 1%|▏ | 13651/1000000 [1:39:34<2161:38:44, 7.89s/it, lr=1e-5, step_loss=0.00629][RANK-0]: Step: [13651], local_loss=0.4798754155635834, train_loss=0.133336141705513, time_cost=2.6413519382476807
+
Steps: 1%|▏ | 13651/1000000 [1:39:34<2161:38:44, 7.89s/it, lr=1e-5, step_loss=0.48]
Steps: 1%|▏ | 13652/1000000 [1:39:48<2679:26:07, 9.78s/it, lr=1e-5, step_loss=0.48][RANK-0]: Step: [13652], local_loss=0.07464351505041122, train_loss=0.07065747678279877, time_cost=5.952864646911621
+
Steps: 1%|▏ | 13652/1000000 [1:39:48<2679:26:07, 9.78s/it, lr=1e-5, step_loss=0.0746]
Steps: 1%|▏ | 13653/1000000 [1:40:02<3030:15:37, 11.06s/it, lr=1e-5, step_loss=0.0746][RANK-0]: Step: [13653], local_loss=0.015069162473082542, train_loss=0.022486679255962372, time_cost=4.920918941497803
+
Steps: 1%|▏ | 13653/1000000 [1:40:02<3030:15:37, 11.06s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%|▏ | 13654/1000000 [1:40:08<2620:49:55, 9.57s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [13654], local_loss=0.010722649283707142, train_loss=0.021509157493710518, time_cost=2.014645576477051
+
Steps: 1%|▏ | 13654/1000000 [1:40:08<2620:49:55, 9.57s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%|▏ | 13655/1000000 [1:40:14<2322:31:36, 8.48s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [13655], local_loss=0.07417643070220947, train_loss=0.02961643785238266, time_cost=1.9427928924560547
+
Steps: 1%|▏ | 13655/1000000 [1:40:14<2322:31:36, 8.48s/it, lr=1e-5, step_loss=0.0742]
Steps: 1%|▏ | 13656/1000000 [1:40:21<2177:17:15, 7.95s/it, lr=1e-5, step_loss=0.0742][RANK-0]: Step: [13656], local_loss=0.0790317952632904, train_loss=0.02936512604355812, time_cost=1.2421324253082275
+
Steps: 1%|▏ | 13656/1000000 [1:40:21<2177:17:15, 7.95s/it, lr=1e-5, step_loss=0.079]
Steps: 1%|▏ | 13657/1000000 [1:40:26<1953:55:35, 7.13s/it, lr=1e-5, step_loss=0.079][RANK-0]: Step: [13657], local_loss=0.018067877739667892, train_loss=0.017687005922198296, time_cost=2.067405939102173
+
Steps: 1%|▏ | 13657/1000000 [1:40:26<1953:55:35, 7.13s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%|▏ | 13658/1000000 [1:40:31<1771:35:47, 6.47s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [13658], local_loss=0.062221113592386246, train_loss=0.035283833742141724, time_cost=2.0195000171661377
+
Steps: 1%|▏ | 13658/1000000 [1:40:31<1771:35:47, 6.47s/it, lr=1e-5, step_loss=0.0622]
Steps: 1%|▏ | 13659/1000000 [1:40:43<2201:42:56, 8.04s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [13659], local_loss=0.05848647654056549, train_loss=0.025286540389060974, time_cost=2.747652292251587
+
Steps: 1%|▏ | 13659/1000000 [1:40:43<2201:42:56, 8.04s/it, lr=1e-5, step_loss=0.0585]
Steps: 1%|▏ | 13660/1000000 [1:40:52<2268:07:34, 8.28s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [13660], local_loss=0.01624475046992302, train_loss=0.040817152708768845, time_cost=2.8382210731506348
+
Steps: 1%|▏ | 13660/1000000 [1:40:52<2268:07:34, 8.28s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%|▏ | 13661/1000000 [1:40:58<2081:07:44, 7.60s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [13661], local_loss=0.06141111999750137, train_loss=0.0514850988984108, time_cost=1.3410024642944336
+
Steps: 1%|▏ | 13661/1000000 [1:40:58<2081:07:44, 7.60s/it, lr=1e-5, step_loss=0.0614]
Steps: 1%|▏ | 13662/1000000 [1:41:09<2378:14:19, 8.68s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [13662], local_loss=0.0643700584769249, train_loss=0.048139993101358414, time_cost=1.6367337703704834
+
Steps: 1%|▏ | 13662/1000000 [1:41:09<2378:14:19, 8.68s/it, lr=1e-5, step_loss=0.0644]
Steps: 1%|▏ | 13663/1000000 [1:41:15<2156:41:17, 7.87s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [13663], local_loss=0.05782737955451012, train_loss=0.03485541418194771, time_cost=1.7796871662139893
+
Steps: 1%|▏ | 13663/1000000 [1:41:15<2156:41:17, 7.87s/it, lr=1e-5, step_loss=0.0578]
Steps: 1%|▏ | 13664/1000000 [1:41:19<1855:43:13, 6.77s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [13664], local_loss=0.014833813533186913, train_loss=0.05034530162811279, time_cost=1.4427664279937744
+
Steps: 1%|▏ | 13664/1000000 [1:41:19<1855:43:13, 6.77s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%|▏ | 13665/1000000 [1:41:32<2365:50:40, 8.64s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [13665], local_loss=0.01635146699845791, train_loss=0.036354321986436844, time_cost=5.040584564208984
+
Steps: 1%|▏ | 13665/1000000 [1:41:32<2365:50:40, 8.64s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%|▏ | 13666/1000000 [1:41:36<2015:58:05, 7.36s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [13666], local_loss=0.01070463564246893, train_loss=0.023142311722040176, time_cost=3.0299172401428223
+
Steps: 1%|▏ | 13666/1000000 [1:41:36<2015:58:05, 7.36s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%|▏ | 13667/1000000 [1:41:48<2341:33:59, 8.55s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [13667], local_loss=0.15503916144371033, train_loss=0.048739612102508545, time_cost=2.283646583557129
+
Steps: 1%|▏ | 13667/1000000 [1:41:48<2341:33:59, 8.55s/it, lr=1e-5, step_loss=0.155]
Steps: 1%|▏ | 13668/1000000 [1:42:00<2604:11:49, 9.51s/it, lr=1e-5, step_loss=0.155][RANK-0]: Step: [13668], local_loss=0.046282075345516205, train_loss=0.05131738632917404, time_cost=1.2478504180908203
+
Steps: 1%|▏ | 13668/1000000 [1:42:00<2604:11:49, 9.51s/it, lr=1e-5, step_loss=0.0463]
Steps: 1%|▏ | 13669/1000000 [1:42:09<2591:17:15, 9.46s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [13669], local_loss=0.011173415929079056, train_loss=0.26772570610046387, time_cost=1.6738979816436768
+
Steps: 1%|▏ | 13669/1000000 [1:42:09<2591:17:15, 9.46s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 13670/1000000 [1:42:13<2170:14:39, 7.92s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [13670], local_loss=0.05215770751237869, train_loss=0.03978104144334793, time_cost=3.2591331005096436
+
Steps: 1%|▏ | 13670/1000000 [1:42:13<2170:14:39, 7.92s/it, lr=1e-5, step_loss=0.0522]
Steps: 1%|▏ | 13671/1000000 [1:42:26<2587:21:00, 9.44s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [13671], local_loss=0.02339811623096466, train_loss=0.02032451517879963, time_cost=4.111645221710205
+
Steps: 1%|▏ | 13671/1000000 [1:42:26<2587:21:00, 9.44s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%|▏ | 13672/1000000 [1:42:31<2246:41:55, 8.20s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [13672], local_loss=0.019694969058036804, train_loss=0.034996483474969864, time_cost=4.466184854507446
+
Steps: 1%|▏ | 13672/1000000 [1:42:31<2246:41:55, 8.20s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%|▏ | 13673/1000000 [1:42:37<2064:36:13, 7.54s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [13673], local_loss=0.09015616774559021, train_loss=0.04787886515259743, time_cost=1.4015872478485107
+
Steps: 1%|▏ | 13673/1000000 [1:42:37<2064:36:13, 7.54s/it, lr=1e-5, step_loss=0.0902]
Steps: 1%|▏ | 13674/1000000 [1:42:45<2049:05:49, 7.48s/it, lr=1e-5, step_loss=0.0902][RANK-0]: Step: [13674], local_loss=0.019054582342505455, train_loss=0.06035039573907852, time_cost=3.22062611579895
+
Steps: 1%|▏ | 13674/1000000 [1:42:45<2049:05:49, 7.48s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%|▏ | 13675/1000000 [1:42:52<2027:31:53, 7.40s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [13675], local_loss=0.18316175043582916, train_loss=0.06220048666000366, time_cost=2.722135305404663
+
Steps: 1%|▏ | 13675/1000000 [1:42:52<2027:31:53, 7.40s/it, lr=1e-5, step_loss=0.183]
Steps: 1%|▏ | 13676/1000000 [1:42:59<1992:50:49, 7.27s/it, lr=1e-5, step_loss=0.183][RANK-0]: Step: [13676], local_loss=0.012906844727694988, train_loss=0.03942825645208359, time_cost=2.816640615463257
+
Steps: 1%|▏ | 13676/1000000 [1:42:59<1992:50:49, 7.27s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 13677/1000000 [1:43:10<2323:28:31, 8.48s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [13677], local_loss=0.009973116219043732, train_loss=0.04687696322798729, time_cost=2.028817892074585
+
Steps: 1%|▏ | 13677/1000000 [1:43:10<2323:28:31, 8.48s/it, lr=1e-5, step_loss=0.00997]
Steps: 1%|▏ | 13678/1000000 [1:43:27<3001:21:02, 10.95s/it, lr=1e-5, step_loss=0.00997][RANK-0]: Step: [13678], local_loss=0.0849977508187294, train_loss=0.07331053912639618, time_cost=8.22209644317627
+
Steps: 1%|▏ | 13678/1000000 [1:43:27<3001:21:02, 10.95s/it, lr=1e-5, step_loss=0.085]
Steps: 1%|▏ | 13679/1000000 [1:43:33<2612:29:17, 9.54s/it, lr=1e-5, step_loss=0.085][RANK-0]: Step: [13679], local_loss=0.008660225197672844, train_loss=0.0793282687664032, time_cost=2.1498262882232666
+
Steps: 1%|▏ | 13679/1000000 [1:43:33<2612:29:17, 9.54s/it, lr=1e-5, step_loss=0.00866]
Steps: 1%|▏ | 13680/1000000 [1:43:47<2937:45:21, 10.72s/it, lr=1e-5, step_loss=0.00866][RANK-0]: Step: [13680], local_loss=0.02579498291015625, train_loss=13.556774139404297, time_cost=3.49575138092041
+
Steps: 1%|▏ | 13680/1000000 [1:43:47<2937:45:21, 10.72s/it, lr=1e-5, step_loss=0.0258]
Steps: 1%|▏ | 13681/1000000 [1:43:52<2498:52:04, 9.12s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [13681], local_loss=0.013093356043100357, train_loss=0.09753067791461945, time_cost=2.0787463188171387
+
Steps: 1%|▏ | 13681/1000000 [1:43:52<2498:52:04, 9.12s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%|▏ | 13682/1000000 [1:43:59<2288:21:12, 8.35s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [13682], local_loss=0.009913425892591476, train_loss=0.036715492606163025, time_cost=1.8794424533843994
+
Steps: 1%|▏ | 13682/1000000 [1:43:59<2288:21:12, 8.35s/it, lr=1e-5, step_loss=0.00991]
Steps: 1%|▏ | 13683/1000000 [1:44:16<2994:00:01, 10.93s/it, lr=1e-5, step_loss=0.00991][RANK-0]: Step: [13683], local_loss=0.02382991462945938, train_loss=0.036759309470653534, time_cost=1.2548470497131348
+
Steps: 1%|▏ | 13683/1000000 [1:44:16<2994:00:01, 10.93s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%|▏ | 13684/1000000 [1:44:20<2460:43:55, 8.98s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [13684], local_loss=0.06357035040855408, train_loss=0.0458163283765316, time_cost=1.4800879955291748
+
Steps: 1%|▏ | 13684/1000000 [1:44:20<2460:43:55, 8.98s/it, lr=1e-5, step_loss=0.0636]
Steps: 1%|▏ | 13685/1000000 [1:44:25<2138:44:55, 7.81s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [13685], local_loss=0.05293998122215271, train_loss=0.2320273071527481, time_cost=2.20798397064209
+
Steps: 1%|▏ | 13685/1000000 [1:44:25<2138:44:55, 7.81s/it, lr=1e-5, step_loss=0.0529]
Steps: 1%|▏ | 13686/1000000 [1:44:37<2442:38:28, 8.92s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [13686], local_loss=0.0051379078067839146, train_loss=0.04023822396993637, time_cost=1.9959726333618164
+
Steps: 1%|▏ | 13686/1000000 [1:44:37<2442:38:28, 8.92s/it, lr=1e-5, step_loss=0.00514]
Steps: 1%|▏ | 13687/1000000 [1:44:44<2292:56:16, 8.37s/it, lr=1e-5, step_loss=0.00514][RANK-0]: Step: [13687], local_loss=0.017663555219769478, train_loss=0.016414053738117218, time_cost=2.120128631591797
+
Steps: 1%|▏ | 13687/1000000 [1:44:44<2292:56:16, 8.37s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%|▏ | 13688/1000000 [1:44:56<2591:58:28, 9.46s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [13688], local_loss=0.017187874764204025, train_loss=0.13866320252418518, time_cost=4.449982404708862
+
Steps: 1%|▏ | 13688/1000000 [1:44:56<2591:58:28, 9.46s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%|▏ | 13689/1000000 [1:45:10<2991:24:54, 10.92s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [13689], local_loss=0.021112162619829178, train_loss=0.195975199341774, time_cost=6.280687093734741
+
Steps: 1%|▏ | 13689/1000000 [1:45:10<2991:24:54, 10.92s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%|▏ | 13690/1000000 [1:45:18<2730:36:54, 9.97s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [13690], local_loss=0.014057999476790428, train_loss=0.01723787933588028, time_cost=3.910188674926758
+
Steps: 1%|▏ | 13690/1000000 [1:45:18<2730:36:54, 9.97s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%|▏ | 13691/1000000 [1:45:32<3098:38:10, 11.31s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [13691], local_loss=0.01550926361232996, train_loss=0.030586693435907364, time_cost=4.940930604934692
+
Steps: 1%|▏ | 13691/1000000 [1:45:32<3098:38:10, 11.31s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 13692/1000000 [1:45:41<2916:21:42, 10.64s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [13692], local_loss=0.012512005865573883, train_loss=0.04458954185247421, time_cost=2.9600582122802734
+
Steps: 1%|▏ | 13692/1000000 [1:45:41<2916:21:42, 10.64s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%|▏ | 13693/1000000 [1:45:48<2618:44:01, 9.56s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [13693], local_loss=0.03683013468980789, train_loss=0.04502797871828079, time_cost=1.95859956741333
+
Steps: 1%|▏ | 13693/1000000 [1:45:48<2618:44:01, 9.56s/it, lr=1e-5, step_loss=0.0368]
Steps: 1%|▏ | 13694/1000000 [1:46:00<2802:09:34, 10.23s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [13694], local_loss=0.06914003193378448, train_loss=0.037463679909706116, time_cost=1.2526881694793701
+
Steps: 1%|▏ | 13694/1000000 [1:46:00<2802:09:34, 10.23s/it, lr=1e-5, step_loss=0.0691]
Steps: 1%|▏ | 13695/1000000 [1:46:05<2387:27:34, 8.71s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [13695], local_loss=0.08578052371740341, train_loss=0.07684270292520523, time_cost=2.07165265083313
+
Steps: 1%|▏ | 13695/1000000 [1:46:05<2387:27:34, 8.71s/it, lr=1e-5, step_loss=0.0858]
Steps: 1%|▏ | 13696/1000000 [1:46:10<2033:52:56, 7.42s/it, lr=1e-5, step_loss=0.0858][RANK-0]: Step: [13696], local_loss=0.013842469081282616, train_loss=0.025684330612421036, time_cost=1.3901774883270264
+
Steps: 1%|▏ | 13696/1000000 [1:46:10<2033:52:56, 7.42s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 13697/1000000 [1:46:20<2264:37:26, 8.27s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [13697], local_loss=0.051931221038103104, train_loss=5.059032440185547, time_cost=7.100661039352417
+
Steps: 1%|▏ | 13697/1000000 [1:46:20<2264:37:26, 8.27s/it, lr=1e-5, step_loss=0.0519]
Steps: 1%|▏ | 13698/1000000 [1:46:32<2537:50:02, 9.26s/it, lr=1e-5, step_loss=0.0519][RANK-0]: Step: [13698], local_loss=0.10318129509687424, train_loss=0.033564385026693344, time_cost=2.6615445613861084
+
Steps: 1%|▏ | 13698/1000000 [1:46:32<2537:50:02, 9.26s/it, lr=1e-5, step_loss=0.103]
Steps: 1%|▏ | 13699/1000000 [1:46:43<2675:48:34, 9.77s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [13699], local_loss=0.048107098788022995, train_loss=0.06670451909303665, time_cost=1.5867247581481934
+
Steps: 1%|▏ | 13699/1000000 [1:46:43<2675:48:34, 9.77s/it, lr=1e-5, step_loss=0.0481]
Steps: 1%|▏ | 13700/1000000 [1:46:51<2603:01:39, 9.50s/it, lr=1e-5, step_loss=0.0481][RANK-0]: Step: [13700], local_loss=0.03585526719689369, train_loss=0.02040174975991249, time_cost=1.610973596572876
+
Steps: 1%|▏ | 13700/1000000 [1:46:51<2603:01:39, 9.50s/it, lr=1e-5, step_loss=0.0359]
Steps: 1%|▏ | 13701/1000000 [1:46:59<2439:06:22, 8.90s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [13701], local_loss=0.016830680891871452, train_loss=0.028272081166505814, time_cost=1.2496554851531982
+
Steps: 1%|▏ | 13701/1000000 [1:46:59<2439:06:22, 8.90s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%|▏ | 13702/1000000 [1:47:08<2464:03:42, 8.99s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [13702], local_loss=0.027774110436439514, train_loss=0.02818339690566063, time_cost=3.713304281234741
+
Steps: 1%|▏ | 13702/1000000 [1:47:08<2464:03:42, 8.99s/it, lr=1e-5, step_loss=0.0278]
Steps: 1%|▏ | 13703/1000000 [1:47:16<2358:08:39, 8.61s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [13703], local_loss=0.08215081691741943, train_loss=0.06158149614930153, time_cost=6.20256495475769
+
Steps: 1%|▏ | 13703/1000000 [1:47:16<2358:08:39, 8.61s/it, lr=1e-5, step_loss=0.0822]
Steps: 1%|▏ | 13704/1000000 [1:47:23<2277:32:10, 8.31s/it, lr=1e-5, step_loss=0.0822][RANK-0]: Step: [13704], local_loss=0.027288828045129776, train_loss=0.017419878393411636, time_cost=1.2343072891235352
+
Steps: 1%|▏ | 13704/1000000 [1:47:23<2277:32:10, 8.31s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%|▏ | 13705/1000000 [1:47:40<2944:45:04, 10.75s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [13705], local_loss=0.09543313831090927, train_loss=0.07571326196193695, time_cost=7.199016809463501
+
Steps: 1%|▏ | 13705/1000000 [1:47:40<2944:45:04, 10.75s/it, lr=1e-5, step_loss=0.0954]
Steps: 1%|▏ | 13706/1000000 [1:47:53<3163:20:41, 11.55s/it, lr=1e-5, step_loss=0.0954][RANK-0]: Step: [13706], local_loss=0.07298309355974197, train_loss=0.031843505799770355, time_cost=5.4444520473480225
+
Steps: 1%|▏ | 13706/1000000 [1:47:53<3163:20:41, 11.55s/it, lr=1e-5, step_loss=0.073]
Steps: 1%|▏ | 13707/1000000 [1:48:01<2847:36:07, 10.39s/it, lr=1e-5, step_loss=0.073][RANK-0]: Step: [13707], local_loss=0.022299204021692276, train_loss=0.06147013232111931, time_cost=1.3164875507354736
+
Steps: 1%|▏ | 13707/1000000 [1:48:01<2847:36:07, 10.39s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%|▏ | 13708/1000000 [1:48:16<3192:50:42, 11.65s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [13708], local_loss=0.008765583857893944, train_loss=0.1305309385061264, time_cost=6.076997518539429
+
Steps: 1%|▏ | 13708/1000000 [1:48:16<3192:50:42, 11.65s/it, lr=1e-5, step_loss=0.00877]
Steps: 1%|▏ | 13709/1000000 [1:48:24<2924:42:39, 10.68s/it, lr=1e-5, step_loss=0.00877][RANK-0]: Step: [13709], local_loss=0.026156209409236908, train_loss=0.05020046979188919, time_cost=1.2251124382019043
+
Steps: 1%|▏ | 13709/1000000 [1:48:24<2924:42:39, 10.68s/it, lr=1e-5, step_loss=0.0262]
Steps: 1%|▏ | 13710/1000000 [1:48:37<3147:21:10, 11.49s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [13710], local_loss=0.5090155005455017, train_loss=0.09470922499895096, time_cost=7.221095561981201
+
Steps: 1%|▏ | 13710/1000000 [1:48:37<3147:21:10, 11.49s/it, lr=1e-5, step_loss=0.509]
Steps: 1%|▏ | 13711/1000000 [1:48:43<2686:03:29, 9.80s/it, lr=1e-5, step_loss=0.509][RANK-0]: Step: [13711], local_loss=0.05346790328621864, train_loss=0.046264585107564926, time_cost=1.277698278427124
+
Steps: 1%|▏ | 13711/1000000 [1:48:43<2686:03:29, 9.80s/it, lr=1e-5, step_loss=0.0535]
Steps: 1%|▏ | 13712/1000000 [1:49:00<3250:57:37, 11.87s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [13712], local_loss=0.012692458927631378, train_loss=0.027796011418104172, time_cost=7.856192588806152
+
Steps: 1%|▏ | 13712/1000000 [1:49:00<3250:57:37, 11.87s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%|▏ | 13713/1000000 [1:49:15<3501:28:20, 12.78s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [13713], local_loss=0.021344201639294624, train_loss=0.01999146118760109, time_cost=3.018245220184326
+
Steps: 1%|▏ | 13713/1000000 [1:49:15<3501:28:20, 12.78s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%|▏ | 13714/1000000 [1:49:31<3819:47:05, 13.94s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [13714], local_loss=0.01386092696338892, train_loss=0.02408764697611332, time_cost=7.3370466232299805
+
Steps: 1%|▏ | 13714/1000000 [1:49:31<3819:47:05, 13.94s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%|▏ | 13715/1000000 [1:49:39<3282:20:34, 11.98s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [13715], local_loss=1.0228506326675415, train_loss=0.2627677917480469, time_cost=1.463663101196289
+
Steps: 1%|▏ | 13715/1000000 [1:49:39<3282:20:34, 11.98s/it, lr=1e-5, step_loss=1.02]
Steps: 1%|▏ | 13716/1000000 [1:49:53<3431:41:17, 12.53s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [13716], local_loss=0.028099866583943367, train_loss=0.05084552988409996, time_cost=4.248777389526367
+
Steps: 1%|▏ | 13716/1000000 [1:49:53<3431:41:17, 12.53s/it, lr=1e-5, step_loss=0.0281]
Steps: 1%|▏ | 13717/1000000 [1:49:57<2773:18:54, 10.12s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [13717], local_loss=0.01745014637708664, train_loss=0.05214826390147209, time_cost=1.581099033355713
+
Steps: 1%|▏ | 13717/1000000 [1:49:57<2773:18:54, 10.12s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 13718/1000000 [1:50:02<2354:49:37, 8.60s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [13718], local_loss=0.08713176846504211, train_loss=0.05638888105750084, time_cost=2.1076788902282715
+
Steps: 1%|▏ | 13718/1000000 [1:50:02<2354:49:37, 8.60s/it, lr=1e-5, step_loss=0.0871]
Steps: 1%|▏ | 13719/1000000 [1:50:15<2693:29:35, 9.83s/it, lr=1e-5, step_loss=0.0871][RANK-0]: Step: [13719], local_loss=0.043118055909872055, train_loss=0.15380170941352844, time_cost=1.249208688735962
+
Steps: 1%|▏ | 13719/1000000 [1:50:15<2693:29:35, 9.83s/it, lr=1e-5, step_loss=0.0431]
Steps: 1%|▏ | 13720/1000000 [1:50:21<2349:39:12, 8.58s/it, lr=1e-5, step_loss=0.0431][RANK-0]: Step: [13720], local_loss=0.028360342606902122, train_loss=0.01805197447538376, time_cost=1.4757487773895264
+
Steps: 1%|▏ | 13720/1000000 [1:50:21<2349:39:12, 8.58s/it, lr=1e-5, step_loss=0.0284]
Steps: 1%|▏ | 13721/1000000 [1:50:26<2086:43:48, 7.62s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [13721], local_loss=0.025459831580519676, train_loss=0.18876436352729797, time_cost=2.429863214492798
+
Steps: 1%|▏ | 13721/1000000 [1:50:26<2086:43:48, 7.62s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%|▏ | 13722/1000000 [1:50:31<1877:26:24, 6.85s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [13722], local_loss=0.07753288745880127, train_loss=0.09439054876565933, time_cost=1.9368929862976074
+
Steps: 1%|▏ | 13722/1000000 [1:50:31<1877:26:24, 6.85s/it, lr=1e-5, step_loss=0.0775]
Steps: 1%|▏ | 13723/1000000 [1:50:43<2266:42:47, 8.27s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [13723], local_loss=0.10086341202259064, train_loss=0.05346358194947243, time_cost=2.8926146030426025
+
Steps: 1%|▏ | 13723/1000000 [1:50:43<2266:42:47, 8.27s/it, lr=1e-5, step_loss=0.101]
Steps: 1%|▏ | 13724/1000000 [1:50:57<2754:27:15, 10.05s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [13724], local_loss=0.006982251536101103, train_loss=0.11767801642417908, time_cost=5.031375169754028
+
Steps: 1%|▏ | 13724/1000000 [1:50:57<2754:27:15, 10.05s/it, lr=1e-5, step_loss=0.00698]
Steps: 1%|▏ | 13725/1000000 [1:51:06<2674:39:09, 9.76s/it, lr=1e-5, step_loss=0.00698][RANK-0]: Step: [13725], local_loss=0.008107045665383339, train_loss=0.01728663593530655, time_cost=1.2458784580230713
+
Steps: 1%|▏ | 13725/1000000 [1:51:06<2674:39:09, 9.76s/it, lr=1e-5, step_loss=0.00811]
Steps: 1%|▏ | 13726/1000000 [1:51:21<3126:33:39, 11.41s/it, lr=1e-5, step_loss=0.00811][RANK-0]: Step: [13726], local_loss=0.3182503879070282, train_loss=0.07334840297698975, time_cost=7.434555292129517
+
Steps: 1%|▏ | 13726/1000000 [1:51:21<3126:33:39, 11.41s/it, lr=1e-5, step_loss=0.318]
Steps: 1%|▏ | 13727/1000000 [1:51:26<2614:32:14, 9.54s/it, lr=1e-5, step_loss=0.318][RANK-0]: Step: [13727], local_loss=0.03905155509710312, train_loss=0.022839562967419624, time_cost=1.217684030532837
+
Steps: 1%|▏ | 13727/1000000 [1:51:26<2614:32:14, 9.54s/it, lr=1e-5, step_loss=0.0391]
Steps: 1%|▏ | 13728/1000000 [1:51:35<2498:44:50, 9.12s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [13728], local_loss=0.1305934637784958, train_loss=0.0676615834236145, time_cost=3.93928861618042
+
Steps: 1%|▏ | 13728/1000000 [1:51:35<2498:44:50, 9.12s/it, lr=1e-5, step_loss=0.131]
Steps: 1%|▏ | 13729/1000000 [1:51:50<3042:56:57, 11.11s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [13729], local_loss=0.052403323352336884, train_loss=0.025885650888085365, time_cost=7.172762632369995
+
Steps: 1%|▏ | 13729/1000000 [1:51:50<3042:56:57, 11.11s/it, lr=1e-5, step_loss=0.0524]
Steps: 1%|▏ | 13730/1000000 [1:51:56<2584:25:48, 9.43s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [13730], local_loss=0.012493932619690895, train_loss=0.05225686728954315, time_cost=4.12087345123291
+
Steps: 1%|▏ | 13730/1000000 [1:51:56<2584:25:48, 9.43s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%|▏ | 13731/1000000 [1:52:00<2141:24:31, 7.82s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [13731], local_loss=0.02884313091635704, train_loss=0.07853444665670395, time_cost=1.6217706203460693
+
Steps: 1%|▏ | 13731/1000000 [1:52:00<2141:24:31, 7.82s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 13732/1000000 [1:52:10<2358:36:26, 8.61s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [13732], local_loss=0.05493869259953499, train_loss=0.034181877970695496, time_cost=4.0377373695373535
+
Steps: 1%|▏ | 13732/1000000 [1:52:10<2358:36:26, 8.61s/it, lr=1e-5, step_loss=0.0549]
Steps: 1%|▏ | 13733/1000000 [1:52:19<2403:41:58, 8.77s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [13733], local_loss=0.018395522609353065, train_loss=0.16638712584972382, time_cost=2.6659302711486816
+
Steps: 1%|▏ | 13733/1000000 [1:52:19<2403:41:58, 8.77s/it, lr=1e-5, step_loss=0.0184]
Steps: 1%|▏ | 13734/1000000 [1:52:30<2588:49:13, 9.45s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [13734], local_loss=0.006518612615764141, train_loss=19.935794830322266, time_cost=6.103301763534546
+
Steps: 1%|▏ | 13734/1000000 [1:52:30<2588:49:13, 9.45s/it, lr=1e-5, step_loss=0.00652]
Steps: 1%|▏ | 13735/1000000 [1:52:36<2299:07:03, 8.39s/it, lr=1e-5, step_loss=0.00652][RANK-0]: Step: [13735], local_loss=0.05092192068696022, train_loss=0.03509654104709625, time_cost=2.4574167728424072
+
Steps: 1%|▏ | 13735/1000000 [1:52:36<2299:07:03, 8.39s/it, lr=1e-5, step_loss=0.0509]
Steps: 1%|▏ | 13736/1000000 [1:52:42<2073:04:59, 7.57s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [13736], local_loss=0.030472809448838234, train_loss=0.1501321643590927, time_cost=2.842550754547119
+
Steps: 1%|▏ | 13736/1000000 [1:52:42<2073:04:59, 7.57s/it, lr=1e-5, step_loss=0.0305]
Steps: 1%|▏ | 13737/1000000 [1:52:46<1796:02:08, 6.56s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [13737], local_loss=0.013423732481896877, train_loss=0.22838006913661957, time_cost=1.3215301036834717
+
Steps: 1%|▏ | 13737/1000000 [1:52:46<1796:02:08, 6.56s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%|▏ | 13738/1000000 [1:52:51<1674:26:49, 6.11s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [13738], local_loss=0.014004727825522423, train_loss=0.0546131432056427, time_cost=2.0614171028137207
+
Steps: 1%|▏ | 13738/1000000 [1:52:51<1674:26:49, 6.11s/it, lr=1e-5, step_loss=0.014]
Steps: 1%|▏ | 13739/1000000 [1:53:00<1903:38:05, 6.95s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [13739], local_loss=0.011097675189375877, train_loss=0.06859361380338669, time_cost=6.227291822433472
+
Steps: 1%|▏ | 13739/1000000 [1:53:00<1903:38:05, 6.95s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 13740/1000000 [1:53:10<2177:43:10, 7.95s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [13740], local_loss=0.05148248001933098, train_loss=0.08320917934179306, time_cost=7.628913879394531
+
Steps: 1%|▏ | 13740/1000000 [1:53:10<2177:43:10, 7.95s/it, lr=1e-5, step_loss=0.0515]
Steps: 1%|▏ | 13741/1000000 [1:53:16<1965:34:05, 7.17s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [13741], local_loss=0.07269264757633209, train_loss=0.027351275086402893, time_cost=2.279588222503662
+
Steps: 1%|▏ | 13741/1000000 [1:53:16<1965:34:05, 7.17s/it, lr=1e-5, step_loss=0.0727]
Steps: 1%|▏ | 13742/1000000 [1:53:24<2066:43:51, 7.54s/it, lr=1e-5, step_loss=0.0727][RANK-0]: Step: [13742], local_loss=0.02361937053501606, train_loss=0.04765481501817703, time_cost=3.101846933364868
+
Steps: 1%|▏ | 13742/1000000 [1:53:24<2066:43:51, 7.54s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%|▏ | 13743/1000000 [1:53:32<2068:55:06, 7.55s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [13743], local_loss=0.024487268179655075, train_loss=0.04053555428981781, time_cost=2.1359386444091797
+
Steps: 1%|▏ | 13743/1000000 [1:53:32<2068:55:06, 7.55s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%|▏ | 13744/1000000 [1:53:39<2041:15:06, 7.45s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [13744], local_loss=0.015016269870102406, train_loss=0.0677599310874939, time_cost=1.5804667472839355
+
Steps: 1%|▏ | 13744/1000000 [1:53:39<2041:15:06, 7.45s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 13745/1000000 [1:53:50<2317:30:54, 8.46s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [13745], local_loss=0.07836823910474777, train_loss=0.04083532467484474, time_cost=2.9783084392547607
+
Steps: 1%|▏ | 13745/1000000 [1:53:50<2317:30:54, 8.46s/it, lr=1e-5, step_loss=0.0784]
Steps: 1%|▏ | 13746/1000000 [1:53:59<2395:19:49, 8.74s/it, lr=1e-5, step_loss=0.0784][RANK-0]: Step: [13746], local_loss=0.034018535166978836, train_loss=0.017051417380571365, time_cost=3.120349407196045
+
Steps: 1%|▏ | 13746/1000000 [1:53:59<2395:19:49, 8.74s/it, lr=1e-5, step_loss=0.034]
Steps: 1%|▏ | 13747/1000000 [1:54:15<2966:35:12, 10.83s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [13747], local_loss=0.020058318972587585, train_loss=0.04578378051519394, time_cost=7.295003175735474
+
Steps: 1%|▏ | 13747/1000000 [1:54:15<2966:35:12, 10.83s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%|▏ | 13748/1000000 [1:54:28<3158:19:10, 11.53s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [13748], local_loss=0.03189845383167267, train_loss=0.0266916174441576, time_cost=5.374443292617798
+
Steps: 1%|▏ | 13748/1000000 [1:54:28<3158:19:10, 11.53s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%|▏ | 13749/1000000 [1:54:34<2705:46:45, 9.88s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [13749], local_loss=0.013549907132983208, train_loss=0.020133908838033676, time_cost=1.730161190032959
+
Steps: 1%|▏ | 13749/1000000 [1:54:34<2705:46:45, 9.88s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%|▏ | 13750/1000000 [1:54:41<2473:44:05, 9.03s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [13750], local_loss=0.01967829279601574, train_loss=0.06663002818822861, time_cost=2.343904972076416
+
Steps: 1%|▏ | 13750/1000000 [1:54:41<2473:44:05, 9.03s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%|▏ | 13751/1000000 [1:54:52<2653:02:32, 9.68s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [13751], local_loss=0.04740433394908905, train_loss=0.044074125587940216, time_cost=2.606153964996338
+
Steps: 1%|▏ | 13751/1000000 [1:54:52<2653:02:32, 9.68s/it, lr=1e-5, step_loss=0.0474]
Steps: 1%|▏ | 13752/1000000 [1:55:02<2678:25:12, 9.78s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [13752], local_loss=0.03627992421388626, train_loss=0.0558759830892086, time_cost=1.227248191833496
+
Steps: 1%|▏ | 13752/1000000 [1:55:02<2678:25:12, 9.78s/it, lr=1e-5, step_loss=0.0363]
Steps: 1%|▏ | 13753/1000000 [1:55:07<2233:31:34, 8.15s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [13753], local_loss=0.0199672132730484, train_loss=0.05150136351585388, time_cost=1.2467725276947021
+
Steps: 1%|▏ | 13753/1000000 [1:55:07<2233:31:34, 8.15s/it, lr=1e-5, step_loss=0.02]
Steps: 1%|▏ | 13754/1000000 [1:55:21<2734:53:37, 9.98s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [13754], local_loss=0.027242157608270645, train_loss=0.05922781303524971, time_cost=5.722615003585815
+
Steps: 1%|▏ | 13754/1000000 [1:55:21<2734:53:37, 9.98s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%|▏ | 13755/1000000 [1:55:35<3049:58:44, 11.13s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [13755], local_loss=0.022647012025117874, train_loss=0.05003056675195694, time_cost=1.2376437187194824
+
Steps: 1%|▏ | 13755/1000000 [1:55:35<3049:58:44, 11.13s/it, lr=1e-5, step_loss=0.0226]
Steps: 1%|▏ | 13756/1000000 [1:55:42<2754:24:42, 10.05s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [13756], local_loss=0.023785125464200974, train_loss=0.08615999668836594, time_cost=2.556518793106079
+
Steps: 1%|▏ | 13756/1000000 [1:55:42<2754:24:42, 10.05s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%|▏ | 13757/1000000 [1:55:49<2498:47:22, 9.12s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [13757], local_loss=0.009426325559616089, train_loss=0.017838096246123314, time_cost=5.20929741859436
+
Steps: 1%|▏ | 13757/1000000 [1:55:49<2498:47:22, 9.12s/it, lr=1e-5, step_loss=0.00943]
Steps: 1%|▏ | 13758/1000000 [1:56:02<2803:59:20, 10.24s/it, lr=1e-5, step_loss=0.00943][RANK-0]: Step: [13758], local_loss=0.04887557774782181, train_loss=0.07279001176357269, time_cost=6.127429246902466
+
Steps: 1%|▏ | 13758/1000000 [1:56:02<2803:59:20, 10.24s/it, lr=1e-5, step_loss=0.0489]
Steps: 1%|▏ | 13759/1000000 [1:56:08<2449:24:43, 8.94s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [13759], local_loss=0.049442559480667114, train_loss=0.026954293251037598, time_cost=4.60748028755188
+
Steps: 1%|▏ | 13759/1000000 [1:56:08<2449:24:43, 8.94s/it, lr=1e-5, step_loss=0.0494]
Steps: 1%|▏ | 13760/1000000 [1:56:15<2267:27:37, 8.28s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [13760], local_loss=0.05783134698867798, train_loss=0.04905251786112785, time_cost=2.9501078128814697
+
Steps: 1%|▏ | 13760/1000000 [1:56:15<2267:27:37, 8.28s/it, lr=1e-5, step_loss=0.0578]
Steps: 1%|▏ | 13761/1000000 [1:56:20<2019:50:33, 7.37s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [13761], local_loss=0.0584285631775856, train_loss=0.053340282291173935, time_cost=2.4021639823913574
+
Steps: 1%|▏ | 13761/1000000 [1:56:20<2019:50:33, 7.37s/it, lr=1e-5, step_loss=0.0584]
Steps: 1%|▏ | 13762/1000000 [1:56:29<2180:02:50, 7.96s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [13762], local_loss=0.06389649957418442, train_loss=0.041636791080236435, time_cost=3.2257590293884277
+
Steps: 1%|▏ | 13762/1000000 [1:56:29<2180:02:50, 7.96s/it, lr=1e-5, step_loss=0.0639]
Steps: 1%|▏ | 13763/1000000 [1:56:35<2011:16:28, 7.34s/it, lr=1e-5, step_loss=0.0639][RANK-0]: Step: [13763], local_loss=0.03853623941540718, train_loss=0.07785142213106155, time_cost=4.11813497543335
+
Steps: 1%|▏ | 13763/1000000 [1:56:35<2011:16:28, 7.34s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%|▏ | 13764/1000000 [1:56:45<2166:25:40, 7.91s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [13764], local_loss=0.06049160286784172, train_loss=0.036250390112400055, time_cost=2.1750476360321045
+
Steps: 1%|▏ | 13764/1000000 [1:56:45<2166:25:40, 7.91s/it, lr=1e-5, step_loss=0.0605]
Steps: 1%|▏ | 13765/1000000 [1:56:54<2329:01:31, 8.50s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [13765], local_loss=0.09326007217168808, train_loss=0.08064396679401398, time_cost=3.92952823638916
+
Steps: 1%|▏ | 13765/1000000 [1:56:54<2329:01:31, 8.50s/it, lr=1e-5, step_loss=0.0933]
Steps: 1%|▏ | 13766/1000000 [1:57:06<2592:07:44, 9.46s/it, lr=1e-5, step_loss=0.0933][RANK-0]: Step: [13766], local_loss=0.01035093143582344, train_loss=0.030351150780916214, time_cost=2.1537163257598877
+
Steps: 1%|▏ | 13766/1000000 [1:57:06<2592:07:44, 9.46s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 13767/1000000 [1:57:17<2741:38:05, 10.01s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [13767], local_loss=0.029198303818702698, train_loss=0.018988903611898422, time_cost=4.517977237701416
+
Steps: 1%|▏ | 13767/1000000 [1:57:17<2741:38:05, 10.01s/it, lr=1e-5, step_loss=0.0292]
Steps: 1%|▏ | 13768/1000000 [1:57:24<2498:12:20, 9.12s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [13768], local_loss=0.01396426185965538, train_loss=0.03870946913957596, time_cost=1.2637689113616943
+
Steps: 1%|▏ | 13768/1000000 [1:57:24<2498:12:20, 9.12s/it, lr=1e-5, step_loss=0.014]
Steps: 1%|▏ | 13769/1000000 [1:57:39<2983:49:55, 10.89s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [13769], local_loss=0.03754132241010666, train_loss=0.03535902500152588, time_cost=1.2764294147491455
+
Steps: 1%|▏ | 13769/1000000 [1:57:39<2983:49:55, 10.89s/it, lr=1e-5, step_loss=0.0375]
Steps: 1%|▏ | 13770/1000000 [1:57:45<2505:39:02, 9.15s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [13770], local_loss=0.02807035483419895, train_loss=0.019101910293102264, time_cost=1.206165075302124
+
Steps: 1%|▏ | 13770/1000000 [1:57:45<2505:39:02, 9.15s/it, lr=1e-5, step_loss=0.0281]
Steps: 1%|▏ | 13771/1000000 [1:57:53<2483:44:32, 9.07s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [13771], local_loss=0.013347203843295574, train_loss=0.02857615239918232, time_cost=1.236379623413086
+
Steps: 1%|▏ | 13771/1000000 [1:57:53<2483:44:32, 9.07s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%|▏ | 13772/1000000 [1:58:04<2601:27:29, 9.50s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [13772], local_loss=0.012800326570868492, train_loss=0.02341354638338089, time_cost=7.55884575843811
+
Steps: 1%|▏ | 13772/1000000 [1:58:04<2601:27:29, 9.50s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%|▏ | 13773/1000000 [1:58:09<2271:39:06, 8.29s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [13773], local_loss=0.026254579424858093, train_loss=0.021691977977752686, time_cost=2.747889995574951
+
Steps: 1%|▏ | 13773/1000000 [1:58:09<2271:39:06, 8.29s/it, lr=1e-5, step_loss=0.0263]
Steps: 1%|▏ | 13774/1000000 [1:58:21<2535:02:00, 9.25s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [13774], local_loss=0.02357030101120472, train_loss=0.0315299890935421, time_cost=1.2170884609222412
+
Steps: 1%|▏ | 13774/1000000 [1:58:21<2535:02:00, 9.25s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%|▏ | 13775/1000000 [1:58:26<2227:44:08, 8.13s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [13775], local_loss=0.12535220384597778, train_loss=0.038658320903778076, time_cost=2.2896475791931152
+
Steps: 1%|▏ | 13775/1000000 [1:58:26<2227:44:08, 8.13s/it, lr=1e-5, step_loss=0.125]
Steps: 1%|▏ | 13776/1000000 [1:58:39<2634:34:36, 9.62s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [13776], local_loss=0.04370423033833504, train_loss=0.14299467206001282, time_cost=1.222435712814331
+
Steps: 1%|▏ | 13776/1000000 [1:58:39<2634:34:36, 9.62s/it, lr=1e-5, step_loss=0.0437]
Steps: 1%|▏ | 13777/1000000 [1:58:47<2460:25:52, 8.98s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [13777], local_loss=0.012253125198185444, train_loss=0.14283335208892822, time_cost=2.499677896499634
+
Steps: 1%|▏ | 13777/1000000 [1:58:47<2460:25:52, 8.98s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%|▏ | 13778/1000000 [1:58:54<2300:49:34, 8.40s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [13778], local_loss=0.02018783986568451, train_loss=0.057089388370513916, time_cost=2.4332189559936523
+
Steps: 1%|▏ | 13778/1000000 [1:58:54<2300:49:34, 8.40s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%|▏ | 13779/1000000 [1:58:59<2048:37:05, 7.48s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [13779], local_loss=0.008456987328827381, train_loss=0.022286906838417053, time_cost=4.247040510177612
+
Steps: 1%|▏ | 13779/1000000 [1:58:59<2048:37:05, 7.48s/it, lr=1e-5, step_loss=0.00846]
Steps: 1%|▏ | 13780/1000000 [1:59:10<2337:06:51, 8.53s/it, lr=1e-5, step_loss=0.00846][RANK-0]: Step: [13780], local_loss=0.07376515120267868, train_loss=0.1262854039669037, time_cost=2.3442838191986084
+
Steps: 1%|▏ | 13780/1000000 [1:59:10<2337:06:51, 8.53s/it, lr=1e-5, step_loss=0.0738]
Steps: 1%|▏ | 13781/1000000 [1:59:21<2522:21:57, 9.21s/it, lr=1e-5, step_loss=0.0738][RANK-0]: Step: [13781], local_loss=0.024893922731280327, train_loss=0.013536830432713032, time_cost=2.990837812423706
+
Steps: 1%|▏ | 13781/1000000 [1:59:21<2522:21:57, 9.21s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%|▏ | 13782/1000000 [1:59:26<2186:35:48, 7.98s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [13782], local_loss=0.007212746888399124, train_loss=0.03156011551618576, time_cost=2.0999550819396973
+
Steps: 1%|▏ | 13782/1000000 [1:59:26<2186:35:48, 7.98s/it, lr=1e-5, step_loss=0.00721]
Steps: 1%|▏ | 13783/1000000 [1:59:41<2767:11:12, 10.10s/it, lr=1e-5, step_loss=0.00721][RANK-0]: Step: [13783], local_loss=0.029612435027956963, train_loss=0.037726521492004395, time_cost=10.998513460159302
+
Steps: 1%|▏ | 13783/1000000 [1:59:41<2767:11:12, 10.10s/it, lr=1e-5, step_loss=0.0296]
Steps: 1%|▏ | 13784/1000000 [1:59:54<3007:42:14, 10.98s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [13784], local_loss=0.016788361594080925, train_loss=7.800343990325928, time_cost=3.2731261253356934
+
Steps: 1%|▏ | 13784/1000000 [1:59:54<3007:42:14, 10.98s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%|▏ | 13785/1000000 [2:00:10<3407:15:21, 12.44s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [13785], local_loss=0.01148716639727354, train_loss=0.019653234630823135, time_cost=1.2507407665252686
+
Steps: 1%|▏ | 13785/1000000 [2:00:10<3407:15:21, 12.44s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%|▏ | 13786/1000000 [2:00:18<2996:51:12, 10.94s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [13786], local_loss=0.017197290435433388, train_loss=0.024600911885499954, time_cost=1.2941608428955078
+
Steps: 1%|▏ | 13786/1000000 [2:00:18<2996:51:12, 10.94s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%|▏ | 13787/1000000 [2:00:29<3006:21:41, 10.97s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [13787], local_loss=0.17986111342906952, train_loss=0.11112084984779358, time_cost=8.463845252990723
+
Steps: 1%|▏ | 13787/1000000 [2:00:29<3006:21:41, 10.97s/it, lr=1e-5, step_loss=0.18]
Steps: 1%|▏ | 13788/1000000 [2:00:38<2877:52:19, 10.51s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [13788], local_loss=0.00830318033695221, train_loss=0.022066015750169754, time_cost=2.6133875846862793
+
Steps: 1%|▏ | 13788/1000000 [2:00:38<2877:52:19, 10.51s/it, lr=1e-5, step_loss=0.0083]
Steps: 1%|▏ | 13789/1000000 [2:00:44<2509:37:25, 9.16s/it, lr=1e-5, step_loss=0.0083][RANK-0]: Step: [13789], local_loss=0.055674947798252106, train_loss=0.027808986604213715, time_cost=1.7779419422149658
+
Steps: 1%|▏ | 13789/1000000 [2:00:44<2509:37:25, 9.16s/it, lr=1e-5, step_loss=0.0557]
Steps: 1%|▏ | 13790/1000000 [2:00:51<2309:50:02, 8.43s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [13790], local_loss=0.32173386216163635, train_loss=0.07918243855237961, time_cost=2.795339345932007
+
Steps: 1%|▏ | 13790/1000000 [2:00:51<2309:50:02, 8.43s/it, lr=1e-5, step_loss=0.322]
Steps: 1%|▏ | 13791/1000000 [2:01:05<2762:33:40, 10.08s/it, lr=1e-5, step_loss=0.322][RANK-0]: Step: [13791], local_loss=0.03501340001821518, train_loss=0.031049787998199463, time_cost=5.2560553550720215
+
Steps: 1%|▏ | 13791/1000000 [2:01:05<2762:33:40, 10.08s/it, lr=1e-5, step_loss=0.035]
Steps: 1%|▏ | 13792/1000000 [2:01:18<3018:23:38, 11.02s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [13792], local_loss=0.015331710688769817, train_loss=0.044368259608745575, time_cost=1.3792614936828613
+
Steps: 1%|▏ | 13792/1000000 [2:01:18<3018:23:38, 11.02s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%|▏ | 13793/1000000 [2:01:32<3244:52:13, 11.84s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [13793], local_loss=0.11916784197092056, train_loss=0.03222552686929703, time_cost=4.08215069770813
+
Steps: 1%|▏ | 13793/1000000 [2:01:32<3244:52:13, 11.84s/it, lr=1e-5, step_loss=0.119]
Steps: 1%|▏ | 13794/1000000 [2:01:36<2659:14:54, 9.71s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [13794], local_loss=0.008388273417949677, train_loss=0.01681290566921234, time_cost=2.1446852684020996
+
Steps: 1%|▏ | 13794/1000000 [2:01:36<2659:14:54, 9.71s/it, lr=1e-5, step_loss=0.00839]
Steps: 1%|▏ | 13795/1000000 [2:01:44<2444:44:32, 8.92s/it, lr=1e-5, step_loss=0.00839][RANK-0]: Step: [13795], local_loss=0.08678115159273148, train_loss=0.1725326031446457, time_cost=2.659148693084717
+
Steps: 1%|▏ | 13795/1000000 [2:01:44<2444:44:32, 8.92s/it, lr=1e-5, step_loss=0.0868]
Steps: 1%|▏ | 13796/1000000 [2:01:57<2856:49:28, 10.43s/it, lr=1e-5, step_loss=0.0868][RANK-0]: Step: [13796], local_loss=0.006482580676674843, train_loss=0.038838762789964676, time_cost=5.441224813461304
+
Steps: 1%|▏ | 13796/1000000 [2:01:57<2856:49:28, 10.43s/it, lr=1e-5, step_loss=0.00648]
Steps: 1%|▏ | 13797/1000000 [2:02:08<2888:48:50, 10.55s/it, lr=1e-5, step_loss=0.00648][RANK-0]: Step: [13797], local_loss=0.011343698017299175, train_loss=0.1847434639930725, time_cost=1.6508674621582031
+
Steps: 1%|▏ | 13797/1000000 [2:02:08<2888:48:50, 10.55s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%|▏ | 13798/1000000 [2:02:18<2818:38:09, 10.29s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [13798], local_loss=0.007663396652787924, train_loss=0.020789382979273796, time_cost=1.5514578819274902
+
Steps: 1%|▏ | 13798/1000000 [2:02:18<2818:38:09, 10.29s/it, lr=1e-5, step_loss=0.00766]
Steps: 1%|▏ | 13799/1000000 [2:02:24<2494:00:43, 9.10s/it, lr=1e-5, step_loss=0.00766][RANK-0]: Step: [13799], local_loss=0.020045896992087364, train_loss=0.025070423260331154, time_cost=1.644887924194336
+
Steps: 1%|▏ | 13799/1000000 [2:02:24<2494:00:43, 9.10s/it, lr=1e-5, step_loss=0.02]
Steps: 1%|▏ | 13800/1000000 [2:02:29<2151:48:42, 7.85s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [13800], local_loss=0.02968878298997879, train_loss=0.16264888644218445, time_cost=1.807671070098877
+
Steps: 1%|▏ | 13800/1000000 [2:02:29<2151:48:42, 7.85s/it, lr=1e-5, step_loss=0.0297]
Steps: 1%|▏ | 13801/1000000 [2:02:43<2666:57:01, 9.74s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [13801], local_loss=0.008464176207780838, train_loss=0.04822885990142822, time_cost=4.174352407455444
+
Steps: 1%|▏ | 13801/1000000 [2:02:43<2666:57:01, 9.74s/it, lr=1e-5, step_loss=0.00846]
Steps: 1%|▏ | 13802/1000000 [2:02:51<2497:30:11, 9.12s/it, lr=1e-5, step_loss=0.00846][RANK-0]: Step: [13802], local_loss=0.009772680699825287, train_loss=0.07192634046077728, time_cost=5.893240690231323
+
Steps: 1%|▏ | 13802/1000000 [2:02:51<2497:30:11, 9.12s/it, lr=1e-5, step_loss=0.00977]
Steps: 1%|▏ | 13803/1000000 [2:02:59<2427:36:53, 8.86s/it, lr=1e-5, step_loss=0.00977][RANK-0]: Step: [13803], local_loss=0.1644420027732849, train_loss=0.06045621633529663, time_cost=3.775536298751831
+
Steps: 1%|▏ | 13803/1000000 [2:02:59<2427:36:53, 8.86s/it, lr=1e-5, step_loss=0.164]
Steps: 1%|▏ | 13804/1000000 [2:03:05<2179:49:45, 7.96s/it, lr=1e-5, step_loss=0.164][RANK-0]: Step: [13804], local_loss=0.2920246422290802, train_loss=0.07436949014663696, time_cost=1.5408987998962402
+
Steps: 1%|▏ | 13804/1000000 [2:03:05<2179:49:45, 7.96s/it, lr=1e-5, step_loss=0.292]
Steps: 1%|▏ | 13805/1000000 [2:03:17<2516:06:48, 9.18s/it, lr=1e-5, step_loss=0.292][RANK-0]: Step: [13805], local_loss=0.9947744011878967, train_loss=0.16220705211162567, time_cost=3.897627830505371
+
Steps: 1%|▏ | 13805/1000000 [2:03:17<2516:06:48, 9.18s/it, lr=1e-5, step_loss=0.995]
Steps: 1%|▏ | 13806/1000000 [2:03:22<2142:07:45, 7.82s/it, lr=1e-5, step_loss=0.995][RANK-0]: Step: [13806], local_loss=0.023390837013721466, train_loss=0.04057478532195091, time_cost=1.4600343704223633
+
Steps: 1%|▏ | 13806/1000000 [2:03:22<2142:07:45, 7.82s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%|▏ | 13807/1000000 [2:03:31<2265:18:58, 8.27s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [13807], local_loss=0.02294100821018219, train_loss=0.032400280237197876, time_cost=3.229797124862671
+
Steps: 1%|▏ | 13807/1000000 [2:03:31<2265:18:58, 8.27s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%|▏ | 13808/1000000 [2:03:36<1965:24:06, 7.17s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [13808], local_loss=0.026901111006736755, train_loss=0.05834632366895676, time_cost=3.6748411655426025
+
Steps: 1%|▏ | 13808/1000000 [2:03:36<1965:24:06, 7.17s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%|▏ | 13809/1000000 [2:03:43<1949:37:06, 7.12s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [13809], local_loss=0.011293169111013412, train_loss=0.05990799143910408, time_cost=2.5258865356445312
+
Steps: 1%|▏ | 13809/1000000 [2:03:43<1949:37:06, 7.12s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%|▏ | 13810/1000000 [2:03:47<1722:09:44, 6.29s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [13810], local_loss=0.11841903626918793, train_loss=0.16979852318763733, time_cost=1.8974909782409668
+
Steps: 1%|▏ | 13810/1000000 [2:03:47<1722:09:44, 6.29s/it, lr=1e-5, step_loss=0.118]
Steps: 1%|▏ | 13811/1000000 [2:03:53<1695:44:09, 6.19s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [13811], local_loss=0.010000551119446754, train_loss=0.05137188732624054, time_cost=1.3697521686553955
+
Steps: 1%|▏ | 13811/1000000 [2:03:53<1695:44:09, 6.19s/it, lr=1e-5, step_loss=0.01]
Steps: 1%|▏ | 13812/1000000 [2:04:05<2138:59:34, 7.81s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [13812], local_loss=0.008556229062378407, train_loss=0.07726245373487473, time_cost=4.496408224105835
+
Steps: 1%|▏ | 13812/1000000 [2:04:05<2138:59:34, 7.81s/it, lr=1e-5, step_loss=0.00856]
Steps: 1%|▏ | 13813/1000000 [2:04:09<1846:34:25, 6.74s/it, lr=1e-5, step_loss=0.00856][RANK-0]: Step: [13813], local_loss=0.013431882485747337, train_loss=0.03226893022656441, time_cost=1.3919029235839844
+
Steps: 1%|▏ | 13813/1000000 [2:04:09<1846:34:25, 6.74s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%|▏ | 13814/1000000 [2:04:14<1697:48:01, 6.20s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [13814], local_loss=1.0087385177612305, train_loss=0.15109333395957947, time_cost=2.293173313140869
+
Steps: 1%|▏ | 13814/1000000 [2:04:14<1697:48:01, 6.20s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 13815/1000000 [2:04:25<2071:00:07, 7.56s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [13815], local_loss=0.043529145419597626, train_loss=0.0210406631231308, time_cost=2.887620449066162
+
Steps: 1%|▏ | 13815/1000000 [2:04:25<2071:00:07, 7.56s/it, lr=1e-5, step_loss=0.0435]
Steps: 1%|▏ | 13816/1000000 [2:04:32<2069:39:22, 7.56s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [13816], local_loss=0.010192747227847576, train_loss=19.08844566345215, time_cost=2.6159071922302246
+
Steps: 1%|▏ | 13816/1000000 [2:04:32<2069:39:22, 7.56s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%|▏ | 13817/1000000 [2:04:38<1932:46:21, 7.06s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [13817], local_loss=0.011801824904978275, train_loss=0.038722194731235504, time_cost=4.39163064956665
+
Steps: 1%|▏ | 13817/1000000 [2:04:38<1932:46:21, 7.06s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 13818/1000000 [2:04:46<1982:30:48, 7.24s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [13818], local_loss=0.019985077902674675, train_loss=0.11274802684783936, time_cost=3.3976330757141113
+
Steps: 1%|▏ | 13818/1000000 [2:04:46<1982:30:48, 7.24s/it, lr=1e-5, step_loss=0.02]
Steps: 1%|▏ | 13819/1000000 [2:04:56<2221:36:14, 8.11s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [13819], local_loss=0.0066056689247488976, train_loss=0.07271447777748108, time_cost=1.3243021965026855
+
Steps: 1%|▏ | 13819/1000000 [2:04:56<2221:36:14, 8.11s/it, lr=1e-5, step_loss=0.00661]
Steps: 1%|▏ | 13820/1000000 [2:05:02<2074:56:01, 7.57s/it, lr=1e-5, step_loss=0.00661][RANK-0]: Step: [13820], local_loss=0.026438988745212555, train_loss=0.017764899879693985, time_cost=1.7676663398742676
+
Steps: 1%|▏ | 13820/1000000 [2:05:02<2074:56:01, 7.57s/it, lr=1e-5, step_loss=0.0264]
Steps: 1%|▏ | 13821/1000000 [2:05:08<1892:20:26, 6.91s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [13821], local_loss=0.07752404361963272, train_loss=0.03817079961299896, time_cost=3.049163818359375
+
Steps: 1%|▏ | 13821/1000000 [2:05:08<1892:20:26, 6.91s/it, lr=1e-5, step_loss=0.0775]
Steps: 1%|▏ | 13822/1000000 [2:05:12<1684:10:08, 6.15s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [13822], local_loss=0.05661178007721901, train_loss=0.04511518403887749, time_cost=1.3764605522155762
+
Steps: 1%|▏ | 13822/1000000 [2:05:12<1684:10:08, 6.15s/it, lr=1e-5, step_loss=0.0566]
Steps: 1%|▏ | 13823/1000000 [2:05:26<2323:47:03, 8.48s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [13823], local_loss=0.0939435213804245, train_loss=0.10303159058094025, time_cost=4.0408806800842285
+
Steps: 1%|▏ | 13823/1000000 [2:05:26<2323:47:03, 8.48s/it, lr=1e-5, step_loss=0.0939]
Steps: 1%|▏ | 13824/1000000 [2:05:32<2115:45:40, 7.72s/it, lr=1e-5, step_loss=0.0939][RANK-0]: Step: [13824], local_loss=0.03683236986398697, train_loss=0.0561857670545578, time_cost=1.4516165256500244
+
Steps: 1%|▏ | 13824/1000000 [2:05:32<2115:45:40, 7.72s/it, lr=1e-5, step_loss=0.0368]
Steps: 1%|▏ | 13825/1000000 [2:05:37<1919:20:16, 7.01s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [13825], local_loss=0.16109347343444824, train_loss=0.054147593677043915, time_cost=1.478870153427124
+
Steps: 1%|▏ | 13825/1000000 [2:05:37<1919:20:16, 7.01s/it, lr=1e-5, step_loss=0.161]
Steps: 1%|▏ | 13826/1000000 [2:05:50<2383:08:35, 8.70s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [13826], local_loss=0.020477110520005226, train_loss=0.03605804219841957, time_cost=1.461850881576538
+
Steps: 1%|▏ | 13826/1000000 [2:05:50<2383:08:35, 8.70s/it, lr=1e-5, step_loss=0.0205]
Steps: 1%|▏ | 13827/1000000 [2:05:56<2159:27:27, 7.88s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [13827], local_loss=0.017959659919142723, train_loss=0.025373198091983795, time_cost=4.104544162750244
+
Steps: 1%|▏ | 13827/1000000 [2:05:56<2159:27:27, 7.88s/it, lr=1e-5, step_loss=0.018]
Steps: 1%|▏ | 13828/1000000 [2:06:02<2000:35:56, 7.30s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [13828], local_loss=0.12320956587791443, train_loss=0.0887274444103241, time_cost=1.7873198986053467
+
Steps: 1%|▏ | 13828/1000000 [2:06:02<2000:35:56, 7.30s/it, lr=1e-5, step_loss=0.123]
Steps: 1%|▏ | 13829/1000000 [2:06:16<2553:36:38, 9.32s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [13829], local_loss=0.03654790297150612, train_loss=0.036825746297836304, time_cost=1.7449991703033447
+
Steps: 1%|▏ | 13829/1000000 [2:06:16<2553:36:38, 9.32s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%|▏ | 13830/1000000 [2:06:27<2735:26:13, 9.99s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [13830], local_loss=0.008188759908080101, train_loss=0.023237798362970352, time_cost=3.0570664405822754
+
Steps: 1%|▏ | 13830/1000000 [2:06:27<2735:26:13, 9.99s/it, lr=1e-5, step_loss=0.00819]
Steps: 1%|▏ | 13831/1000000 [2:06:33<2423:31:42, 8.85s/it, lr=1e-5, step_loss=0.00819][RANK-0]: Step: [13831], local_loss=0.018800335004925728, train_loss=0.02115127444267273, time_cost=2.3377745151519775
+
Steps: 1%|▏ | 13831/1000000 [2:06:33<2423:31:42, 8.85s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%|▏ | 13832/1000000 [2:06:39<2159:03:32, 7.88s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [13832], local_loss=0.04178622364997864, train_loss=0.05622434616088867, time_cost=2.2203822135925293
+
Steps: 1%|▏ | 13832/1000000 [2:06:39<2159:03:32, 7.88s/it, lr=1e-5, step_loss=0.0418]
Steps: 1%|▏ | 13833/1000000 [2:06:47<2129:19:13, 7.77s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [13833], local_loss=0.01181013323366642, train_loss=0.07340825349092484, time_cost=1.7070865631103516
+
Steps: 1%|▏ | 13833/1000000 [2:06:47<2129:19:13, 7.77s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 13834/1000000 [2:06:52<1900:11:49, 6.94s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [13834], local_loss=0.0068417564034461975, train_loss=0.04117502272129059, time_cost=1.9736263751983643
+
Steps: 1%|▏ | 13834/1000000 [2:06:52<1900:11:49, 6.94s/it, lr=1e-5, step_loss=0.00684]
Steps: 1%|▏ | 13835/1000000 [2:06:56<1678:36:25, 6.13s/it, lr=1e-5, step_loss=0.00684][RANK-0]: Step: [13835], local_loss=0.01665802113711834, train_loss=0.03885398060083389, time_cost=1.3670740127563477
+
Steps: 1%|▏ | 13835/1000000 [2:06:56<1678:36:25, 6.13s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 13836/1000000 [2:07:04<1810:19:23, 6.61s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [13836], local_loss=0.022335203364491463, train_loss=0.032632797956466675, time_cost=1.3064801692962646
+
Steps: 1%|▏ | 13836/1000000 [2:07:04<1810:19:23, 6.61s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%|▏ | 13837/1000000 [2:07:13<2074:33:36, 7.57s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [13837], local_loss=109.45993041992188, train_loss=13.734854698181152, time_cost=1.2364747524261475
+
Steps: 1%|▏ | 13837/1000000 [2:07:13<2074:33:36, 7.57s/it, lr=1e-5, step_loss=109]
Steps: 1%|▏ | 13838/1000000 [2:07:22<2171:20:35, 7.93s/it, lr=1e-5, step_loss=109][RANK-0]: Step: [13838], local_loss=0.12617036700248718, train_loss=0.03782140463590622, time_cost=2.6331090927124023
+
Steps: 1%|▏ | 13838/1000000 [2:07:22<2171:20:35, 7.93s/it, lr=1e-5, step_loss=0.126]
Steps: 1%|▏ | 13839/1000000 [2:07:27<1885:31:52, 6.88s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [13839], local_loss=0.03739407658576965, train_loss=0.05616656318306923, time_cost=1.6136260032653809
+
Steps: 1%|▏ | 13839/1000000 [2:07:27<1885:31:52, 6.88s/it, lr=1e-5, step_loss=0.0374]
Steps: 1%|▏ | 13840/1000000 [2:07:34<1964:02:14, 7.17s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [13840], local_loss=0.052571430802345276, train_loss=0.05845554918050766, time_cost=6.991995334625244
+
Steps: 1%|▏ | 13840/1000000 [2:07:34<1964:02:14, 7.17s/it, lr=1e-5, step_loss=0.0526]
Steps: 1%|▏ | 13841/1000000 [2:07:47<2403:52:11, 8.78s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [13841], local_loss=0.028045887127518654, train_loss=0.08888570219278336, time_cost=3.1274781227111816
+
Steps: 1%|▏ | 13841/1000000 [2:07:47<2403:52:11, 8.78s/it, lr=1e-5, step_loss=0.028]
Steps: 1%|▏ | 13842/1000000 [2:07:57<2527:42:09, 9.23s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [13842], local_loss=1.0001674890518188, train_loss=0.14846689999103546, time_cost=4.670159339904785
+
Steps: 1%|▏ | 13842/1000000 [2:07:57<2527:42:09, 9.23s/it, lr=1e-5, step_loss=1]
Steps: 1%|▏ | 13843/1000000 [2:08:02<2143:33:39, 7.83s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [13843], local_loss=0.016554085537791252, train_loss=0.04416333884000778, time_cost=1.7419970035552979
+
Steps: 1%|▏ | 13843/1000000 [2:08:02<2143:33:39, 7.83s/it, lr=1e-5, step_loss=0.0166]
Steps: 1%|▏ | 13844/1000000 [2:08:07<1925:23:54, 7.03s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [13844], local_loss=0.019228626042604446, train_loss=0.041390560567379, time_cost=1.208695888519287
+
Steps: 1%|▏ | 13844/1000000 [2:08:07<1925:23:54, 7.03s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%|▏ | 13845/1000000 [2:08:14<1924:57:48, 7.03s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [13845], local_loss=0.0801176056265831, train_loss=0.037220750004053116, time_cost=3.1437814235687256
+
Steps: 1%|▏ | 13845/1000000 [2:08:14<1924:57:48, 7.03s/it, lr=1e-5, step_loss=0.0801]
Steps: 1%|▏ | 13846/1000000 [2:08:24<2162:31:17, 7.89s/it, lr=1e-5, step_loss=0.0801][RANK-0]: Step: [13846], local_loss=0.0051470231264829636, train_loss=0.09390126168727875, time_cost=3.406994342803955
+
Steps: 1%|▏ | 13846/1000000 [2:08:24<2162:31:17, 7.89s/it, lr=1e-5, step_loss=0.00515]
Steps: 1%|▏ | 13847/1000000 [2:08:29<1927:32:39, 7.04s/it, lr=1e-5, step_loss=0.00515][RANK-0]: Step: [13847], local_loss=0.05416449159383774, train_loss=0.18442606925964355, time_cost=3.805711030960083
+
Steps: 1%|▏ | 13847/1000000 [2:08:29<1927:32:39, 7.04s/it, lr=1e-5, step_loss=0.0542]
Steps: 1%|▏ | 13848/1000000 [2:08:37<1980:40:25, 7.23s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [13848], local_loss=0.026795798912644386, train_loss=0.09426717460155487, time_cost=2.438530683517456
+
Steps: 1%|▏ | 13848/1000000 [2:08:37<1980:40:25, 7.23s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%|▏ | 13849/1000000 [2:08:45<2079:28:44, 7.59s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [13849], local_loss=0.01784420572221279, train_loss=0.03766180947422981, time_cost=2.072024345397949
+
Steps: 1%|▏ | 13849/1000000 [2:08:45<2079:28:44, 7.59s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%|▏ | 13850/1000000 [2:09:00<2671:25:48, 9.75s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [13850], local_loss=0.0198868028819561, train_loss=0.043455325067043304, time_cost=4.988530397415161
+
Steps: 1%|▏ | 13850/1000000 [2:09:00<2671:25:48, 9.75s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%|▏ | 13851/1000000 [2:09:07<2462:51:26, 8.99s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [13851], local_loss=0.0322418287396431, train_loss=0.020449725911021233, time_cost=5.319761753082275
+
Steps: 1%|▏ | 13851/1000000 [2:09:07<2462:51:26, 8.99s/it, lr=1e-5, step_loss=0.0322]
Steps: 1%|▏ | 13852/1000000 [2:09:12<2134:54:58, 7.79s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [13852], local_loss=0.05308991298079491, train_loss=0.057096514850854874, time_cost=4.14263653755188
+
Steps: 1%|▏ | 13852/1000000 [2:09:12<2134:54:58, 7.79s/it, lr=1e-5, step_loss=0.0531]
Steps: 1%|▏ | 13853/1000000 [2:09:19<2061:24:34, 7.53s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [13853], local_loss=0.018268758431077003, train_loss=0.020015774294734, time_cost=1.2313225269317627
+
Steps: 1%|▏ | 13853/1000000 [2:09:19<2061:24:34, 7.53s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%|▏ | 13854/1000000 [2:09:24<1834:59:35, 6.70s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [13854], local_loss=0.05821481719613075, train_loss=0.021261397749185562, time_cost=1.5390491485595703
+
Steps: 1%|▏ | 13854/1000000 [2:09:24<1834:59:35, 6.70s/it, lr=1e-5, step_loss=0.0582]
Steps: 1%|▏ | 13855/1000000 [2:09:32<1941:13:45, 7.09s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [13855], local_loss=0.02266460470855236, train_loss=0.05170807987451553, time_cost=2.0036540031433105
+
Steps: 1%|▏ | 13855/1000000 [2:09:32<1941:13:45, 7.09s/it, lr=1e-5, step_loss=0.0227]
Steps: 1%|▏ | 13856/1000000 [2:09:36<1727:40:11, 6.31s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [13856], local_loss=0.010957816615700722, train_loss=0.06896577030420303, time_cost=1.2279486656188965
+
Steps: 1%|▏ | 13856/1000000 [2:09:36<1727:40:11, 6.31s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 13857/1000000 [2:09:47<2105:49:17, 7.69s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [13857], local_loss=0.024071821942925453, train_loss=0.041665732860565186, time_cost=7.711802959442139
+
Steps: 1%|▏ | 13857/1000000 [2:09:47<2105:49:17, 7.69s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%|▏ | 13858/1000000 [2:09:52<1904:58:33, 6.95s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [13858], local_loss=0.08740631490945816, train_loss=0.040989577770233154, time_cost=1.286937952041626
+
Steps: 1%|▏ | 13858/1000000 [2:09:52<1904:58:33, 6.95s/it, lr=1e-5, step_loss=0.0874]
Steps: 1%|▏ | 13859/1000000 [2:10:02<2093:53:24, 7.64s/it, lr=1e-5, step_loss=0.0874][RANK-0]: Step: [13859], local_loss=0.015213481150567532, train_loss=0.03914865478873253, time_cost=1.2692253589630127
+
Steps: 1%|▏ | 13859/1000000 [2:10:02<2093:53:24, 7.64s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%|▏ | 13860/1000000 [2:10:11<2225:54:17, 8.13s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [13860], local_loss=0.025295868515968323, train_loss=0.16131621599197388, time_cost=1.2334403991699219
+
Steps: 1%|▏ | 13860/1000000 [2:10:11<2225:54:17, 8.13s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%|▏ | 13861/1000000 [2:10:26<2798:54:55, 10.22s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [13861], local_loss=0.03297623619437218, train_loss=0.02758730761706829, time_cost=6.726881265640259
+
Steps: 1%|▏ | 13861/1000000 [2:10:26<2798:54:55, 10.22s/it, lr=1e-5, step_loss=0.033]
Steps: 1%|▏ | 13862/1000000 [2:10:30<2301:58:16, 8.40s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [13862], local_loss=0.012415433302521706, train_loss=0.019963882863521576, time_cost=1.207275390625
+
Steps: 1%|▏ | 13862/1000000 [2:10:30<2301:58:16, 8.40s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13863/1000000 [2:10:38<2236:53:35, 8.17s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13863], local_loss=0.2448018193244934, train_loss=0.07514717429876328, time_cost=1.6914172172546387
+
Steps: 1%|▏ | 13863/1000000 [2:10:38<2236:53:35, 8.17s/it, lr=1e-5, step_loss=0.245]
Steps: 1%|▏ | 13864/1000000 [2:10:43<1994:03:59, 7.28s/it, lr=1e-5, step_loss=0.245][RANK-0]: Step: [13864], local_loss=0.015314222313463688, train_loss=0.0265482347458601, time_cost=1.3166608810424805
+
Steps: 1%|▏ | 13864/1000000 [2:10:43<1994:03:59, 7.28s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%|▏ | 13865/1000000 [2:10:54<2314:23:36, 8.45s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [13865], local_loss=0.18777614831924438, train_loss=0.07124517858028412, time_cost=2.12221360206604
+
Steps: 1%|▏ | 13865/1000000 [2:10:54<2314:23:36, 8.45s/it, lr=1e-5, step_loss=0.188]
Steps: 1%|▏ | 13866/1000000 [2:11:00<2080:28:25, 7.60s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [13866], local_loss=0.016927897930145264, train_loss=0.14038041234016418, time_cost=1.3847870826721191
+
Steps: 1%|▏ | 13866/1000000 [2:11:00<2080:28:25, 7.60s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 13867/1000000 [2:11:09<2206:51:26, 8.06s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [13867], local_loss=0.09148107469081879, train_loss=0.033564403653144836, time_cost=1.9432055950164795
+
Steps: 1%|▏ | 13867/1000000 [2:11:09<2206:51:26, 8.06s/it, lr=1e-5, step_loss=0.0915]
Steps: 1%|▏ | 13868/1000000 [2:11:17<2190:58:20, 8.00s/it, lr=1e-5, step_loss=0.0915][RANK-0]: Step: [13868], local_loss=0.02790810540318489, train_loss=0.07836422324180603, time_cost=1.756593942642212
+
Steps: 1%|▏ | 13868/1000000 [2:11:17<2190:58:20, 8.00s/it, lr=1e-5, step_loss=0.0279]
Steps: 1%|▏ | 13869/1000000 [2:11:24<2114:13:22, 7.72s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [13869], local_loss=0.015135164372622967, train_loss=0.07367558032274246, time_cost=3.159792423248291
+
Steps: 1%|▏ | 13869/1000000 [2:11:24<2114:13:22, 7.72s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%|▏ | 13870/1000000 [2:11:31<2043:36:01, 7.46s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [13870], local_loss=0.0602445974946022, train_loss=0.07646411657333374, time_cost=2.1619813442230225
+
Steps: 1%|▏ | 13870/1000000 [2:11:31<2043:36:01, 7.46s/it, lr=1e-5, step_loss=0.0602]
Steps: 1%|▏ | 13871/1000000 [2:11:44<2537:51:35, 9.26s/it, lr=1e-5, step_loss=0.0602][RANK-0]: Step: [13871], local_loss=0.07982586324214935, train_loss=0.06614190340042114, time_cost=2.5119211673736572
+
Steps: 1%|▏ | 13871/1000000 [2:11:44<2537:51:35, 9.26s/it, lr=1e-5, step_loss=0.0798]
Steps: 1%|▏ | 13872/1000000 [2:11:59<2959:54:11, 10.81s/it, lr=1e-5, step_loss=0.0798][RANK-0]: Step: [13872], local_loss=0.3292354345321655, train_loss=0.07624556124210358, time_cost=5.180332183837891
+
Steps: 1%|▏ | 13872/1000000 [2:11:59<2959:54:11, 10.81s/it, lr=1e-5, step_loss=0.329]
Steps: 1%|▏ | 13873/1000000 [2:12:12<3194:58:46, 11.66s/it, lr=1e-5, step_loss=0.329][RANK-0]: Step: [13873], local_loss=0.02547931857407093, train_loss=0.3057534694671631, time_cost=4.616315126419067
+
Steps: 1%|▏ | 13873/1000000 [2:12:12<3194:58:46, 11.66s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%|▏ | 13874/1000000 [2:12:17<2598:41:58, 9.49s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [13874], local_loss=0.07866640388965607, train_loss=0.03896919637918472, time_cost=3.4342994689941406
+
Steps: 1%|▏ | 13874/1000000 [2:12:17<2598:41:58, 9.49s/it, lr=1e-5, step_loss=0.0787]
Steps: 1%|▏ | 13875/1000000 [2:12:31<2981:39:46, 10.89s/it, lr=1e-5, step_loss=0.0787][RANK-0]: Step: [13875], local_loss=0.282065212726593, train_loss=0.048223309218883514, time_cost=5.817871570587158
+
Steps: 1%|▏ | 13875/1000000 [2:12:31<2981:39:46, 10.89s/it, lr=1e-5, step_loss=0.282]
Steps: 1%|▏ | 13876/1000000 [2:12:36<2509:27:06, 9.16s/it, lr=1e-5, step_loss=0.282][RANK-0]: Step: [13876], local_loss=0.017664402723312378, train_loss=0.02323583886027336, time_cost=2.2671959400177
+
Steps: 1%|▏ | 13876/1000000 [2:12:36<2509:27:06, 9.16s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%|▏ | 13877/1000000 [2:12:51<3006:48:16, 10.98s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [13877], local_loss=0.05337027460336685, train_loss=0.05740255117416382, time_cost=2.887951135635376
+
Steps: 1%|▏ | 13877/1000000 [2:12:51<3006:48:16, 10.98s/it, lr=1e-5, step_loss=0.0534]
Steps: 1%|▏ | 13878/1000000 [2:13:00<2863:37:43, 10.45s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [13878], local_loss=0.008501541800796986, train_loss=0.09292720258235931, time_cost=2.749316692352295
+
Steps: 1%|▏ | 13878/1000000 [2:13:00<2863:37:43, 10.45s/it, lr=1e-5, step_loss=0.0085]
Steps: 1%|▏ | 13879/1000000 [2:13:06<2465:23:08, 9.00s/it, lr=1e-5, step_loss=0.0085][RANK-0]: Step: [13879], local_loss=137.98504638671875, train_loss=17.262380599975586, time_cost=2.9271836280822754
+
Steps: 1%|▏ | 13879/1000000 [2:13:06<2465:23:08, 9.00s/it, lr=1e-5, step_loss=138]
Steps: 1%|▏ | 13880/1000000 [2:13:11<2141:17:23, 7.82s/it, lr=1e-5, step_loss=138][RANK-0]: Step: [13880], local_loss=0.013639401644468307, train_loss=0.079152412712574, time_cost=2.034773826599121
+
Steps: 1%|▏ | 13880/1000000 [2:13:11<2141:17:23, 7.82s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%|▏ | 13881/1000000 [2:13:15<1837:00:50, 6.71s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [13881], local_loss=0.03381253778934479, train_loss=0.029882457107305527, time_cost=1.2533504962921143
+
Steps: 1%|▏ | 13881/1000000 [2:13:15<1837:00:50, 6.71s/it, lr=1e-5, step_loss=0.0338]
Steps: 1%|▏ | 13882/1000000 [2:13:31<2609:00:04, 9.52s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [13882], local_loss=0.02272954396903515, train_loss=0.03817672282457352, time_cost=14.214639663696289
+
Steps: 1%|▏ | 13882/1000000 [2:13:31<2609:00:04, 9.52s/it, lr=1e-5, step_loss=0.0227]
Steps: 1%|▏ | 13883/1000000 [2:13:37<2305:35:48, 8.42s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [13883], local_loss=0.058827709406614304, train_loss=0.02618541568517685, time_cost=5.13472580909729
+
Steps: 1%|▏ | 13883/1000000 [2:13:37<2305:35:48, 8.42s/it, lr=1e-5, step_loss=0.0588]
Steps: 1%|▏ | 13884/1000000 [2:13:53<2912:29:20, 10.63s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [13884], local_loss=0.03866958990693092, train_loss=0.06692193448543549, time_cost=7.796393871307373
+
Steps: 1%|▏ | 13884/1000000 [2:13:53<2912:29:20, 10.63s/it, lr=1e-5, step_loss=0.0387]
Steps: 1%|▏ | 13885/1000000 [2:14:03<2846:06:03, 10.39s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [13885], local_loss=0.007940756157040596, train_loss=0.034746333956718445, time_cost=2.9574666023254395
+
Steps: 1%|▏ | 13885/1000000 [2:14:03<2846:06:03, 10.39s/it, lr=1e-5, step_loss=0.00794]
Steps: 1%|▏ | 13886/1000000 [2:14:08<2420:11:55, 8.84s/it, lr=1e-5, step_loss=0.00794][RANK-0]: Step: [13886], local_loss=0.06849949806928635, train_loss=0.018286649137735367, time_cost=2.5636422634124756
+
Steps: 1%|▏ | 13886/1000000 [2:14:08<2420:11:55, 8.84s/it, lr=1e-5, step_loss=0.0685]
Steps: 1%|▏ | 13887/1000000 [2:14:19<2607:53:15, 9.52s/it, lr=1e-5, step_loss=0.0685][RANK-0]: Step: [13887], local_loss=0.020759060978889465, train_loss=0.07153967767953873, time_cost=2.972344398498535
+
Steps: 1%|▏ | 13887/1000000 [2:14:19<2607:53:15, 9.52s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%|▏ | 13888/1000000 [2:14:35<3101:27:51, 11.32s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [13888], local_loss=0.019300853833556175, train_loss=0.047153666615486145, time_cost=7.060593128204346
+
Steps: 1%|▏ | 13888/1000000 [2:14:35<3101:27:51, 11.32s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%|▏ | 13889/1000000 [2:14:43<2829:11:37, 10.33s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [13889], local_loss=0.017948372289538383, train_loss=0.018652353435754776, time_cost=1.9310123920440674
+
Steps: 1%|▏ | 13889/1000000 [2:14:43<2829:11:37, 10.33s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 13890/1000000 [2:14:48<2397:58:04, 8.75s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [13890], local_loss=0.13352519273757935, train_loss=0.05521285533905029, time_cost=2.491014003753662
+
Steps: 1%|▏ | 13890/1000000 [2:14:48<2397:58:04, 8.75s/it, lr=1e-5, step_loss=0.134]
Steps: 1%|▏ | 13891/1000000 [2:14:53<2126:33:17, 7.76s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [13891], local_loss=0.029502354562282562, train_loss=0.09944569319486618, time_cost=4.138398885726929
+
Steps: 1%|▏ | 13891/1000000 [2:14:53<2126:33:17, 7.76s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%|▏ | 13892/1000000 [2:15:02<2210:17:54, 8.07s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [13892], local_loss=0.020410973578691483, train_loss=0.05641202628612518, time_cost=6.914560079574585
+
Steps: 1%|▏ | 13892/1000000 [2:15:02<2210:17:54, 8.07s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 13893/1000000 [2:15:17<2787:00:33, 10.17s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [13893], local_loss=0.030811110511422157, train_loss=0.029052481055259705, time_cost=4.447373867034912
+
Steps: 1%|▏ | 13893/1000000 [2:15:17<2787:00:33, 10.17s/it, lr=1e-5, step_loss=0.0308]
Steps: 1%|▏ | 13894/1000000 [2:15:28<2878:13:52, 10.51s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [13894], local_loss=0.04327933117747307, train_loss=0.17968237400054932, time_cost=1.2694616317749023
+
Steps: 1%|▏ | 13894/1000000 [2:15:28<2878:13:52, 10.51s/it, lr=1e-5, step_loss=0.0433]
Steps: 1%|▏ | 13895/1000000 [2:15:38<2790:19:58, 10.19s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [13895], local_loss=0.018199849873781204, train_loss=6.274853229522705, time_cost=4.227587938308716
+
Steps: 1%|▏ | 13895/1000000 [2:15:38<2790:19:58, 10.19s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%|▏ | 13896/1000000 [2:15:49<2908:11:58, 10.62s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [13896], local_loss=0.022860553115606308, train_loss=0.02364087849855423, time_cost=3.436598300933838
+
Steps: 1%|▏ | 13896/1000000 [2:15:49<2908:11:58, 10.62s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%|▏ | 13897/1000000 [2:16:04<3209:14:35, 11.72s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [13897], local_loss=0.03625190258026123, train_loss=29.111547470092773, time_cost=6.377869606018066
+
Steps: 1%|▏ | 13897/1000000 [2:16:04<3209:14:35, 11.72s/it, lr=1e-5, step_loss=0.0363]
Steps: 1%|▏ | 13898/1000000 [2:16:13<3059:24:24, 11.17s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [13898], local_loss=0.00406180415302515, train_loss=0.04863304644823074, time_cost=2.210803270339966
+
Steps: 1%|▏ | 13898/1000000 [2:16:13<3059:24:24, 11.17s/it, lr=1e-5, step_loss=0.00406]
Steps: 1%|▏ | 13899/1000000 [2:16:18<2513:27:19, 9.18s/it, lr=1e-5, step_loss=0.00406][RANK-0]: Step: [13899], local_loss=0.007759891450405121, train_loss=0.04133741557598114, time_cost=1.7333984375
+
Steps: 1%|▏ | 13899/1000000 [2:16:18<2513:27:19, 9.18s/it, lr=1e-5, step_loss=0.00776]
Steps: 1%|▏ | 13900/1000000 [2:16:27<2511:13:38, 9.17s/it, lr=1e-5, step_loss=0.00776][RANK-0]: Step: [13900], local_loss=0.017490746453404427, train_loss=0.03364087641239166, time_cost=3.2666492462158203
+
Steps: 1%|▏ | 13900/1000000 [2:16:27<2511:13:38, 9.17s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 13901/1000000 [2:16:39<2720:16:45, 9.93s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [13901], local_loss=0.04960639029741287, train_loss=12.8719482421875, time_cost=3.058393955230713
+
Steps: 1%|▏ | 13901/1000000 [2:16:39<2720:16:45, 9.93s/it, lr=1e-5, step_loss=0.0496]
Steps: 1%|▏ | 13902/1000000 [2:16:44<2336:26:40, 8.53s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [13902], local_loss=0.014973461627960205, train_loss=0.1716831624507904, time_cost=2.43762469291687
+
Steps: 1%|▏ | 13902/1000000 [2:16:44<2336:26:40, 8.53s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 13903/1000000 [2:16:58<2749:19:04, 10.04s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [13903], local_loss=0.012381907552480698, train_loss=0.13449986279010773, time_cost=4.799593448638916
+
Steps: 1%|▏ | 13903/1000000 [2:16:58<2749:19:04, 10.04s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 13904/1000000 [2:17:10<2978:30:57, 10.87s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [13904], local_loss=0.01584695093333721, train_loss=0.15375354886054993, time_cost=3.496612310409546
+
Steps: 1%|▏ | 13904/1000000 [2:17:10<2978:30:57, 10.87s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%|▏ | 13905/1000000 [2:17:18<2704:27:15, 9.87s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [13905], local_loss=0.0049604312516748905, train_loss=0.028662294149398804, time_cost=2.475137233734131
+
Steps: 1%|▏ | 13905/1000000 [2:17:18<2704:27:15, 9.87s/it, lr=1e-5, step_loss=0.00496]
Steps: 1%|▏ | 13906/1000000 [2:17:34<3194:59:55, 11.66s/it, lr=1e-5, step_loss=0.00496][RANK-0]: Step: [13906], local_loss=0.007046704180538654, train_loss=0.0666106715798378, time_cost=7.391105890274048
+
Steps: 1%|▏ | 13906/1000000 [2:17:34<3194:59:55, 11.66s/it, lr=1e-5, step_loss=0.00705]
Steps: 1%|▏ | 13907/1000000 [2:17:45<3169:11:56, 11.57s/it, lr=1e-5, step_loss=0.00705][RANK-0]: Step: [13907], local_loss=0.02351759746670723, train_loss=0.0245673730969429, time_cost=6.265113353729248
+
Steps: 1%|▏ | 13907/1000000 [2:17:45<3169:11:56, 11.57s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%|▏ | 13908/1000000 [2:17:59<3344:50:00, 12.21s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [13908], local_loss=0.010520227253437042, train_loss=0.04114904999732971, time_cost=5.7039806842803955
+
Steps: 1%|▏ | 13908/1000000 [2:17:59<3344:50:00, 12.21s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%|▏ | 13909/1000000 [2:18:10<3262:31:10, 11.91s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [13909], local_loss=0.008358615450561047, train_loss=0.036994025111198425, time_cost=4.492232799530029
+
Steps: 1%|▏ | 13909/1000000 [2:18:10<3262:31:10, 11.91s/it, lr=1e-5, step_loss=0.00836]
Steps: 1%|▏ | 13910/1000000 [2:18:17<2829:45:50, 10.33s/it, lr=1e-5, step_loss=0.00836][RANK-0]: Step: [13910], local_loss=0.014594546519219875, train_loss=0.1316487342119217, time_cost=1.925140619277954
+
Steps: 1%|▏ | 13910/1000000 [2:18:17<2829:45:50, 10.33s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%|▏ | 13911/1000000 [2:18:31<3125:26:17, 11.41s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [13911], local_loss=0.025302739813923836, train_loss=0.020870326086878777, time_cost=5.891962051391602
+
Steps: 1%|▏ | 13911/1000000 [2:18:31<3125:26:17, 11.41s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%|▏ | 13912/1000000 [2:18:44<3257:54:08, 11.89s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [13912], local_loss=0.005653458181768656, train_loss=0.015636926516890526, time_cost=3.8079254627227783
+
Steps: 1%|▏ | 13912/1000000 [2:18:44<3257:54:08, 11.89s/it, lr=1e-5, step_loss=0.00565]
Steps: 1%|▏ | 13913/1000000 [2:18:58<3437:17:53, 12.55s/it, lr=1e-5, step_loss=0.00565][RANK-0]: Step: [13913], local_loss=0.019125167280435562, train_loss=0.046647973358631134, time_cost=3.7597596645355225
+
Steps: 1%|▏ | 13913/1000000 [2:18:58<3437:17:53, 12.55s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%|▏ | 13914/1000000 [2:19:03<2844:27:34, 10.38s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [13914], local_loss=0.017023442313075066, train_loss=0.011412972584366798, time_cost=2.510150194168091
+
Steps: 1%|▏ | 13914/1000000 [2:19:03<2844:27:34, 10.38s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 13915/1000000 [2:19:16<3080:35:37, 11.25s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [13915], local_loss=0.007311233319342136, train_loss=0.08817507326602936, time_cost=5.440118074417114
+
Steps: 1%|▏ | 13915/1000000 [2:19:16<3080:35:37, 11.25s/it, lr=1e-5, step_loss=0.00731]
Steps: 1%|▏ | 13916/1000000 [2:19:25<2834:16:33, 10.35s/it, lr=1e-5, step_loss=0.00731][RANK-0]: Step: [13916], local_loss=0.07247822731733322, train_loss=0.022885460406541824, time_cost=2.511455535888672
+
Steps: 1%|▏ | 13916/1000000 [2:19:25<2834:16:33, 10.35s/it, lr=1e-5, step_loss=0.0725]
Steps: 1%|▏ | 13917/1000000 [2:19:32<2574:48:01, 9.40s/it, lr=1e-5, step_loss=0.0725][RANK-0]: Step: [13917], local_loss=0.10436239838600159, train_loss=0.14092755317687988, time_cost=2.625957727432251
+
Steps: 1%|▏ | 13917/1000000 [2:19:32<2574:48:01, 9.40s/it, lr=1e-5, step_loss=0.104]
Steps: 1%|▏ | 13918/1000000 [2:19:39<2387:44:30, 8.72s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [13918], local_loss=0.0092082554474473, train_loss=0.050282664597034454, time_cost=2.4341037273406982
+
Steps: 1%|▏ | 13918/1000000 [2:19:39<2387:44:30, 8.72s/it, lr=1e-5, step_loss=0.00921]
Steps: 1%|▏ | 13919/1000000 [2:19:47<2292:29:04, 8.37s/it, lr=1e-5, step_loss=0.00921][RANK-0]: Step: [13919], local_loss=0.05799794942140579, train_loss=0.040855519473552704, time_cost=1.2557168006896973
+
Steps: 1%|▏ | 13919/1000000 [2:19:47<2292:29:04, 8.37s/it, lr=1e-5, step_loss=0.058]
Steps: 1%|▏ | 13920/1000000 [2:19:53<2097:39:36, 7.66s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [13920], local_loss=0.034368351101875305, train_loss=0.05998283997178078, time_cost=2.428287982940674
+
Steps: 1%|▏ | 13920/1000000 [2:19:53<2097:39:36, 7.66s/it, lr=1e-5, step_loss=0.0344]
Steps: 1%|▏ | 13921/1000000 [2:20:03<2299:06:46, 8.39s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [13921], local_loss=0.39221975207328796, train_loss=0.07539951056241989, time_cost=4.531379222869873
+
Steps: 1%|▏ | 13921/1000000 [2:20:03<2299:06:46, 8.39s/it, lr=1e-5, step_loss=0.392]
Steps: 1%|▏ | 13922/1000000 [2:20:17<2814:58:48, 10.28s/it, lr=1e-5, step_loss=0.392][RANK-0]: Step: [13922], local_loss=0.026608899235725403, train_loss=0.03598642349243164, time_cost=5.426732540130615
+
Steps: 1%|▏ | 13922/1000000 [2:20:17<2814:58:48, 10.28s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%|▏ | 13923/1000000 [2:20:29<2911:27:33, 10.63s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [13923], local_loss=0.06915146112442017, train_loss=0.03253408521413803, time_cost=1.2375202178955078
+
Steps: 1%|▏ | 13923/1000000 [2:20:29<2911:27:33, 10.63s/it, lr=1e-5, step_loss=0.0692]
Steps: 1%|▏ | 13924/1000000 [2:20:40<2934:36:50, 10.71s/it, lr=1e-5, step_loss=0.0692][RANK-0]: Step: [13924], local_loss=0.015040562488138676, train_loss=0.04291630536317825, time_cost=1.7795090675354004
+
Steps: 1%|▏ | 13924/1000000 [2:20:40<2934:36:50, 10.71s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 13925/1000000 [2:20:50<2905:58:52, 10.61s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [13925], local_loss=0.018820425495505333, train_loss=0.012712251394987106, time_cost=1.303809404373169
+
Steps: 1%|▏ | 13925/1000000 [2:20:50<2905:58:52, 10.61s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%|▏ | 13926/1000000 [2:20:57<2590:26:11, 9.46s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [13926], local_loss=0.027885891497135162, train_loss=0.06036653369665146, time_cost=2.9848849773406982
+
Steps: 1%|▏ | 13926/1000000 [2:20:57<2590:26:11, 9.46s/it, lr=1e-5, step_loss=0.0279]
Steps: 1%|▏ | 13927/1000000 [2:21:02<2236:18:47, 8.16s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [13927], local_loss=0.06363893300294876, train_loss=0.04548349231481552, time_cost=2.4938571453094482
+
Steps: 1%|▏ | 13927/1000000 [2:21:02<2236:18:47, 8.16s/it, lr=1e-5, step_loss=0.0636]
Steps: 1%|▏ | 13928/1000000 [2:21:08<2066:21:46, 7.54s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [13928], local_loss=0.6179894804954529, train_loss=0.11044694483280182, time_cost=1.272446632385254
+
Steps: 1%|▏ | 13928/1000000 [2:21:08<2066:21:46, 7.54s/it, lr=1e-5, step_loss=0.618]
Steps: 1%|▏ | 13929/1000000 [2:21:13<1870:26:14, 6.83s/it, lr=1e-5, step_loss=0.618][RANK-0]: Step: [13929], local_loss=0.9307019114494324, train_loss=0.14755649864673615, time_cost=2.7457354068756104
+
Steps: 1%|▏ | 13929/1000000 [2:21:13<1870:26:14, 6.83s/it, lr=1e-5, step_loss=0.931]
Steps: 1%|▏ | 13930/1000000 [2:21:29<2602:09:58, 9.50s/it, lr=1e-5, step_loss=0.931][RANK-0]: Step: [13930], local_loss=0.03207172825932503, train_loss=0.024637602269649506, time_cost=1.384033203125
+
Steps: 1%|▏ | 13930/1000000 [2:21:29<2602:09:58, 9.50s/it, lr=1e-5, step_loss=0.0321]
Steps: 1%|▏ | 13931/1000000 [2:21:35<2300:42:52, 8.40s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [13931], local_loss=0.009503500536084175, train_loss=0.020275013521313667, time_cost=1.6447393894195557
+
Steps: 1%|▏ | 13931/1000000 [2:21:35<2300:42:52, 8.40s/it, lr=1e-5, step_loss=0.0095]
Steps: 1%|▏ | 13932/1000000 [2:21:49<2799:53:50, 10.22s/it, lr=1e-5, step_loss=0.0095][RANK-0]: Step: [13932], local_loss=0.00820943620055914, train_loss=0.0400080643594265, time_cost=6.265672922134399
+
Steps: 1%|▏ | 13932/1000000 [2:21:49<2799:53:50, 10.22s/it, lr=1e-5, step_loss=0.00821]
Steps: 1%|▏ | 13933/1000000 [2:21:56<2531:51:10, 9.24s/it, lr=1e-5, step_loss=0.00821][RANK-0]: Step: [13933], local_loss=0.020440611988306046, train_loss=0.015506023541092873, time_cost=2.652947425842285
+
Steps: 1%|▏ | 13933/1000000 [2:21:56<2531:51:10, 9.24s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 13934/1000000 [2:22:04<2381:37:45, 8.70s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [13934], local_loss=0.01798976957798004, train_loss=0.16993479430675507, time_cost=2.1064813137054443
+
Steps: 1%|▏ | 13934/1000000 [2:22:04<2381:37:45, 8.70s/it, lr=1e-5, step_loss=0.018]
Steps: 1%|▏ | 13935/1000000 [2:22:11<2311:07:01, 8.44s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [13935], local_loss=0.014571252278983593, train_loss=0.16037383675575256, time_cost=2.8578882217407227
+
Steps: 1%|▏ | 13935/1000000 [2:22:11<2311:07:01, 8.44s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%|▏ | 13936/1000000 [2:22:19<2223:46:44, 8.12s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [13936], local_loss=0.751811146736145, train_loss=0.13086704909801483, time_cost=2.702004909515381
+
Steps: 1%|▏ | 13936/1000000 [2:22:19<2223:46:44, 8.12s/it, lr=1e-5, step_loss=0.752]
Steps: 1%|▏ | 13937/1000000 [2:22:32<2677:41:53, 9.78s/it, lr=1e-5, step_loss=0.752][RANK-0]: Step: [13937], local_loss=0.033621106296777725, train_loss=0.04718807339668274, time_cost=9.603124856948853
+
Steps: 1%|▏ | 13937/1000000 [2:22:32<2677:41:53, 9.78s/it, lr=1e-5, step_loss=0.0336]
Steps: 1%|▏ | 13938/1000000 [2:22:45<2867:00:08, 10.47s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [13938], local_loss=0.05445283651351929, train_loss=0.023268036544322968, time_cost=2.1428513526916504
+
Steps: 1%|▏ | 13938/1000000 [2:22:45<2867:00:08, 10.47s/it, lr=1e-5, step_loss=0.0545]
Steps: 1%|▏ | 13939/1000000 [2:22:56<2915:07:32, 10.64s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [13939], local_loss=0.0195587407797575, train_loss=0.04647393524646759, time_cost=3.4864261150360107
+
Steps: 1%|▏ | 13939/1000000 [2:22:56<2915:07:32, 10.64s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%|▏ | 13940/1000000 [2:23:02<2526:25:04, 9.22s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [13940], local_loss=0.023413460701704025, train_loss=0.03864247351884842, time_cost=2.037198305130005
+
Steps: 1%|▏ | 13940/1000000 [2:23:02<2526:25:04, 9.22s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%|▏ | 13941/1000000 [2:23:18<3104:26:27, 11.33s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [13941], local_loss=0.006927527952939272, train_loss=0.1925804316997528, time_cost=5.8729088306427
+
Steps: 1%|▏ | 13941/1000000 [2:23:18<3104:26:27, 11.33s/it, lr=1e-5, step_loss=0.00693]
Steps: 1%|▏ | 13942/1000000 [2:23:24<2675:54:37, 9.77s/it, lr=1e-5, step_loss=0.00693][RANK-0]: Step: [13942], local_loss=0.03280700743198395, train_loss=0.023341583088040352, time_cost=1.5756101608276367
+
Steps: 1%|▏ | 13942/1000000 [2:23:24<2675:54:37, 9.77s/it, lr=1e-5, step_loss=0.0328]
Steps: 1%|▏ | 13943/1000000 [2:23:29<2327:26:10, 8.50s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [13943], local_loss=0.006327126640826464, train_loss=0.014665849506855011, time_cost=2.161663293838501
+
Steps: 1%|▏ | 13943/1000000 [2:23:29<2327:26:10, 8.50s/it, lr=1e-5, step_loss=0.00633]
Steps: 1%|▏ | 13944/1000000 [2:23:37<2219:49:59, 8.10s/it, lr=1e-5, step_loss=0.00633][RANK-0]: Step: [13944], local_loss=0.27946409583091736, train_loss=0.07712849229574203, time_cost=2.4548537731170654
+
Steps: 1%|▏ | 13944/1000000 [2:23:37<2219:49:59, 8.10s/it, lr=1e-5, step_loss=0.279]
Steps: 1%|▏ | 13945/1000000 [2:23:41<1916:09:29, 7.00s/it, lr=1e-5, step_loss=0.279][RANK-0]: Step: [13945], local_loss=0.02360927313566208, train_loss=0.03076978772878647, time_cost=1.5767269134521484
+
Steps: 1%|▏ | 13945/1000000 [2:23:41<1916:09:29, 7.00s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%|▏ | 13946/1000000 [2:23:46<1766:43:56, 6.45s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [13946], local_loss=0.009728390723466873, train_loss=0.04921375960111618, time_cost=2.5946812629699707
+
Steps: 1%|▏ | 13946/1000000 [2:23:46<1766:43:56, 6.45s/it, lr=1e-5, step_loss=0.00973]
Steps: 1%|▏ | 13947/1000000 [2:23:52<1711:24:40, 6.25s/it, lr=1e-5, step_loss=0.00973][RANK-0]: Step: [13947], local_loss=0.05671807378530502, train_loss=0.17912614345550537, time_cost=1.384352207183838
+
Steps: 1%|▏ | 13947/1000000 [2:23:52<1711:24:40, 6.25s/it, lr=1e-5, step_loss=0.0567]
Steps: 1%|▏ | 13948/1000000 [2:24:02<2032:07:51, 7.42s/it, lr=1e-5, step_loss=0.0567][RANK-0]: Step: [13948], local_loss=0.025005053728818893, train_loss=0.02004118822515011, time_cost=1.2267508506774902
+
Steps: 1%|▏ | 13948/1000000 [2:24:02<2032:07:51, 7.42s/it, lr=1e-5, step_loss=0.025]
Steps: 1%|▏ | 13949/1000000 [2:24:10<2084:22:24, 7.61s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [13949], local_loss=0.01177520863711834, train_loss=0.021179014816880226, time_cost=3.6953203678131104
+
Steps: 1%|▏ | 13949/1000000 [2:24:10<2084:22:24, 7.61s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 13950/1000000 [2:24:19<2181:51:35, 7.97s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [13950], local_loss=0.11974762380123138, train_loss=5.383446216583252, time_cost=3.256124973297119
+
Steps: 1%|▏ | 13950/1000000 [2:24:19<2181:51:35, 7.97s/it, lr=1e-5, step_loss=0.12]
Steps: 1%|▏ | 13951/1000000 [2:24:25<2002:57:08, 7.31s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [13951], local_loss=0.011026802472770214, train_loss=0.022808171808719635, time_cost=1.326512336730957
+
Steps: 1%|▏ | 13951/1000000 [2:24:25<2002:57:08, 7.31s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 13952/1000000 [2:24:34<2174:04:48, 7.94s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [13952], local_loss=0.12453821301460266, train_loss=0.08164021372795105, time_cost=2.1870803833007812
+
Steps: 1%|▏ | 13952/1000000 [2:24:34<2174:04:48, 7.94s/it, lr=1e-5, step_loss=0.125]
Steps: 1%|▏ | 13953/1000000 [2:24:40<2014:02:27, 7.35s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [13953], local_loss=0.006952469237148762, train_loss=0.15395976603031158, time_cost=1.5749890804290771
+
Steps: 1%|▏ | 13953/1000000 [2:24:40<2014:02:27, 7.35s/it, lr=1e-5, step_loss=0.00695]
Steps: 1%|▏ | 13954/1000000 [2:24:48<2076:01:22, 7.58s/it, lr=1e-5, step_loss=0.00695][RANK-0]: Step: [13954], local_loss=0.025180287659168243, train_loss=0.033955734223127365, time_cost=2.4232864379882812
+
Steps: 1%|▏ | 13954/1000000 [2:24:48<2076:01:22, 7.58s/it, lr=1e-5, step_loss=0.0252]
Steps: 1%|▏ | 13955/1000000 [2:24:52<1789:48:18, 6.53s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [13955], local_loss=0.008918399922549725, train_loss=0.03162591531872749, time_cost=1.3970322608947754
+
Steps: 1%|▏ | 13955/1000000 [2:24:52<1789:48:18, 6.53s/it, lr=1e-5, step_loss=0.00892]
Steps: 1%|▏ | 13956/1000000 [2:25:00<1861:26:40, 6.80s/it, lr=1e-5, step_loss=0.00892][RANK-0]: Step: [13956], local_loss=0.016955072060227394, train_loss=0.07833538949489594, time_cost=1.57674241065979
+
Steps: 1%|▏ | 13956/1000000 [2:25:00<1861:26:40, 6.80s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 13957/1000000 [2:25:10<2127:30:17, 7.77s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [13957], local_loss=0.017556335777044296, train_loss=0.024320276454091072, time_cost=1.3465261459350586
+
Steps: 1%|▏ | 13957/1000000 [2:25:10<2127:30:17, 7.77s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%|▏ | 13958/1000000 [2:25:23<2541:37:21, 9.28s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [13958], local_loss=0.00851104874163866, train_loss=0.08335734903812408, time_cost=6.220107316970825
+
Steps: 1%|▏ | 13958/1000000 [2:25:23<2541:37:21, 9.28s/it, lr=1e-5, step_loss=0.00851]
Steps: 1%|▏ | 13959/1000000 [2:25:29<2312:28:51, 8.44s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [13959], local_loss=0.02202175185084343, train_loss=0.05121270939707756, time_cost=2.2076518535614014
+
Steps: 1%|▏ | 13959/1000000 [2:25:29<2312:28:51, 8.44s/it, lr=1e-5, step_loss=0.022]
Steps: 1%|▏ | 13960/1000000 [2:25:36<2226:23:31, 8.13s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [13960], local_loss=0.047231726348400116, train_loss=0.03610435873270035, time_cost=2.4333231449127197
+
Steps: 1%|▏ | 13960/1000000 [2:25:36<2226:23:31, 8.13s/it, lr=1e-5, step_loss=0.0472]
Steps: 1%|▏ | 13961/1000000 [2:25:41<1929:37:53, 7.05s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [13961], local_loss=0.01548255980014801, train_loss=0.014152261428534985, time_cost=1.4067423343658447
+
Steps: 1%|▏ | 13961/1000000 [2:25:41<1929:37:53, 7.05s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 13962/1000000 [2:25:53<2320:11:10, 8.47s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [13962], local_loss=0.010553432628512383, train_loss=0.026952732354402542, time_cost=2.7561769485473633
+
Steps: 1%|▏ | 13962/1000000 [2:25:53<2320:11:10, 8.47s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 13963/1000000 [2:26:01<2269:54:27, 8.29s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [13963], local_loss=0.043959129601716995, train_loss=0.03193560987710953, time_cost=2.2001888751983643
+
Steps: 1%|▏ | 13963/1000000 [2:26:01<2269:54:27, 8.29s/it, lr=1e-5, step_loss=0.044]
Steps: 1%|▏ | 13964/1000000 [2:26:08<2184:59:18, 7.98s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [13964], local_loss=0.05161320045590401, train_loss=0.04864216968417168, time_cost=1.9983220100402832
+
Steps: 1%|▏ | 13964/1000000 [2:26:08<2184:59:18, 7.98s/it, lr=1e-5, step_loss=0.0516]
Steps: 1%|▏ | 13965/1000000 [2:26:16<2159:35:22, 7.88s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [13965], local_loss=0.024854497984051704, train_loss=0.0963478833436966, time_cost=1.3177671432495117
+
Steps: 1%|▏ | 13965/1000000 [2:26:16<2159:35:22, 7.88s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%|▏ | 13966/1000000 [2:26:21<1990:04:02, 7.27s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [13966], local_loss=0.023268623277544975, train_loss=0.03925035521388054, time_cost=2.991204023361206
+
Steps: 1%|▏ | 13966/1000000 [2:26:21<1990:04:02, 7.27s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%|▏ | 13967/1000000 [2:26:27<1847:26:14, 6.74s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [13967], local_loss=0.014862669631838799, train_loss=0.06506142020225525, time_cost=2.7646970748901367
+
Steps: 1%|▏ | 13967/1000000 [2:26:27<1847:26:14, 6.74s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 13968/1000000 [2:26:34<1869:36:13, 6.83s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [13968], local_loss=0.09115111827850342, train_loss=4.409208297729492, time_cost=2.926248788833618
+
Steps: 1%|▏ | 13968/1000000 [2:26:34<1869:36:13, 6.83s/it, lr=1e-5, step_loss=0.0912]
Steps: 1%|▏ | 13969/1000000 [2:26:45<2234:13:15, 8.16s/it, lr=1e-5, step_loss=0.0912][RANK-0]: Step: [13969], local_loss=0.010378670878708363, train_loss=0.034856460988521576, time_cost=1.3711802959442139
+
Steps: 1%|▏ | 13969/1000000 [2:26:45<2234:13:15, 8.16s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 13970/1000000 [2:26:58<2598:08:38, 9.49s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [13970], local_loss=0.034327432513237, train_loss=0.022683870047330856, time_cost=1.7969965934753418
+
Steps: 1%|▏ | 13970/1000000 [2:26:58<2598:08:38, 9.49s/it, lr=1e-5, step_loss=0.0343]
Steps: 1%|▏ | 13971/1000000 [2:27:04<2307:54:58, 8.43s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [13971], local_loss=0.011122757568955421, train_loss=0.025292446836829185, time_cost=4.001748323440552
+
Steps: 1%|▏ | 13971/1000000 [2:27:04<2307:54:58, 8.43s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 13972/1000000 [2:27:11<2247:20:55, 8.21s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [13972], local_loss=0.04392024502158165, train_loss=0.025246955454349518, time_cost=1.6592438220977783
+
Steps: 1%|▏ | 13972/1000000 [2:27:11<2247:20:55, 8.21s/it, lr=1e-5, step_loss=0.0439]
Steps: 1%|▏ | 13973/1000000 [2:27:17<2060:15:47, 7.52s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [13973], local_loss=0.012984773144125938, train_loss=0.08351240307092667, time_cost=1.6640992164611816
+
Steps: 1%|▏ | 13973/1000000 [2:27:17<2060:15:47, 7.52s/it, lr=1e-5, step_loss=0.013]
Steps: 1%|▏ | 13974/1000000 [2:27:26<2178:06:45, 7.95s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [13974], local_loss=0.05240388959646225, train_loss=0.049498144537210464, time_cost=1.5968680381774902
+
Steps: 1%|▏ | 13974/1000000 [2:27:26<2178:06:45, 7.95s/it, lr=1e-5, step_loss=0.0524]
Steps: 1%|▏ | 13975/1000000 [2:27:34<2135:09:32, 7.80s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [13975], local_loss=0.04559936001896858, train_loss=0.03361104801297188, time_cost=3.544879198074341
+
Steps: 1%|▏ | 13975/1000000 [2:27:34<2135:09:32, 7.80s/it, lr=1e-5, step_loss=0.0456]
Steps: 1%|▏ | 13976/1000000 [2:27:41<2060:10:43, 7.52s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [13976], local_loss=0.006344487890601158, train_loss=0.02791912481188774, time_cost=2.460648536682129
+
Steps: 1%|▏ | 13976/1000000 [2:27:41<2060:10:43, 7.52s/it, lr=1e-5, step_loss=0.00634]
Steps: 1%|▏ | 13977/1000000 [2:27:48<2026:21:36, 7.40s/it, lr=1e-5, step_loss=0.00634][RANK-0]: Step: [13977], local_loss=0.006711649242788553, train_loss=0.013874052092432976, time_cost=2.9021012783050537
+
Steps: 1%|▏ | 13977/1000000 [2:27:48<2026:21:36, 7.40s/it, lr=1e-5, step_loss=0.00671]
Steps: 1%|▏ | 13978/1000000 [2:28:01<2517:26:59, 9.19s/it, lr=1e-5, step_loss=0.00671][RANK-0]: Step: [13978], local_loss=0.023772597312927246, train_loss=0.027825193479657173, time_cost=4.778449773788452
+
Steps: 1%|▏ | 13978/1000000 [2:28:01<2517:26:59, 9.19s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%|▏ | 13979/1000000 [2:28:15<2884:31:15, 10.53s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [13979], local_loss=0.007514447905123234, train_loss=0.07027260959148407, time_cost=11.522871971130371
+
Steps: 1%|▏ | 13979/1000000 [2:28:15<2884:31:15, 10.53s/it, lr=1e-5, step_loss=0.00751]
Steps: 1%|▏ | 13980/1000000 [2:28:21<2528:42:52, 9.23s/it, lr=1e-5, step_loss=0.00751][RANK-0]: Step: [13980], local_loss=0.07115288823843002, train_loss=0.02760211005806923, time_cost=1.841858148574829
+
Steps: 1%|▏ | 13980/1000000 [2:28:21<2528:42:52, 9.23s/it, lr=1e-5, step_loss=0.0712]
Steps: 1%|▏ | 13981/1000000 [2:28:26<2190:08:54, 8.00s/it, lr=1e-5, step_loss=0.0712][RANK-0]: Step: [13981], local_loss=0.026065412908792496, train_loss=0.03686344251036644, time_cost=2.0738718509674072
+
Steps: 1%|▏ | 13981/1000000 [2:28:26<2190:08:54, 8.00s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%|▏ | 13982/1000000 [2:28:31<1972:25:45, 7.20s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [13982], local_loss=0.010800600983202457, train_loss=0.05672840029001236, time_cost=1.2393317222595215
+
Steps: 1%|▏ | 13982/1000000 [2:28:31<1972:25:45, 7.20s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 13983/1000000 [2:28:39<2014:49:42, 7.36s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [13983], local_loss=0.0555277056992054, train_loss=0.03445447236299515, time_cost=5.498882293701172
+
Steps: 1%|▏ | 13983/1000000 [2:28:39<2014:49:42, 7.36s/it, lr=1e-5, step_loss=0.0555]
Steps: 1%|▏ | 13984/1000000 [2:28:46<1994:21:39, 7.28s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [13984], local_loss=0.010202717036008835, train_loss=0.01998875103890896, time_cost=2.846552848815918
+
Steps: 1%|▏ | 13984/1000000 [2:28:46<1994:21:39, 7.28s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%|▏ | 13985/1000000 [2:28:57<2279:06:17, 8.32s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [13985], local_loss=0.03283822536468506, train_loss=0.037929367274045944, time_cost=3.5303213596343994
+
Steps: 1%|▏ | 13985/1000000 [2:28:57<2279:06:17, 8.32s/it, lr=1e-5, step_loss=0.0328]
Steps: 1%|▏ | 13986/1000000 [2:29:02<2015:36:12, 7.36s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [13986], local_loss=0.009224774315953255, train_loss=0.07233825325965881, time_cost=1.2703444957733154
+
Steps: 1%|▏ | 13986/1000000 [2:29:02<2015:36:12, 7.36s/it, lr=1e-5, step_loss=0.00922]
Steps: 1%|▏ | 13987/1000000 [2:29:13<2305:44:19, 8.42s/it, lr=1e-5, step_loss=0.00922][RANK-0]: Step: [13987], local_loss=0.024857046082615852, train_loss=0.06949225068092346, time_cost=1.6513848304748535
+
Steps: 1%|▏ | 13987/1000000 [2:29:13<2305:44:19, 8.42s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%|▏ | 13988/1000000 [2:29:20<2172:10:12, 7.93s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [13988], local_loss=0.014689575880765915, train_loss=0.07368277758359909, time_cost=2.5369935035705566
+
Steps: 1%|▏ | 13988/1000000 [2:29:20<2172:10:12, 7.93s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 13989/1000000 [2:29:33<2639:51:22, 9.64s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [13989], local_loss=0.0064382851123809814, train_loss=0.029706507921218872, time_cost=1.2342910766601562
+
Steps: 1%|▏ | 13989/1000000 [2:29:33<2639:51:22, 9.64s/it, lr=1e-5, step_loss=0.00644]
Steps: 1%|▏ | 13990/1000000 [2:29:48<3042:29:17, 11.11s/it, lr=1e-5, step_loss=0.00644][RANK-0]: Step: [13990], local_loss=0.03113647922873497, train_loss=0.02742885984480381, time_cost=2.962455987930298
+
Steps: 1%|▏ | 13990/1000000 [2:29:48<3042:29:17, 11.11s/it, lr=1e-5, step_loss=0.0311]
Steps: 1%|▏ | 13991/1000000 [2:29:58<2922:23:06, 10.67s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [13991], local_loss=0.018779471516609192, train_loss=0.04580472409725189, time_cost=1.930626630783081
+
Steps: 1%|▏ | 13991/1000000 [2:29:58<2922:23:06, 10.67s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%|▏ | 13992/1000000 [2:30:13<3277:54:50, 11.97s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [13992], local_loss=0.019261160865426064, train_loss=0.07657039910554886, time_cost=6.088449954986572
+
Steps: 1%|▏ | 13992/1000000 [2:30:13<3277:54:50, 11.97s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%|▏ | 13993/1000000 [2:30:18<2723:31:48, 9.94s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [13993], local_loss=0.03947850689291954, train_loss=0.022101106122136116, time_cost=2.3462367057800293
+
Steps: 1%|▏ | 13993/1000000 [2:30:18<2723:31:48, 9.94s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%|▏ | 13994/1000000 [2:30:25<2505:19:08, 9.15s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [13994], local_loss=0.03572425618767738, train_loss=0.07372488081455231, time_cost=3.272230386734009
+
Steps: 1%|▏ | 13994/1000000 [2:30:25<2505:19:08, 9.15s/it, lr=1e-5, step_loss=0.0357]
Steps: 1%|▏ | 13995/1000000 [2:30:30<2166:55:54, 7.91s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [13995], local_loss=0.06723524630069733, train_loss=0.030074341222643852, time_cost=2.0409889221191406
+
Steps: 1%|▏ | 13995/1000000 [2:30:30<2166:55:54, 7.91s/it, lr=1e-5, step_loss=0.0672]
Steps: 1%|▏ | 13996/1000000 [2:30:35<1935:12:00, 7.07s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [13996], local_loss=0.06738539785146713, train_loss=0.04798044636845589, time_cost=1.949411153793335
+
Steps: 1%|▏ | 13996/1000000 [2:30:35<1935:12:00, 7.07s/it, lr=1e-5, step_loss=0.0674]
Steps: 1%|▏ | 13997/1000000 [2:30:44<2085:52:07, 7.62s/it, lr=1e-5, step_loss=0.0674][RANK-0]: Step: [13997], local_loss=0.016040772199630737, train_loss=0.028906619176268578, time_cost=2.9294395446777344
+
Steps: 1%|▏ | 13997/1000000 [2:30:44<2085:52:07, 7.62s/it, lr=1e-5, step_loss=0.016]
Steps: 1%|▏ | 13998/1000000 [2:30:48<1812:05:18, 6.62s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [13998], local_loss=0.05557297170162201, train_loss=21.07402992248535, time_cost=1.2301440238952637
+
Steps: 1%|▏ | 13998/1000000 [2:30:48<1812:05:18, 6.62s/it, lr=1e-5, step_loss=0.0556]
Steps: 1%|▏ | 13999/1000000 [2:31:04<2515:22:33, 9.18s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [13999], local_loss=0.33451735973358154, train_loss=0.0618312731385231, time_cost=6.531261205673218
+
Steps: 1%|▏ | 13999/1000000 [2:31:04<2515:22:33, 9.18s/it, lr=1e-5, step_loss=0.335]
Steps: 1%|▏ | 14000/1000000 [2:31:17<2879:35:27, 10.51s/it, lr=1e-5, step_loss=0.335][RANK-0]: Step: [14000], local_loss=0.006224147044122219, train_loss=0.14634402096271515, time_cost=1.2015371322631836
+09/18/2024 11:55:20 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1/checkpoint-14000
+09/18/2024 11:55:20 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-18 11:55:20,639] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+[2024-09-18 11:55:20,682] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-18 11:55:20,683] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 11:55:38,884] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 11:55:38,923] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-18 11:55:38,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-18 11:56:11,112] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:11,112] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:11,112] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:13,138] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:13,138] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:13,139] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:13,304] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:13,304] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:13,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:14,424] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:14,424] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:14,425] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:14,667] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:14,667] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:14,667] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:15,082] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:15,129] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:15,130] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:15,281] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:15,281] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:15,281] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 11:56:15,298] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-18 11:56:15,298] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-18 11:56:15,298] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/18/2024 11:56:15 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/pytorch_model
+{'norm_num_groups', 'dropout', 'use_additional_conditions'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/model/diffusion_pytorch_model.safetensors
+09/18/2024 11:57:22 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/scheduler.bin
+09/18/2024 11:57:22 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/sampler.bin
+09/18/2024 11:57:22 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-14000/random_states_0.pkl
+09/18/2024 11:57:22 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1/checkpoint-14000
+
Steps: 1%|▏ | 14000/1000000 [2:33:19<2879:35:27, 10.51s/it, lr=1e-5, step_loss=0.00622]
Steps: 1%|▏ | 14001/1000000 [2:33:28<12787:31:19, 46.69s/it, lr=1e-5, step_loss=0.00622][RANK-0]: Step: [14001], local_loss=0.07989844679832458, train_loss=0.05981217324733734, time_cost=1.2455110549926758
+
Steps: 1%|▏ | 14001/1000000 [2:33:28<12787:31:19, 46.69s/it, lr=1e-5, step_loss=0.0799]
Steps: 1%|▏ | 14002/1000000 [2:33:40<9896:21:05, 36.13s/it, lr=1e-5, step_loss=0.0799] [RANK-0]: Step: [14002], local_loss=0.015900254249572754, train_loss=0.018778041005134583, time_cost=2.002653121948242
+
Steps: 1%|▏ | 14002/1000000 [2:33:40<9896:21:05, 36.13s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%|▏ | 14003/1000000 [2:33:51<7863:24:02, 28.71s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [14003], local_loss=0.012002825736999512, train_loss=0.03958243504166603, time_cost=3.741004228591919
+
Steps: 1%|▏ | 14003/1000000 [2:33:51<7863:24:02, 28.71s/it, lr=1e-5, step_loss=0.012]
Steps: 1%|▏ | 14004/1000000 [2:34:01<6275:19:09, 22.91s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [14004], local_loss=0.008784138597548008, train_loss=0.03567611426115036, time_cost=6.967105150222778
+
Steps: 1%|▏ | 14004/1000000 [2:34:01<6275:19:09, 22.91s/it, lr=1e-5, step_loss=0.00878]
Steps: 1%|▏ | 14005/1000000 [2:34:07<4887:10:39, 17.84s/it, lr=1e-5, step_loss=0.00878][RANK-0]: Step: [14005], local_loss=0.027099423110485077, train_loss=0.027840010821819305, time_cost=3.325361728668213
+
Steps: 1%|▏ | 14005/1000000 [2:34:07<4887:10:39, 17.84s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%|▏ | 14006/1000000 [2:34:18<4343:01:14, 15.86s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [14006], local_loss=0.0059271203354001045, train_loss=0.023155905306339264, time_cost=1.2164111137390137
+
Steps: 1%|▏ | 14006/1000000 [2:34:18<4343:01:14, 15.86s/it, lr=1e-5, step_loss=0.00593]
Steps: 1%|▏ | 14007/1000000 [2:34:23<3472:18:20, 12.68s/it, lr=1e-5, step_loss=0.00593][RANK-0]: Step: [14007], local_loss=0.05257201939821243, train_loss=0.048712924122810364, time_cost=2.133803129196167
+
Steps: 1%|▏ | 14007/1000000 [2:34:23<3472:18:20, 12.68s/it, lr=1e-5, step_loss=0.0526]
Steps: 1%|▏ | 14008/1000000 [2:34:39<3713:47:25, 13.56s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [14008], local_loss=0.07322961837053299, train_loss=0.03357638418674469, time_cost=4.985657453536987
+
Steps: 1%|▏ | 14008/1000000 [2:34:39<3713:47:25, 13.56s/it, lr=1e-5, step_loss=0.0732]
Steps: 1%|▏ | 14009/1000000 [2:34:45<3100:38:53, 11.32s/it, lr=1e-5, step_loss=0.0732][RANK-0]: Step: [14009], local_loss=0.03354329988360405, train_loss=0.09775905311107635, time_cost=1.4411523342132568
+
Steps: 1%|▏ | 14009/1000000 [2:34:45<3100:38:53, 11.32s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%|▏ | 14010/1000000 [2:34:53<2857:16:24, 10.43s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [14010], local_loss=0.05029839277267456, train_loss=0.06393792480230331, time_cost=4.512723207473755
+
Steps: 1%|▏ | 14010/1000000 [2:34:53<2857:16:24, 10.43s/it, lr=1e-5, step_loss=0.0503]
Steps: 1%|▏ | 14011/1000000 [2:35:04<2884:11:25, 10.53s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [14011], local_loss=0.011801090091466904, train_loss=0.05689328536391258, time_cost=2.87722110748291
+
Steps: 1%|▏ | 14011/1000000 [2:35:04<2884:11:25, 10.53s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14012/1000000 [2:35:15<2954:12:00, 10.79s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14012], local_loss=0.03199613094329834, train_loss=0.06366808712482452, time_cost=3.30664324760437
+
Steps: 1%|▏ | 14012/1000000 [2:35:15<2954:12:00, 10.79s/it, lr=1e-5, step_loss=0.032]
Steps: 1%|▏ | 14013/1000000 [2:35:27<3007:25:55, 10.98s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [14013], local_loss=0.3194861114025116, train_loss=0.058337219059467316, time_cost=1.2994256019592285
+
Steps: 1%|▏ | 14013/1000000 [2:35:27<3007:25:55, 10.98s/it, lr=1e-5, step_loss=0.319]
Steps: 1%|▏ | 14014/1000000 [2:35:38<3026:34:23, 11.05s/it, lr=1e-5, step_loss=0.319][RANK-0]: Step: [14014], local_loss=0.09385903179645538, train_loss=0.07490015774965286, time_cost=2.1572065353393555
+
Steps: 1%|▏ | 14014/1000000 [2:35:38<3026:34:23, 11.05s/it, lr=1e-5, step_loss=0.0939]
Steps: 1%|▏ | 14015/1000000 [2:35:43<2536:55:09, 9.26s/it, lr=1e-5, step_loss=0.0939][RANK-0]: Step: [14015], local_loss=0.0453365258872509, train_loss=0.026098299771547318, time_cost=1.289269208908081
+
Steps: 1%|▏ | 14015/1000000 [2:35:43<2536:55:09, 9.26s/it, lr=1e-5, step_loss=0.0453]
Steps: 1%|▏ | 14016/1000000 [2:35:49<2237:10:46, 8.17s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [14016], local_loss=0.010922832414507866, train_loss=0.022948505356907845, time_cost=2.507361650466919
+
Steps: 1%|▏ | 14016/1000000 [2:35:49<2237:10:46, 8.17s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 14017/1000000 [2:36:02<2664:06:14, 9.73s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [14017], local_loss=0.009350785054266453, train_loss=0.1852826178073883, time_cost=4.174278974533081
+
Steps: 1%|▏ | 14017/1000000 [2:36:02<2664:06:14, 9.73s/it, lr=1e-5, step_loss=0.00935]
Steps: 1%|▏ | 14018/1000000 [2:36:10<2504:17:36, 9.14s/it, lr=1e-5, step_loss=0.00935][RANK-0]: Step: [14018], local_loss=0.04300945997238159, train_loss=0.031155237928032875, time_cost=5.956053256988525
+
Steps: 1%|▏ | 14018/1000000 [2:36:10<2504:17:36, 9.14s/it, lr=1e-5, step_loss=0.043]
Steps: 1%|▏ | 14019/1000000 [2:36:17<2345:30:34, 8.56s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [14019], local_loss=0.06447087228298187, train_loss=0.07194830477237701, time_cost=2.7870471477508545
+
Steps: 1%|▏ | 14019/1000000 [2:36:17<2345:30:34, 8.56s/it, lr=1e-5, step_loss=0.0645]
Steps: 1%|▏ | 14020/1000000 [2:36:24<2227:30:52, 8.13s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [14020], local_loss=0.01711214892566204, train_loss=0.053419265896081924, time_cost=1.305546760559082
+
Steps: 1%|▏ | 14020/1000000 [2:36:24<2227:30:52, 8.13s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%|▏ | 14021/1000000 [2:36:38<2679:49:19, 9.78s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [14021], local_loss=0.5394738912582397, train_loss=0.1360926479101181, time_cost=4.419754505157471
+
Steps: 1%|▏ | 14021/1000000 [2:36:38<2679:49:19, 9.78s/it, lr=1e-5, step_loss=0.539]
Steps: 1%|▏ | 14022/1000000 [2:36:47<2660:28:32, 9.71s/it, lr=1e-5, step_loss=0.539][RANK-0]: Step: [14022], local_loss=0.022953148931264877, train_loss=0.02039181813597679, time_cost=1.377488613128662
+
Steps: 1%|▏ | 14022/1000000 [2:36:47<2660:28:32, 9.71s/it, lr=1e-5, step_loss=0.023]
Steps: 1%|▏ | 14023/1000000 [2:37:00<2900:07:48, 10.59s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [14023], local_loss=0.036654189229011536, train_loss=0.03481290489435196, time_cost=6.483529090881348
+
Steps: 1%|▏ | 14023/1000000 [2:37:00<2900:07:48, 10.59s/it, lr=1e-5, step_loss=0.0367]
Steps: 1%|▏ | 14024/1000000 [2:37:11<2952:46:01, 10.78s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [14024], local_loss=0.011803166940808296, train_loss=0.1473097801208496, time_cost=6.499017000198364
+
Steps: 1%|▏ | 14024/1000000 [2:37:11<2952:46:01, 10.78s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14025/1000000 [2:37:19<2717:09:31, 9.92s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14025], local_loss=0.28522181510925293, train_loss=0.08613809943199158, time_cost=2.0926058292388916
+
Steps: 1%|▏ | 14025/1000000 [2:37:19<2717:09:31, 9.92s/it, lr=1e-5, step_loss=0.285]
Steps: 1%|▏ | 14026/1000000 [2:37:33<3037:43:46, 11.09s/it, lr=1e-5, step_loss=0.285][RANK-0]: Step: [14026], local_loss=0.027110302820801735, train_loss=0.028254196047782898, time_cost=4.1821911334991455
+
Steps: 1%|▏ | 14026/1000000 [2:37:33<3037:43:46, 11.09s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%|▏ | 14027/1000000 [2:37:38<2544:52:07, 9.29s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [14027], local_loss=0.015118220821022987, train_loss=0.015319110825657845, time_cost=4.0414721965789795
+
Steps: 1%|▏ | 14027/1000000 [2:37:38<2544:52:07, 9.29s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%|▏ | 14028/1000000 [2:37:50<2754:37:25, 10.06s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [14028], local_loss=0.027839994058012962, train_loss=0.025797121226787567, time_cost=3.2062525749206543
+
Steps: 1%|▏ | 14028/1000000 [2:37:50<2754:37:25, 10.06s/it, lr=1e-5, step_loss=0.0278]
Steps: 1%|▏ | 14029/1000000 [2:37:55<2337:54:15, 8.54s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [14029], local_loss=0.022660836577415466, train_loss=0.04211156442761421, time_cost=2.1045644283294678
+
Steps: 1%|▏ | 14029/1000000 [2:37:55<2337:54:15, 8.54s/it, lr=1e-5, step_loss=0.0227]
Steps: 1%|▏ | 14030/1000000 [2:38:06<2583:14:05, 9.43s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [14030], local_loss=0.012722771614789963, train_loss=0.16212311387062073, time_cost=1.229731559753418
+
Steps: 1%|▏ | 14030/1000000 [2:38:06<2583:14:05, 9.43s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%|▏ | 14031/1000000 [2:38:12<2306:54:49, 8.42s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [14031], local_loss=0.0162070132791996, train_loss=19.1119384765625, time_cost=1.966433048248291
+
Steps: 1%|▏ | 14031/1000000 [2:38:12<2306:54:49, 8.42s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%|▏ | 14032/1000000 [2:38:21<2334:47:51, 8.52s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [14032], local_loss=0.02677997387945652, train_loss=0.05192364379763603, time_cost=2.949047565460205
+
Steps: 1%|▏ | 14032/1000000 [2:38:21<2334:47:51, 8.52s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%|▏ | 14033/1000000 [2:38:31<2427:25:45, 8.86s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [14033], local_loss=0.05085736885666847, train_loss=0.040403373539447784, time_cost=1.3593332767486572
+
Steps: 1%|▏ | 14033/1000000 [2:38:31<2427:25:45, 8.86s/it, lr=1e-5, step_loss=0.0509]
Steps: 1%|▏ | 14034/1000000 [2:38:43<2688:46:50, 9.82s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [14034], local_loss=0.006639894563704729, train_loss=0.1438673734664917, time_cost=4.35160493850708
+
Steps: 1%|▏ | 14034/1000000 [2:38:43<2688:46:50, 9.82s/it, lr=1e-5, step_loss=0.00664]
Steps: 1%|▏ | 14035/1000000 [2:38:51<2545:03:37, 9.29s/it, lr=1e-5, step_loss=0.00664][RANK-0]: Step: [14035], local_loss=0.021933663636446, train_loss=0.0817335769534111, time_cost=3.6977057456970215
+
Steps: 1%|▏ | 14035/1000000 [2:38:51<2545:03:37, 9.29s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%|▏ | 14036/1000000 [2:39:04<2890:43:24, 10.55s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [14036], local_loss=0.04699994623661041, train_loss=0.02715536393225193, time_cost=3.8864123821258545
+
Steps: 1%|▏ | 14036/1000000 [2:39:04<2890:43:24, 10.55s/it, lr=1e-5, step_loss=0.047]
Steps: 1%|▏ | 14037/1000000 [2:39:11<2526:40:19, 9.23s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [14037], local_loss=0.021937694400548935, train_loss=0.060842398554086685, time_cost=1.6954700946807861
+
Steps: 1%|▏ | 14037/1000000 [2:39:11<2526:40:19, 9.23s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%|▏ | 14038/1000000 [2:39:22<2732:58:58, 9.98s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [14038], local_loss=0.0561930350959301, train_loss=0.06336580216884613, time_cost=8.82506012916565
+
Steps: 1%|▏ | 14038/1000000 [2:39:22<2732:58:58, 9.98s/it, lr=1e-5, step_loss=0.0562]
Steps: 1%|▏ | 14039/1000000 [2:39:34<2844:43:16, 10.39s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [14039], local_loss=0.035090796649456024, train_loss=0.18764320015907288, time_cost=1.2475621700286865
+
Steps: 1%|▏ | 14039/1000000 [2:39:34<2844:43:16, 10.39s/it, lr=1e-5, step_loss=0.0351]
Steps: 1%|▏ | 14040/1000000 [2:39:47<3116:35:21, 11.38s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [14040], local_loss=0.01517495233565569, train_loss=0.04814764857292175, time_cost=4.356549263000488
+
Steps: 1%|▏ | 14040/1000000 [2:39:47<3116:35:21, 11.38s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%|▏ | 14041/1000000 [2:39:58<3064:27:57, 11.19s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [14041], local_loss=0.013575262390077114, train_loss=0.01800113543868065, time_cost=3.137204647064209
+
Steps: 1%|▏ | 14041/1000000 [2:39:58<3064:27:57, 11.19s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%|▏ | 14042/1000000 [2:40:07<2895:51:06, 10.57s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [14042], local_loss=0.1411902755498886, train_loss=0.10819433629512787, time_cost=1.4095261096954346
+
Steps: 1%|▏ | 14042/1000000 [2:40:07<2895:51:06, 10.57s/it, lr=1e-5, step_loss=0.141]
Steps: 1%|▏ | 14043/1000000 [2:40:18<2943:01:49, 10.75s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [14043], local_loss=0.03349537029862404, train_loss=0.019998548552393913, time_cost=3.601322889328003
+
Steps: 1%|▏ | 14043/1000000 [2:40:18<2943:01:49, 10.75s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%|▏ | 14044/1000000 [2:40:24<2544:11:15, 9.29s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [14044], local_loss=0.011988474056124687, train_loss=0.07022624462842941, time_cost=1.664180040359497
+
Steps: 1%|▏ | 14044/1000000 [2:40:24<2544:11:15, 9.29s/it, lr=1e-5, step_loss=0.012]
Steps: 1%|▏ | 14045/1000000 [2:40:33<2510:17:54, 9.17s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [14045], local_loss=0.048561908304691315, train_loss=0.043857090175151825, time_cost=2.9202966690063477
+
Steps: 1%|▏ | 14045/1000000 [2:40:33<2510:17:54, 9.17s/it, lr=1e-5, step_loss=0.0486]
Steps: 1%|▏ | 14046/1000000 [2:40:40<2322:00:29, 8.48s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [14046], local_loss=0.052870914340019226, train_loss=0.04225607216358185, time_cost=2.2570745944976807
+
Steps: 1%|▏ | 14046/1000000 [2:40:40<2322:00:29, 8.48s/it, lr=1e-5, step_loss=0.0529]
Steps: 1%|▏ | 14047/1000000 [2:40:45<2055:11:41, 7.50s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [14047], local_loss=0.007093056105077267, train_loss=0.014929141849279404, time_cost=1.9012904167175293
+
Steps: 1%|▏ | 14047/1000000 [2:40:45<2055:11:41, 7.50s/it, lr=1e-5, step_loss=0.00709]
Steps: 1%|▏ | 14048/1000000 [2:40:52<2014:48:03, 7.36s/it, lr=1e-5, step_loss=0.00709][RANK-0]: Step: [14048], local_loss=0.008712833747267723, train_loss=0.03215838223695755, time_cost=2.2253451347351074
+
Steps: 1%|▏ | 14048/1000000 [2:40:52<2014:48:03, 7.36s/it, lr=1e-5, step_loss=0.00871]
Steps: 1%|▏ | 14049/1000000 [2:41:04<2335:01:37, 8.53s/it, lr=1e-5, step_loss=0.00871][RANK-0]: Step: [14049], local_loss=0.02205101028084755, train_loss=44.584808349609375, time_cost=4.485236406326294
+
Steps: 1%|▏ | 14049/1000000 [2:41:04<2335:01:37, 8.53s/it, lr=1e-5, step_loss=0.0221]
Steps: 1%|▏ | 14050/1000000 [2:41:09<2043:37:29, 7.46s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [14050], local_loss=0.016664160415530205, train_loss=0.06924927234649658, time_cost=1.2380449771881104
+
Steps: 1%|▏ | 14050/1000000 [2:41:09<2043:37:29, 7.46s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 14051/1000000 [2:41:20<2344:37:24, 8.56s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [14051], local_loss=0.018069151788949966, train_loss=0.03253762796521187, time_cost=8.791326522827148
+
Steps: 1%|▏ | 14051/1000000 [2:41:20<2344:37:24, 8.56s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%|▏ | 14052/1000000 [2:41:25<2055:47:06, 7.51s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [14052], local_loss=0.06811610609292984, train_loss=0.1687486618757248, time_cost=1.9619901180267334
+
Steps: 1%|▏ | 14052/1000000 [2:41:25<2055:47:06, 7.51s/it, lr=1e-5, step_loss=0.0681]
Steps: 1%|▏ | 14053/1000000 [2:41:30<1861:05:32, 6.80s/it, lr=1e-5, step_loss=0.0681][RANK-0]: Step: [14053], local_loss=0.007969269528985023, train_loss=0.025489002466201782, time_cost=3.966522216796875
+
Steps: 1%|▏ | 14053/1000000 [2:41:30<1861:05:32, 6.80s/it, lr=1e-5, step_loss=0.00797]
Steps: 1%|▏ | 14054/1000000 [2:41:35<1719:59:25, 6.28s/it, lr=1e-5, step_loss=0.00797][RANK-0]: Step: [14054], local_loss=0.061330024152994156, train_loss=0.04125996679067612, time_cost=2.1941580772399902
+
Steps: 1%|▏ | 14054/1000000 [2:41:35<1719:59:25, 6.28s/it, lr=1e-5, step_loss=0.0613]
Steps: 1%|▏ | 14055/1000000 [2:41:40<1617:33:51, 5.91s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [14055], local_loss=0.044623710215091705, train_loss=0.03701028972864151, time_cost=3.7663559913635254
+
Steps: 1%|▏ | 14055/1000000 [2:41:40<1617:33:51, 5.91s/it, lr=1e-5, step_loss=0.0446]
Steps: 1%|▏ | 14056/1000000 [2:41:47<1747:03:23, 6.38s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [14056], local_loss=0.05088010057806969, train_loss=0.1518382579088211, time_cost=2.13071608543396
+
Steps: 1%|▏ | 14056/1000000 [2:41:47<1747:03:23, 6.38s/it, lr=1e-5, step_loss=0.0509]
Steps: 1%|▏ | 14057/1000000 [2:41:58<2116:39:57, 7.73s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [14057], local_loss=0.014991370029747486, train_loss=0.07828874886035919, time_cost=1.2128651142120361
+
Steps: 1%|▏ | 14057/1000000 [2:41:58<2116:39:57, 7.73s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 14058/1000000 [2:42:08<2311:07:52, 8.44s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [14058], local_loss=0.01600675843656063, train_loss=0.07435775548219681, time_cost=4.6172308921813965
+
Steps: 1%|▏ | 14058/1000000 [2:42:08<2311:07:52, 8.44s/it, lr=1e-5, step_loss=0.016]
Steps: 1%|▏ | 14059/1000000 [2:42:19<2480:47:34, 9.06s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [14059], local_loss=0.497003436088562, train_loss=0.09003317356109619, time_cost=1.2264955043792725
+
Steps: 1%|▏ | 14059/1000000 [2:42:19<2480:47:34, 9.06s/it, lr=1e-5, step_loss=0.497]
Steps: 1%|▏ | 14060/1000000 [2:42:24<2146:18:39, 7.84s/it, lr=1e-5, step_loss=0.497][RANK-0]: Step: [14060], local_loss=0.012610353529453278, train_loss=0.04217107594013214, time_cost=1.9750170707702637
+
Steps: 1%|▏ | 14060/1000000 [2:42:24<2146:18:39, 7.84s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%|▏ | 14061/1000000 [2:42:34<2301:13:00, 8.40s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [14061], local_loss=0.11634030938148499, train_loss=0.06867703795433044, time_cost=2.532048463821411
+
Steps: 1%|▏ | 14061/1000000 [2:42:34<2301:13:00, 8.40s/it, lr=1e-5, step_loss=0.116]
Steps: 1%|▏ | 14062/1000000 [2:42:47<2696:05:33, 9.84s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [14062], local_loss=0.4194430708885193, train_loss=0.13237765431404114, time_cost=4.5847718715667725
+
Steps: 1%|▏ | 14062/1000000 [2:42:47<2696:05:33, 9.84s/it, lr=1e-5, step_loss=0.419]
Steps: 1%|▏ | 14063/1000000 [2:42:58<2779:24:34, 10.15s/it, lr=1e-5, step_loss=0.419][RANK-0]: Step: [14063], local_loss=0.017889738082885742, train_loss=0.14973416924476624, time_cost=1.7956888675689697
+
Steps: 1%|▏ | 14063/1000000 [2:42:58<2779:24:34, 10.15s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 14064/1000000 [2:43:02<2326:40:30, 8.50s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [14064], local_loss=0.010826480574905872, train_loss=0.04647243022918701, time_cost=2.2282633781433105
+
Steps: 1%|▏ | 14064/1000000 [2:43:02<2326:40:30, 8.50s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 14065/1000000 [2:43:08<2070:59:55, 7.56s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [14065], local_loss=0.03188912570476532, train_loss=0.07944238185882568, time_cost=3.765624761581421
+
Steps: 1%|▏ | 14065/1000000 [2:43:08<2070:59:55, 7.56s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%|▏ | 14066/1000000 [2:43:17<2194:06:12, 8.01s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [14066], local_loss=1.0165154933929443, train_loss=0.2809164524078369, time_cost=2.6925241947174072
+
Steps: 1%|▏ | 14066/1000000 [2:43:17<2194:06:12, 8.01s/it, lr=1e-5, step_loss=1.02]
Steps: 1%|▏ | 14067/1000000 [2:43:26<2291:21:49, 8.37s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [14067], local_loss=0.34283262491226196, train_loss=0.0714634358882904, time_cost=6.2779381275177
+
Steps: 1%|▏ | 14067/1000000 [2:43:26<2291:21:49, 8.37s/it, lr=1e-5, step_loss=0.343]
Steps: 1%|▏ | 14068/1000000 [2:43:31<2051:58:09, 7.49s/it, lr=1e-5, step_loss=0.343][RANK-0]: Step: [14068], local_loss=0.11953256279230118, train_loss=0.07155926525592804, time_cost=2.3058102130889893
+
Steps: 1%|▏ | 14068/1000000 [2:43:31<2051:58:09, 7.49s/it, lr=1e-5, step_loss=0.12]
Steps: 1%|▏ | 14069/1000000 [2:43:36<1844:41:15, 6.74s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [14069], local_loss=0.02674674242734909, train_loss=0.07187708467245102, time_cost=1.2884814739227295
+
Steps: 1%|▏ | 14069/1000000 [2:43:36<1844:41:15, 6.74s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%|▏ | 14070/1000000 [2:43:47<2170:00:41, 7.92s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [14070], local_loss=0.005926141981035471, train_loss=0.1620481014251709, time_cost=3.65474796295166
+
Steps: 1%|▏ | 14070/1000000 [2:43:47<2170:00:41, 7.92s/it, lr=1e-5, step_loss=0.00593]
Steps: 1%|▏ | 14071/1000000 [2:43:59<2475:13:37, 9.04s/it, lr=1e-5, step_loss=0.00593][RANK-0]: Step: [14071], local_loss=0.012703249230980873, train_loss=0.031695615500211716, time_cost=3.0180695056915283
+
Steps: 1%|▏ | 14071/1000000 [2:43:59<2475:13:37, 9.04s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%|▏ | 14072/1000000 [2:44:04<2149:54:38, 7.85s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [14072], local_loss=0.01060360949486494, train_loss=0.02522560954093933, time_cost=1.9932732582092285
+
Steps: 1%|▏ | 14072/1000000 [2:44:04<2149:54:38, 7.85s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 14073/1000000 [2:44:08<1865:53:03, 6.81s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [14073], local_loss=0.034270212054252625, train_loss=0.058572784066200256, time_cost=1.6184871196746826
+
Steps: 1%|▏ | 14073/1000000 [2:44:08<1865:53:03, 6.81s/it, lr=1e-5, step_loss=0.0343]
Steps: 1%|▏ | 14074/1000000 [2:44:21<2387:07:44, 8.72s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [14074], local_loss=0.023813404142856598, train_loss=0.023403242230415344, time_cost=3.9672248363494873
+
Steps: 1%|▏ | 14074/1000000 [2:44:21<2387:07:44, 8.72s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%|▏ | 14075/1000000 [2:44:27<2102:53:26, 7.68s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [14075], local_loss=0.00878902431577444, train_loss=0.02255713939666748, time_cost=2.40600848197937
+
Steps: 1%|▏ | 14075/1000000 [2:44:27<2102:53:26, 7.68s/it, lr=1e-5, step_loss=0.00879]
Steps: 1%|▏ | 14076/1000000 [2:44:32<1905:49:50, 6.96s/it, lr=1e-5, step_loss=0.00879][RANK-0]: Step: [14076], local_loss=0.00932309776544571, train_loss=19.068279266357422, time_cost=1.212599754333496
+
Steps: 1%|▏ | 14076/1000000 [2:44:32<1905:49:50, 6.96s/it, lr=1e-5, step_loss=0.00932]
Steps: 1%|▏ | 14077/1000000 [2:44:48<2637:46:44, 9.63s/it, lr=1e-5, step_loss=0.00932][RANK-0]: Step: [14077], local_loss=0.2994841933250427, train_loss=0.11145615577697754, time_cost=7.036192178726196
+
Steps: 1%|▏ | 14077/1000000 [2:44:48<2637:46:44, 9.63s/it, lr=1e-5, step_loss=0.299]
Steps: 1%|▏ | 14078/1000000 [2:44:58<2729:13:08, 9.97s/it, lr=1e-5, step_loss=0.299][RANK-0]: Step: [14078], local_loss=0.007493922486901283, train_loss=0.03158736974000931, time_cost=2.0070855617523193
+
Steps: 1%|▏ | 14078/1000000 [2:44:58<2729:13:08, 9.97s/it, lr=1e-5, step_loss=0.00749]
Steps: 1%|▏ | 14079/1000000 [2:45:12<3041:00:08, 11.10s/it, lr=1e-5, step_loss=0.00749][RANK-0]: Step: [14079], local_loss=0.01614656299352646, train_loss=0.04452397674322128, time_cost=3.7721259593963623
+
Steps: 1%|▏ | 14079/1000000 [2:45:12<3041:00:08, 11.10s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%|▏ | 14080/1000000 [2:45:20<2736:51:16, 9.99s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [14080], local_loss=0.03295561298727989, train_loss=0.2069566249847412, time_cost=1.7104649543762207
+
Steps: 1%|▏ | 14080/1000000 [2:45:20<2736:51:16, 9.99s/it, lr=1e-5, step_loss=0.033]
Steps: 1%|▏ | 14081/1000000 [2:45:31<2856:54:25, 10.43s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [14081], local_loss=0.03130868822336197, train_loss=0.15989530086517334, time_cost=3.6730268001556396
+
Steps: 1%|▏ | 14081/1000000 [2:45:31<2856:54:25, 10.43s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%|▏ | 14082/1000000 [2:45:42<2894:55:39, 10.57s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [14082], local_loss=0.006017506588250399, train_loss=0.04424891248345375, time_cost=1.2377548217773438
+
Steps: 1%|▏ | 14082/1000000 [2:45:42<2894:55:39, 10.57s/it, lr=1e-5, step_loss=0.00602]
Steps: 1%|▏ | 14083/1000000 [2:45:56<3187:58:34, 11.64s/it, lr=1e-5, step_loss=0.00602][RANK-0]: Step: [14083], local_loss=0.0070571633987128735, train_loss=0.06814493238925934, time_cost=2.2694637775421143
+
Steps: 1%|▏ | 14083/1000000 [2:45:56<3187:58:34, 11.64s/it, lr=1e-5, step_loss=0.00706]
Steps: 1%|▏ | 14084/1000000 [2:46:09<3325:18:28, 12.14s/it, lr=1e-5, step_loss=0.00706][RANK-0]: Step: [14084], local_loss=0.153286874294281, train_loss=0.03344444930553436, time_cost=3.7602198123931885
+
Steps: 1%|▏ | 14084/1000000 [2:46:09<3325:18:28, 12.14s/it, lr=1e-5, step_loss=0.153]
Steps: 1%|▏ | 14085/1000000 [2:46:22<3338:39:38, 12.19s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [14085], local_loss=0.02808741293847561, train_loss=0.04233179986476898, time_cost=6.66150689125061
+
Steps: 1%|▏ | 14085/1000000 [2:46:22<3338:39:38, 12.19s/it, lr=1e-5, step_loss=0.0281]
Steps: 1%|▏ | 14086/1000000 [2:46:31<3106:26:15, 11.34s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [14086], local_loss=0.05462180823087692, train_loss=0.023917851969599724, time_cost=2.403998613357544
+
Steps: 1%|▏ | 14086/1000000 [2:46:31<3106:26:15, 11.34s/it, lr=1e-5, step_loss=0.0546]
Steps: 1%|▏ | 14087/1000000 [2:46:40<2925:58:15, 10.68s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [14087], local_loss=0.015675000846385956, train_loss=0.022721223533153534, time_cost=3.9981422424316406
+
Steps: 1%|▏ | 14087/1000000 [2:46:40<2925:58:15, 10.68s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%|▏ | 14088/1000000 [2:46:45<2452:12:47, 8.95s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [14088], local_loss=0.04528316855430603, train_loss=0.03069119155406952, time_cost=2.3042454719543457
+
Steps: 1%|▏ | 14088/1000000 [2:46:45<2452:12:47, 8.95s/it, lr=1e-5, step_loss=0.0453]
Steps: 1%|▏ | 14089/1000000 [2:46:56<2625:36:01, 9.59s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [14089], local_loss=0.1466013789176941, train_loss=0.04286142438650131, time_cost=1.2199280261993408
+
Steps: 1%|▏ | 14089/1000000 [2:46:56<2625:36:01, 9.59s/it, lr=1e-5, step_loss=0.147]
Steps: 1%|▏ | 14090/1000000 [2:47:02<2316:01:29, 8.46s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [14090], local_loss=0.022866852581501007, train_loss=0.032109398394823074, time_cost=2.7482070922851562
+
Steps: 1%|▏ | 14090/1000000 [2:47:02<2316:01:29, 8.46s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%|▏ | 14091/1000000 [2:47:08<2104:43:00, 7.69s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [14091], local_loss=0.05148199200630188, train_loss=0.07173651456832886, time_cost=4.715597152709961
+
Steps: 1%|▏ | 14091/1000000 [2:47:08<2104:43:00, 7.69s/it, lr=1e-5, step_loss=0.0515]
Steps: 1%|▏ | 14092/1000000 [2:47:20<2466:56:16, 9.01s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [14092], local_loss=0.04115563631057739, train_loss=0.01882275938987732, time_cost=1.9932498931884766
+
Steps: 1%|▏ | 14092/1000000 [2:47:20<2466:56:16, 9.01s/it, lr=1e-5, step_loss=0.0412]
Steps: 1%|▏ | 14093/1000000 [2:47:34<2875:32:03, 10.50s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [14093], local_loss=0.03586550056934357, train_loss=0.03027632087469101, time_cost=1.3715717792510986
+
Steps: 1%|▏ | 14093/1000000 [2:47:34<2875:32:03, 10.50s/it, lr=1e-5, step_loss=0.0359]
Steps: 1%|▏ | 14094/1000000 [2:47:47<3096:02:53, 11.31s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [14094], local_loss=0.005887121427804232, train_loss=0.02442111447453499, time_cost=3.9685888290405273
+
Steps: 1%|▏ | 14094/1000000 [2:47:47<3096:02:53, 11.31s/it, lr=1e-5, step_loss=0.00589]
Steps: 1%|▏ | 14095/1000000 [2:48:01<3291:09:50, 12.02s/it, lr=1e-5, step_loss=0.00589][RANK-0]: Step: [14095], local_loss=0.005491478368639946, train_loss=0.06354999542236328, time_cost=4.519543170928955
+
Steps: 1%|▏ | 14095/1000000 [2:48:01<3291:09:50, 12.02s/it, lr=1e-5, step_loss=0.00549]
Steps: 1%|▏ | 14096/1000000 [2:48:06<2719:18:15, 9.93s/it, lr=1e-5, step_loss=0.00549][RANK-0]: Step: [14096], local_loss=0.02463737316429615, train_loss=0.056953247636556625, time_cost=2.229001760482788
+
Steps: 1%|▏ | 14096/1000000 [2:48:06<2719:18:15, 9.93s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%|▏ | 14097/1000000 [2:48:18<2913:43:09, 10.64s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [14097], local_loss=0.004565075039863586, train_loss=0.07424129545688629, time_cost=5.7546374797821045
+
Steps: 1%|▏ | 14097/1000000 [2:48:18<2913:43:09, 10.64s/it, lr=1e-5, step_loss=0.00457]
Steps: 1%|▏ | 14098/1000000 [2:48:23<2395:01:50, 8.75s/it, lr=1e-5, step_loss=0.00457][RANK-0]: Step: [14098], local_loss=0.022795362398028374, train_loss=0.04025927186012268, time_cost=1.4083242416381836
+
Steps: 1%|▏ | 14098/1000000 [2:48:23<2395:01:50, 8.75s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%|▏ | 14099/1000000 [2:48:27<2050:24:23, 7.49s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [14099], local_loss=0.09445963054895401, train_loss=0.07256016880273819, time_cost=1.5489919185638428
+
Steps: 1%|▏ | 14099/1000000 [2:48:27<2050:24:23, 7.49s/it, lr=1e-5, step_loss=0.0945]
Steps: 1%|▏ | 14100/1000000 [2:48:32<1798:24:53, 6.57s/it, lr=1e-5, step_loss=0.0945][RANK-0]: Step: [14100], local_loss=0.04909656196832657, train_loss=0.035115428268909454, time_cost=1.481372594833374
+
Steps: 1%|▏ | 14100/1000000 [2:48:32<1798:24:53, 6.57s/it, lr=1e-5, step_loss=0.0491]
Steps: 1%|▏ | 14101/1000000 [2:48:42<2115:35:14, 7.73s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [14101], local_loss=0.03315350413322449, train_loss=0.237238809466362, time_cost=7.728400945663452
+
Steps: 1%|▏ | 14101/1000000 [2:48:42<2115:35:14, 7.73s/it, lr=1e-5, step_loss=0.0332]
Steps: 1%|▏ | 14102/1000000 [2:48:51<2252:59:03, 8.23s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [14102], local_loss=0.036620818078517914, train_loss=0.038581427186727524, time_cost=3.3023159503936768
+
Steps: 1%|▏ | 14102/1000000 [2:48:51<2252:59:03, 8.23s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%|▏ | 14103/1000000 [2:49:00<2301:52:30, 8.41s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [14103], local_loss=0.008706015534698963, train_loss=0.0211858619004488, time_cost=3.073223114013672
+
Steps: 1%|▏ | 14103/1000000 [2:49:00<2301:52:30, 8.41s/it, lr=1e-5, step_loss=0.00871]
Steps: 1%|▏ | 14104/1000000 [2:49:07<2185:16:38, 7.98s/it, lr=1e-5, step_loss=0.00871][RANK-0]: Step: [14104], local_loss=0.0208804439753294, train_loss=0.03318680450320244, time_cost=5.806704044342041
+
Steps: 1%|▏ | 14104/1000000 [2:49:07<2185:16:38, 7.98s/it, lr=1e-5, step_loss=0.0209]
Steps: 1%|▏ | 14105/1000000 [2:49:21<2686:26:47, 9.81s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [14105], local_loss=0.01698031648993492, train_loss=0.01925818622112274, time_cost=4.668039083480835
+
Steps: 1%|▏ | 14105/1000000 [2:49:21<2686:26:47, 9.81s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 14106/1000000 [2:49:26<2279:13:33, 8.32s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [14106], local_loss=0.008246101438999176, train_loss=0.01639663055539131, time_cost=2.0016584396362305
+
Steps: 1%|▏ | 14106/1000000 [2:49:26<2279:13:33, 8.32s/it, lr=1e-5, step_loss=0.00825]
Steps: 1%|▏ | 14107/1000000 [2:49:32<2070:24:22, 7.56s/it, lr=1e-5, step_loss=0.00825][RANK-0]: Step: [14107], local_loss=0.01095232367515564, train_loss=0.016023218631744385, time_cost=4.765251398086548
+
Steps: 1%|▏ | 14107/1000000 [2:49:32<2070:24:22, 7.56s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 14108/1000000 [2:49:46<2581:47:54, 9.43s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [14108], local_loss=0.05244730785489082, train_loss=0.036357581615448, time_cost=4.72025990486145
+
Steps: 1%|▏ | 14108/1000000 [2:49:46<2581:47:54, 9.43s/it, lr=1e-5, step_loss=0.0524]/home/image_data/hxy/Open-Sora-Plan/opensora/utils/utils.py:369: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
+ caption = BeautifulSoup(caption, features='html.parser').text
+
Steps: 1%|▏ | 14109/1000000 [2:49:57<2719:22:21, 9.93s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [14109], local_loss=0.015686238184571266, train_loss=0.045234315097332, time_cost=2.1816368103027344
+
Steps: 1%|▏ | 14109/1000000 [2:49:57<2719:22:21, 9.93s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%|▏ | 14110/1000000 [2:50:13<3253:51:27, 11.88s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [14110], local_loss=0.07125768810510635, train_loss=0.02389705367386341, time_cost=6.6047446727752686
+
Steps: 1%|▏ | 14110/1000000 [2:50:13<3253:51:27, 11.88s/it, lr=1e-5, step_loss=0.0713]
Steps: 1%|▏ | 14111/1000000 [2:50:18<2709:56:17, 9.90s/it, lr=1e-5, step_loss=0.0713][RANK-0]: Step: [14111], local_loss=0.0337374284863472, train_loss=0.05782727897167206, time_cost=2.250950813293457
+
Steps: 1%|▏ | 14111/1000000 [2:50:18<2709:56:17, 9.90s/it, lr=1e-5, step_loss=0.0337]
Steps: 1%|▏ | 14112/1000000 [2:50:26<2529:49:26, 9.24s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [14112], local_loss=0.03769417852163315, train_loss=0.09010843932628632, time_cost=1.2388103008270264
+
Steps: 1%|▏ | 14112/1000000 [2:50:26<2529:49:26, 9.24s/it, lr=1e-5, step_loss=0.0377]
Steps: 1%|▏ | 14113/1000000 [2:50:36<2604:12:06, 9.51s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [14113], local_loss=0.03572603687644005, train_loss=0.0592273473739624, time_cost=3.531306743621826
+
Steps: 1%|▏ | 14113/1000000 [2:50:36<2604:12:06, 9.51s/it, lr=1e-5, step_loss=0.0357]
Steps: 1%|▏ | 14114/1000000 [2:50:46<2583:12:00, 9.43s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [14114], local_loss=0.006115241441875696, train_loss=0.02093784138560295, time_cost=3.030874729156494
+
Steps: 1%|▏ | 14114/1000000 [2:50:46<2583:12:00, 9.43s/it, lr=1e-5, step_loss=0.00612]
Steps: 1%|▏ | 14115/1000000 [2:50:59<2896:21:00, 10.58s/it, lr=1e-5, step_loss=0.00612][RANK-0]: Step: [14115], local_loss=0.05557464808225632, train_loss=0.030957352370023727, time_cost=4.896298170089722
+
Steps: 1%|▏ | 14115/1000000 [2:50:59<2896:21:00, 10.58s/it, lr=1e-5, step_loss=0.0556]
Steps: 1%|▏ | 14116/1000000 [2:51:04<2443:29:37, 8.92s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [14116], local_loss=0.011051367968320847, train_loss=0.024453718215227127, time_cost=1.7850477695465088
+
Steps: 1%|▏ | 14116/1000000 [2:51:04<2443:29:37, 8.92s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 14117/1000000 [2:51:17<2793:27:00, 10.20s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [14117], local_loss=0.3789955675601959, train_loss=0.09758946299552917, time_cost=5.783368110656738
+
Steps: 1%|▏ | 14117/1000000 [2:51:17<2793:27:00, 10.20s/it, lr=1e-5, step_loss=0.379]
Steps: 1%|▏ | 14118/1000000 [2:51:29<2906:50:17, 10.61s/it, lr=1e-5, step_loss=0.379][RANK-0]: Step: [14118], local_loss=0.011080065742135048, train_loss=13.82101058959961, time_cost=4.308382749557495
+
Steps: 1%|▏ | 14118/1000000 [2:51:29<2906:50:17, 10.61s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 14119/1000000 [2:51:35<2558:01:02, 9.34s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [14119], local_loss=0.050719887018203735, train_loss=0.07333126664161682, time_cost=2.64676570892334
+
Steps: 1%|▏ | 14119/1000000 [2:51:35<2558:01:02, 9.34s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%|▏ | 14120/1000000 [2:51:45<2591:27:00, 9.46s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [14120], local_loss=0.022091703489422798, train_loss=0.03272608295083046, time_cost=3.296748638153076
+
Steps: 1%|▏ | 14120/1000000 [2:51:45<2591:27:00, 9.46s/it, lr=1e-5, step_loss=0.0221]
Steps: 1%|▏ | 14121/1000000 [2:51:59<3014:37:43, 11.01s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [14121], local_loss=0.008420866914093494, train_loss=0.08758720755577087, time_cost=3.0148189067840576
+
Steps: 1%|▏ | 14121/1000000 [2:51:59<3014:37:43, 11.01s/it, lr=1e-5, step_loss=0.00842]
Steps: 1%|▏ | 14122/1000000 [2:52:07<2720:36:05, 9.93s/it, lr=1e-5, step_loss=0.00842][RANK-0]: Step: [14122], local_loss=0.08974206447601318, train_loss=0.028568383306264877, time_cost=1.218428134918213
+
Steps: 1%|▏ | 14122/1000000 [2:52:07<2720:36:05, 9.93s/it, lr=1e-5, step_loss=0.0897]
Steps: 1%|▏ | 14123/1000000 [2:52:19<2875:10:46, 10.50s/it, lr=1e-5, step_loss=0.0897][RANK-0]: Step: [14123], local_loss=0.09348595142364502, train_loss=0.06972211599349976, time_cost=1.9926340579986572
+
Steps: 1%|▏ | 14123/1000000 [2:52:19<2875:10:46, 10.50s/it, lr=1e-5, step_loss=0.0935]
Steps: 1%|▏ | 14124/1000000 [2:52:27<2710:18:13, 9.90s/it, lr=1e-5, step_loss=0.0935][RANK-0]: Step: [14124], local_loss=0.024926042184233665, train_loss=0.14346536993980408, time_cost=2.308030843734741
+
Steps: 1%|▏ | 14124/1000000 [2:52:27<2710:18:13, 9.90s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%|▏ | 14125/1000000 [2:52:39<2910:24:51, 10.63s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [14125], local_loss=0.010817108675837517, train_loss=0.0214517954736948, time_cost=4.599536418914795
+
Steps: 1%|▏ | 14125/1000000 [2:52:39<2910:24:51, 10.63s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 14126/1000000 [2:52:50<2932:07:44, 10.71s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [14126], local_loss=0.007611059118062258, train_loss=0.037624653428792953, time_cost=9.710602283477783
+
Steps: 1%|▏ | 14126/1000000 [2:52:50<2932:07:44, 10.71s/it, lr=1e-5, step_loss=0.00761]
Steps: 1%|▏ | 14127/1000000 [2:53:02<3044:30:30, 11.12s/it, lr=1e-5, step_loss=0.00761][RANK-0]: Step: [14127], local_loss=0.040110718458890915, train_loss=0.05718247964978218, time_cost=4.775214433670044
+
Steps: 1%|▏ | 14127/1000000 [2:53:02<3044:30:30, 11.12s/it, lr=1e-5, step_loss=0.0401]
Steps: 1%|▏ | 14128/1000000 [2:53:07<2503:58:31, 9.14s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [14128], local_loss=0.05497666075825691, train_loss=0.037103258073329926, time_cost=1.2748641967773438
+
Steps: 1%|▏ | 14128/1000000 [2:53:07<2503:58:31, 9.14s/it, lr=1e-5, step_loss=0.055]
Steps: 1%|▏ | 14129/1000000 [2:53:17<2545:03:34, 9.29s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [14129], local_loss=0.030135268345475197, train_loss=0.016796162351965904, time_cost=4.2159624099731445
+
Steps: 1%|▏ | 14129/1000000 [2:53:17<2545:03:34, 9.29s/it, lr=1e-5, step_loss=0.0301]
Steps: 1%|▏ | 14130/1000000 [2:53:22<2215:40:34, 8.09s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [14130], local_loss=0.025585852563381195, train_loss=0.04185933247208595, time_cost=2.36430025100708
+
Steps: 1%|▏ | 14130/1000000 [2:53:22<2215:40:34, 8.09s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%|▏ | 14131/1000000 [2:53:33<2474:17:48, 9.04s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [14131], local_loss=0.007245277054607868, train_loss=0.08521269261837006, time_cost=1.7147305011749268
+
Steps: 1%|▏ | 14131/1000000 [2:53:33<2474:17:48, 9.04s/it, lr=1e-5, step_loss=0.00725]
Steps: 1%|▏ | 14132/1000000 [2:53:38<2159:26:43, 7.89s/it, lr=1e-5, step_loss=0.00725][RANK-0]: Step: [14132], local_loss=0.012982345186173916, train_loss=0.0582147091627121, time_cost=1.4650030136108398
+
Steps: 1%|▏ | 14132/1000000 [2:53:38<2159:26:43, 7.89s/it, lr=1e-5, step_loss=0.013]
Steps: 1%|▏ | 14133/1000000 [2:53:49<2380:52:54, 8.69s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [14133], local_loss=0.2823958992958069, train_loss=40.31014633178711, time_cost=1.3662972450256348
+
Steps: 1%|▏ | 14133/1000000 [2:53:49<2380:52:54, 8.69s/it, lr=1e-5, step_loss=0.282]
Steps: 1%|▏ | 14134/1000000 [2:53:53<2034:46:30, 7.43s/it, lr=1e-5, step_loss=0.282][RANK-0]: Step: [14134], local_loss=1.0184166431427002, train_loss=0.16541500389575958, time_cost=1.744394302368164
+
Steps: 1%|▏ | 14134/1000000 [2:53:53<2034:46:30, 7.43s/it, lr=1e-5, step_loss=1.02]
Steps: 1%|▏ | 14135/1000000 [2:54:07<2538:44:51, 9.27s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [14135], local_loss=0.01704762876033783, train_loss=0.09402361512184143, time_cost=1.5890588760375977
+
Steps: 1%|▏ | 14135/1000000 [2:54:07<2538:44:51, 9.27s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 14136/1000000 [2:54:15<2414:01:19, 8.82s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [14136], local_loss=0.14764995872974396, train_loss=0.06398802995681763, time_cost=3.6270153522491455
+
Steps: 1%|▏ | 14136/1000000 [2:54:15<2414:01:19, 8.82s/it, lr=1e-5, step_loss=0.148]
Steps: 1%|▏ | 14137/1000000 [2:54:23<2348:43:41, 8.58s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [14137], local_loss=0.020228616893291473, train_loss=0.03128793090581894, time_cost=4.153419733047485
+
Steps: 1%|▏ | 14137/1000000 [2:54:23<2348:43:41, 8.58s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%|▏ | 14138/1000000 [2:54:30<2280:04:15, 8.33s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [14138], local_loss=0.02051497809588909, train_loss=0.03401477262377739, time_cost=2.1932806968688965
+
Steps: 1%|▏ | 14138/1000000 [2:54:30<2280:04:15, 8.33s/it, lr=1e-5, step_loss=0.0205]
Steps: 1%|▏ | 14139/1000000 [2:54:47<2919:19:09, 10.66s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [14139], local_loss=0.005724884103983641, train_loss=0.08503367751836777, time_cost=7.735530853271484
+
Steps: 1%|▏ | 14139/1000000 [2:54:47<2919:19:09, 10.66s/it, lr=1e-5, step_loss=0.00572]
Steps: 1%|▏ | 14140/1000000 [2:54:57<2862:12:52, 10.45s/it, lr=1e-5, step_loss=0.00572][RANK-0]: Step: [14140], local_loss=0.011341666802763939, train_loss=0.05004066228866577, time_cost=1.679084062576294
+
Steps: 1%|▏ | 14140/1000000 [2:54:57<2862:12:52, 10.45s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%|▏ | 14141/1000000 [2:55:10<3141:47:38, 11.47s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [14141], local_loss=0.027079638093709946, train_loss=0.0560702383518219, time_cost=4.7649617195129395
+
Steps: 1%|▏ | 14141/1000000 [2:55:10<3141:47:38, 11.47s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%|▏ | 14142/1000000 [2:55:18<2862:16:13, 10.45s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [14142], local_loss=0.024776548147201538, train_loss=0.08863172680139542, time_cost=1.2936735153198242
+
Steps: 1%|▏ | 14142/1000000 [2:55:18<2862:16:13, 10.45s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 14143/1000000 [2:55:29<2863:28:12, 10.46s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [14143], local_loss=0.010911722667515278, train_loss=0.04145995154976845, time_cost=1.336437702178955
+
Steps: 1%|▏ | 14143/1000000 [2:55:29<2863:28:12, 10.46s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 14144/1000000 [2:55:38<2732:06:35, 9.98s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [14144], local_loss=0.03855106607079506, train_loss=0.13717371225357056, time_cost=1.7492091655731201
+
Steps: 1%|▏ | 14144/1000000 [2:55:38<2732:06:35, 9.98s/it, lr=1e-5, step_loss=0.0386]
Steps: 1%|▏ | 14145/1000000 [2:55:50<2906:38:38, 10.61s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [14145], local_loss=0.014106504619121552, train_loss=0.03371185064315796, time_cost=1.3195607662200928
+
Steps: 1%|▏ | 14145/1000000 [2:55:50<2906:38:38, 10.61s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%|▏ | 14146/1000000 [2:55:54<2407:34:37, 8.79s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [14146], local_loss=0.02777903713285923, train_loss=0.0411057211458683, time_cost=1.7838551998138428
+
Steps: 1%|▏ | 14146/1000000 [2:55:54<2407:34:37, 8.79s/it, lr=1e-5, step_loss=0.0278]
Steps: 1%|▏ | 14147/1000000 [2:56:03<2407:18:50, 8.79s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [14147], local_loss=0.048855122178792953, train_loss=0.04150574281811714, time_cost=2.3908684253692627
+
Steps: 1%|▏ | 14147/1000000 [2:56:03<2407:18:50, 8.79s/it, lr=1e-5, step_loss=0.0489]
Steps: 1%|▏ | 14148/1000000 [2:56:14<2554:03:46, 9.33s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [14148], local_loss=0.012060807086527348, train_loss=0.01906910352408886, time_cost=2.199415445327759
+
Steps: 1%|▏ | 14148/1000000 [2:56:14<2554:03:46, 9.33s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 14149/1000000 [2:56:19<2214:37:38, 8.09s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [14149], local_loss=0.008762115612626076, train_loss=0.030866684392094612, time_cost=1.337050199508667
+
Steps: 1%|▏ | 14149/1000000 [2:56:19<2214:37:38, 8.09s/it, lr=1e-5, step_loss=0.00876]
Steps: 1%|▏ | 14150/1000000 [2:56:33<2665:07:52, 9.73s/it, lr=1e-5, step_loss=0.00876][RANK-0]: Step: [14150], local_loss=0.02200229838490486, train_loss=0.04484473168849945, time_cost=5.752737522125244
+
Steps: 1%|▏ | 14150/1000000 [2:56:33<2665:07:52, 9.73s/it, lr=1e-5, step_loss=0.022]
Steps: 1%|▏ | 14151/1000000 [2:56:38<2275:49:09, 8.31s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [14151], local_loss=0.010751733556389809, train_loss=0.031775616109371185, time_cost=2.6156413555145264
+
Steps: 1%|▏ | 14151/1000000 [2:56:38<2275:49:09, 8.31s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 14152/1000000 [2:56:45<2198:01:07, 8.03s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [14152], local_loss=0.0072416202165186405, train_loss=0.03640257567167282, time_cost=1.3438537120819092
+
Steps: 1%|▏ | 14152/1000000 [2:56:45<2198:01:07, 8.03s/it, lr=1e-5, step_loss=0.00724]
Steps: 1%|▏ | 14153/1000000 [2:56:56<2471:26:15, 9.02s/it, lr=1e-5, step_loss=0.00724][RANK-0]: Step: [14153], local_loss=0.04968538507819176, train_loss=0.056293342262506485, time_cost=3.9601006507873535
+
Steps: 1%|▏ | 14153/1000000 [2:56:56<2471:26:15, 9.02s/it, lr=1e-5, step_loss=0.0497]
Steps: 1%|▏ | 14154/1000000 [2:57:01<2124:02:03, 7.76s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [14154], local_loss=0.04418323561549187, train_loss=0.02810712158679962, time_cost=1.6049714088439941
+
Steps: 1%|▏ | 14154/1000000 [2:57:01<2124:02:03, 7.76s/it, lr=1e-5, step_loss=0.0442]
Steps: 1%|▏ | 14155/1000000 [2:57:11<2302:43:54, 8.41s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [14155], local_loss=0.014367579482495785, train_loss=0.029678871855139732, time_cost=1.5389108657836914
+
Steps: 1%|▏ | 14155/1000000 [2:57:11<2302:43:54, 8.41s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 14156/1000000 [2:57:20<2360:42:33, 8.62s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [14156], local_loss=0.03919296711683273, train_loss=0.05861368030309677, time_cost=1.2621383666992188
+
Steps: 1%|▏ | 14156/1000000 [2:57:20<2360:42:33, 8.62s/it, lr=1e-5, step_loss=0.0392]
Steps: 1%|▏ | 14157/1000000 [2:57:28<2271:09:08, 8.29s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [14157], local_loss=0.029343217611312866, train_loss=0.03642585873603821, time_cost=3.171757936477661
+
Steps: 1%|▏ | 14157/1000000 [2:57:28<2271:09:08, 8.29s/it, lr=1e-5, step_loss=0.0293]
Steps: 1%|▏ | 14158/1000000 [2:57:33<2036:10:50, 7.44s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [14158], local_loss=0.006807010620832443, train_loss=0.05438427999615669, time_cost=2.3955721855163574
+
Steps: 1%|▏ | 14158/1000000 [2:57:33<2036:10:50, 7.44s/it, lr=1e-5, step_loss=0.00681]
Steps: 1%|▏ | 14159/1000000 [2:57:40<2034:43:58, 7.43s/it, lr=1e-5, step_loss=0.00681][RANK-0]: Step: [14159], local_loss=0.017742563039064407, train_loss=0.02213311567902565, time_cost=3.2014384269714355
+
Steps: 1%|▏ | 14159/1000000 [2:57:40<2034:43:58, 7.43s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%|▏ | 14160/1000000 [2:57:50<2180:28:30, 7.96s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [14160], local_loss=0.20263586938381195, train_loss=0.11618175357580185, time_cost=4.999837636947632
+
Steps: 1%|▏ | 14160/1000000 [2:57:50<2180:28:30, 7.96s/it, lr=1e-5, step_loss=0.203]
Steps: 1%|▏ | 14161/1000000 [2:57:57<2124:45:34, 7.76s/it, lr=1e-5, step_loss=0.203][RANK-0]: Step: [14161], local_loss=0.023954998701810837, train_loss=0.030122868716716766, time_cost=3.329582452774048
+
Steps: 1%|▏ | 14161/1000000 [2:57:57<2124:45:34, 7.76s/it, lr=1e-5, step_loss=0.024]
Steps: 1%|▏ | 14162/1000000 [2:58:04<2078:20:18, 7.59s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [14162], local_loss=0.30068427324295044, train_loss=0.05564670264720917, time_cost=2.611293077468872
+
Steps: 1%|▏ | 14162/1000000 [2:58:04<2078:20:18, 7.59s/it, lr=1e-5, step_loss=0.301]
Steps: 1%|▏ | 14163/1000000 [2:58:20<2728:01:37, 9.96s/it, lr=1e-5, step_loss=0.301][RANK-0]: Step: [14163], local_loss=0.02315622940659523, train_loss=0.07392385601997375, time_cost=7.147566080093384
+
Steps: 1%|▏ | 14163/1000000 [2:58:20<2728:01:37, 9.96s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%|▏ | 14164/1000000 [2:58:31<2819:39:22, 10.30s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [14164], local_loss=0.008663766086101532, train_loss=0.05575505271553993, time_cost=2.484142780303955
+
Steps: 1%|▏ | 14164/1000000 [2:58:31<2819:39:22, 10.30s/it, lr=1e-5, step_loss=0.00866]
Steps: 1%|▏ | 14165/1000000 [2:58:36<2422:53:04, 8.85s/it, lr=1e-5, step_loss=0.00866][RANK-0]: Step: [14165], local_loss=0.007776789367198944, train_loss=0.015615341253578663, time_cost=2.4253196716308594
+
Steps: 1%|▏ | 14165/1000000 [2:58:36<2422:53:04, 8.85s/it, lr=1e-5, step_loss=0.00778]
Steps: 1%|▏ | 14166/1000000 [2:58:40<2037:12:25, 7.44s/it, lr=1e-5, step_loss=0.00778][RANK-0]: Step: [14166], local_loss=0.022133274003863335, train_loss=0.03880394995212555, time_cost=1.2306325435638428
+
Steps: 1%|▏ | 14166/1000000 [2:58:40<2037:12:25, 7.44s/it, lr=1e-5, step_loss=0.0221]
Steps: 1%|▏ | 14167/1000000 [2:58:46<1866:19:36, 6.82s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [14167], local_loss=0.009449400007724762, train_loss=0.030930733308196068, time_cost=1.9298255443572998
+
Steps: 1%|▏ | 14167/1000000 [2:58:46<1866:19:36, 6.82s/it, lr=1e-5, step_loss=0.00945]
Steps: 1%|▏ | 14168/1000000 [2:58:55<2054:17:59, 7.50s/it, lr=1e-5, step_loss=0.00945][RANK-0]: Step: [14168], local_loss=0.015590517781674862, train_loss=0.036286793649196625, time_cost=3.602389097213745
+
Steps: 1%|▏ | 14168/1000000 [2:58:55<2054:17:59, 7.50s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%|▏ | 14169/1000000 [2:59:10<2657:29:58, 9.70s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [14169], local_loss=0.05948673188686371, train_loss=0.1517806202173233, time_cost=12.203734874725342
+
Steps: 1%|▏ | 14169/1000000 [2:59:10<2657:29:58, 9.70s/it, lr=1e-5, step_loss=0.0595]
Steps: 1%|▏ | 14170/1000000 [2:59:19<2654:48:29, 9.69s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [14170], local_loss=0.011207311414182186, train_loss=0.02230597287416458, time_cost=3.0813872814178467
+
Steps: 1%|▏ | 14170/1000000 [2:59:19<2654:48:29, 9.69s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 14171/1000000 [2:59:26<2425:54:18, 8.86s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [14171], local_loss=0.17923541367053986, train_loss=0.039199575781822205, time_cost=5.266646146774292
+
Steps: 1%|▏ | 14171/1000000 [2:59:26<2425:54:18, 8.86s/it, lr=1e-5, step_loss=0.179]
Steps: 1%|▏ | 14172/1000000 [2:59:31<2121:01:13, 7.75s/it, lr=1e-5, step_loss=0.179][RANK-0]: Step: [14172], local_loss=0.06513252854347229, train_loss=0.07647797465324402, time_cost=2.40734601020813
+
Steps: 1%|▏ | 14172/1000000 [2:59:31<2121:01:13, 7.75s/it, lr=1e-5, step_loss=0.0651]
Steps: 1%|▏ | 14173/1000000 [2:59:36<1892:18:09, 6.91s/it, lr=1e-5, step_loss=0.0651][RANK-0]: Step: [14173], local_loss=0.005546243861317635, train_loss=0.01790960691869259, time_cost=1.9503533840179443
+
Steps: 1%|▏ | 14173/1000000 [2:59:36<1892:18:09, 6.91s/it, lr=1e-5, step_loss=0.00555]
Steps: 1%|▏ | 14174/1000000 [2:59:44<1920:44:09, 7.01s/it, lr=1e-5, step_loss=0.00555][RANK-0]: Step: [14174], local_loss=0.034766796976327896, train_loss=0.04368966817855835, time_cost=1.2527039051055908
+
Steps: 1%|▏ | 14174/1000000 [2:59:44<1920:44:09, 7.01s/it, lr=1e-5, step_loss=0.0348]
Steps: 1%|▏ | 14175/1000000 [2:59:55<2300:17:14, 8.40s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [14175], local_loss=0.007315529976040125, train_loss=0.0221133753657341, time_cost=2.7853150367736816
+
Steps: 1%|▏ | 14175/1000000 [2:59:55<2300:17:14, 8.40s/it, lr=1e-5, step_loss=0.00732]
Steps: 1%|▏ | 14176/1000000 [3:00:03<2242:16:32, 8.19s/it, lr=1e-5, step_loss=0.00732][RANK-0]: Step: [14176], local_loss=0.005978078581392765, train_loss=0.02712392434477806, time_cost=1.6488773822784424
+
Steps: 1%|▏ | 14176/1000000 [3:00:03<2242:16:32, 8.19s/it, lr=1e-5, step_loss=0.00598]
Steps: 1%|▏ | 14177/1000000 [3:00:16<2651:27:01, 9.68s/it, lr=1e-5, step_loss=0.00598][RANK-0]: Step: [14177], local_loss=0.041642170399427414, train_loss=0.035571854561567307, time_cost=3.875638484954834
+
Steps: 1%|▏ | 14177/1000000 [3:00:16<2651:27:01, 9.68s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%|▏ | 14178/1000000 [3:00:24<2477:54:37, 9.05s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [14178], local_loss=0.008167493157088757, train_loss=0.011533010751008987, time_cost=1.2297933101654053
+
Steps: 1%|▏ | 14178/1000000 [3:00:24<2477:54:37, 9.05s/it, lr=1e-5, step_loss=0.00817]
Steps: 1%|▏ | 14179/1000000 [3:00:31<2334:43:41, 8.53s/it, lr=1e-5, step_loss=0.00817][RANK-0]: Step: [14179], local_loss=0.01167056430131197, train_loss=0.02434016764163971, time_cost=2.266944169998169
+
Steps: 1%|▏ | 14179/1000000 [3:00:31<2334:43:41, 8.53s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14180/1000000 [3:00:46<2894:40:23, 10.57s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14180], local_loss=0.03976839780807495, train_loss=0.031822264194488525, time_cost=5.615163803100586
+
Steps: 1%|▏ | 14180/1000000 [3:00:46<2894:40:23, 10.57s/it, lr=1e-5, step_loss=0.0398]
Steps: 1%|▏ | 14181/1000000 [3:00:59<3082:41:03, 11.26s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [14181], local_loss=0.007264932617545128, train_loss=0.04702651873230934, time_cost=3.2569844722747803
+
Steps: 1%|▏ | 14181/1000000 [3:00:59<3082:41:03, 11.26s/it, lr=1e-5, step_loss=0.00726]
Steps: 1%|▏ | 14182/1000000 [3:01:08<2850:22:38, 10.41s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [14182], local_loss=0.011749545112252235, train_loss=0.013943705707788467, time_cost=1.221555233001709
+
Steps: 1%|▏ | 14182/1000000 [3:01:08<2850:22:38, 10.41s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14183/1000000 [3:01:15<2605:53:09, 9.52s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14183], local_loss=0.06690683215856552, train_loss=0.03274692967534065, time_cost=1.271735429763794
+
Steps: 1%|▏ | 14183/1000000 [3:01:15<2605:53:09, 9.52s/it, lr=1e-5, step_loss=0.0669]
Steps: 1%|▏ | 14184/1000000 [3:01:27<2793:01:20, 10.20s/it, lr=1e-5, step_loss=0.0669][RANK-0]: Step: [14184], local_loss=0.044560566544532776, train_loss=0.06228004768490791, time_cost=1.2311184406280518
+
Steps: 1%|▏ | 14184/1000000 [3:01:27<2793:01:20, 10.20s/it, lr=1e-5, step_loss=0.0446]
Steps: 1%|▏ | 14185/1000000 [3:01:36<2731:39:28, 9.98s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [14185], local_loss=0.01746823638677597, train_loss=0.022810906171798706, time_cost=1.222362995147705
+
Steps: 1%|▏ | 14185/1000000 [3:01:36<2731:39:28, 9.98s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 14186/1000000 [3:01:42<2344:32:14, 8.56s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [14186], local_loss=0.03336598724126816, train_loss=0.04434457793831825, time_cost=2.5357131958007812
+
Steps: 1%|▏ | 14186/1000000 [3:01:42<2344:32:14, 8.56s/it, lr=1e-5, step_loss=0.0334]
Steps: 1%|▏ | 14187/1000000 [3:01:52<2497:30:13, 9.12s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [14187], local_loss=0.10546097159385681, train_loss=0.05633331090211868, time_cost=1.8428199291229248
+
Steps: 1%|▏ | 14187/1000000 [3:01:52<2497:30:13, 9.12s/it, lr=1e-5, step_loss=0.105]
Steps: 1%|▏ | 14188/1000000 [3:01:56<2119:16:08, 7.74s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [14188], local_loss=0.021893801167607307, train_loss=0.019180355593562126, time_cost=1.7169125080108643
+
Steps: 1%|▏ | 14188/1000000 [3:01:56<2119:16:08, 7.74s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%|▏ | 14189/1000000 [3:02:09<2498:35:49, 9.12s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [14189], local_loss=0.039375584572553635, train_loss=0.0522807240486145, time_cost=3.625124931335449
+
Steps: 1%|▏ | 14189/1000000 [3:02:09<2498:35:49, 9.12s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%|▏ | 14190/1000000 [3:02:14<2204:12:28, 8.05s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [14190], local_loss=0.02018335834145546, train_loss=0.06472272425889969, time_cost=2.6966991424560547
+
Steps: 1%|▏ | 14190/1000000 [3:02:14<2204:12:28, 8.05s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%|▏ | 14191/1000000 [3:02:23<2258:50:01, 8.25s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [14191], local_loss=0.505343496799469, train_loss=0.21922165155410767, time_cost=2.711510419845581
+
Steps: 1%|▏ | 14191/1000000 [3:02:23<2258:50:01, 8.25s/it, lr=1e-5, step_loss=0.505]
Steps: 1%|▏ | 14192/1000000 [3:02:35<2555:08:32, 9.33s/it, lr=1e-5, step_loss=0.505][RANK-0]: Step: [14192], local_loss=0.05031251162290573, train_loss=0.02783368155360222, time_cost=2.290926218032837
+
Steps: 1%|▏ | 14192/1000000 [3:02:35<2555:08:32, 9.33s/it, lr=1e-5, step_loss=0.0503]
Steps: 1%|▏ | 14193/1000000 [3:02:40<2210:43:32, 8.07s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [14193], local_loss=0.006432280410081148, train_loss=0.06344927847385406, time_cost=1.391740322113037
+
Steps: 1%|▏ | 14193/1000000 [3:02:40<2210:43:32, 8.07s/it, lr=1e-5, step_loss=0.00643]
Steps: 1%|▏ | 14194/1000000 [3:02:45<1910:17:47, 6.98s/it, lr=1e-5, step_loss=0.00643][RANK-0]: Step: [14194], local_loss=0.07309627532958984, train_loss=0.037407174706459045, time_cost=1.3796672821044922
+
Steps: 1%|▏ | 14194/1000000 [3:02:45<1910:17:47, 6.98s/it, lr=1e-5, step_loss=0.0731]
Steps: 1%|▏ | 14195/1000000 [3:02:57<2381:58:52, 8.70s/it, lr=1e-5, step_loss=0.0731][RANK-0]: Step: [14195], local_loss=0.25981152057647705, train_loss=0.06310483068227768, time_cost=5.361659049987793
+
Steps: 1%|▏ | 14195/1000000 [3:02:57<2381:58:52, 8.70s/it, lr=1e-5, step_loss=0.26]
Steps: 1%|▏ | 14196/1000000 [3:03:02<2092:17:03, 7.64s/it, lr=1e-5, step_loss=0.26][RANK-0]: Step: [14196], local_loss=0.32370951771736145, train_loss=0.06376327574253082, time_cost=2.140223741531372
+
Steps: 1%|▏ | 14196/1000000 [3:03:02<2092:17:03, 7.64s/it, lr=1e-5, step_loss=0.324]
Steps: 1%|▏ | 14197/1000000 [3:03:09<2029:41:06, 7.41s/it, lr=1e-5, step_loss=0.324][RANK-0]: Step: [14197], local_loss=0.02484043687582016, train_loss=0.014720860868692398, time_cost=5.748403787612915
+
Steps: 1%|▏ | 14197/1000000 [3:03:09<2029:41:06, 7.41s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 14198/1000000 [3:03:13<1750:14:55, 6.39s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [14198], local_loss=0.037659548223018646, train_loss=0.04927702993154526, time_cost=1.2320446968078613
+
Steps: 1%|▏ | 14198/1000000 [3:03:13<1750:14:55, 6.39s/it, lr=1e-5, step_loss=0.0377]
Steps: 1%|▏ | 14199/1000000 [3:03:27<2342:26:16, 8.55s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [14199], local_loss=0.058172907680273056, train_loss=0.039062950760126114, time_cost=6.796860456466675
+
Steps: 1%|▏ | 14199/1000000 [3:03:27<2342:26:16, 8.55s/it, lr=1e-5, step_loss=0.0582]
Steps: 1%|▏ | 14200/1000000 [3:03:40<2694:54:58, 9.84s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [14200], local_loss=0.0113739725202322, train_loss=0.026897847652435303, time_cost=4.9372172355651855
+
Steps: 1%|▏ | 14200/1000000 [3:03:40<2694:54:58, 9.84s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%|▏ | 14201/1000000 [3:03:44<2257:37:31, 8.24s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [14201], local_loss=0.013777822256088257, train_loss=0.015730030834674835, time_cost=1.2292375564575195
+
Steps: 1%|▏ | 14201/1000000 [3:03:44<2257:37:31, 8.24s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 14202/1000000 [3:03:57<2651:02:04, 9.68s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [14202], local_loss=0.008705931715667248, train_loss=0.0414327047765255, time_cost=5.197913885116577
+
Steps: 1%|▏ | 14202/1000000 [3:03:57<2651:02:04, 9.68s/it, lr=1e-5, step_loss=0.00871]
Steps: 1%|▏ | 14203/1000000 [3:04:10<2912:14:23, 10.64s/it, lr=1e-5, step_loss=0.00871][RANK-0]: Step: [14203], local_loss=0.0323328897356987, train_loss=0.03281733766198158, time_cost=6.532404661178589
+
Steps: 1%|▏ | 14203/1000000 [3:04:10<2912:14:23, 10.64s/it, lr=1e-5, step_loss=0.0323]
Steps: 1%|▏ | 14204/1000000 [3:04:23<3124:04:58, 11.41s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [14204], local_loss=0.25409919023513794, train_loss=0.09870263189077377, time_cost=1.2302892208099365
+
Steps: 1%|▏ | 14204/1000000 [3:04:23<3124:04:58, 11.41s/it, lr=1e-5, step_loss=0.254]
Steps: 1%|▏ | 14205/1000000 [3:04:30<2719:50:09, 9.93s/it, lr=1e-5, step_loss=0.254][RANK-0]: Step: [14205], local_loss=0.004642660263925791, train_loss=0.026954559609293938, time_cost=2.208721160888672
+
Steps: 1%|▏ | 14205/1000000 [3:04:30<2719:50:09, 9.93s/it, lr=1e-5, step_loss=0.00464]
Steps: 1%|▏ | 14206/1000000 [3:04:38<2612:32:35, 9.54s/it, lr=1e-5, step_loss=0.00464][RANK-0]: Step: [14206], local_loss=0.047591302543878555, train_loss=0.037845317274332047, time_cost=5.019023656845093
+
Steps: 1%|▏ | 14206/1000000 [3:04:38<2612:32:35, 9.54s/it, lr=1e-5, step_loss=0.0476]
Steps: 1%|▏ | 14207/1000000 [3:04:47<2559:56:46, 9.35s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [14207], local_loss=0.1258510798215866, train_loss=0.07374389469623566, time_cost=1.3182919025421143
+
Steps: 1%|▏ | 14207/1000000 [3:04:47<2559:56:46, 9.35s/it, lr=1e-5, step_loss=0.126]
Steps: 1%|▏ | 14208/1000000 [3:04:55<2386:22:46, 8.71s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [14208], local_loss=0.08146557956933975, train_loss=0.02829558774828911, time_cost=2.824890375137329
+
Steps: 1%|▏ | 14208/1000000 [3:04:55<2386:22:46, 8.71s/it, lr=1e-5, step_loss=0.0815]
Steps: 1%|▏ | 14209/1000000 [3:05:09<2860:14:37, 10.45s/it, lr=1e-5, step_loss=0.0815][RANK-0]: Step: [14209], local_loss=0.007339631672948599, train_loss=0.043147847056388855, time_cost=5.612383127212524
+
Steps: 1%|▏ | 14209/1000000 [3:05:09<2860:14:37, 10.45s/it, lr=1e-5, step_loss=0.00734]
Steps: 1%|▏ | 14210/1000000 [3:05:15<2447:29:33, 8.94s/it, lr=1e-5, step_loss=0.00734][RANK-0]: Step: [14210], local_loss=0.015998784452676773, train_loss=0.02243446186184883, time_cost=2.3908450603485107
+
Steps: 1%|▏ | 14210/1000000 [3:05:15<2447:29:33, 8.94s/it, lr=1e-5, step_loss=0.016]
Steps: 1%|▏ | 14211/1000000 [3:05:20<2123:39:49, 7.76s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [14211], local_loss=0.060096099972724915, train_loss=0.042189955711364746, time_cost=2.187305450439453
+
Steps: 1%|▏ | 14211/1000000 [3:05:20<2123:39:49, 7.76s/it, lr=1e-5, step_loss=0.0601]
Steps: 1%|▏ | 14212/1000000 [3:05:31<2450:00:10, 8.95s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [14212], local_loss=0.03840181604027748, train_loss=0.055106114596128464, time_cost=3.799562692642212
+
Steps: 1%|▏ | 14212/1000000 [3:05:31<2450:00:10, 8.95s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%|▏ | 14213/1000000 [3:05:38<2258:11:57, 8.25s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [14213], local_loss=0.06338559091091156, train_loss=0.07167477905750275, time_cost=1.8689868450164795
+
Steps: 1%|▏ | 14213/1000000 [3:05:38<2258:11:57, 8.25s/it, lr=1e-5, step_loss=0.0634]
Steps: 1%|▏ | 14214/1000000 [3:05:43<2001:24:44, 7.31s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [14214], local_loss=0.03524881973862648, train_loss=0.04651999473571777, time_cost=2.6228792667388916
+
Steps: 1%|▏ | 14214/1000000 [3:05:43<2001:24:44, 7.31s/it, lr=1e-5, step_loss=0.0352]
Steps: 1%|▏ | 14215/1000000 [3:05:50<1943:43:12, 7.10s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [14215], local_loss=0.021540002897381783, train_loss=0.046710312366485596, time_cost=2.7769381999969482
+
Steps: 1%|▏ | 14215/1000000 [3:05:50<1943:43:12, 7.10s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%|▏ | 14216/1000000 [3:05:57<1952:17:54, 7.13s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [14216], local_loss=0.01676035113632679, train_loss=0.07157400995492935, time_cost=2.0898005962371826
+
Steps: 1%|▏ | 14216/1000000 [3:05:57<1952:17:54, 7.13s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%|▏ | 14217/1000000 [3:06:03<1868:30:18, 6.82s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [14217], local_loss=0.005143247079104185, train_loss=0.05456412583589554, time_cost=1.1999716758728027
+
Steps: 1%|▏ | 14217/1000000 [3:06:03<1868:30:18, 6.82s/it, lr=1e-5, step_loss=0.00514]
Steps: 1%|▏ | 14218/1000000 [3:06:08<1712:54:47, 6.26s/it, lr=1e-5, step_loss=0.00514][RANK-0]: Step: [14218], local_loss=0.040835898369550705, train_loss=0.05573632940649986, time_cost=1.8649072647094727
+
Steps: 1%|▏ | 14218/1000000 [3:06:08<1712:54:47, 6.26s/it, lr=1e-5, step_loss=0.0408]
Steps: 1%|▏ | 14219/1000000 [3:06:19<2111:36:58, 7.71s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [14219], local_loss=0.009573481976985931, train_loss=0.01648521237075329, time_cost=1.6732761859893799
+
Steps: 1%|▏ | 14219/1000000 [3:06:19<2111:36:58, 7.71s/it, lr=1e-5, step_loss=0.00957]
Steps: 1%|▏ | 14220/1000000 [3:06:24<1895:17:23, 6.92s/it, lr=1e-5, step_loss=0.00957][RANK-0]: Step: [14220], local_loss=0.05459560081362724, train_loss=0.020926080644130707, time_cost=2.3493800163269043
+
Steps: 1%|▏ | 14220/1000000 [3:06:24<1895:17:23, 6.92s/it, lr=1e-5, step_loss=0.0546]
Steps: 1%|▏ | 14221/1000000 [3:06:30<1788:36:34, 6.53s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [14221], local_loss=0.03190125897526741, train_loss=0.21699245274066925, time_cost=1.2180607318878174
+
Steps: 1%|▏ | 14221/1000000 [3:06:30<1788:36:34, 6.53s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%|▏ | 14222/1000000 [3:06:36<1742:24:50, 6.36s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [14222], local_loss=0.02407645620405674, train_loss=0.03173578158020973, time_cost=1.2728843688964844
+
Steps: 1%|▏ | 14222/1000000 [3:06:36<1742:24:50, 6.36s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%|▏ | 14223/1000000 [3:06:45<1990:55:29, 7.27s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [14223], local_loss=0.02719392441213131, train_loss=0.028092525899410248, time_cost=3.311084747314453
+
Steps: 1%|▏ | 14223/1000000 [3:06:45<1990:55:29, 7.27s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%|▏ | 14224/1000000 [3:06:55<2194:01:34, 8.01s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [14224], local_loss=0.008815249428153038, train_loss=0.36310917139053345, time_cost=2.1796679496765137
+
Steps: 1%|▏ | 14224/1000000 [3:06:55<2194:01:34, 8.01s/it, lr=1e-5, step_loss=0.00882]
Steps: 1%|▏ | 14225/1000000 [3:07:08<2594:48:00, 9.48s/it, lr=1e-5, step_loss=0.00882][RANK-0]: Step: [14225], local_loss=0.07336925715208054, train_loss=0.045467786490917206, time_cost=4.714899063110352
+
Steps: 1%|▏ | 14225/1000000 [3:07:08<2594:48:00, 9.48s/it, lr=1e-5, step_loss=0.0734]
Steps: 1%|▏ | 14226/1000000 [3:07:17<2597:13:19, 9.48s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [14226], local_loss=0.012552927248179913, train_loss=0.03780968114733696, time_cost=1.2449347972869873
+
Steps: 1%|▏ | 14226/1000000 [3:07:17<2597:13:19, 9.48s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%|▏ | 14227/1000000 [3:07:22<2245:26:24, 8.20s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [14227], local_loss=0.005729861091822386, train_loss=0.05532527342438698, time_cost=1.3285026550292969
+
Steps: 1%|▏ | 14227/1000000 [3:07:22<2245:26:24, 8.20s/it, lr=1e-5, step_loss=0.00573]
Steps: 1%|▏ | 14228/1000000 [3:07:34<2542:06:15, 9.28s/it, lr=1e-5, step_loss=0.00573][RANK-0]: Step: [14228], local_loss=0.11245831847190857, train_loss=0.1221463680267334, time_cost=1.3104608058929443
+
Steps: 1%|▏ | 14228/1000000 [3:07:34<2542:06:15, 9.28s/it, lr=1e-5, step_loss=0.112]
Steps: 1%|▏ | 14229/1000000 [3:07:49<2993:25:48, 10.93s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [14229], local_loss=0.02633909322321415, train_loss=0.028753414750099182, time_cost=6.860444784164429
+
Steps: 1%|▏ | 14229/1000000 [3:07:49<2993:25:48, 10.93s/it, lr=1e-5, step_loss=0.0263]
Steps: 1%|▏ | 14230/1000000 [3:08:03<3290:48:07, 12.02s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [14230], local_loss=0.007172807585448027, train_loss=0.03015379048883915, time_cost=8.695467472076416
+
Steps: 1%|▏ | 14230/1000000 [3:08:03<3290:48:07, 12.02s/it, lr=1e-5, step_loss=0.00717]
Steps: 1%|▏ | 14231/1000000 [3:08:15<3233:07:55, 11.81s/it, lr=1e-5, step_loss=0.00717][RANK-0]: Step: [14231], local_loss=0.019940154626965523, train_loss=0.0166607778519392, time_cost=3.0919203758239746
+
Steps: 1%|▏ | 14231/1000000 [3:08:15<3233:07:55, 11.81s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%|▏ | 14232/1000000 [3:08:29<3469:55:39, 12.67s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [14232], local_loss=0.05003245547413826, train_loss=0.19250354170799255, time_cost=10.911273717880249
+
Steps: 1%|▏ | 14232/1000000 [3:08:29<3469:55:39, 12.67s/it, lr=1e-5, step_loss=0.05]
Steps: 1%|▏ | 14233/1000000 [3:08:34<2834:57:54, 10.35s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [14233], local_loss=0.011711026541888714, train_loss=0.04028692841529846, time_cost=3.6785099506378174
+
Steps: 1%|▏ | 14233/1000000 [3:08:34<2834:57:54, 10.35s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14234/1000000 [3:08:44<2759:35:46, 10.08s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14234], local_loss=0.011041651479899883, train_loss=0.020403781905770302, time_cost=4.4473490715026855
+
Steps: 1%|▏ | 14234/1000000 [3:08:44<2759:35:46, 10.08s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 14235/1000000 [3:08:51<2494:11:05, 9.11s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [14235], local_loss=0.061000287532806396, train_loss=0.1106918528676033, time_cost=3.018048048019409
+
Steps: 1%|▏ | 14235/1000000 [3:08:51<2494:11:05, 9.11s/it, lr=1e-5, step_loss=0.061]
Steps: 1%|▏ | 14236/1000000 [3:09:06<2988:23:24, 10.91s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [14236], local_loss=0.025343868881464005, train_loss=0.04230586811900139, time_cost=6.549295663833618
+
Steps: 1%|▏ | 14236/1000000 [3:09:06<2988:23:24, 10.91s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%|▏ | 14237/1000000 [3:09:17<2988:38:57, 10.91s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [14237], local_loss=0.013758780434727669, train_loss=0.014310291036963463, time_cost=3.5718038082122803
+
Steps: 1%|▏ | 14237/1000000 [3:09:17<2988:38:57, 10.91s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 14238/1000000 [3:09:33<3426:39:48, 12.51s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [14238], local_loss=0.014241650700569153, train_loss=0.6546061635017395, time_cost=13.320338010787964
+
Steps: 1%|▏ | 14238/1000000 [3:09:33<3426:39:48, 12.51s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%|▏ | 14239/1000000 [3:09:37<2737:35:54, 10.00s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [14239], local_loss=0.014421593397855759, train_loss=0.01880832575261593, time_cost=1.6538276672363281
+
Steps: 1%|▏ | 14239/1000000 [3:09:37<2737:35:54, 10.00s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 14240/1000000 [3:09:47<2710:36:42, 9.90s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [14240], local_loss=0.09079799056053162, train_loss=0.08085522800683975, time_cost=2.327352523803711
+
Steps: 1%|▏ | 14240/1000000 [3:09:47<2710:36:42, 9.90s/it, lr=1e-5, step_loss=0.0908]
Steps: 1%|▏ | 14241/1000000 [3:09:56<2638:33:44, 9.64s/it, lr=1e-5, step_loss=0.0908][RANK-0]: Step: [14241], local_loss=1.008179783821106, train_loss=0.17539694905281067, time_cost=3.6741068363189697
+
Steps: 1%|▏ | 14241/1000000 [3:09:56<2638:33:44, 9.64s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 14242/1000000 [3:10:04<2505:00:04, 9.15s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [14242], local_loss=0.017880354076623917, train_loss=0.11235956102609634, time_cost=4.802075386047363
+
Steps: 1%|▏ | 14242/1000000 [3:10:04<2505:00:04, 9.15s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 14243/1000000 [3:10:14<2562:24:15, 9.36s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [14243], local_loss=0.016484582796692848, train_loss=0.06273200362920761, time_cost=2.8819780349731445
+
Steps: 1%|▏ | 14243/1000000 [3:10:14<2562:24:15, 9.36s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%|▏ | 14244/1000000 [3:10:20<2293:15:11, 8.38s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [14244], local_loss=0.02358976937830448, train_loss=0.04574631154537201, time_cost=3.4673168659210205
+
Steps: 1%|▏ | 14244/1000000 [3:10:20<2293:15:11, 8.38s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%|▏ | 14245/1000000 [3:10:26<2107:02:17, 7.69s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [14245], local_loss=0.009068929590284824, train_loss=0.03050316497683525, time_cost=4.597217082977295
+
Steps: 1%|▏ | 14245/1000000 [3:10:26<2107:02:17, 7.69s/it, lr=1e-5, step_loss=0.00907]
Steps: 1%|▏ | 14246/1000000 [3:10:37<2365:57:17, 8.64s/it, lr=1e-5, step_loss=0.00907][RANK-0]: Step: [14246], local_loss=0.065425343811512, train_loss=0.07899083197116852, time_cost=1.2950668334960938
+
Steps: 1%|▏ | 14246/1000000 [3:10:37<2365:57:17, 8.64s/it, lr=1e-5, step_loss=0.0654]
Steps: 1%|▏ | 14247/1000000 [3:10:51<2791:31:02, 10.19s/it, lr=1e-5, step_loss=0.0654][RANK-0]: Step: [14247], local_loss=0.01650976575911045, train_loss=0.032604411244392395, time_cost=4.883599042892456
+
Steps: 1%|▏ | 14247/1000000 [3:10:51<2791:31:02, 10.19s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%|▏ | 14248/1000000 [3:10:56<2378:04:12, 8.68s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [14248], local_loss=0.03653625026345253, train_loss=0.03562779724597931, time_cost=1.279280185699463
+
Steps: 1%|▏ | 14248/1000000 [3:10:56<2378:04:12, 8.68s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%|▏ | 14249/1000000 [3:11:00<2032:09:10, 7.42s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [14249], local_loss=0.00822453759610653, train_loss=0.17034833133220673, time_cost=1.6513292789459229
+
Steps: 1%|▏ | 14249/1000000 [3:11:00<2032:09:10, 7.42s/it, lr=1e-5, step_loss=0.00822]
Steps: 1%|▏ | 14250/1000000 [3:11:08<2079:47:51, 7.60s/it, lr=1e-5, step_loss=0.00822][RANK-0]: Step: [14250], local_loss=0.04577897489070892, train_loss=0.08050478994846344, time_cost=2.923978328704834
+
Steps: 1%|▏ | 14250/1000000 [3:11:08<2079:47:51, 7.60s/it, lr=1e-5, step_loss=0.0458]
Steps: 1%|▏ | 14251/1000000 [3:11:14<1927:03:04, 7.04s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [14251], local_loss=0.02242211624979973, train_loss=0.09034682810306549, time_cost=1.483139991760254
+
Steps: 1%|▏ | 14251/1000000 [3:11:14<1927:03:04, 7.04s/it, lr=1e-5, step_loss=0.0224]
Steps: 1%|▏ | 14252/1000000 [3:11:25<2250:24:16, 8.22s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [14252], local_loss=0.03799359127879143, train_loss=54.276039123535156, time_cost=1.3746063709259033
+
Steps: 1%|▏ | 14252/1000000 [3:11:25<2250:24:16, 8.22s/it, lr=1e-5, step_loss=0.038]
Steps: 1%|▏ | 14253/1000000 [3:11:36<2504:53:00, 9.15s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [14253], local_loss=0.053929828107357025, train_loss=0.023699695244431496, time_cost=2.523909568786621
+
Steps: 1%|▏ | 14253/1000000 [3:11:36<2504:53:00, 9.15s/it, lr=1e-5, step_loss=0.0539]
Steps: 1%|▏ | 14254/1000000 [3:11:50<2896:49:39, 10.58s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [14254], local_loss=0.04684378579258919, train_loss=0.01957099512219429, time_cost=11.86349105834961
+
Steps: 1%|▏ | 14254/1000000 [3:11:50<2896:49:39, 10.58s/it, lr=1e-5, step_loss=0.0468]
Steps: 1%|▏ | 14255/1000000 [3:11:57<2610:58:18, 9.54s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [14255], local_loss=0.013814290054142475, train_loss=0.012206792831420898, time_cost=2.1746015548706055
+
Steps: 1%|▏ | 14255/1000000 [3:11:57<2610:58:18, 9.54s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 14256/1000000 [3:12:04<2364:59:12, 8.64s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [14256], local_loss=0.02957945317029953, train_loss=0.03347868472337723, time_cost=1.2112791538238525
+
Steps: 1%|▏ | 14256/1000000 [3:12:04<2364:59:12, 8.64s/it, lr=1e-5, step_loss=0.0296]
Steps: 1%|▏ | 14257/1000000 [3:12:12<2325:37:32, 8.49s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [14257], local_loss=0.01716180145740509, train_loss=0.06458087265491486, time_cost=4.092110633850098
+
Steps: 1%|▏ | 14257/1000000 [3:12:12<2325:37:32, 8.49s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%|▏ | 14258/1000000 [3:12:24<2583:44:25, 9.44s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [14258], local_loss=0.06533925235271454, train_loss=0.03301844000816345, time_cost=3.3851261138916016
+
Steps: 1%|▏ | 14258/1000000 [3:12:24<2583:44:25, 9.44s/it, lr=1e-5, step_loss=0.0653]
Steps: 1%|▏ | 14259/1000000 [3:12:37<2912:55:59, 10.64s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [14259], local_loss=0.23943911492824554, train_loss=0.055802956223487854, time_cost=11.265611410140991
+
Steps: 1%|▏ | 14259/1000000 [3:12:37<2912:55:59, 10.64s/it, lr=1e-5, step_loss=0.239]
Steps: 1%|▏ | 14260/1000000 [3:12:44<2622:21:00, 9.58s/it, lr=1e-5, step_loss=0.239][RANK-0]: Step: [14260], local_loss=0.06483659148216248, train_loss=0.027867164462804794, time_cost=1.2150604724884033
+
Steps: 1%|▏ | 14260/1000000 [3:12:44<2622:21:00, 9.58s/it, lr=1e-5, step_loss=0.0648]
Steps: 1%|▏ | 14261/1000000 [3:12:53<2529:40:30, 9.24s/it, lr=1e-5, step_loss=0.0648][RANK-0]: Step: [14261], local_loss=0.02131643332540989, train_loss=0.0797962173819542, time_cost=6.120904207229614
+
Steps: 1%|▏ | 14261/1000000 [3:12:53<2529:40:30, 9.24s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%|▏ | 14262/1000000 [3:13:01<2460:47:40, 8.99s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [14262], local_loss=0.03551629185676575, train_loss=0.07206873595714569, time_cost=3.0642542839050293
+
Steps: 1%|▏ | 14262/1000000 [3:13:01<2460:47:40, 8.99s/it, lr=1e-5, step_loss=0.0355]
Steps: 1%|▏ | 14263/1000000 [3:13:08<2299:30:54, 8.40s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [14263], local_loss=0.01736973226070404, train_loss=0.03500639647245407, time_cost=2.1670875549316406
+
Steps: 1%|▏ | 14263/1000000 [3:13:08<2299:30:54, 8.40s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%|▏ | 14264/1000000 [3:13:19<2509:39:33, 9.17s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [14264], local_loss=0.08140674978494644, train_loss=0.05040644109249115, time_cost=1.2751274108886719
+
Steps: 1%|▏ | 14264/1000000 [3:13:19<2509:39:33, 9.17s/it, lr=1e-5, step_loss=0.0814]
Steps: 1%|▏ | 14265/1000000 [3:13:29<2613:08:21, 9.54s/it, lr=1e-5, step_loss=0.0814][RANK-0]: Step: [14265], local_loss=0.3581782877445221, train_loss=0.06020989269018173, time_cost=2.9135799407958984
+
Steps: 1%|▏ | 14265/1000000 [3:13:29<2613:08:21, 9.54s/it, lr=1e-5, step_loss=0.358]
Steps: 1%|▏ | 14266/1000000 [3:13:42<2844:48:20, 10.39s/it, lr=1e-5, step_loss=0.358][RANK-0]: Step: [14266], local_loss=0.028340652585029602, train_loss=0.03167849779129028, time_cost=1.2467126846313477
+
Steps: 1%|▏ | 14266/1000000 [3:13:42<2844:48:20, 10.39s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%|▏ | 14267/1000000 [3:13:49<2579:19:37, 9.42s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [14267], local_loss=0.021725790575146675, train_loss=0.03148111701011658, time_cost=2.5889029502868652
+
Steps: 1%|▏ | 14267/1000000 [3:13:49<2579:19:37, 9.42s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%|▏ | 14268/1000000 [3:14:01<2803:58:19, 10.24s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [14268], local_loss=0.026620153337717056, train_loss=0.04285259172320366, time_cost=2.419443368911743
+
Steps: 1%|▏ | 14268/1000000 [3:14:01<2803:58:19, 10.24s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%|▏ | 14269/1000000 [3:14:14<3008:22:55, 10.99s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [14269], local_loss=0.011772267520427704, train_loss=0.04189257696270943, time_cost=1.2159273624420166
+
Steps: 1%|▏ | 14269/1000000 [3:14:14<3008:22:55, 10.99s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14270/1000000 [3:14:19<2532:46:22, 9.25s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14270], local_loss=0.015810158103704453, train_loss=0.02890663780272007, time_cost=1.8921008110046387
+
Steps: 1%|▏ | 14270/1000000 [3:14:19<2532:46:22, 9.25s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%|▏ | 14271/1000000 [3:14:28<2520:21:03, 9.20s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [14271], local_loss=0.05985049903392792, train_loss=0.03393708914518356, time_cost=2.9652814865112305
+
Steps: 1%|▏ | 14271/1000000 [3:14:28<2520:21:03, 9.20s/it, lr=1e-5, step_loss=0.0599]
Steps: 1%|▏ | 14272/1000000 [3:14:35<2343:47:54, 8.56s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [14272], local_loss=0.008734343573451042, train_loss=0.026861805468797684, time_cost=6.112222909927368
+
Steps: 1%|▏ | 14272/1000000 [3:14:35<2343:47:54, 8.56s/it, lr=1e-5, step_loss=0.00873]
Steps: 1%|▏ | 14273/1000000 [3:14:43<2267:51:08, 8.28s/it, lr=1e-5, step_loss=0.00873][RANK-0]: Step: [14273], local_loss=0.03347863256931305, train_loss=0.05579090863466263, time_cost=1.459542989730835
+
Steps: 1%|▏ | 14273/1000000 [3:14:43<2267:51:08, 8.28s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%|▏ | 14274/1000000 [3:14:54<2548:11:20, 9.31s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [14274], local_loss=0.01695733703672886, train_loss=0.07649138569831848, time_cost=1.2263109683990479
+
Steps: 1%|▏ | 14274/1000000 [3:14:54<2548:11:20, 9.31s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 14275/1000000 [3:15:05<2640:31:14, 9.64s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [14275], local_loss=0.03406922519207001, train_loss=0.02170989289879799, time_cost=3.080545425415039
+
Steps: 1%|▏ | 14275/1000000 [3:15:05<2640:31:14, 9.64s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%|▏ | 14276/1000000 [3:15:11<2339:56:44, 8.55s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [14276], local_loss=0.11041990667581558, train_loss=0.05789492651820183, time_cost=1.7299325466156006
+
Steps: 1%|▏ | 14276/1000000 [3:15:11<2339:56:44, 8.55s/it, lr=1e-5, step_loss=0.11]
Steps: 1%|▏ | 14277/1000000 [3:15:26<2892:19:25, 10.56s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [14277], local_loss=0.015044893138110638, train_loss=0.028046974912285805, time_cost=13.138832807540894
+
Steps: 1%|▏ | 14277/1000000 [3:15:26<2892:19:25, 10.56s/it, lr=1e-5, step_loss=0.015]
Steps: 1%|▏ | 14278/1000000 [3:15:31<2442:26:04, 8.92s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [14278], local_loss=0.02556581422686577, train_loss=0.03527949005365372, time_cost=1.9605309963226318
+
Steps: 1%|▏ | 14278/1000000 [3:15:31<2442:26:04, 8.92s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%|▏ | 14279/1000000 [3:15:39<2333:57:39, 8.52s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [14279], local_loss=0.047801751643419266, train_loss=0.07747282087802887, time_cost=1.2264618873596191
+
Steps: 1%|▏ | 14279/1000000 [3:15:39<2333:57:39, 8.52s/it, lr=1e-5, step_loss=0.0478]
Steps: 1%|▏ | 14280/1000000 [3:15:50<2562:16:56, 9.36s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [14280], local_loss=0.171421617269516, train_loss=0.048345714807510376, time_cost=3.892334222793579
+
Steps: 1%|▏ | 14280/1000000 [3:15:50<2562:16:56, 9.36s/it, lr=1e-5, step_loss=0.171]
Steps: 1%|▏ | 14281/1000000 [3:15:58<2420:47:54, 8.84s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [14281], local_loss=0.012148861773312092, train_loss=0.03229408711194992, time_cost=2.290172815322876
+
Steps: 1%|▏ | 14281/1000000 [3:15:58<2420:47:54, 8.84s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 14282/1000000 [3:16:03<2135:08:14, 7.80s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [14282], local_loss=0.05745472386479378, train_loss=0.019674990326166153, time_cost=2.024303913116455
+
Steps: 1%|▏ | 14282/1000000 [3:16:03<2135:08:14, 7.80s/it, lr=1e-5, step_loss=0.0575]
Steps: 1%|▏ | 14283/1000000 [3:16:09<1978:17:57, 7.23s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [14283], local_loss=0.04113392531871796, train_loss=0.05485644191503525, time_cost=2.981437921524048
+
Steps: 1%|▏ | 14283/1000000 [3:16:09<1978:17:57, 7.23s/it, lr=1e-5, step_loss=0.0411]
Steps: 1%|▏ | 14284/1000000 [3:16:18<2157:33:51, 7.88s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [14284], local_loss=0.018958406522870064, train_loss=0.03478927165269852, time_cost=2.495028018951416
+
Steps: 1%|▏ | 14284/1000000 [3:16:18<2157:33:51, 7.88s/it, lr=1e-5, step_loss=0.019]
Steps: 1%|▏ | 14285/1000000 [3:16:25<2079:53:49, 7.60s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [14285], local_loss=0.14542272686958313, train_loss=0.14927838742733002, time_cost=1.2607271671295166
+
Steps: 1%|▏ | 14285/1000000 [3:16:25<2079:53:49, 7.60s/it, lr=1e-5, step_loss=0.145]
Steps: 1%|▏ | 14286/1000000 [3:16:37<2406:47:36, 8.79s/it, lr=1e-5, step_loss=0.145][RANK-0]: Step: [14286], local_loss=0.02777276746928692, train_loss=0.06040891259908676, time_cost=2.8695197105407715
+
Steps: 1%|▏ | 14286/1000000 [3:16:37<2406:47:36, 8.79s/it, lr=1e-5, step_loss=0.0278]
Steps: 1%|▏ | 14287/1000000 [3:16:50<2765:27:43, 10.10s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [14287], local_loss=0.045159634202718735, train_loss=0.14794407784938812, time_cost=4.459010362625122
+
Steps: 1%|▏ | 14287/1000000 [3:16:50<2765:27:43, 10.10s/it, lr=1e-5, step_loss=0.0452]
Steps: 1%|▏ | 14288/1000000 [3:16:56<2407:51:32, 8.79s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [14288], local_loss=0.007996127009391785, train_loss=0.02992672100663185, time_cost=2.748748540878296
+
Steps: 1%|▏ | 14288/1000000 [3:16:56<2407:51:32, 8.79s/it, lr=1e-5, step_loss=0.008]
Steps: 1%|▏ | 14289/1000000 [3:17:03<2305:51:25, 8.42s/it, lr=1e-5, step_loss=0.008][RANK-0]: Step: [14289], local_loss=0.4817882478237152, train_loss=0.08074834942817688, time_cost=1.8665962219238281
+
Steps: 1%|▏ | 14289/1000000 [3:17:03<2305:51:25, 8.42s/it, lr=1e-5, step_loss=0.482]
Steps: 1%|▏ | 14290/1000000 [3:17:19<2875:01:04, 10.50s/it, lr=1e-5, step_loss=0.482][RANK-0]: Step: [14290], local_loss=0.018902208656072617, train_loss=0.019764918833971024, time_cost=5.324578523635864
+
Steps: 1%|▏ | 14290/1000000 [3:17:19<2875:01:04, 10.50s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%|▏ | 14291/1000000 [3:17:31<3042:39:23, 11.11s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [14291], local_loss=0.03937944397330284, train_loss=0.021972741931676865, time_cost=1.2411465644836426
+
Steps: 1%|▏ | 14291/1000000 [3:17:31<3042:39:23, 11.11s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%|▏ | 14292/1000000 [3:17:37<2583:33:39, 9.44s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [14292], local_loss=0.03813054785132408, train_loss=0.020816141739487648, time_cost=2.7440569400787354
+
Steps: 1%|▏ | 14292/1000000 [3:17:37<2583:33:39, 9.44s/it, lr=1e-5, step_loss=0.0381]
Steps: 1%|▏ | 14293/1000000 [3:17:42<2237:06:36, 8.17s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [14293], local_loss=0.038464564830064774, train_loss=0.03641171753406525, time_cost=2.2230374813079834
+
Steps: 1%|▏ | 14293/1000000 [3:17:42<2237:06:36, 8.17s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%|▏ | 14294/1000000 [3:17:56<2719:30:55, 9.93s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [14294], local_loss=0.028778212144970894, train_loss=0.0421404093503952, time_cost=2.309908151626587
+
Steps: 1%|▏ | 14294/1000000 [3:17:56<2719:30:55, 9.93s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 14295/1000000 [3:18:02<2355:11:32, 8.60s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [14295], local_loss=0.1571832150220871, train_loss=0.166886568069458, time_cost=1.653005838394165
+
Steps: 1%|▏ | 14295/1000000 [3:18:02<2355:11:32, 8.60s/it, lr=1e-5, step_loss=0.157]
Steps: 1%|▏ | 14296/1000000 [3:18:16<2848:36:46, 10.40s/it, lr=1e-5, step_loss=0.157][RANK-0]: Step: [14296], local_loss=0.013931029476225376, train_loss=0.06251650303602219, time_cost=10.800114870071411
+
Steps: 1%|▏ | 14296/1000000 [3:18:16<2848:36:46, 10.40s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%|▏ | 14297/1000000 [3:18:28<2947:23:01, 10.76s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [14297], local_loss=0.03988822177052498, train_loss=0.013466423377394676, time_cost=1.2272896766662598
+
Steps: 1%|▏ | 14297/1000000 [3:18:28<2947:23:01, 10.76s/it, lr=1e-5, step_loss=0.0399]
Steps: 1%|▏ | 14298/1000000 [3:18:32<2413:30:04, 8.81s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [14298], local_loss=0.0393458716571331, train_loss=0.05568639561533928, time_cost=1.5290305614471436
+
Steps: 1%|▏ | 14298/1000000 [3:18:32<2413:30:04, 8.81s/it, lr=1e-5, step_loss=0.0393]
Steps: 1%|▏ | 14299/1000000 [3:18:39<2277:25:37, 8.32s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [14299], local_loss=0.00991123728454113, train_loss=0.049722958356142044, time_cost=2.62005877494812
+
Steps: 1%|▏ | 14299/1000000 [3:18:39<2277:25:37, 8.32s/it, lr=1e-5, step_loss=0.00991]
Steps: 1%|▏ | 14300/1000000 [3:18:46<2177:24:46, 7.95s/it, lr=1e-5, step_loss=0.00991][RANK-0]: Step: [14300], local_loss=0.005859286990016699, train_loss=18.441606521606445, time_cost=4.078344821929932
+
Steps: 1%|▏ | 14300/1000000 [3:18:46<2177:24:46, 7.95s/it, lr=1e-5, step_loss=0.00586]
Steps: 1%|▏ | 14301/1000000 [3:18:55<2228:11:38, 8.14s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [14301], local_loss=0.02151058241724968, train_loss=0.03786458820104599, time_cost=1.2209842205047607
+
Steps: 1%|▏ | 14301/1000000 [3:18:55<2228:11:38, 8.14s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%|▏ | 14302/1000000 [3:19:01<2029:15:36, 7.41s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [14302], local_loss=0.016379591077566147, train_loss=0.022290412336587906, time_cost=3.3087198734283447
+
Steps: 1%|▏ | 14302/1000000 [3:19:01<2029:15:36, 7.41s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%|▏ | 14303/1000000 [3:19:11<2259:02:25, 8.25s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [14303], local_loss=0.027287263423204422, train_loss=0.026034193113446236, time_cost=1.7598190307617188
+
Steps: 1%|▏ | 14303/1000000 [3:19:11<2259:02:25, 8.25s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%|▏ | 14304/1000000 [3:19:18<2152:24:14, 7.86s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [14304], local_loss=0.028788337484002113, train_loss=0.045235056430101395, time_cost=2.4268691539764404
+
Steps: 1%|▏ | 14304/1000000 [3:19:18<2152:24:14, 7.86s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 14305/1000000 [3:19:22<1866:33:28, 6.82s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [14305], local_loss=0.01361401379108429, train_loss=0.07362444698810577, time_cost=1.520453929901123
+
Steps: 1%|▏ | 14305/1000000 [3:19:22<1866:33:28, 6.82s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%|▏ | 14306/1000000 [3:19:38<2573:31:09, 9.40s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [14306], local_loss=0.01145892683416605, train_loss=0.15578316152095795, time_cost=1.579416275024414
+
Steps: 1%|▏ | 14306/1000000 [3:19:38<2573:31:09, 9.40s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%|▏ | 14307/1000000 [3:19:42<2198:14:44, 8.03s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [14307], local_loss=0.04909933730959892, train_loss=0.025152940303087234, time_cost=2.4335556030273438
+
Steps: 1%|▏ | 14307/1000000 [3:19:42<2198:14:44, 8.03s/it, lr=1e-5, step_loss=0.0491]
Steps: 1%|▏ | 14308/1000000 [3:19:47<1958:34:52, 7.15s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [14308], local_loss=0.09083318710327148, train_loss=0.06602328270673752, time_cost=2.4192709922790527
+
Steps: 1%|▏ | 14308/1000000 [3:19:47<1958:34:52, 7.15s/it, lr=1e-5, step_loss=0.0908]
Steps: 1%|▏ | 14309/1000000 [3:19:56<2067:04:31, 7.55s/it, lr=1e-5, step_loss=0.0908][RANK-0]: Step: [14309], local_loss=0.008074806071817875, train_loss=0.025669937953352928, time_cost=1.686422348022461
+
Steps: 1%|▏ | 14309/1000000 [3:19:56<2067:04:31, 7.55s/it, lr=1e-5, step_loss=0.00807]
Steps: 1%|▏ | 14310/1000000 [3:20:03<2030:00:22, 7.41s/it, lr=1e-5, step_loss=0.00807][RANK-0]: Step: [14310], local_loss=0.008744632825255394, train_loss=0.07886673510074615, time_cost=1.7611174583435059
+
Steps: 1%|▏ | 14310/1000000 [3:20:03<2030:00:22, 7.41s/it, lr=1e-5, step_loss=0.00874]
Steps: 1%|▏ | 14311/1000000 [3:20:16<2523:06:30, 9.22s/it, lr=1e-5, step_loss=0.00874][RANK-0]: Step: [14311], local_loss=0.019902130588889122, train_loss=0.018504036590456963, time_cost=4.070356607437134
+
Steps: 1%|▏ | 14311/1000000 [3:20:16<2523:06:30, 9.22s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%|▏ | 14312/1000000 [3:20:27<2630:05:32, 9.61s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [14312], local_loss=0.008336019702255726, train_loss=0.0151072908192873, time_cost=1.2094800472259521
+
Steps: 1%|▏ | 14312/1000000 [3:20:27<2630:05:32, 9.61s/it, lr=1e-5, step_loss=0.00834]
Steps: 1%|▏ | 14313/1000000 [3:20:34<2440:51:33, 8.91s/it, lr=1e-5, step_loss=0.00834][RANK-0]: Step: [14313], local_loss=0.004275171086192131, train_loss=0.05253658816218376, time_cost=1.5796418190002441
+
Steps: 1%|▏ | 14313/1000000 [3:20:34<2440:51:33, 8.91s/it, lr=1e-5, step_loss=0.00428]
Steps: 1%|▏ | 14314/1000000 [3:20:41<2298:22:45, 8.39s/it, lr=1e-5, step_loss=0.00428][RANK-0]: Step: [14314], local_loss=0.005450072232633829, train_loss=0.02826218493282795, time_cost=1.6633381843566895
+
Steps: 1%|▏ | 14314/1000000 [3:20:41<2298:22:45, 8.39s/it, lr=1e-5, step_loss=0.00545]
Steps: 1%|▏ | 14315/1000000 [3:20:51<2352:47:15, 8.59s/it, lr=1e-5, step_loss=0.00545][RANK-0]: Step: [14315], local_loss=0.007649563252925873, train_loss=0.023819968104362488, time_cost=1.2296051979064941
+
Steps: 1%|▏ | 14315/1000000 [3:20:51<2352:47:15, 8.59s/it, lr=1e-5, step_loss=0.00765]
Steps: 1%|▏ | 14316/1000000 [3:21:00<2417:23:53, 8.83s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [14316], local_loss=0.03791068494319916, train_loss=0.14950227737426758, time_cost=7.050447702407837
+
Steps: 1%|▏ | 14316/1000000 [3:21:00<2417:23:53, 8.83s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%|▏ | 14317/1000000 [3:21:12<2682:33:38, 9.80s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [14317], local_loss=0.006898154504597187, train_loss=0.1360742747783661, time_cost=3.6964385509490967
+
Steps: 1%|▏ | 14317/1000000 [3:21:12<2682:33:38, 9.80s/it, lr=1e-5, step_loss=0.0069]
Steps: 1%|▏ | 14318/1000000 [3:21:20<2523:17:08, 9.22s/it, lr=1e-5, step_loss=0.0069][RANK-0]: Step: [14318], local_loss=0.004736484028398991, train_loss=0.01958061009645462, time_cost=3.765845537185669
+
Steps: 1%|▏ | 14318/1000000 [3:21:20<2523:17:08, 9.22s/it, lr=1e-5, step_loss=0.00474]
Steps: 1%|▏ | 14319/1000000 [3:21:29<2543:50:07, 9.29s/it, lr=1e-5, step_loss=0.00474][RANK-0]: Step: [14319], local_loss=0.025537818670272827, train_loss=0.03216013312339783, time_cost=1.8985018730163574
+
Steps: 1%|▏ | 14319/1000000 [3:21:29<2543:50:07, 9.29s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%|▏ | 14320/1000000 [3:21:36<2342:19:52, 8.55s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [14320], local_loss=0.00720202224329114, train_loss=0.07914475351572037, time_cost=2.5005569458007812
+
Steps: 1%|▏ | 14320/1000000 [3:21:36<2342:19:52, 8.55s/it, lr=1e-5, step_loss=0.0072]
Steps: 1%|▏ | 14321/1000000 [3:21:41<2039:52:34, 7.45s/it, lr=1e-5, step_loss=0.0072][RANK-0]: Step: [14321], local_loss=0.01352495327591896, train_loss=0.02279367670416832, time_cost=2.213189125061035
+
Steps: 1%|▏ | 14321/1000000 [3:21:41<2039:52:34, 7.45s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%|▏ | 14322/1000000 [3:21:45<1783:55:21, 6.52s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [14322], local_loss=0.009713314473628998, train_loss=0.08297790586948395, time_cost=1.4264283180236816
+
Steps: 1%|▏ | 14322/1000000 [3:21:45<1783:55:21, 6.52s/it, lr=1e-5, step_loss=0.00971]
Steps: 1%|▏ | 14323/1000000 [3:21:50<1604:57:15, 5.86s/it, lr=1e-5, step_loss=0.00971][RANK-0]: Step: [14323], local_loss=0.008354231715202332, train_loss=0.020509198307991028, time_cost=1.7985835075378418
+
Steps: 1%|▏ | 14323/1000000 [3:21:50<1604:57:15, 5.86s/it, lr=1e-5, step_loss=0.00835]
Steps: 1%|▏ | 14324/1000000 [3:22:00<1973:05:16, 7.21s/it, lr=1e-5, step_loss=0.00835][RANK-0]: Step: [14324], local_loss=0.11807968467473984, train_loss=0.03827411308884621, time_cost=1.2321336269378662
+
Steps: 1%|▏ | 14324/1000000 [3:22:00<1973:05:16, 7.21s/it, lr=1e-5, step_loss=0.118]
Steps: 1%|▏ | 14325/1000000 [3:22:13<2460:21:06, 8.99s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [14325], local_loss=0.01691843569278717, train_loss=0.020218193531036377, time_cost=3.6383025646209717
+
Steps: 1%|▏ | 14325/1000000 [3:22:13<2460:21:06, 8.99s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 14326/1000000 [3:22:18<2100:45:19, 7.67s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [14326], local_loss=0.014083553105592728, train_loss=0.02566804736852646, time_cost=1.813035249710083
+
Steps: 1%|▏ | 14326/1000000 [3:22:18<2100:45:19, 7.67s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%|▏ | 14327/1000000 [3:22:28<2293:55:54, 8.38s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [14327], local_loss=0.02331496775150299, train_loss=37.360172271728516, time_cost=1.3205785751342773
+
Steps: 1%|▏ | 14327/1000000 [3:22:28<2293:55:54, 8.38s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%|▏ | 14328/1000000 [3:22:37<2373:18:23, 8.67s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [14328], local_loss=0.02035505883395672, train_loss=0.04029957577586174, time_cost=3.0515754222869873
+
Steps: 1%|▏ | 14328/1000000 [3:22:37<2373:18:23, 8.67s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 14329/1000000 [3:22:45<2308:13:19, 8.43s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [14329], local_loss=0.006816389970481396, train_loss=0.01917438954114914, time_cost=2.0510644912719727
+
Steps: 1%|▏ | 14329/1000000 [3:22:45<2308:13:19, 8.43s/it, lr=1e-5, step_loss=0.00682]
Steps: 1%|▏ | 14330/1000000 [3:22:54<2395:41:07, 8.75s/it, lr=1e-5, step_loss=0.00682][RANK-0]: Step: [14330], local_loss=0.020794274285435677, train_loss=0.015188025310635567, time_cost=1.9843485355377197
+
Steps: 1%|▏ | 14330/1000000 [3:22:54<2395:41:07, 8.75s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%|▏ | 14331/1000000 [3:23:05<2516:05:37, 9.19s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [14331], local_loss=0.03038283996284008, train_loss=0.07247801870107651, time_cost=4.501598596572876
+
Steps: 1%|▏ | 14331/1000000 [3:23:05<2516:05:37, 9.19s/it, lr=1e-5, step_loss=0.0304]
Steps: 1%|▏ | 14332/1000000 [3:23:11<2276:31:33, 8.31s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [14332], local_loss=0.10848940908908844, train_loss=0.028999043628573418, time_cost=4.061704158782959
+
Steps: 1%|▏ | 14332/1000000 [3:23:11<2276:31:33, 8.31s/it, lr=1e-5, step_loss=0.108]
Steps: 1%|▏ | 14333/1000000 [3:23:15<1946:35:47, 7.11s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [14333], local_loss=0.1263912469148636, train_loss=0.03619540110230446, time_cost=1.4632506370544434
+
Steps: 1%|▏ | 14333/1000000 [3:23:15<1946:35:47, 7.11s/it, lr=1e-5, step_loss=0.126]
Steps: 1%|▏ | 14334/1000000 [3:23:23<2008:51:18, 7.34s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [14334], local_loss=0.04290202260017395, train_loss=0.053460489958524704, time_cost=3.566885471343994
+
Steps: 1%|▏ | 14334/1000000 [3:23:23<2008:51:18, 7.34s/it, lr=1e-5, step_loss=0.0429]
Steps: 1%|▏ | 14335/1000000 [3:23:32<2164:59:30, 7.91s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [14335], local_loss=0.04291331768035889, train_loss=0.04267922043800354, time_cost=3.959965467453003
+
Steps: 1%|▏ | 14335/1000000 [3:23:32<2164:59:30, 7.91s/it, lr=1e-5, step_loss=0.0429]
Steps: 1%|▏ | 14336/1000000 [3:23:40<2145:21:19, 7.84s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [14336], local_loss=0.0875944048166275, train_loss=11.843962669372559, time_cost=4.976382255554199
+
Steps: 1%|▏ | 14336/1000000 [3:23:40<2145:21:19, 7.84s/it, lr=1e-5, step_loss=0.0876]
Steps: 1%|▏ | 14337/1000000 [3:23:51<2390:38:04, 8.73s/it, lr=1e-5, step_loss=0.0876][RANK-0]: Step: [14337], local_loss=0.07180777192115784, train_loss=0.04222340136766434, time_cost=2.8254547119140625
+
Steps: 1%|▏ | 14337/1000000 [3:23:51<2390:38:04, 8.73s/it, lr=1e-5, step_loss=0.0718]
Steps: 1%|▏ | 14338/1000000 [3:23:56<2122:35:26, 7.75s/it, lr=1e-5, step_loss=0.0718][RANK-0]: Step: [14338], local_loss=0.008235113695263863, train_loss=0.018507257103919983, time_cost=2.7281370162963867
+
Steps: 1%|▏ | 14338/1000000 [3:23:56<2122:35:26, 7.75s/it, lr=1e-5, step_loss=0.00824]
Steps: 1%|▏ | 14339/1000000 [3:24:08<2406:16:33, 8.79s/it, lr=1e-5, step_loss=0.00824][RANK-0]: Step: [14339], local_loss=0.020419398322701454, train_loss=0.021611008793115616, time_cost=1.5136592388153076
+
Steps: 1%|▏ | 14339/1000000 [3:24:08<2406:16:33, 8.79s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 14340/1000000 [3:24:17<2432:09:19, 8.88s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [14340], local_loss=0.008692378178238869, train_loss=0.011100146919488907, time_cost=7.536011457443237
+
Steps: 1%|▏ | 14340/1000000 [3:24:17<2432:09:19, 8.88s/it, lr=1e-5, step_loss=0.00869]
Steps: 1%|▏ | 14341/1000000 [3:24:32<2940:18:36, 10.74s/it, lr=1e-5, step_loss=0.00869][RANK-0]: Step: [14341], local_loss=0.011930703185498714, train_loss=0.046055130660533905, time_cost=3.649395227432251
+
Steps: 1%|▏ | 14341/1000000 [3:24:32<2940:18:36, 10.74s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%|▏ | 14342/1000000 [3:24:37<2464:12:44, 9.00s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [14342], local_loss=0.01672055758535862, train_loss=0.04346342012286186, time_cost=2.076040267944336
+
Steps: 1%|▏ | 14342/1000000 [3:24:37<2464:12:44, 9.00s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 14343/1000000 [3:24:42<2158:49:30, 7.88s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [14343], local_loss=0.03612305968999863, train_loss=0.03040395677089691, time_cost=2.3340930938720703
+
Steps: 1%|▏ | 14343/1000000 [3:24:42<2158:49:30, 7.88s/it, lr=1e-5, step_loss=0.0361]
Steps: 1%|▏ | 14344/1000000 [3:24:51<2237:00:59, 8.17s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [14344], local_loss=0.021739711984992027, train_loss=0.02446732670068741, time_cost=5.862530946731567
+
Steps: 1%|▏ | 14344/1000000 [3:24:51<2237:00:59, 8.17s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%|▏ | 14345/1000000 [3:24:55<1913:34:02, 6.99s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [14345], local_loss=0.04483717307448387, train_loss=0.05737285315990448, time_cost=1.7075743675231934
+
Steps: 1%|▏ | 14345/1000000 [3:24:55<1913:34:02, 6.99s/it, lr=1e-5, step_loss=0.0448]
Steps: 1%|▏ | 14346/1000000 [3:24:59<1683:33:28, 6.15s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [14346], local_loss=0.9883979558944702, train_loss=0.24511423707008362, time_cost=1.5021443367004395
+
Steps: 1%|▏ | 14346/1000000 [3:24:59<1683:33:28, 6.15s/it, lr=1e-5, step_loss=0.988]
Steps: 1%|▏ | 14347/1000000 [3:25:15<2448:31:52, 8.94s/it, lr=1e-5, step_loss=0.988][RANK-0]: Step: [14347], local_loss=0.006653369404375553, train_loss=0.03870119899511337, time_cost=5.246936559677124
+
Steps: 1%|▏ | 14347/1000000 [3:25:15<2448:31:52, 8.94s/it, lr=1e-5, step_loss=0.00665]
Steps: 1%|▏ | 14348/1000000 [3:25:20<2187:41:01, 7.99s/it, lr=1e-5, step_loss=0.00665][RANK-0]: Step: [14348], local_loss=0.035288646817207336, train_loss=0.14649274945259094, time_cost=3.256542205810547
+
Steps: 1%|▏ | 14348/1000000 [3:25:20<2187:41:01, 7.99s/it, lr=1e-5, step_loss=0.0353]
Steps: 1%|▏ | 14349/1000000 [3:25:35<2689:30:05, 9.82s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [14349], local_loss=0.4393543004989624, train_loss=0.14515012502670288, time_cost=4.0300538539886475
+
Steps: 1%|▏ | 14349/1000000 [3:25:35<2689:30:05, 9.82s/it, lr=1e-5, step_loss=0.439]
Steps: 1%|▏ | 14350/1000000 [3:25:40<2325:38:12, 8.49s/it, lr=1e-5, step_loss=0.439][RANK-0]: Step: [14350], local_loss=0.16316422820091248, train_loss=0.04247797280550003, time_cost=2.8130762577056885
+
Steps: 1%|▏ | 14350/1000000 [3:25:40<2325:38:12, 8.49s/it, lr=1e-5, step_loss=0.163]
Steps: 1%|▏ | 14351/1000000 [3:25:47<2183:40:50, 7.98s/it, lr=1e-5, step_loss=0.163][RANK-0]: Step: [14351], local_loss=0.02561384066939354, train_loss=21.049179077148438, time_cost=1.9544785022735596
+
Steps: 1%|▏ | 14351/1000000 [3:25:47<2183:40:50, 7.98s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%|▏ | 14352/1000000 [3:26:00<2650:30:38, 9.68s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [14352], local_loss=0.25780755281448364, train_loss=0.0556553415954113, time_cost=5.757248401641846
+
Steps: 1%|▏ | 14352/1000000 [3:26:00<2650:30:38, 9.68s/it, lr=1e-5, step_loss=0.258]
Steps: 1%|▏ | 14353/1000000 [3:26:06<2347:14:37, 8.57s/it, lr=1e-5, step_loss=0.258][RANK-0]: Step: [14353], local_loss=0.019259952008724213, train_loss=0.0179764237254858, time_cost=1.9264633655548096
+
Steps: 1%|▏ | 14353/1000000 [3:26:06<2347:14:37, 8.57s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%|▏ | 14354/1000000 [3:26:11<1989:32:48, 7.27s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [14354], local_loss=0.03430010750889778, train_loss=0.10076844692230225, time_cost=1.4987030029296875
+
Steps: 1%|▏ | 14354/1000000 [3:26:11<1989:32:48, 7.27s/it, lr=1e-5, step_loss=0.0343]
Steps: 1%|▏ | 14355/1000000 [3:26:23<2452:56:21, 8.96s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [14355], local_loss=0.03460897132754326, train_loss=0.07775217294692993, time_cost=4.233229398727417
+
Steps: 1%|▏ | 14355/1000000 [3:26:23<2452:56:21, 8.96s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%|▏ | 14356/1000000 [3:26:32<2398:30:48, 8.76s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [14356], local_loss=0.015960143879055977, train_loss=0.019421255216002464, time_cost=6.549242258071899
+
Steps: 1%|▏ | 14356/1000000 [3:26:32<2398:30:48, 8.76s/it, lr=1e-5, step_loss=0.016]
Steps: 1%|▏ | 14357/1000000 [3:26:46<2836:22:14, 10.36s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [14357], local_loss=0.02479364164173603, train_loss=0.030105454847216606, time_cost=4.192545175552368
+
Steps: 1%|▏ | 14357/1000000 [3:26:46<2836:22:14, 10.36s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 14358/1000000 [3:26:50<2347:04:28, 8.57s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [14358], local_loss=0.016732018440961838, train_loss=0.02188676968216896, time_cost=1.5234320163726807
+
Steps: 1%|▏ | 14358/1000000 [3:26:50<2347:04:28, 8.57s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 14359/1000000 [3:27:03<2710:35:33, 9.90s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [14359], local_loss=0.004264086484909058, train_loss=0.06816677749156952, time_cost=4.452088832855225
+
Steps: 1%|▏ | 14359/1000000 [3:27:03<2710:35:33, 9.90s/it, lr=1e-5, step_loss=0.00426]
Steps: 1%|▏ | 14360/1000000 [3:27:11<2506:54:21, 9.16s/it, lr=1e-5, step_loss=0.00426][RANK-0]: Step: [14360], local_loss=0.007853042334318161, train_loss=20.687179565429688, time_cost=2.734675884246826
+
Steps: 1%|▏ | 14360/1000000 [3:27:11<2506:54:21, 9.16s/it, lr=1e-5, step_loss=0.00785]
Steps: 1%|▏ | 14361/1000000 [3:27:22<2672:19:09, 9.76s/it, lr=1e-5, step_loss=0.00785][RANK-0]: Step: [14361], local_loss=0.1977750062942505, train_loss=0.0836566910147667, time_cost=1.2098827362060547
+
Steps: 1%|▏ | 14361/1000000 [3:27:22<2672:19:09, 9.76s/it, lr=1e-5, step_loss=0.198]
Steps: 1%|▏ | 14362/1000000 [3:27:28<2365:22:12, 8.64s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [14362], local_loss=0.024600956588983536, train_loss=0.021575521677732468, time_cost=3.3234682083129883
+
Steps: 1%|▏ | 14362/1000000 [3:27:28<2365:22:12, 8.64s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%|▏ | 14363/1000000 [3:27:42<2810:43:41, 10.27s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [14363], local_loss=0.0356779545545578, train_loss=0.03963273763656616, time_cost=4.839030742645264
+
Steps: 1%|▏ | 14363/1000000 [3:27:42<2810:43:41, 10.27s/it, lr=1e-5, step_loss=0.0357]
Steps: 1%|▏ | 14364/1000000 [3:27:50<2614:45:35, 9.55s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [14364], local_loss=0.006500591523945332, train_loss=0.061030901968479156, time_cost=6.441577434539795
+
Steps: 1%|▏ | 14364/1000000 [3:27:50<2614:45:35, 9.55s/it, lr=1e-5, step_loss=0.0065]
Steps: 1%|▏ | 14365/1000000 [3:27:54<2172:01:11, 7.93s/it, lr=1e-5, step_loss=0.0065][RANK-0]: Step: [14365], local_loss=0.04596655070781708, train_loss=0.2991465628147125, time_cost=1.258223056793213
+
Steps: 1%|▏ | 14365/1000000 [3:27:54<2172:01:11, 7.93s/it, lr=1e-5, step_loss=0.046]
Steps: 1%|▏ | 14366/1000000 [3:27:59<1932:23:10, 7.06s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [14366], local_loss=0.05702436715364456, train_loss=0.055179134011268616, time_cost=1.3673732280731201
+
Steps: 1%|▏ | 14366/1000000 [3:27:59<1932:23:10, 7.06s/it, lr=1e-5, step_loss=0.057]
Steps: 1%|▏ | 14367/1000000 [3:28:13<2474:09:26, 9.04s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [14367], local_loss=0.008656129240989685, train_loss=0.0349697470664978, time_cost=4.635830879211426
+
Steps: 1%|▏ | 14367/1000000 [3:28:13<2474:09:26, 9.04s/it, lr=1e-5, step_loss=0.00866]
Steps: 1%|▏ | 14368/1000000 [3:28:18<2130:41:08, 7.78s/it, lr=1e-5, step_loss=0.00866][RANK-0]: Step: [14368], local_loss=0.01646897941827774, train_loss=0.021459437906742096, time_cost=2.030017614364624
+
Steps: 1%|▏ | 14368/1000000 [3:28:18<2130:41:08, 7.78s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%|▏ | 14369/1000000 [3:28:23<1974:17:29, 7.21s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [14369], local_loss=0.006748713552951813, train_loss=0.024097410961985588, time_cost=1.783585548400879
+
Steps: 1%|▏ | 14369/1000000 [3:28:23<1974:17:29, 7.21s/it, lr=1e-5, step_loss=0.00675]
Steps: 1%|▏ | 14370/1000000 [3:28:34<2237:52:33, 8.17s/it, lr=1e-5, step_loss=0.00675][RANK-0]: Step: [14370], local_loss=0.011777265928685665, train_loss=0.15695522725582123, time_cost=2.470282554626465
+
Steps: 1%|▏ | 14370/1000000 [3:28:34<2237:52:33, 8.17s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14371/1000000 [3:28:42<2218:08:59, 8.10s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14371], local_loss=0.006779232062399387, train_loss=0.021289166063070297, time_cost=3.8253748416900635
+
Steps: 1%|▏ | 14371/1000000 [3:28:42<2218:08:59, 8.10s/it, lr=1e-5, step_loss=0.00678]
Steps: 1%|▏ | 14372/1000000 [3:28:49<2137:20:59, 7.81s/it, lr=1e-5, step_loss=0.00678][RANK-0]: Step: [14372], local_loss=0.09971161931753159, train_loss=0.1884794384241104, time_cost=2.861685037612915
+
Steps: 1%|▏ | 14372/1000000 [3:28:49<2137:20:59, 7.81s/it, lr=1e-5, step_loss=0.0997]
Steps: 1%|▏ | 14373/1000000 [3:28:55<1985:44:06, 7.25s/it, lr=1e-5, step_loss=0.0997][RANK-0]: Step: [14373], local_loss=0.02509070746600628, train_loss=0.024265389889478683, time_cost=1.2197842597961426
+
Steps: 1%|▏ | 14373/1000000 [3:28:55<1985:44:06, 7.25s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%|▏ | 14374/1000000 [3:29:05<2221:32:11, 8.11s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [14374], local_loss=0.006015895400196314, train_loss=0.06709441542625427, time_cost=2.1057844161987305
+
Steps: 1%|▏ | 14374/1000000 [3:29:05<2221:32:11, 8.11s/it, lr=1e-5, step_loss=0.00602]
Steps: 1%|▏ | 14375/1000000 [3:29:09<1924:50:48, 7.03s/it, lr=1e-5, step_loss=0.00602][RANK-0]: Step: [14375], local_loss=0.42100876569747925, train_loss=0.08389833569526672, time_cost=1.6222412586212158
+
Steps: 1%|▏ | 14375/1000000 [3:29:09<1924:50:48, 7.03s/it, lr=1e-5, step_loss=0.421]
Steps: 1%|▏ | 14376/1000000 [3:29:19<2119:57:17, 7.74s/it, lr=1e-5, step_loss=0.421][RANK-0]: Step: [14376], local_loss=0.08851013332605362, train_loss=0.12781095504760742, time_cost=1.2305750846862793
+
Steps: 1%|▏ | 14376/1000000 [3:29:19<2119:57:17, 7.74s/it, lr=1e-5, step_loss=0.0885]
Steps: 1%|▏ | 14377/1000000 [3:29:23<1839:05:18, 6.72s/it, lr=1e-5, step_loss=0.0885][RANK-0]: Step: [14377], local_loss=0.07358845323324203, train_loss=0.04480855166912079, time_cost=1.5339057445526123
+
Steps: 1%|▏ | 14377/1000000 [3:29:23<1839:05:18, 6.72s/it, lr=1e-5, step_loss=0.0736]
Steps: 1%|▏ | 14378/1000000 [3:29:33<2083:25:04, 7.61s/it, lr=1e-5, step_loss=0.0736][RANK-0]: Step: [14378], local_loss=0.01321801170706749, train_loss=0.02872757613658905, time_cost=2.232032537460327
+
Steps: 1%|▏ | 14378/1000000 [3:29:33<2083:25:04, 7.61s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%|▏ | 14379/1000000 [3:29:41<2164:00:38, 7.90s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [14379], local_loss=0.05228010192513466, train_loss=0.054965388029813766, time_cost=3.2460408210754395
+
Steps: 1%|▏ | 14379/1000000 [3:29:41<2164:00:38, 7.90s/it, lr=1e-5, step_loss=0.0523]
Steps: 1%|▏ | 14380/1000000 [3:29:49<2139:35:30, 7.81s/it, lr=1e-5, step_loss=0.0523][RANK-0]: Step: [14380], local_loss=0.005420318339020014, train_loss=0.016847850754857063, time_cost=2.8507184982299805
+
Steps: 1%|▏ | 14380/1000000 [3:29:49<2139:35:30, 7.81s/it, lr=1e-5, step_loss=0.00542]
Steps: 1%|▏ | 14381/1000000 [3:30:02<2524:22:10, 9.22s/it, lr=1e-5, step_loss=0.00542][RANK-0]: Step: [14381], local_loss=0.006001242436468601, train_loss=0.09470468014478683, time_cost=5.459389686584473
+
Steps: 1%|▏ | 14381/1000000 [3:30:02<2524:22:10, 9.22s/it, lr=1e-5, step_loss=0.006]
Steps: 1%|▏ | 14382/1000000 [3:30:11<2565:46:17, 9.37s/it, lr=1e-5, step_loss=0.006][RANK-0]: Step: [14382], local_loss=0.010567022487521172, train_loss=0.11799097806215286, time_cost=3.439460039138794
+
Steps: 1%|▏ | 14382/1000000 [3:30:11<2565:46:17, 9.37s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 14383/1000000 [3:30:25<2889:25:35, 10.55s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [14383], local_loss=0.0660909041762352, train_loss=37.501686096191406, time_cost=2.9995076656341553
+
Steps: 1%|▏ | 14383/1000000 [3:30:25<2889:25:35, 10.55s/it, lr=1e-5, step_loss=0.0661]
Steps: 1%|▏ | 14384/1000000 [3:30:36<2935:44:04, 10.72s/it, lr=1e-5, step_loss=0.0661][RANK-0]: Step: [14384], local_loss=0.03344650939106941, train_loss=0.023628417402505875, time_cost=8.09201955795288
+
Steps: 1%|▏ | 14384/1000000 [3:30:36<2935:44:04, 10.72s/it, lr=1e-5, step_loss=0.0334]
Steps: 1%|▏ | 14385/1000000 [3:30:42<2545:25:50, 9.30s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [14385], local_loss=0.06342516839504242, train_loss=0.03581395372748375, time_cost=1.7684423923492432
+
Steps: 1%|▏ | 14385/1000000 [3:30:42<2545:25:50, 9.30s/it, lr=1e-5, step_loss=0.0634]
Steps: 1%|▏ | 14386/1000000 [3:30:46<2132:13:51, 7.79s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [14386], local_loss=0.014420334249734879, train_loss=0.06825320422649384, time_cost=1.3615999221801758
+
Steps: 1%|▏ | 14386/1000000 [3:30:46<2132:13:51, 7.79s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%|▏ | 14387/1000000 [3:30:52<2000:40:04, 7.31s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [14387], local_loss=0.056319206953048706, train_loss=0.04410405457019806, time_cost=1.2300686836242676
+
Steps: 1%|▏ | 14387/1000000 [3:30:52<2000:40:04, 7.31s/it, lr=1e-5, step_loss=0.0563]
Steps: 1%|▏ | 14388/1000000 [3:31:03<2281:48:29, 8.33s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [14388], local_loss=0.14073075354099274, train_loss=0.0566241517663002, time_cost=3.666351079940796
+
Steps: 1%|▏ | 14388/1000000 [3:31:03<2281:48:29, 8.33s/it, lr=1e-5, step_loss=0.141]
Steps: 1%|▏ | 14389/1000000 [3:31:16<2711:14:01, 9.90s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [14389], local_loss=0.01241995394229889, train_loss=0.020035801455378532, time_cost=11.368299007415771
+
Steps: 1%|▏ | 14389/1000000 [3:31:16<2711:14:01, 9.90s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 14390/1000000 [3:31:24<2493:58:29, 9.11s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [14390], local_loss=0.030919570475816727, train_loss=0.022342609241604805, time_cost=2.6666364669799805
+
Steps: 1%|▏ | 14390/1000000 [3:31:24<2493:58:29, 9.11s/it, lr=1e-5, step_loss=0.0309]
Steps: 1%|▏ | 14391/1000000 [3:31:29<2209:53:36, 8.07s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [14391], local_loss=0.016654370352625847, train_loss=0.04208843410015106, time_cost=1.2600266933441162
+
Steps: 1%|▏ | 14391/1000000 [3:31:29<2209:53:36, 8.07s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 14392/1000000 [3:31:43<2654:26:49, 9.70s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [14392], local_loss=0.06275907903909683, train_loss=0.026073800399899483, time_cost=3.3627209663391113
+
Steps: 1%|▏ | 14392/1000000 [3:31:43<2654:26:49, 9.70s/it, lr=1e-5, step_loss=0.0628]
Steps: 1%|▏ | 14393/1000000 [3:31:49<2332:29:40, 8.52s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [14393], local_loss=0.021285109221935272, train_loss=0.021779995411634445, time_cost=2.9868221282958984
+
Steps: 1%|▏ | 14393/1000000 [3:31:49<2332:29:40, 8.52s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%|▏ | 14394/1000000 [3:32:00<2539:34:49, 9.28s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [14394], local_loss=0.010060042142868042, train_loss=0.05791587382555008, time_cost=1.5977039337158203
+
Steps: 1%|▏ | 14394/1000000 [3:32:00<2539:34:49, 9.28s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%|▏ | 14395/1000000 [3:32:13<2871:43:15, 10.49s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [14395], local_loss=0.008680988103151321, train_loss=0.047839198261499405, time_cost=11.220980644226074
+
Steps: 1%|▏ | 14395/1000000 [3:32:13<2871:43:15, 10.49s/it, lr=1e-5, step_loss=0.00868]
Steps: 1%|▏ | 14396/1000000 [3:32:20<2572:08:20, 9.39s/it, lr=1e-5, step_loss=0.00868][RANK-0]: Step: [14396], local_loss=0.05632779747247696, train_loss=33.19349670410156, time_cost=1.8449015617370605
+
Steps: 1%|▏ | 14396/1000000 [3:32:20<2572:08:20, 9.39s/it, lr=1e-5, step_loss=0.0563]
Steps: 1%|▏ | 14397/1000000 [3:32:31<2687:21:46, 9.82s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [14397], local_loss=0.024847202003002167, train_loss=0.06632300466299057, time_cost=4.535000324249268
+
Steps: 1%|▏ | 14397/1000000 [3:32:31<2687:21:46, 9.82s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 14398/1000000 [3:32:42<2851:37:14, 10.42s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [14398], local_loss=0.023195581510663033, train_loss=0.032945238053798676, time_cost=2.455749034881592
+
Steps: 1%|▏ | 14398/1000000 [3:32:42<2851:37:14, 10.42s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%|▏ | 14399/1000000 [3:32:50<2634:06:27, 9.62s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [14399], local_loss=0.02045447751879692, train_loss=0.12837257981300354, time_cost=2.785898208618164
+
Steps: 1%|▏ | 14399/1000000 [3:32:50<2634:06:27, 9.62s/it, lr=1e-5, step_loss=0.0205]
Steps: 1%|▏ | 14400/1000000 [3:33:00<2651:18:38, 9.68s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [14400], local_loss=0.016658635810017586, train_loss=0.028561165556311607, time_cost=2.3355700969696045
+
Steps: 1%|▏ | 14400/1000000 [3:33:00<2651:18:38, 9.68s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 14401/1000000 [3:33:09<2627:19:46, 9.60s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [14401], local_loss=0.03807790204882622, train_loss=0.08395595848560333, time_cost=2.330284833908081
+
Steps: 1%|▏ | 14401/1000000 [3:33:09<2627:19:46, 9.60s/it, lr=1e-5, step_loss=0.0381]
Steps: 1%|▏ | 14402/1000000 [3:33:21<2807:01:13, 10.25s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [14402], local_loss=0.028805403038859367, train_loss=0.046159401535987854, time_cost=1.204965353012085
+
Steps: 1%|▏ | 14402/1000000 [3:33:21<2807:01:13, 10.25s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 14403/1000000 [3:33:28<2542:07:10, 9.29s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [14403], local_loss=0.011038660071790218, train_loss=0.018477316945791245, time_cost=2.680417060852051
+
Steps: 1%|▏ | 14403/1000000 [3:33:28<2542:07:10, 9.29s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 14404/1000000 [3:33:40<2714:57:11, 9.92s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [14404], local_loss=0.44242382049560547, train_loss=0.21692350506782532, time_cost=3.0335516929626465
+
Steps: 1%|▏ | 14404/1000000 [3:33:40<2714:57:11, 9.92s/it, lr=1e-5, step_loss=0.442]
Steps: 1%|▏ | 14405/1000000 [3:33:46<2445:30:28, 8.93s/it, lr=1e-5, step_loss=0.442][RANK-0]: Step: [14405], local_loss=0.01746067777276039, train_loss=0.027154475450515747, time_cost=1.7012546062469482
+
Steps: 1%|▏ | 14405/1000000 [3:33:46<2445:30:28, 8.93s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 14406/1000000 [3:33:51<2139:47:01, 7.82s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [14406], local_loss=0.04886331409215927, train_loss=0.0761779397726059, time_cost=2.124696731567383
+
Steps: 1%|▏ | 14406/1000000 [3:33:51<2139:47:01, 7.82s/it, lr=1e-5, step_loss=0.0489]
Steps: 1%|▏ | 14407/1000000 [3:34:02<2372:54:55, 8.67s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [14407], local_loss=0.2457868754863739, train_loss=0.12829241156578064, time_cost=4.190356492996216
+
Steps: 1%|▏ | 14407/1000000 [3:34:02<2372:54:55, 8.67s/it, lr=1e-5, step_loss=0.246]
Steps: 1%|▏ | 14408/1000000 [3:34:09<2217:09:49, 8.10s/it, lr=1e-5, step_loss=0.246][RANK-0]: Step: [14408], local_loss=0.024463549256324768, train_loss=0.022950705140829086, time_cost=2.3616631031036377
+
Steps: 1%|▏ | 14408/1000000 [3:34:09<2217:09:49, 8.10s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%|▏ | 14409/1000000 [3:34:13<1910:21:47, 6.98s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [14409], local_loss=0.008445079438388348, train_loss=0.018080320209264755, time_cost=1.231144905090332
+
Steps: 1%|▏ | 14409/1000000 [3:34:13<1910:21:47, 6.98s/it, lr=1e-5, step_loss=0.00845]
Steps: 1%|▏ | 14410/1000000 [3:34:24<2248:50:41, 8.21s/it, lr=1e-5, step_loss=0.00845][RANK-0]: Step: [14410], local_loss=0.05509783327579498, train_loss=0.035214632749557495, time_cost=1.209012508392334
+
Steps: 1%|▏ | 14410/1000000 [3:34:24<2248:50:41, 8.21s/it, lr=1e-5, step_loss=0.0551]
Steps: 1%|▏ | 14411/1000000 [3:34:36<2562:30:33, 9.36s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [14411], local_loss=0.13095402717590332, train_loss=0.04853587597608566, time_cost=5.081682205200195
+
Steps: 1%|▏ | 14411/1000000 [3:34:36<2562:30:33, 9.36s/it, lr=1e-5, step_loss=0.131] /
Steps: 1%|▏ | 14412/1000000 [3:34:47<2680:23:58, 9.79s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [14412], local_loss=0.025012439116835594, train_loss=0.0632084310054779, time_cost=5.355118751525879
+
Steps: 1%|▏ | 14412/1000000 [3:34:47<2680:23:58, 9.79s/it, lr=1e-5, step_loss=0.025]
Steps: 1%|▏ | 14413/1000000 [3:34:53<2336:33:11, 8.53s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [14413], local_loss=0.049060944467782974, train_loss=0.041570618748664856, time_cost=2.5981390476226807
+
Steps: 1%|▏ | 14413/1000000 [3:34:53<2336:33:11, 8.53s/it, lr=1e-5, step_loss=0.0491]
Steps: 1%|▏ | 14414/1000000 [3:35:05<2679:04:39, 9.79s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [14414], local_loss=0.012881595641374588, train_loss=0.20966145396232605, time_cost=3.1530635356903076
+
Steps: 1%|▏ | 14414/1000000 [3:35:05<2679:04:39, 9.79s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14415/1000000 [3:35:16<2740:20:57, 10.01s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14415], local_loss=86.87480926513672, train_loss=10.872732162475586, time_cost=1.2141430377960205
+
Steps: 1%|▏ | 14415/1000000 [3:35:16<2740:20:57, 10.01s/it, lr=1e-5, step_loss=86.9]
Steps: 1%|▏ | 14416/1000000 [3:35:28<2928:12:52, 10.70s/it, lr=1e-5, step_loss=86.9][RANK-0]: Step: [14416], local_loss=0.023202965036034584, train_loss=0.027738498523831367, time_cost=1.2003061771392822
+
Steps: 1%|▏ | 14416/1000000 [3:35:28<2928:12:52, 10.70s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%|▏ | 14417/1000000 [3:35:35<2561:26:41, 9.36s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [14417], local_loss=0.01790759339928627, train_loss=0.042559560388326645, time_cost=1.2107009887695312
+
Steps: 1%|▏ | 14417/1000000 [3:35:35<2561:26:41, 9.36s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%|▏ | 14418/1000000 [3:35:44<2532:26:55, 9.25s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [14418], local_loss=0.03649088740348816, train_loss=35.24381637573242, time_cost=3.4648568630218506
+
Steps: 1%|▏ | 14418/1000000 [3:35:44<2532:26:55, 9.25s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%|▏ | 14419/1000000 [3:35:50<2344:12:30, 8.56s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [14419], local_loss=0.01575232297182083, train_loss=0.07797049731016159, time_cost=1.2052156925201416
+
Steps: 1%|▏ | 14419/1000000 [3:35:50<2344:12:30, 8.56s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%|▏ | 14420/1000000 [3:36:00<2382:21:05, 8.70s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [14420], local_loss=0.04952258616685867, train_loss=0.04114647954702377, time_cost=2.806825637817383
+
Steps: 1%|▏ | 14420/1000000 [3:36:00<2382:21:05, 8.70s/it, lr=1e-5, step_loss=0.0495]
Steps: 1%|▏ | 14421/1000000 [3:36:16<2997:32:00, 10.95s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [14421], local_loss=0.34415730834007263, train_loss=0.07902133464813232, time_cost=7.9980628490448
+
Steps: 1%|▏ | 14421/1000000 [3:36:16<2997:32:00, 10.95s/it, lr=1e-5, step_loss=0.344]
Steps: 1%|▏ | 14422/1000000 [3:36:23<2728:55:15, 9.97s/it, lr=1e-5, step_loss=0.344][RANK-0]: Step: [14422], local_loss=0.049998100847005844, train_loss=0.04338904842734337, time_cost=1.9167354106903076
+
Steps: 1%|▏ | 14422/1000000 [3:36:23<2728:55:15, 9.97s/it, lr=1e-5, step_loss=0.05]
Steps: 1%|▏ | 14423/1000000 [3:36:29<2334:15:35, 8.53s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [14423], local_loss=0.01545712724328041, train_loss=0.04096248000860214, time_cost=2.044922113418579
+
Steps: 1%|▏ | 14423/1000000 [3:36:29<2334:15:35, 8.53s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 14424/1000000 [3:36:36<2245:12:23, 8.20s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [14424], local_loss=0.03447433188557625, train_loss=0.04374413564801216, time_cost=2.3298451900482178
+
Steps: 1%|▏ | 14424/1000000 [3:36:36<2245:12:23, 8.20s/it, lr=1e-5, step_loss=0.0345]
Steps: 1%|▏ | 14425/1000000 [3:36:51<2772:20:51, 10.13s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [14425], local_loss=0.007167281117290258, train_loss=0.019674701616168022, time_cost=2.3186867237091064
+
Steps: 1%|▏ | 14425/1000000 [3:36:51<2772:20:51, 10.13s/it, lr=1e-5, step_loss=0.00717]
Steps: 1%|▏ | 14426/1000000 [3:36:56<2349:15:56, 8.58s/it, lr=1e-5, step_loss=0.00717][RANK-0]: Step: [14426], local_loss=0.3122768700122833, train_loss=0.06084060296416283, time_cost=1.2677662372589111
+
Steps: 1%|▏ | 14426/1000000 [3:36:56<2349:15:56, 8.58s/it, lr=1e-5, step_loss=0.312]
Steps: 1%|▏ | 14427/1000000 [3:37:01<2049:35:31, 7.49s/it, lr=1e-5, step_loss=0.312][RANK-0]: Step: [14427], local_loss=0.012075209990143776, train_loss=0.06314098834991455, time_cost=2.3320343494415283
+
Steps: 1%|▏ | 14427/1000000 [3:37:01<2049:35:31, 7.49s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 14428/1000000 [3:37:08<2088:20:42, 7.63s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [14428], local_loss=0.012903792783617973, train_loss=0.039749667048454285, time_cost=4.068902969360352
+
Steps: 1%|▏ | 14428/1000000 [3:37:08<2088:20:42, 7.63s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14429/1000000 [3:37:21<2476:23:15, 9.05s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14429], local_loss=0.030690133571624756, train_loss=0.07023641467094421, time_cost=1.21335768699646
+
Steps: 1%|▏ | 14429/1000000 [3:37:21<2476:23:15, 9.05s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%|▏ | 14430/1000000 [3:37:27<2259:44:26, 8.25s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [14430], local_loss=0.006653104443103075, train_loss=0.020818952471017838, time_cost=1.197901725769043
+
Steps: 1%|▏ | 14430/1000000 [3:37:27<2259:44:26, 8.25s/it, lr=1e-5, step_loss=0.00665]
Steps: 1%|▏ | 14431/1000000 [3:37:40<2626:05:26, 9.59s/it, lr=1e-5, step_loss=0.00665][RANK-0]: Step: [14431], local_loss=0.19352151453495026, train_loss=0.11235924065113068, time_cost=1.1900649070739746
+
Steps: 1%|▏ | 14431/1000000 [3:37:40<2626:05:26, 9.59s/it, lr=1e-5, step_loss=0.194]
Steps: 1%|▏ | 14432/1000000 [3:37:49<2576:17:52, 9.41s/it, lr=1e-5, step_loss=0.194][RANK-0]: Step: [14432], local_loss=0.009968176484107971, train_loss=0.039430562406778336, time_cost=3.657980442047119
+
Steps: 1%|▏ | 14432/1000000 [3:37:49<2576:17:52, 9.41s/it, lr=1e-5, step_loss=0.00997]
Steps: 1%|▏ | 14433/1000000 [3:37:56<2418:31:35, 8.83s/it, lr=1e-5, step_loss=0.00997][RANK-0]: Step: [14433], local_loss=0.015127342194318771, train_loss=0.04947135969996452, time_cost=5.5560877323150635
+
Steps: 1%|▏ | 14433/1000000 [3:37:56<2418:31:35, 8.83s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%|▏ | 14434/1000000 [3:38:06<2476:39:44, 9.05s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [14434], local_loss=0.05065513029694557, train_loss=0.16097700595855713, time_cost=1.2827348709106445
+
Steps: 1%|▏ | 14434/1000000 [3:38:06<2476:39:44, 9.05s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%|▏ | 14435/1000000 [3:38:13<2299:27:26, 8.40s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [14435], local_loss=0.07792268693447113, train_loss=0.13954579830169678, time_cost=2.206876754760742
+
Steps: 1%|▏ | 14435/1000000 [3:38:13<2299:27:26, 8.40s/it, lr=1e-5, step_loss=0.0779]
Steps: 1%|▏ | 14436/1000000 [3:38:25<2636:11:50, 9.63s/it, lr=1e-5, step_loss=0.0779][RANK-0]: Step: [14436], local_loss=0.051829252392053604, train_loss=0.07385776937007904, time_cost=5.349815607070923
+
Steps: 1%|▏ | 14436/1000000 [3:38:25<2636:11:50, 9.63s/it, lr=1e-5, step_loss=0.0518]
Steps: 1%|▏ | 14437/1000000 [3:38:40<3042:13:08, 11.11s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [14437], local_loss=0.028809037059545517, train_loss=0.06465186923742294, time_cost=11.422505140304565
+
Steps: 1%|▏ | 14437/1000000 [3:38:40<3042:13:08, 11.11s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%|▏ | 14438/1000000 [3:38:47<2724:25:19, 9.95s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [14438], local_loss=0.02617528662085533, train_loss=0.03396850824356079, time_cost=2.4967076778411865
+
Steps: 1%|▏ | 14438/1000000 [3:38:47<2724:25:19, 9.95s/it, lr=1e-5, step_loss=0.0262]
Steps: 1%|▏ | 14439/1000000 [3:39:00<2957:32:32, 10.80s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [14439], local_loss=0.017187947407364845, train_loss=0.1075659692287445, time_cost=5.088781356811523
+
Steps: 1%|▏ | 14439/1000000 [3:39:00<2957:32:32, 10.80s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%|▏ | 14440/1000000 [3:39:09<2800:35:05, 10.23s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [14440], local_loss=0.039138346910476685, train_loss=0.04237692058086395, time_cost=3.133746385574341
+
Steps: 1%|▏ | 14440/1000000 [3:39:09<2800:35:05, 10.23s/it, lr=1e-5, step_loss=0.0391]
Steps: 1%|▏ | 14441/1000000 [3:39:25<3271:15:49, 11.95s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [14441], local_loss=0.0076507446356117725, train_loss=15.204437255859375, time_cost=7.044513702392578
+
Steps: 1%|▏ | 14441/1000000 [3:39:25<3271:15:49, 11.95s/it, lr=1e-5, step_loss=0.00765]
Steps: 1%|▏ | 14442/1000000 [3:39:34<3041:00:48, 11.11s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [14442], local_loss=0.007281155325472355, train_loss=0.06270384788513184, time_cost=2.1826674938201904
+
Steps: 1%|▏ | 14442/1000000 [3:39:34<3041:00:48, 11.11s/it, lr=1e-5, step_loss=0.00728]
Steps: 1%|▏ | 14443/1000000 [3:39:42<2755:10:46, 10.06s/it, lr=1e-5, step_loss=0.00728][RANK-0]: Step: [14443], local_loss=0.047335848212242126, train_loss=0.029386574402451515, time_cost=1.2151753902435303
+
Steps: 1%|▏ | 14443/1000000 [3:39:42<2755:10:46, 10.06s/it, lr=1e-5, step_loss=0.0473]
Steps: 1%|▏ | 14444/1000000 [3:39:48<2429:35:08, 8.87s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [14444], local_loss=0.007418538443744183, train_loss=0.03324032947421074, time_cost=2.365175247192383
+
Steps: 1%|▏ | 14444/1000000 [3:39:48<2429:35:08, 8.87s/it, lr=1e-5, step_loss=0.00742]
Steps: 1%|▏ | 14445/1000000 [3:39:54<2204:58:30, 8.05s/it, lr=1e-5, step_loss=0.00742][RANK-0]: Step: [14445], local_loss=0.3085310459136963, train_loss=0.1028166115283966, time_cost=2.008277416229248
+
Steps: 1%|▏ | 14445/1000000 [3:39:54<2204:58:30, 8.05s/it, lr=1e-5, step_loss=0.309]
Steps: 1%|▏ | 14446/1000000 [3:40:01<2109:38:55, 7.71s/it, lr=1e-5, step_loss=0.309][RANK-0]: Step: [14446], local_loss=0.005726412869989872, train_loss=0.030109163373708725, time_cost=2.402430295944214
+
Steps: 1%|▏ | 14446/1000000 [3:40:01<2109:38:55, 7.71s/it, lr=1e-5, step_loss=0.00573]
Steps: 1%|▏ | 14447/1000000 [3:40:13<2510:15:59, 9.17s/it, lr=1e-5, step_loss=0.00573][RANK-0]: Step: [14447], local_loss=0.15130400657653809, train_loss=0.031564194709062576, time_cost=6.425499439239502
+
Steps: 1%|▏ | 14447/1000000 [3:40:13<2510:15:59, 9.17s/it, lr=1e-5, step_loss=0.151]
Steps: 1%|▏ | 14448/1000000 [3:40:26<2798:03:29, 10.22s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [14448], local_loss=0.009691567160189152, train_loss=0.013142944313585758, time_cost=3.9502577781677246
+
Steps: 1%|▏ | 14448/1000000 [3:40:26<2798:03:29, 10.22s/it, lr=1e-5, step_loss=0.00969]
Steps: 1%|▏ | 14449/1000000 [3:40:33<2497:00:54, 9.12s/it, lr=1e-5, step_loss=0.00969][RANK-0]: Step: [14449], local_loss=0.11722463369369507, train_loss=0.15659299492835999, time_cost=1.397400140762329
+
Steps: 1%|▏ | 14449/1000000 [3:40:33<2497:00:54, 9.12s/it, lr=1e-5, step_loss=0.117]
Steps: 1%|▏ | 14450/1000000 [3:40:43<2640:51:04, 9.65s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [14450], local_loss=0.013820176012814045, train_loss=0.024901360273361206, time_cost=7.863779306411743
+
Steps: 1%|▏ | 14450/1000000 [3:40:43<2640:51:04, 9.65s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 14451/1000000 [3:40:53<2607:13:51, 9.52s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [14451], local_loss=0.010906873270869255, train_loss=0.14712923765182495, time_cost=1.7107875347137451
+
Steps: 1%|▏ | 14451/1000000 [3:40:53<2607:13:51, 9.52s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 14452/1000000 [3:40:59<2367:10:56, 8.65s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [14452], local_loss=0.03171927109360695, train_loss=0.013483557850122452, time_cost=2.4950602054595947
+
Steps: 1%|▏ | 14452/1000000 [3:40:59<2367:10:56, 8.65s/it, lr=1e-5, step_loss=0.0317]
Steps: 1%|▏ | 14453/1000000 [3:41:08<2396:22:41, 8.75s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [14453], local_loss=0.00979459285736084, train_loss=0.017066016793251038, time_cost=4.171120643615723
+
Steps: 1%|▏ | 14453/1000000 [3:41:08<2396:22:41, 8.75s/it, lr=1e-5, step_loss=0.00979]
Steps: 1%|▏ | 14454/1000000 [3:41:14<2165:10:38, 7.91s/it, lr=1e-5, step_loss=0.00979][RANK-0]: Step: [14454], local_loss=0.031656038016080856, train_loss=0.12533296644687653, time_cost=3.048931360244751
+
Steps: 1%|▏ | 14454/1000000 [3:41:14<2165:10:38, 7.91s/it, lr=1e-5, step_loss=0.0317]
Steps: 1%|▏ | 14455/1000000 [3:41:19<1916:25:51, 7.00s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [14455], local_loss=0.010331735946238041, train_loss=0.02839791215956211, time_cost=2.3858325481414795
+
Steps: 1%|▏ | 14455/1000000 [3:41:19<1916:25:51, 7.00s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%|▏ | 14456/1000000 [3:41:26<1925:22:51, 7.03s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [14456], local_loss=0.056610457599163055, train_loss=0.1555284559726715, time_cost=2.7841804027557373
+
Steps: 1%|▏ | 14456/1000000 [3:41:26<1925:22:51, 7.03s/it, lr=1e-5, step_loss=0.0566]
Steps: 1%|▏ | 14457/1000000 [3:41:31<1733:48:08, 6.33s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [14457], local_loss=0.009901770390570164, train_loss=0.026155617088079453, time_cost=1.9461009502410889
+
Steps: 1%|▏ | 14457/1000000 [3:41:31<1733:48:08, 6.33s/it, lr=1e-5, step_loss=0.0099]
Steps: 1%|▏ | 14458/1000000 [3:41:38<1798:39:32, 6.57s/it, lr=1e-5, step_loss=0.0099][RANK-0]: Step: [14458], local_loss=0.17369000613689423, train_loss=0.0968814343214035, time_cost=2.494429349899292
+
Steps: 1%|▏ | 14458/1000000 [3:41:38<1798:39:32, 6.57s/it, lr=1e-5, step_loss=0.174]
Steps: 1%|▏ | 14459/1000000 [3:41:52<2400:18:43, 8.77s/it, lr=1e-5, step_loss=0.174][RANK-0]: Step: [14459], local_loss=0.03210507333278656, train_loss=0.0378967821598053, time_cost=5.511459112167358
+
Steps: 1%|▏ | 14459/1000000 [3:41:52<2400:18:43, 8.77s/it, lr=1e-5, step_loss=0.0321]
Steps: 1%|▏ | 14460/1000000 [3:42:08<3020:26:50, 11.03s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [14460], local_loss=0.024914881214499474, train_loss=0.04046633467078209, time_cost=12.07249927520752
+
Steps: 1%|▏ | 14460/1000000 [3:42:08<3020:26:50, 11.03s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%|▏ | 14461/1000000 [3:42:20<3048:57:18, 11.14s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [14461], local_loss=0.016388585790991783, train_loss=0.05076517537236214, time_cost=2.3590614795684814
+
Steps: 1%|▏ | 14461/1000000 [3:42:20<3048:57:18, 11.14s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%|▏ | 14462/1000000 [3:42:31<3041:25:12, 11.11s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [14462], local_loss=0.01169670931994915, train_loss=0.021358102560043335, time_cost=3.4442412853240967
+
Steps: 1%|▏ | 14462/1000000 [3:42:31<3041:25:12, 11.11s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14463/1000000 [3:42:35<2517:27:34, 9.20s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14463], local_loss=0.015108789317309856, train_loss=0.059160731732845306, time_cost=1.7452549934387207
+
Steps: 1%|▏ | 14463/1000000 [3:42:35<2517:27:34, 9.20s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%|▏ | 14464/1000000 [3:42:50<2991:20:52, 10.93s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [14464], local_loss=0.06165404990315437, train_loss=0.041531600058078766, time_cost=6.735465049743652
+
Steps: 1%|▏ | 14464/1000000 [3:42:50<2991:20:52, 10.93s/it, lr=1e-5, step_loss=0.0617]
Steps: 1%|▏ | 14465/1000000 [3:42:58<2714:34:15, 9.92s/it, lr=1e-5, step_loss=0.0617][RANK-0]: Step: [14465], local_loss=0.019417526200413704, train_loss=0.02901039831340313, time_cost=4.494375467300415
+
Steps: 1%|▏ | 14465/1000000 [3:42:58<2714:34:15, 9.92s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%|▏ | 14466/1000000 [3:43:04<2365:54:12, 8.64s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [14466], local_loss=0.007411320228129625, train_loss=0.09344395995140076, time_cost=2.882761240005493
+
Steps: 1%|▏ | 14466/1000000 [3:43:04<2365:54:12, 8.64s/it, lr=1e-5, step_loss=0.00741]
Steps: 1%|▏ | 14467/1000000 [3:43:11<2225:04:17, 8.13s/it, lr=1e-5, step_loss=0.00741][RANK-0]: Step: [14467], local_loss=0.38487064838409424, train_loss=0.08580903708934784, time_cost=1.854731559753418
+
Steps: 1%|▏ | 14467/1000000 [3:43:11<2225:04:17, 8.13s/it, lr=1e-5, step_loss=0.385]
Steps: 1%|▏ | 14468/1000000 [3:43:22<2468:47:48, 9.02s/it, lr=1e-5, step_loss=0.385][RANK-0]: Step: [14468], local_loss=0.013997159898281097, train_loss=0.05088870972394943, time_cost=2.1179726123809814
+
Steps: 1%|▏ | 14468/1000000 [3:43:22<2468:47:48, 9.02s/it, lr=1e-5, step_loss=0.014]
Steps: 1%|▏ | 14469/1000000 [3:43:28<2272:14:46, 8.30s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [14469], local_loss=0.012943558394908905, train_loss=0.01403967384248972, time_cost=1.2198827266693115
+
Steps: 1%|▏ | 14469/1000000 [3:43:28<2272:14:46, 8.30s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14470/1000000 [3:43:43<2824:37:23, 10.32s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14470], local_loss=0.010174525901675224, train_loss=0.09242096543312073, time_cost=2.2263050079345703
+
Steps: 1%|▏ | 14470/1000000 [3:43:43<2824:37:23, 10.32s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%|▏ | 14471/1000000 [3:43:48<2392:47:56, 8.74s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [14471], local_loss=0.016527554020285606, train_loss=0.025096029043197632, time_cost=1.2027606964111328
+
Steps: 1%|▏ | 14471/1000000 [3:43:48<2392:47:56, 8.74s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%|▏ | 14472/1000000 [3:44:00<2640:23:23, 9.64s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [14472], local_loss=0.11410535126924515, train_loss=0.03957274556159973, time_cost=4.950367450714111
+
Steps: 1%|▏ | 14472/1000000 [3:44:00<2640:23:23, 9.64s/it, lr=1e-5, step_loss=0.114]
Steps: 1%|▏ | 14473/1000000 [3:44:06<2324:38:49, 8.49s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [14473], local_loss=0.030566100031137466, train_loss=0.03395284339785576, time_cost=1.7121403217315674
+
Steps: 1%|▏ | 14473/1000000 [3:44:06<2324:38:49, 8.49s/it, lr=1e-5, step_loss=0.0306]
Steps: 1%|▏ | 14474/1000000 [3:44:15<2377:41:42, 8.69s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [14474], local_loss=0.02888292819261551, train_loss=0.09712720662355423, time_cost=2.397289991378784
+
Steps: 1%|▏ | 14474/1000000 [3:44:15<2377:41:42, 8.69s/it, lr=1e-5, step_loss=0.0289]
Steps: 1%|▏ | 14475/1000000 [3:44:26<2599:41:12, 9.50s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [14475], local_loss=0.025035254657268524, train_loss=0.12715336680412292, time_cost=6.0310118198394775
+
Steps: 1%|▏ | 14475/1000000 [3:44:26<2599:41:12, 9.50s/it, lr=1e-5, step_loss=0.025]
Steps: 1%|▏ | 14476/1000000 [3:44:34<2407:16:39, 8.79s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [14476], local_loss=0.021194884553551674, train_loss=0.028997762128710747, time_cost=2.748044490814209
+
Steps: 1%|▏ | 14476/1000000 [3:44:34<2407:16:39, 8.79s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%|▏ | 14477/1000000 [3:44:38<2064:58:15, 7.54s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [14477], local_loss=0.033867478370666504, train_loss=0.2020861804485321, time_cost=2.3328378200531006
+
Steps: 1%|▏ | 14477/1000000 [3:44:38<2064:58:15, 7.54s/it, lr=1e-5, step_loss=0.0339]
Steps: 1%|▏ | 14478/1000000 [3:44:47<2164:31:23, 7.91s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [14478], local_loss=0.0585506409406662, train_loss=0.03749625012278557, time_cost=1.921830415725708
+
Steps: 1%|▏ | 14478/1000000 [3:44:47<2164:31:23, 7.91s/it, lr=1e-5, step_loss=0.0586]
Steps: 1%|▏ | 14479/1000000 [3:44:56<2254:05:21, 8.23s/it, lr=1e-5, step_loss=0.0586][RANK-0]: Step: [14479], local_loss=0.08872917294502258, train_loss=0.04154708981513977, time_cost=1.2517240047454834
+
Steps: 1%|▏ | 14479/1000000 [3:44:56<2254:05:21, 8.23s/it, lr=1e-5, step_loss=0.0887]
Steps: 1%|▏ | 14480/1000000 [3:45:04<2209:23:48, 8.07s/it, lr=1e-5, step_loss=0.0887][RANK-0]: Step: [14480], local_loss=0.021753137931227684, train_loss=0.02329101413488388, time_cost=1.228205919265747
+
Steps: 1%|▏ | 14480/1000000 [3:45:04<2209:23:48, 8.07s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%|▏ | 14481/1000000 [3:45:09<2014:49:21, 7.36s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [14481], local_loss=0.005075419787317514, train_loss=0.14980313181877136, time_cost=1.4146289825439453
+
Steps: 1%|▏ | 14481/1000000 [3:45:09<2014:49:21, 7.36s/it, lr=1e-5, step_loss=0.00508]
Steps: 1%|▏ | 14482/1000000 [3:45:18<2123:31:08, 7.76s/it, lr=1e-5, step_loss=0.00508][RANK-0]: Step: [14482], local_loss=0.07455328106880188, train_loss=0.033686526119709015, time_cost=2.941002130508423
+
Steps: 1%|▏ | 14482/1000000 [3:45:18<2123:31:08, 7.76s/it, lr=1e-5, step_loss=0.0746]
Steps: 1%|▏ | 14483/1000000 [3:45:31<2570:14:58, 9.39s/it, lr=1e-5, step_loss=0.0746][RANK-0]: Step: [14483], local_loss=0.04440584033727646, train_loss=0.021548792719841003, time_cost=6.924813985824585
+
Steps: 1%|▏ | 14483/1000000 [3:45:31<2570:14:58, 9.39s/it, lr=1e-5, step_loss=0.0444]
Steps: 1%|▏ | 14484/1000000 [3:45:42<2719:48:14, 9.94s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [14484], local_loss=0.023078221827745438, train_loss=0.016061585396528244, time_cost=2.59590220451355
+
Steps: 1%|▏ | 14484/1000000 [3:45:42<2719:48:14, 9.94s/it, lr=1e-5, step_loss=0.0231]
Steps: 1%|▏ | 14485/1000000 [3:45:56<2997:44:46, 10.95s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [14485], local_loss=0.016963325440883636, train_loss=0.05026848614215851, time_cost=9.836309909820557
+
Steps: 1%|▏ | 14485/1000000 [3:45:56<2997:44:46, 10.95s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 14486/1000000 [3:46:09<3215:22:50, 11.75s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [14486], local_loss=0.07616055756807327, train_loss=0.02991982363164425, time_cost=4.291054010391235
+
Steps: 1%|▏ | 14486/1000000 [3:46:09<3215:22:50, 11.75s/it, lr=1e-5, step_loss=0.0762]
Steps: 1%|▏ | 14487/1000000 [3:46:18<3003:55:29, 10.97s/it, lr=1e-5, step_loss=0.0762][RANK-0]: Step: [14487], local_loss=0.011303117498755455, train_loss=0.05277293920516968, time_cost=3.318760395050049
+
Steps: 1%|▏ | 14487/1000000 [3:46:18<3003:55:29, 10.97s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%|▏ | 14488/1000000 [3:46:25<2670:26:11, 9.75s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [14488], local_loss=0.00865038949996233, train_loss=0.15919162333011627, time_cost=1.2914915084838867
+
Steps: 1%|▏ | 14488/1000000 [3:46:25<2670:26:11, 9.75s/it, lr=1e-5, step_loss=0.00865]
Steps: 1%|▏ | 14489/1000000 [3:46:32<2448:59:45, 8.95s/it, lr=1e-5, step_loss=0.00865][RANK-0]: Step: [14489], local_loss=0.008890163153409958, train_loss=0.026256747543811798, time_cost=2.6903929710388184
+
Steps: 1%|▏ | 14489/1000000 [3:46:32<2448:59:45, 8.95s/it, lr=1e-5, step_loss=0.00889]
Steps: 1%|▏ | 14490/1000000 [3:46:38<2182:28:11, 7.97s/it, lr=1e-5, step_loss=0.00889][RANK-0]: Step: [14490], local_loss=0.033381715416908264, train_loss=0.04688158631324768, time_cost=2.700066566467285
+
Steps: 1%|▏ | 14490/1000000 [3:46:38<2182:28:11, 7.97s/it, lr=1e-5, step_loss=0.0334]
Steps: 1%|▏ | 14491/1000000 [3:46:47<2224:28:23, 8.13s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [14491], local_loss=0.12101853638887405, train_loss=0.03315998986363411, time_cost=1.8535137176513672
+
Steps: 1%|▏ | 14491/1000000 [3:46:47<2224:28:23, 8.13s/it, lr=1e-5, step_loss=0.121]
Steps: 1%|▏ | 14492/1000000 [3:47:01<2703:48:22, 9.88s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [14492], local_loss=0.099565789103508, train_loss=0.02385936677455902, time_cost=4.642866373062134
+
Steps: 1%|▏ | 14492/1000000 [3:47:01<2703:48:22, 9.88s/it, lr=1e-5, step_loss=0.0996]
Steps: 1%|▏ | 14493/1000000 [3:47:06<2304:53:10, 8.42s/it, lr=1e-5, step_loss=0.0996][RANK-0]: Step: [14493], local_loss=0.010058498941361904, train_loss=0.01563584804534912, time_cost=2.3895862102508545
+
Steps: 1%|▏ | 14493/1000000 [3:47:06<2304:53:10, 8.42s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%|▏ | 14494/1000000 [3:47:14<2332:35:22, 8.52s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [14494], local_loss=0.19010503590106964, train_loss=0.03998648747801781, time_cost=1.9590284824371338
+
Steps: 1%|▏ | 14494/1000000 [3:47:14<2332:35:22, 8.52s/it, lr=1e-5, step_loss=0.19]
Steps: 1%|▏ | 14495/1000000 [3:47:24<2462:18:43, 8.99s/it, lr=1e-5, step_loss=0.19][RANK-0]: Step: [14495], local_loss=0.0676659494638443, train_loss=0.16917265951633453, time_cost=8.451897144317627
+
Steps: 1%|▏ | 14495/1000000 [3:47:24<2462:18:43, 8.99s/it, lr=1e-5, step_loss=0.0677]
Steps: 1%|▏ | 14496/1000000 [3:47:38<2850:20:33, 10.41s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [14496], local_loss=0.07478578388690948, train_loss=0.0798829197883606, time_cost=2.092539072036743
+
Steps: 1%|▏ | 14496/1000000 [3:47:38<2850:20:33, 10.41s/it, lr=1e-5, step_loss=0.0748]
Steps: 1%|▏ | 14497/1000000 [3:47:44<2479:30:19, 9.06s/it, lr=1e-5, step_loss=0.0748][RANK-0]: Step: [14497], local_loss=0.03840508684515953, train_loss=0.03669435530900955, time_cost=1.2533035278320312
+
Steps: 1%|▏ | 14497/1000000 [3:47:44<2479:30:19, 9.06s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%|▏ | 14498/1000000 [3:47:50<2201:34:39, 8.04s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [14498], local_loss=115.14026641845703, train_loss=14.425862312316895, time_cost=2.918111801147461
+
Steps: 1%|▏ | 14498/1000000 [3:47:50<2201:34:39, 8.04s/it, lr=1e-5, step_loss=115]
Steps: 1%|▏ | 14499/1000000 [3:48:01<2432:36:36, 8.89s/it, lr=1e-5, step_loss=115][RANK-0]: Step: [14499], local_loss=0.012200030498206615, train_loss=0.05510815232992172, time_cost=3.555874824523926
+
Steps: 1%|▏ | 14499/1000000 [3:48:01<2432:36:36, 8.89s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%|▏ | 14500/1000000 [3:48:10<2446:05:13, 8.94s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [14500], local_loss=0.011135173961520195, train_loss=0.015330446884036064, time_cost=6.7711381912231445
+
Steps: 1%|▏ | 14500/1000000 [3:48:10<2446:05:13, 8.94s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 14501/1000000 [3:48:16<2218:08:09, 8.10s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [14501], local_loss=0.06508985161781311, train_loss=0.1691705584526062, time_cost=1.8780755996704102
+
Steps: 1%|▏ | 14501/1000000 [3:48:16<2218:08:09, 8.10s/it, lr=1e-5, step_loss=0.0651]
Steps: 1%|▏ | 14502/1000000 [3:48:21<1968:16:22, 7.19s/it, lr=1e-5, step_loss=0.0651][RANK-0]: Step: [14502], local_loss=0.012432838790118694, train_loss=0.020867425948381424, time_cost=2.532747268676758
+
Steps: 1%|▏ | 14502/1000000 [3:48:21<1968:16:22, 7.19s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 14503/1000000 [3:48:35<2511:07:47, 9.17s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [14503], local_loss=0.006309265270829201, train_loss=0.03688867390155792, time_cost=4.484621524810791
+
Steps: 1%|▏ | 14503/1000000 [3:48:35<2511:07:47, 9.17s/it, lr=1e-5, step_loss=0.00631]
Steps: 1%|▏ | 14504/1000000 [3:48:49<2902:36:21, 10.60s/it, lr=1e-5, step_loss=0.00631][RANK-0]: Step: [14504], local_loss=0.014597420580685139, train_loss=0.044498808681964874, time_cost=4.57879114151001
+
Steps: 1%|▏ | 14504/1000000 [3:48:49<2902:36:21, 10.60s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%|▏ | 14505/1000000 [3:49:03<3210:13:42, 11.73s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [14505], local_loss=0.06952168047428131, train_loss=0.02137484773993492, time_cost=5.131584644317627
+
Steps: 1%|▏ | 14505/1000000 [3:49:03<3210:13:42, 11.73s/it, lr=1e-5, step_loss=0.0695]
Steps: 1%|▏ | 14506/1000000 [3:49:12<2975:49:19, 10.87s/it, lr=1e-5, step_loss=0.0695][RANK-0]: Step: [14506], local_loss=0.1404329538345337, train_loss=0.0397762656211853, time_cost=3.578799247741699
+
Steps: 1%|▏ | 14506/1000000 [3:49:12<2975:49:19, 10.87s/it, lr=1e-5, step_loss=0.14]
Steps: 1%|▏ | 14507/1000000 [3:49:21<2796:21:11, 10.22s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [14507], local_loss=0.037639256566762924, train_loss=0.04192159324884415, time_cost=6.323858737945557
+
Steps: 1%|▏ | 14507/1000000 [3:49:21<2796:21:11, 10.22s/it, lr=1e-5, step_loss=0.0376]
Steps: 1%|▏ | 14508/1000000 [3:49:28<2597:01:00, 9.49s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [14508], local_loss=0.04171661660075188, train_loss=0.036872848868370056, time_cost=1.8537507057189941
+
Steps: 1%|▏ | 14508/1000000 [3:49:28<2597:01:00, 9.49s/it, lr=1e-5, step_loss=0.0417]
Steps: 1%|▏ | 14509/1000000 [3:49:34<2244:18:10, 8.20s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [14509], local_loss=0.02130265347659588, train_loss=0.0625562071800232, time_cost=1.2245399951934814
+
Steps: 1%|▏ | 14509/1000000 [3:49:34<2244:18:10, 8.20s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%|▏ | 14510/1000000 [3:49:38<1927:03:22, 7.04s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [14510], local_loss=0.03716004639863968, train_loss=0.04088392108678818, time_cost=1.605456829071045
+
Steps: 1%|▏ | 14510/1000000 [3:49:38<1927:03:22, 7.04s/it, lr=1e-5, step_loss=0.0372]
Steps: 1%|▏ | 14511/1000000 [3:49:44<1847:51:17, 6.75s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [14511], local_loss=0.01206298265606165, train_loss=0.028948193415999413, time_cost=1.3266572952270508
+
Steps: 1%|▏ | 14511/1000000 [3:49:44<1847:51:17, 6.75s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%|▏ | 14512/1000000 [3:49:58<2447:25:06, 8.94s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [14512], local_loss=0.41046643257141113, train_loss=0.2142925262451172, time_cost=2.8464927673339844
+
Steps: 1%|▏ | 14512/1000000 [3:49:58<2447:25:06, 8.94s/it, lr=1e-5, step_loss=0.41]
Steps: 1%|▏ | 14513/1000000 [3:50:13<2960:47:34, 10.82s/it, lr=1e-5, step_loss=0.41][RANK-0]: Step: [14513], local_loss=0.013874107971787453, train_loss=0.025695910677313805, time_cost=6.777476787567139
+
Steps: 1%|▏ | 14513/1000000 [3:50:13<2960:47:34, 10.82s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%|▏ | 14514/1000000 [3:50:29<3368:10:44, 12.30s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [14514], local_loss=0.006544442381709814, train_loss=0.01668558642268181, time_cost=5.98080587387085
+
Steps: 1%|▏ | 14514/1000000 [3:50:29<3368:10:44, 12.30s/it, lr=1e-5, step_loss=0.00654]
Steps: 1%|▏ | 14515/1000000 [3:50:40<3285:37:02, 12.00s/it, lr=1e-5, step_loss=0.00654][RANK-0]: Step: [14515], local_loss=0.02006658911705017, train_loss=0.030159123241901398, time_cost=1.4825036525726318
+
Steps: 1%|▏ | 14515/1000000 [3:50:40<3285:37:02, 12.00s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%|▏ | 14516/1000000 [3:50:47<2815:53:53, 10.29s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [14516], local_loss=0.017342640087008476, train_loss=0.049203060567379, time_cost=2.030318021774292
+
Steps: 1%|▏ | 14516/1000000 [3:50:47<2815:53:53, 10.29s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%|▏ | 14517/1000000 [3:50:54<2601:03:21, 9.50s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [14517], local_loss=0.036085695028305054, train_loss=0.02773282490670681, time_cost=3.6061198711395264
+
Steps: 1%|▏ | 14517/1000000 [3:50:54<2601:03:21, 9.50s/it, lr=1e-5, step_loss=0.0361]
Steps: 1%|▏ | 14518/1000000 [3:51:01<2381:01:35, 8.70s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [14518], local_loss=0.01550372689962387, train_loss=0.17862282693386078, time_cost=3.0659165382385254
+
Steps: 1%|▏ | 14518/1000000 [3:51:01<2381:01:35, 8.70s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 14519/1000000 [3:51:08<2263:46:15, 8.27s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [14519], local_loss=0.020292378962039948, train_loss=0.011715201660990715, time_cost=3.1731021404266357
+
Steps: 1%|▏ | 14519/1000000 [3:51:08<2263:46:15, 8.27s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%|▏ | 14520/1000000 [3:51:21<2667:36:52, 9.74s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [14520], local_loss=0.0524701327085495, train_loss=0.10011225938796997, time_cost=4.329916477203369
+
Steps: 1%|▏ | 14520/1000000 [3:51:21<2667:36:52, 9.74s/it, lr=1e-5, step_loss=0.0525]
Steps: 1%|▏ | 14521/1000000 [3:51:30<2570:53:49, 9.39s/it, lr=1e-5, step_loss=0.0525][RANK-0]: Step: [14521], local_loss=0.007469822186976671, train_loss=0.024075832217931747, time_cost=1.6852688789367676
+
Steps: 1%|▏ | 14521/1000000 [3:51:30<2570:53:49, 9.39s/it, lr=1e-5, step_loss=0.00747]
Steps: 1%|▏ | 14522/1000000 [3:51:37<2389:26:22, 8.73s/it, lr=1e-5, step_loss=0.00747][RANK-0]: Step: [14522], local_loss=0.01898360252380371, train_loss=0.042131371796131134, time_cost=1.7153856754302979
+
Steps: 1%|▏ | 14522/1000000 [3:51:37<2389:26:22, 8.73s/it, lr=1e-5, step_loss=0.019]
Steps: 1%|▏ | 14523/1000000 [3:51:42<2092:08:07, 7.64s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [14523], local_loss=0.014238793402910233, train_loss=0.0453624352812767, time_cost=2.2692763805389404
+
Steps: 1%|▏ | 14523/1000000 [3:51:42<2092:08:07, 7.64s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%|▏ | 14524/1000000 [3:51:54<2398:54:27, 8.76s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [14524], local_loss=0.007050089072436094, train_loss=0.030864417552947998, time_cost=3.327563762664795
+
Steps: 1%|▏ | 14524/1000000 [3:51:54<2398:54:27, 8.76s/it, lr=1e-5, step_loss=0.00705]
Steps: 1%|▏ | 14525/1000000 [3:52:02<2397:58:27, 8.76s/it, lr=1e-5, step_loss=0.00705][RANK-0]: Step: [14525], local_loss=0.11337967216968536, train_loss=0.06182107329368591, time_cost=2.5459771156311035
+
Steps: 1%|▏ | 14525/1000000 [3:52:02<2397:58:27, 8.76s/it, lr=1e-5, step_loss=0.113]
Steps: 1%|▏ | 14526/1000000 [3:52:09<2178:43:09, 7.96s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [14526], local_loss=0.014748765155673027, train_loss=0.03647053241729736, time_cost=1.5746619701385498
+
Steps: 1%|▏ | 14526/1000000 [3:52:09<2178:43:09, 7.96s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 14527/1000000 [3:52:19<2403:24:43, 8.78s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [14527], local_loss=0.14281338453292847, train_loss=0.033225078135728836, time_cost=1.6412138938903809
+
Steps: 1%|▏ | 14527/1000000 [3:52:19<2403:24:43, 8.78s/it, lr=1e-5, step_loss=0.143]
Steps: 1%|▏ | 14528/1000000 [3:52:24<2036:16:37, 7.44s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [14528], local_loss=0.0250526275485754, train_loss=0.1434258371591568, time_cost=1.2237651348114014
+
Steps: 1%|▏ | 14528/1000000 [3:52:24<2036:16:37, 7.44s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%|▏ | 14529/1000000 [3:52:31<2008:48:02, 7.34s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [14529], local_loss=0.007927000522613525, train_loss=0.030990513041615486, time_cost=1.4157345294952393
+
Steps: 1%|▏ | 14529/1000000 [3:52:31<2008:48:02, 7.34s/it, lr=1e-5, step_loss=0.00793]
Steps: 1%|▏ | 14530/1000000 [3:52:44<2512:52:55, 9.18s/it, lr=1e-5, step_loss=0.00793][RANK-0]: Step: [14530], local_loss=0.04001782462000847, train_loss=0.0916774570941925, time_cost=1.3815476894378662
+
Steps: 1%|▏ | 14530/1000000 [3:52:44<2512:52:55, 9.18s/it, lr=1e-5, step_loss=0.04]
Steps: 1%|▏ | 14531/1000000 [3:52:49<2184:40:36, 7.98s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [14531], local_loss=0.2677679657936096, train_loss=0.06205632537603378, time_cost=4.339561700820923
+
Steps: 1%|▏ | 14531/1000000 [3:52:49<2184:40:36, 7.98s/it, lr=1e-5, step_loss=0.268]
Steps: 1%|▏ | 14532/1000000 [3:53:06<2932:00:11, 10.71s/it, lr=1e-5, step_loss=0.268][RANK-0]: Step: [14532], local_loss=0.02259828709065914, train_loss=15.275247573852539, time_cost=4.4436585903167725
+
Steps: 1%|▏ | 14532/1000000 [3:53:06<2932:00:11, 10.71s/it, lr=1e-5, step_loss=0.0226]
Steps: 1%|▏ | 14533/1000000 [3:53:13<2594:04:07, 9.48s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [14533], local_loss=0.02396424487233162, train_loss=0.04367504268884659, time_cost=2.4622185230255127
+
Steps: 1%|▏ | 14533/1000000 [3:53:13<2594:04:07, 9.48s/it, lr=1e-5, step_loss=0.024]
Steps: 1%|▏ | 14534/1000000 [3:53:26<2911:28:11, 10.64s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [14534], local_loss=0.03404005244374275, train_loss=0.04316100850701332, time_cost=1.2306742668151855
+
Steps: 1%|▏ | 14534/1000000 [3:53:26<2911:28:11, 10.64s/it, lr=1e-5, step_loss=0.034]
Steps: 1%|▏ | 14535/1000000 [3:53:42<3309:20:44, 12.09s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [14535], local_loss=0.07987777888774872, train_loss=0.02779197134077549, time_cost=7.384159803390503
+
Steps: 1%|▏ | 14535/1000000 [3:53:42<3309:20:44, 12.09s/it, lr=1e-5, step_loss=0.0799]
Steps: 1%|▏ | 14536/1000000 [3:53:53<3255:51:02, 11.89s/it, lr=1e-5, step_loss=0.0799][RANK-0]: Step: [14536], local_loss=0.038858331739902496, train_loss=0.02131474018096924, time_cost=4.495136499404907
+
Steps: 1%|▏ | 14536/1000000 [3:53:53<3255:51:02, 11.89s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%|▏ | 14537/1000000 [3:54:00<2798:37:46, 10.22s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [14537], local_loss=0.004973786883056164, train_loss=0.031613051891326904, time_cost=1.6716771125793457
+
Steps: 1%|▏ | 14537/1000000 [3:54:00<2798:37:46, 10.22s/it, lr=1e-5, step_loss=0.00497]
Steps: 1%|▏ | 14538/1000000 [3:54:12<2997:47:38, 10.95s/it, lr=1e-5, step_loss=0.00497][RANK-0]: Step: [14538], local_loss=0.010008150711655617, train_loss=0.059680625796318054, time_cost=2.927312135696411
+
Steps: 1%|▏ | 14538/1000000 [3:54:12<2997:47:38, 10.95s/it, lr=1e-5, step_loss=0.01]
Steps: 1%|▏ | 14539/1000000 [3:54:22<2914:58:16, 10.65s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [14539], local_loss=0.22467803955078125, train_loss=0.0369531624019146, time_cost=2.149521827697754
+
Steps: 1%|▏ | 14539/1000000 [3:54:22<2914:58:16, 10.65s/it, lr=1e-5, step_loss=0.225]
Steps: 1%|▏ | 14540/1000000 [3:54:29<2637:14:41, 9.63s/it, lr=1e-5, step_loss=0.225][RANK-0]: Step: [14540], local_loss=0.0058050905354321, train_loss=0.032553065568208694, time_cost=2.6096031665802
+
Steps: 1%|▏ | 14540/1000000 [3:54:29<2637:14:41, 9.63s/it, lr=1e-5, step_loss=0.00581]
Steps: 1%|▏ | 14541/1000000 [3:54:40<2717:20:17, 9.93s/it, lr=1e-5, step_loss=0.00581][RANK-0]: Step: [14541], local_loss=0.01592911034822464, train_loss=0.21892960369586945, time_cost=3.3879637718200684
+
Steps: 1%|▏ | 14541/1000000 [3:54:40<2717:20:17, 9.93s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%|▏ | 14542/1000000 [3:54:52<2870:14:12, 10.49s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [14542], local_loss=0.05357716977596283, train_loss=0.03106033243238926, time_cost=1.2163538932800293
+
Steps: 1%|▏ | 14542/1000000 [3:54:52<2870:14:12, 10.49s/it, lr=1e-5, step_loss=0.0536]
Steps: 1%|▏ | 14543/1000000 [3:55:06<3193:32:23, 11.67s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [14543], local_loss=0.06294271349906921, train_loss=0.049327053129673004, time_cost=4.711332321166992
+
Steps: 1%|▏ | 14543/1000000 [3:55:06<3193:32:23, 11.67s/it, lr=1e-5, step_loss=0.0629]
Steps: 1%|▏ | 14544/1000000 [3:55:13<2791:00:24, 10.20s/it, lr=1e-5, step_loss=0.0629][RANK-0]: Step: [14544], local_loss=0.0364326648414135, train_loss=0.1097848117351532, time_cost=3.2159371376037598
+
Steps: 1%|▏ | 14544/1000000 [3:55:13<2791:00:24, 10.20s/it, lr=1e-5, step_loss=0.0364]
Steps: 1%|▏ | 14545/1000000 [3:55:24<2862:57:43, 10.46s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [14545], local_loss=0.007372621446847916, train_loss=0.05557575821876526, time_cost=1.9095299243927002
+
Steps: 1%|▏ | 14545/1000000 [3:55:24<2862:57:43, 10.46s/it, lr=1e-5, step_loss=0.00737]
Steps: 1%|▏ | 14546/1000000 [3:55:38<3147:31:28, 11.50s/it, lr=1e-5, step_loss=0.00737][RANK-0]: Step: [14546], local_loss=0.010273181833326817, train_loss=0.04357389733195305, time_cost=1.2438583374023438
+
Steps: 1%|▏ | 14546/1000000 [3:55:38<3147:31:28, 11.50s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%|▏ | 14547/1000000 [3:55:43<2599:37:31, 9.50s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [14547], local_loss=0.007088837679475546, train_loss=0.01994231902062893, time_cost=1.6441254615783691
+
Steps: 1%|▏ | 14547/1000000 [3:55:43<2599:37:31, 9.50s/it, lr=1e-5, step_loss=0.00709]
Steps: 1%|▏ | 14548/1000000 [3:55:47<2194:35:15, 8.02s/it, lr=1e-5, step_loss=0.00709][RANK-0]: Step: [14548], local_loss=0.026674456894397736, train_loss=0.024350445717573166, time_cost=1.2262563705444336
+
Steps: 1%|▏ | 14548/1000000 [3:55:47<2194:35:15, 8.02s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%|▏ | 14549/1000000 [3:55:55<2178:27:52, 7.96s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [14549], local_loss=0.04003457725048065, train_loss=0.03604993224143982, time_cost=1.1922566890716553
+
Steps: 1%|▏ | 14549/1000000 [3:55:55<2178:27:52, 7.96s/it, lr=1e-5, step_loss=0.04]
Steps: 1%|▏ | 14550/1000000 [3:56:00<1908:34:41, 6.97s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [14550], local_loss=0.0037952959537506104, train_loss=0.022357895970344543, time_cost=1.2084429264068604
+
Steps: 1%|▏ | 14550/1000000 [3:56:00<1908:34:41, 6.97s/it, lr=1e-5, step_loss=0.0038]
Steps: 1%|▏ | 14551/1000000 [3:56:06<1808:36:47, 6.61s/it, lr=1e-5, step_loss=0.0038][RANK-0]: Step: [14551], local_loss=0.027481714263558388, train_loss=0.025180399417877197, time_cost=2.8805670738220215
+
Steps: 1%|▏ | 14551/1000000 [3:56:06<1808:36:47, 6.61s/it, lr=1e-5, step_loss=0.0275]
Steps: 1%|▏ | 14552/1000000 [3:56:11<1714:15:34, 6.26s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [14552], local_loss=0.037195995450019836, train_loss=0.09169454872608185, time_cost=1.451070785522461
+
Steps: 1%|▏ | 14552/1000000 [3:56:11<1714:15:34, 6.26s/it, lr=1e-5, step_loss=0.0372]
Steps: 1%|▏ | 14553/1000000 [3:56:18<1800:07:53, 6.58s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [14553], local_loss=0.011751738376915455, train_loss=0.020394600927829742, time_cost=2.5312044620513916
+
Steps: 1%|▏ | 14553/1000000 [3:56:18<1800:07:53, 6.58s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14554/1000000 [3:56:29<2164:26:40, 7.91s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14554], local_loss=0.04990339279174805, train_loss=0.030074546113610268, time_cost=7.989263296127319
+
Steps: 1%|▏ | 14554/1000000 [3:56:29<2164:26:40, 7.91s/it, lr=1e-5, step_loss=0.0499]
Steps: 1%|▏ | 14555/1000000 [3:56:39<2322:37:17, 8.48s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [14555], local_loss=0.0472852885723114, train_loss=0.04917284846305847, time_cost=1.8040149211883545
+
Steps: 1%|▏ | 14555/1000000 [3:56:39<2322:37:17, 8.48s/it, lr=1e-5, step_loss=0.0473]
Steps: 1%|▏ | 14556/1000000 [3:56:45<2077:21:21, 7.59s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [14556], local_loss=0.01873146742582321, train_loss=0.035420238971710205, time_cost=1.554931402206421
+
Steps: 1%|▏ | 14556/1000000 [3:56:45<2077:21:21, 7.59s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%|▏ | 14557/1000000 [3:56:49<1807:54:47, 6.60s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [14557], local_loss=0.005776859354227781, train_loss=0.07825543731451035, time_cost=1.3873083591461182
+
Steps: 1%|▏ | 14557/1000000 [3:56:49<1807:54:47, 6.60s/it, lr=1e-5, step_loss=0.00578]
Steps: 1%|▏ | 14558/1000000 [3:57:00<2199:44:56, 8.04s/it, lr=1e-5, step_loss=0.00578][RANK-0]: Step: [14558], local_loss=0.049111876636743546, train_loss=0.08200894296169281, time_cost=1.8816092014312744
+
Steps: 1%|▏ | 14558/1000000 [3:57:00<2199:44:56, 8.04s/it, lr=1e-5, step_loss=0.0491]
Steps: 1%|▏ | 14559/1000000 [3:57:14<2618:16:43, 9.57s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [14559], local_loss=0.03178580105304718, train_loss=0.02168979123234749, time_cost=11.024041414260864
+
Steps: 1%|▏ | 14559/1000000 [3:57:14<2618:16:43, 9.57s/it, lr=1e-5, step_loss=0.0318]
Steps: 1%|▏ | 14560/1000000 [3:57:20<2366:25:39, 8.65s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [14560], local_loss=0.03565547987818718, train_loss=0.02984839677810669, time_cost=2.1647324562072754
+
Steps: 1%|▏ | 14560/1000000 [3:57:20<2366:25:39, 8.65s/it, lr=1e-5, step_loss=0.0357]
Steps: 1%|▏ | 14561/1000000 [3:57:31<2579:41:01, 9.42s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [14561], local_loss=0.00817162450402975, train_loss=0.03457563742995262, time_cost=3.9859044551849365
+
Steps: 1%|▏ | 14561/1000000 [3:57:31<2579:41:01, 9.42s/it, lr=1e-5, step_loss=0.00817]
Steps: 1%|▏ | 14562/1000000 [3:57:36<2225:10:17, 8.13s/it, lr=1e-5, step_loss=0.00817][RANK-0]: Step: [14562], local_loss=0.012689180672168732, train_loss=0.011359324678778648, time_cost=2.638629674911499
+
Steps: 1%|▏ | 14562/1000000 [3:57:36<2225:10:17, 8.13s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%|▏ | 14563/1000000 [3:57:46<2306:05:24, 8.42s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [14563], local_loss=1.0044530630111694, train_loss=0.1499565988779068, time_cost=3.266735553741455
+
Steps: 1%|▏ | 14563/1000000 [3:57:46<2306:05:24, 8.42s/it, lr=1e-5, step_loss=1]
Steps: 1%|▏ | 14564/1000000 [3:57:59<2684:36:04, 9.81s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [14564], local_loss=0.015472833067178726, train_loss=0.01941712573170662, time_cost=1.3414204120635986
+
Steps: 1%|▏ | 14564/1000000 [3:57:59<2684:36:04, 9.81s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 14565/1000000 [3:58:05<2424:42:40, 8.86s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [14565], local_loss=0.028310419991612434, train_loss=0.02249854989349842, time_cost=3.030341148376465
+
Steps: 1%|▏ | 14565/1000000 [3:58:05<2424:42:40, 8.86s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%|▏ | 14566/1000000 [3:58:10<2115:10:42, 7.73s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [14566], local_loss=0.05402405560016632, train_loss=0.03150000050663948, time_cost=2.273684024810791
+
Steps: 1%|▏ | 14566/1000000 [3:58:10<2115:10:42, 7.73s/it, lr=1e-5, step_loss=0.054]
Steps: 1%|▏ | 14567/1000000 [3:58:15<1841:23:58, 6.73s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [14567], local_loss=0.0077075110748410225, train_loss=0.041615720838308334, time_cost=1.6503562927246094
+
Steps: 1%|▏ | 14567/1000000 [3:58:15<1841:23:58, 6.73s/it, lr=1e-5, step_loss=0.00771]
Steps: 1%|▏ | 14568/1000000 [3:58:28<2338:23:18, 8.54s/it, lr=1e-5, step_loss=0.00771][RANK-0]: Step: [14568], local_loss=0.005754211451858282, train_loss=0.030002128332853317, time_cost=5.255432605743408
+
Steps: 1%|▏ | 14568/1000000 [3:58:28<2338:23:18, 8.54s/it, lr=1e-5, step_loss=0.00575]
Steps: 1%|▏ | 14569/1000000 [3:58:39<2547:28:40, 9.31s/it, lr=1e-5, step_loss=0.00575][RANK-0]: Step: [14569], local_loss=0.016405535861849785, train_loss=0.014719583094120026, time_cost=1.9897127151489258
+
Steps: 1%|▏ | 14569/1000000 [3:58:39<2547:28:40, 9.31s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%|▏ | 14570/1000000 [3:58:55<3100:30:39, 11.33s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [14570], local_loss=0.008842509239912033, train_loss=0.017207764089107513, time_cost=8.504846572875977
+
Steps: 1%|▏ | 14570/1000000 [3:58:55<3100:30:39, 11.33s/it, lr=1e-5, step_loss=0.00884]
Steps: 1%|▏ | 14571/1000000 [3:59:07<3181:02:10, 11.62s/it, lr=1e-5, step_loss=0.00884][RANK-0]: Step: [14571], local_loss=0.026484625414013863, train_loss=0.1431855410337448, time_cost=2.937407970428467
+
Steps: 1%|▏ | 14571/1000000 [3:59:07<3181:02:10, 11.62s/it, lr=1e-5, step_loss=0.0265]
Steps: 1%|▏ | 14572/1000000 [3:59:16<2966:33:16, 10.84s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [14572], local_loss=0.004690936300903559, train_loss=0.10834961384534836, time_cost=2.9169795513153076
+
Steps: 1%|▏ | 14572/1000000 [3:59:16<2966:33:16, 10.84s/it, lr=1e-5, step_loss=0.00469]
Steps: 1%|▏ | 14573/1000000 [3:59:21<2502:09:41, 9.14s/it, lr=1e-5, step_loss=0.00469][RANK-0]: Step: [14573], local_loss=0.007938600145280361, train_loss=0.0496477372944355, time_cost=1.387364387512207
+
Steps: 1%|▏ | 14573/1000000 [3:59:21<2502:09:41, 9.14s/it, lr=1e-5, step_loss=0.00794]
Steps: 1%|▏ | 14574/1000000 [3:59:34<2802:34:14, 10.24s/it, lr=1e-5, step_loss=0.00794][RANK-0]: Step: [14574], local_loss=0.0034821818117052317, train_loss=0.02677168883383274, time_cost=10.82869029045105
+
Steps: 1%|▏ | 14574/1000000 [3:59:34<2802:34:14, 10.24s/it, lr=1e-5, step_loss=0.00348]
Steps: 1%|▏ | 14575/1000000 [3:59:46<2928:09:47, 10.70s/it, lr=1e-5, step_loss=0.00348][RANK-0]: Step: [14575], local_loss=0.06916476786136627, train_loss=0.023198433220386505, time_cost=2.733978509902954
+
Steps: 1%|▏ | 14575/1000000 [3:59:46<2928:09:47, 10.70s/it, lr=1e-5, step_loss=0.0692]
Steps: 1%|▏ | 14576/1000000 [3:59:55<2853:49:41, 10.43s/it, lr=1e-5, step_loss=0.0692][RANK-0]: Step: [14576], local_loss=0.020051956176757812, train_loss=0.014243845827877522, time_cost=1.2859492301940918
+
Steps: 1%|▏ | 14576/1000000 [3:59:55<2853:49:41, 10.43s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%|▏ | 14577/1000000 [4:00:04<2657:07:38, 9.71s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [14577], local_loss=0.2018847018480301, train_loss=0.06483262777328491, time_cost=1.2433116436004639
+
Steps: 1%|▏ | 14577/1000000 [4:00:04<2657:07:38, 9.71s/it, lr=1e-5, step_loss=0.202]
Steps: 1%|▏ | 14578/1000000 [4:00:10<2363:38:28, 8.63s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [14578], local_loss=0.009312245063483715, train_loss=0.015071390196681023, time_cost=1.6716516017913818
+
Steps: 1%|▏ | 14578/1000000 [4:00:10<2363:38:28, 8.63s/it, lr=1e-5, step_loss=0.00931]
Steps: 1%|▏ | 14579/1000000 [4:00:19<2381:56:30, 8.70s/it, lr=1e-5, step_loss=0.00931][RANK-0]: Step: [14579], local_loss=0.017521964386105537, train_loss=0.06762316077947617, time_cost=3.186119318008423
+
Steps: 1%|▏ | 14579/1000000 [4:00:19<2381:56:30, 8.70s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 14580/1000000 [4:00:23<2023:23:24, 7.39s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [14580], local_loss=0.016263600438833237, train_loss=0.024779070168733597, time_cost=1.3358182907104492
+
Steps: 1%|▏ | 14580/1000000 [4:00:23<2023:23:24, 7.39s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%|▏ | 14581/1000000 [4:00:28<1828:23:00, 6.68s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [14581], local_loss=0.02482488751411438, train_loss=0.03687184303998947, time_cost=2.5093729496002197
+
Steps: 1%|▏ | 14581/1000000 [4:00:28<1828:23:00, 6.68s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 14582/1000000 [4:00:33<1729:04:13, 6.32s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [14582], local_loss=0.11325957626104355, train_loss=0.05506373941898346, time_cost=2.8532748222351074
+
Steps: 1%|▏ | 14582/1000000 [4:00:33<1729:04:13, 6.32s/it, lr=1e-5, step_loss=0.113]
Steps: 1%|▏ | 14583/1000000 [4:00:45<2173:51:41, 7.94s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [14583], local_loss=0.024808941408991814, train_loss=0.048561859875917435, time_cost=4.240293741226196
+
Steps: 1%|▏ | 14583/1000000 [4:00:45<2173:51:41, 7.94s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%|▏ | 14584/1000000 [4:00:49<1877:05:19, 6.86s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [14584], local_loss=0.017580056563019753, train_loss=0.0292240958660841, time_cost=1.325117588043213
+
Steps: 1%|▏ | 14584/1000000 [4:00:49<1877:05:19, 6.86s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%|▏ | 14585/1000000 [4:00:55<1790:04:16, 6.54s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [14585], local_loss=0.012362116947770119, train_loss=0.019932949915528297, time_cost=4.367323398590088
+
Steps: 1%|▏ | 14585/1000000 [4:00:55<1790:04:16, 6.54s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 14586/1000000 [4:01:01<1755:28:41, 6.41s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [14586], local_loss=0.04323829710483551, train_loss=0.02955932728946209, time_cost=1.579688310623169
+
Steps: 1%|▏ | 14586/1000000 [4:01:01<1755:28:41, 6.41s/it, lr=1e-5, step_loss=0.0432]
Steps: 1%|▏ | 14587/1000000 [4:01:06<1650:25:47, 6.03s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [14587], local_loss=0.07889463752508163, train_loss=0.04772890359163284, time_cost=2.189601421356201
+
Steps: 1%|▏ | 14587/1000000 [4:01:06<1650:25:47, 6.03s/it, lr=1e-5, step_loss=0.0789]
Steps: 1%|▏ | 14588/1000000 [4:01:13<1710:05:54, 6.25s/it, lr=1e-5, step_loss=0.0789][RANK-0]: Step: [14588], local_loss=0.08318459242582321, train_loss=0.027237223461270332, time_cost=5.367211818695068
+
Steps: 1%|▏ | 14588/1000000 [4:01:13<1710:05:54, 6.25s/it, lr=1e-5, step_loss=0.0832]
Steps: 1%|▏ | 14589/1000000 [4:01:22<1923:06:39, 7.03s/it, lr=1e-5, step_loss=0.0832][RANK-0]: Step: [14589], local_loss=0.0470159649848938, train_loss=0.04665350168943405, time_cost=1.203963041305542
+
Steps: 1%|▏ | 14589/1000000 [4:01:22<1923:06:39, 7.03s/it, lr=1e-5, step_loss=0.047]
Steps: 1%|▏ | 14590/1000000 [4:01:28<1853:55:12, 6.77s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [14590], local_loss=0.3620862066745758, train_loss=0.0622769258916378, time_cost=2.6650173664093018
+
Steps: 1%|▏ | 14590/1000000 [4:01:28<1853:55:12, 6.77s/it, lr=1e-5, step_loss=0.362]
Steps: 1%|▏ | 14591/1000000 [4:01:41<2386:15:08, 8.72s/it, lr=1e-5, step_loss=0.362][RANK-0]: Step: [14591], local_loss=0.06574119627475739, train_loss=0.07113084942102432, time_cost=3.7236361503601074
+
Steps: 1%|▏ | 14591/1000000 [4:01:41<2386:15:08, 8.72s/it, lr=1e-5, step_loss=0.0657]
Steps: 1%|▏ | 14592/1000000 [4:01:50<2395:16:53, 8.75s/it, lr=1e-5, step_loss=0.0657][RANK-0]: Step: [14592], local_loss=0.007393618114292622, train_loss=0.014779283665120602, time_cost=3.6867787837982178
+
Steps: 1%|▏ | 14592/1000000 [4:01:50<2395:16:53, 8.75s/it, lr=1e-5, step_loss=0.00739]
Steps: 1%|▏ | 14593/1000000 [4:02:02<2648:35:50, 9.68s/it, lr=1e-5, step_loss=0.00739][RANK-0]: Step: [14593], local_loss=0.047356005758047104, train_loss=0.07180820405483246, time_cost=2.4189045429229736
+
Steps: 1%|▏ | 14593/1000000 [4:02:02<2648:35:50, 9.68s/it, lr=1e-5, step_loss=0.0474]
Steps: 1%|▏ | 14594/1000000 [4:02:13<2769:45:22, 10.12s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [14594], local_loss=0.0031223283149302006, train_loss=0.012759413570165634, time_cost=2.0188236236572266
+
Steps: 1%|▏ | 14594/1000000 [4:02:13<2769:45:22, 10.12s/it, lr=1e-5, step_loss=0.00312]
Steps: 1%|▏ | 14595/1000000 [4:02:24<2779:48:19, 10.16s/it, lr=1e-5, step_loss=0.00312][RANK-0]: Step: [14595], local_loss=0.005321408621966839, train_loss=0.01426822692155838, time_cost=5.4837164878845215
+
Steps: 1%|▏ | 14595/1000000 [4:02:24<2779:48:19, 10.16s/it, lr=1e-5, step_loss=0.00532]
Steps: 1%|▏ | 14596/1000000 [4:02:28<2292:41:11, 8.38s/it, lr=1e-5, step_loss=0.00532][RANK-0]: Step: [14596], local_loss=0.04917968809604645, train_loss=0.04222787916660309, time_cost=1.4241943359375
+
Steps: 1%|▏ | 14596/1000000 [4:02:28<2292:41:11, 8.38s/it, lr=1e-5, step_loss=0.0492]
Steps: 1%|▏ | 14597/1000000 [4:02:35<2219:03:10, 8.11s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [14597], local_loss=0.01429453119635582, train_loss=0.04820859432220459, time_cost=5.4445860385894775
+
Steps: 1%|▏ | 14597/1000000 [4:02:35<2219:03:10, 8.11s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%|▏ | 14598/1000000 [4:02:49<2644:44:04, 9.66s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [14598], local_loss=0.16503500938415527, train_loss=0.07727568596601486, time_cost=5.390451669692993
+
Steps: 1%|▏ | 14598/1000000 [4:02:49<2644:44:04, 9.66s/it, lr=1e-5, step_loss=0.165]
Steps: 1%|▏ | 14599/1000000 [4:02:56<2442:58:36, 8.93s/it, lr=1e-5, step_loss=0.165][RANK-0]: Step: [14599], local_loss=0.045446865260601044, train_loss=0.025208603590726852, time_cost=1.5798521041870117
+
Steps: 1%|▏ | 14599/1000000 [4:02:56<2442:58:36, 8.93s/it, lr=1e-5, step_loss=0.0454]
Steps: 1%|▏ | 14600/1000000 [4:03:06<2524:59:52, 9.22s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [14600], local_loss=0.011925159022212029, train_loss=0.023425791412591934, time_cost=4.9566943645477295
+
Steps: 1%|▏ | 14600/1000000 [4:03:06<2524:59:52, 9.22s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%|▏ | 14601/1000000 [4:03:10<2120:53:08, 7.75s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [14601], local_loss=0.010447436943650246, train_loss=0.0231336560100317, time_cost=1.2193799018859863
+
Steps: 1%|▏ | 14601/1000000 [4:03:10<2120:53:08, 7.75s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 14602/1000000 [4:03:26<2772:14:16, 10.13s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [14602], local_loss=0.6486324071884155, train_loss=0.24942579865455627, time_cost=6.04773736000061
+
Steps: 1%|▏ | 14602/1000000 [4:03:26<2772:14:16, 10.13s/it, lr=1e-5, step_loss=0.649]
Steps: 1%|▏ | 14603/1000000 [4:03:34<2636:49:18, 9.63s/it, lr=1e-5, step_loss=0.649][RANK-0]: Step: [14603], local_loss=0.006415564566850662, train_loss=0.08421044051647186, time_cost=2.072007179260254
+
Steps: 1%|▏ | 14603/1000000 [4:03:34<2636:49:18, 9.63s/it, lr=1e-5, step_loss=0.00642]
Steps: 1%|▏ | 14604/1000000 [4:03:42<2465:39:46, 9.01s/it, lr=1e-5, step_loss=0.00642][RANK-0]: Step: [14604], local_loss=0.007689613848924637, train_loss=0.029908189550042152, time_cost=1.2334480285644531
+
Steps: 1%|▏ | 14604/1000000 [4:03:42<2465:39:46, 9.01s/it, lr=1e-5, step_loss=0.00769]
Steps: 1%|▏ | 14605/1000000 [4:03:46<2079:13:19, 7.60s/it, lr=1e-5, step_loss=0.00769][RANK-0]: Step: [14605], local_loss=0.15014880895614624, train_loss=0.05160528048872948, time_cost=1.5495820045471191
+
Steps: 1%|▏ | 14605/1000000 [4:03:46<2079:13:19, 7.60s/it, lr=1e-5, step_loss=0.15]
Steps: 1%|▏ | 14606/1000000 [4:04:00<2633:58:53, 9.62s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [14606], local_loss=0.03662360459566116, train_loss=0.04492183402180672, time_cost=7.064941167831421
+
Steps: 1%|▏ | 14606/1000000 [4:04:00<2633:58:53, 9.62s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%|▏ | 14607/1000000 [4:04:05<2213:47:51, 8.09s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [14607], local_loss=0.011054310016334057, train_loss=0.04973388463258743, time_cost=1.332331657409668
+
Steps: 1%|▏ | 14607/1000000 [4:04:05<2213:47:51, 8.09s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 14608/1000000 [4:04:17<2562:40:46, 9.36s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [14608], local_loss=0.0525914691388607, train_loss=0.09820406138896942, time_cost=4.205674171447754
+
Steps: 1%|▏ | 14608/1000000 [4:04:17<2562:40:46, 9.36s/it, lr=1e-5, step_loss=0.0526]
Steps: 1%|▏ | 14609/1000000 [4:04:27<2623:10:20, 9.58s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [14609], local_loss=0.019735433161258698, train_loss=0.14157865941524506, time_cost=1.2115349769592285
+
Steps: 1%|▏ | 14609/1000000 [4:04:27<2623:10:20, 9.58s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%|▏ | 14610/1000000 [4:04:32<2248:03:06, 8.21s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [14610], local_loss=0.014544444158673286, train_loss=0.15605299174785614, time_cost=1.213435411453247
+
Steps: 1%|▏ | 14610/1000000 [4:04:32<2248:03:06, 8.21s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%|▏ | 14611/1000000 [4:04:48<2886:56:13, 10.55s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [14611], local_loss=0.0635690987110138, train_loss=0.16304326057434082, time_cost=1.5827834606170654
+
Steps: 1%|▏ | 14611/1000000 [4:04:48<2886:56:13, 10.55s/it, lr=1e-5, step_loss=0.0636]
Steps: 1%|▏ | 14612/1000000 [4:05:02<3123:26:48, 11.41s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [14612], local_loss=0.13699790835380554, train_loss=0.04681918025016785, time_cost=5.346842527389526
+
Steps: 1%|▏ | 14612/1000000 [4:05:02<3123:26:48, 11.41s/it, lr=1e-5, step_loss=0.137]
Steps: 1%|▏ | 14613/1000000 [4:05:13<3108:25:00, 11.36s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [14613], local_loss=0.00714306253939867, train_loss=0.01088312640786171, time_cost=2.0475268363952637
+
Steps: 1%|▏ | 14613/1000000 [4:05:13<3108:25:00, 11.36s/it, lr=1e-5, step_loss=0.00714]
Steps: 1%|▏ | 14614/1000000 [4:05:19<2672:46:07, 9.76s/it, lr=1e-5, step_loss=0.00714][RANK-0]: Step: [14614], local_loss=0.018242528662085533, train_loss=0.06687669456005096, time_cost=1.5077264308929443
+
Steps: 1%|▏ | 14614/1000000 [4:05:19<2672:46:07, 9.76s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%|▏ | 14615/1000000 [4:05:25<2364:48:23, 8.64s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [14615], local_loss=0.04698414355516434, train_loss=0.06471319496631622, time_cost=1.3699862957000732
+
Steps: 1%|▏ | 14615/1000000 [4:05:25<2364:48:23, 8.64s/it, lr=1e-5, step_loss=0.047]
Steps: 1%|▏ | 14616/1000000 [4:05:40<2918:42:26, 10.66s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [14616], local_loss=0.011065797880291939, train_loss=0.022258898243308067, time_cost=1.2128396034240723
+
Steps: 1%|▏ | 14616/1000000 [4:05:40<2918:42:26, 10.66s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%|▏ | 14617/1000000 [4:05:54<3150:36:24, 11.51s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [14617], local_loss=0.045923955738544464, train_loss=10.797355651855469, time_cost=4.654610872268677
+
Steps: 1%|▏ | 14617/1000000 [4:05:54<3150:36:24, 11.51s/it, lr=1e-5, step_loss=0.0459]
Steps: 1%|▏ | 14618/1000000 [4:06:00<2719:26:10, 9.94s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [14618], local_loss=0.026851266622543335, train_loss=0.02233867719769478, time_cost=2.683739185333252
+
Steps: 1%|▏ | 14618/1000000 [4:06:00<2719:26:10, 9.94s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%|▏ | 14619/1000000 [4:06:13<2964:05:39, 10.83s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [14619], local_loss=0.01221193466335535, train_loss=0.0568045899271965, time_cost=9.286746501922607
+
Steps: 1%|▏ | 14619/1000000 [4:06:13<2964:05:39, 10.83s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%|▏ | 14620/1000000 [4:06:18<2460:48:40, 8.99s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [14620], local_loss=0.03159601613879204, train_loss=0.15955762565135956, time_cost=2.340930223464966
+
Steps: 1%|▏ | 14620/1000000 [4:06:18<2460:48:40, 8.99s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%|▏ | 14621/1000000 [4:06:30<2730:51:35, 9.98s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [14621], local_loss=1.000211238861084, train_loss=0.18523985147476196, time_cost=3.5054943561553955
+
Steps: 1%|▏ | 14621/1000000 [4:06:30<2730:51:35, 9.98s/it, lr=1e-5, step_loss=1]
Steps: 1%|▏ | 14622/1000000 [4:06:40<2705:58:57, 9.89s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [14622], local_loss=0.05526687949895859, train_loss=0.058067213743925095, time_cost=7.943739891052246
+
Steps: 1%|▏ | 14622/1000000 [4:06:40<2705:58:57, 9.89s/it, lr=1e-5, step_loss=0.0553]
Steps: 1%|▏ | 14623/1000000 [4:06:48<2612:31:04, 9.54s/it, lr=1e-5, step_loss=0.0553][RANK-0]: Step: [14623], local_loss=0.012151745148003101, train_loss=0.030072450637817383, time_cost=1.9783411026000977
+
Steps: 1%|▏ | 14623/1000000 [4:06:48<2612:31:04, 9.54s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%|▏ | 14624/1000000 [4:06:55<2389:19:22, 8.73s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [14624], local_loss=0.05533691868185997, train_loss=0.12070900201797485, time_cost=1.2060489654541016
+
Steps: 1%|▏ | 14624/1000000 [4:06:55<2389:19:22, 8.73s/it, lr=1e-5, step_loss=0.0553]
Steps: 1%|▏ | 14625/1000000 [4:07:00<2087:28:25, 7.63s/it, lr=1e-5, step_loss=0.0553][RANK-0]: Step: [14625], local_loss=0.13900598883628845, train_loss=0.037307851016521454, time_cost=1.2582499980926514
+
Steps: 1%|▏ | 14625/1000000 [4:07:00<2087:28:25, 7.63s/it, lr=1e-5, step_loss=0.139]
Steps: 1%|▏ | 14626/1000000 [4:07:12<2401:05:57, 8.77s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [14626], local_loss=0.010476316325366497, train_loss=0.04554560035467148, time_cost=2.2537052631378174
+
Steps: 1%|▏ | 14626/1000000 [4:07:12<2401:05:57, 8.77s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%|▏ | 14627/1000000 [4:07:25<2776:02:18, 10.14s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [14627], local_loss=0.00781419686973095, train_loss=0.0205075703561306, time_cost=1.2178022861480713
+
Steps: 1%|▏ | 14627/1000000 [4:07:25<2776:02:18, 10.14s/it, lr=1e-5, step_loss=0.00781]
Steps: 1%|▏ | 14628/1000000 [4:07:34<2670:42:25, 9.76s/it, lr=1e-5, step_loss=0.00781][RANK-0]: Step: [14628], local_loss=0.036636389791965485, train_loss=0.039618995040655136, time_cost=1.3373146057128906
+
Steps: 1%|▏ | 14628/1000000 [4:07:34<2670:42:25, 9.76s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%|▏ | 14629/1000000 [4:07:47<2919:06:43, 10.66s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [14629], local_loss=0.005587401799857616, train_loss=29.99433135986328, time_cost=5.383854389190674
+
Steps: 1%|▏ | 14629/1000000 [4:07:47<2919:06:43, 10.66s/it, lr=1e-5, step_loss=0.00559]
Steps: 1%|▏ | 14630/1000000 [4:07:57<2889:07:30, 10.56s/it, lr=1e-5, step_loss=0.00559][RANK-0]: Step: [14630], local_loss=0.015453105792403221, train_loss=0.03490758687257767, time_cost=1.9113142490386963
+
Steps: 1%|▏ | 14630/1000000 [4:07:57<2889:07:30, 10.56s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%|▏ | 14631/1000000 [4:08:08<2882:55:19, 10.53s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [14631], local_loss=0.01981058157980442, train_loss=0.05597744882106781, time_cost=2.7339155673980713
+
Steps: 1%|▏ | 14631/1000000 [4:08:08<2882:55:19, 10.53s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%|▏ | 14632/1000000 [4:08:17<2804:53:02, 10.25s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [14632], local_loss=0.011867637746036053, train_loss=0.13442592322826385, time_cost=3.662262439727783
+
Steps: 1%|▏ | 14632/1000000 [4:08:17<2804:53:02, 10.25s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%|▏ | 14633/1000000 [4:08:25<2603:25:29, 9.51s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [14633], local_loss=0.07809003442525864, train_loss=0.059613004326820374, time_cost=1.2105305194854736
+
Steps: 1%|▏ | 14633/1000000 [4:08:25<2603:25:29, 9.51s/it, lr=1e-5, step_loss=0.0781]
Steps: 1%|▏ | 14634/1000000 [4:08:34<2546:39:18, 9.30s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [14634], local_loss=0.012205451726913452, train_loss=0.031301189213991165, time_cost=3.8786702156066895
+
Steps: 1%|▏ | 14634/1000000 [4:08:34<2546:39:18, 9.30s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%|▏ | 14635/1000000 [4:08:46<2800:15:43, 10.23s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [14635], local_loss=0.006689855828881264, train_loss=0.030309412628412247, time_cost=6.756704568862915
+
Steps: 1%|▏ | 14635/1000000 [4:08:46<2800:15:43, 10.23s/it, lr=1e-5, step_loss=0.00669]
Steps: 1%|▏ | 14636/1000000 [4:08:51<2376:15:44, 8.68s/it, lr=1e-5, step_loss=0.00669][RANK-0]: Step: [14636], local_loss=0.10386546701192856, train_loss=0.3205249607563019, time_cost=2.5002975463867188
+
Steps: 1%|▏ | 14636/1000000 [4:08:51<2376:15:44, 8.68s/it, lr=1e-5, step_loss=0.104]
Steps: 1%|▏ | 14637/1000000 [4:09:03<2637:49:47, 9.64s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [14637], local_loss=0.0072660124860703945, train_loss=0.07524718344211578, time_cost=2.8816721439361572
+
Steps: 1%|▏ | 14637/1000000 [4:09:03<2637:49:47, 9.64s/it, lr=1e-5, step_loss=0.00727]
Steps: 1%|▏ | 14638/1000000 [4:09:07<2207:14:32, 8.06s/it, lr=1e-5, step_loss=0.00727][RANK-0]: Step: [14638], local_loss=0.01060991920530796, train_loss=0.07222678512334824, time_cost=1.5962271690368652
+
Steps: 1%|▏ | 14638/1000000 [4:09:07<2207:14:32, 8.06s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 14639/1000000 [4:09:15<2158:37:09, 7.89s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [14639], local_loss=0.09760930389165878, train_loss=0.08279021829366684, time_cost=1.9501044750213623
+
Steps: 1%|▏ | 14639/1000000 [4:09:15<2158:37:09, 7.89s/it, lr=1e-5, step_loss=0.0976]
Steps: 1%|▏ | 14640/1000000 [4:09:27<2468:32:03, 9.02s/it, lr=1e-5, step_loss=0.0976][RANK-0]: Step: [14640], local_loss=0.9884605407714844, train_loss=0.15037192404270172, time_cost=3.1915085315704346
+
Steps: 1%|▏ | 14640/1000000 [4:09:27<2468:32:03, 9.02s/it, lr=1e-5, step_loss=0.988]
Steps: 1%|▏ | 14641/1000000 [4:09:31<2080:12:51, 7.60s/it, lr=1e-5, step_loss=0.988][RANK-0]: Step: [14641], local_loss=0.034823525696992874, train_loss=0.01942039653658867, time_cost=1.6103692054748535
+
Steps: 1%|▏ | 14641/1000000 [4:09:31<2080:12:51, 7.60s/it, lr=1e-5, step_loss=0.0348]
Steps: 1%|▏ | 14642/1000000 [4:09:43<2420:47:42, 8.84s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [14642], local_loss=0.004597719758749008, train_loss=0.05209633708000183, time_cost=4.094258546829224
+
Steps: 1%|▏ | 14642/1000000 [4:09:43<2420:47:42, 8.84s/it, lr=1e-5, step_loss=0.0046]
Steps: 1%|▏ | 14643/1000000 [4:09:50<2273:37:30, 8.31s/it, lr=1e-5, step_loss=0.0046][RANK-0]: Step: [14643], local_loss=0.013863603584468365, train_loss=0.019230453297495842, time_cost=2.240412473678589
+
Steps: 1%|▏ | 14643/1000000 [4:09:50<2273:37:30, 8.31s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%|▏ | 14644/1000000 [4:09:55<2069:08:49, 7.56s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [14644], local_loss=0.08735578507184982, train_loss=0.021805236116051674, time_cost=1.967777967453003
+
Steps: 1%|▏ | 14644/1000000 [4:09:55<2069:08:49, 7.56s/it, lr=1e-5, step_loss=0.0874]
Steps: 1%|▏ | 14645/1000000 [4:10:01<1926:35:40, 7.04s/it, lr=1e-5, step_loss=0.0874][RANK-0]: Step: [14645], local_loss=0.04499240592122078, train_loss=0.019352776929736137, time_cost=1.528977870941162
+
Steps: 1%|▏ | 14645/1000000 [4:10:01<1926:35:40, 7.04s/it, lr=1e-5, step_loss=0.045]
Steps: 1%|▏ | 14646/1000000 [4:10:08<1887:35:00, 6.90s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [14646], local_loss=0.05741938576102257, train_loss=0.14595386385917664, time_cost=2.7831757068634033
+
Steps: 1%|▏ | 14646/1000000 [4:10:08<1887:35:00, 6.90s/it, lr=1e-5, step_loss=0.0574]
Steps: 1%|▏ | 14647/1000000 [4:10:19<2232:44:59, 8.16s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [14647], local_loss=0.22486276924610138, train_loss=0.07370555400848389, time_cost=4.236080169677734
+
Steps: 1%|▏ | 14647/1000000 [4:10:19<2232:44:59, 8.16s/it, lr=1e-5, step_loss=0.225]
Steps: 1%|▏ | 14648/1000000 [4:10:30<2469:46:58, 9.02s/it, lr=1e-5, step_loss=0.225][RANK-0]: Step: [14648], local_loss=0.03757099807262421, train_loss=0.06651546061038971, time_cost=3.8736684322357178
+
Steps: 1%|▏ | 14648/1000000 [4:10:30<2469:46:58, 9.02s/it, lr=1e-5, step_loss=0.0376]
Steps: 1%|▏ | 14649/1000000 [4:10:35<2138:38:15, 7.81s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [14649], local_loss=0.006745805032551289, train_loss=0.013475120067596436, time_cost=3.7694268226623535
+
Steps: 1%|▏ | 14649/1000000 [4:10:35<2138:38:15, 7.81s/it, lr=1e-5, step_loss=0.00675]
Steps: 1%|▏ | 14650/1000000 [4:10:43<2125:47:20, 7.77s/it, lr=1e-5, step_loss=0.00675][RANK-0]: Step: [14650], local_loss=0.011467870324850082, train_loss=0.03492097929120064, time_cost=5.223607540130615
+
Steps: 1%|▏ | 14650/1000000 [4:10:43<2125:47:20, 7.77s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%|▏ | 14651/1000000 [4:10:53<2335:31:05, 8.53s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [14651], local_loss=0.034635864198207855, train_loss=0.04407453536987305, time_cost=4.693748950958252
+
Steps: 1%|▏ | 14651/1000000 [4:10:53<2335:31:05, 8.53s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%|▏ | 14652/1000000 [4:11:05<2587:55:34, 9.46s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [14652], local_loss=0.0487428717315197, train_loss=0.13077029585838318, time_cost=4.620225667953491
+
Steps: 1%|▏ | 14652/1000000 [4:11:05<2587:55:34, 9.46s/it, lr=1e-5, step_loss=0.0487]
Steps: 1%|▏ | 14653/1000000 [4:11:20<3082:56:21, 11.26s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [14653], local_loss=0.024212198331952095, train_loss=0.15147916972637177, time_cost=4.26161003112793
+
Steps: 1%|▏ | 14653/1000000 [4:11:20<3082:56:21, 11.26s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%|▏ | 14654/1000000 [4:11:32<3162:44:13, 11.56s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [14654], local_loss=0.01690785586833954, train_loss=0.02770863100886345, time_cost=1.320815086364746
+
Steps: 1%|▏ | 14654/1000000 [4:11:32<3162:44:13, 11.56s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%|▏ | 14655/1000000 [4:11:43<3075:01:49, 11.23s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [14655], local_loss=0.010459840297698975, train_loss=0.10140855610370636, time_cost=1.316533088684082
+
Steps: 1%|▏ | 14655/1000000 [4:11:43<3075:01:49, 11.23s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%|▏ | 14656/1000000 [4:11:54<3104:38:40, 11.34s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [14656], local_loss=0.030347373336553574, train_loss=0.046999234706163406, time_cost=6.479245901107788
+
Steps: 1%|▏ | 14656/1000000 [4:11:54<3104:38:40, 11.34s/it, lr=1e-5, step_loss=0.0303]
Steps: 1%|▏ | 14657/1000000 [4:12:03<2883:14:44, 10.53s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [14657], local_loss=0.029923900961875916, train_loss=0.1659267246723175, time_cost=2.258388042449951
+
Steps: 1%|▏ | 14657/1000000 [4:12:03<2883:14:44, 10.53s/it, lr=1e-5, step_loss=0.0299]
Steps: 1%|▏ | 14658/1000000 [4:12:15<2980:07:51, 10.89s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [14658], local_loss=0.016727862879633904, train_loss=0.041753362864255905, time_cost=2.456427574157715
+
Steps: 1%|▏ | 14658/1000000 [4:12:15<2980:07:51, 10.89s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%|▏ | 14659/1000000 [4:12:23<2771:05:49, 10.12s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [14659], local_loss=0.029675781726837158, train_loss=0.04193774238228798, time_cost=4.277524948120117
+
Steps: 1%|▏ | 14659/1000000 [4:12:23<2771:05:49, 10.12s/it, lr=1e-5, step_loss=0.0297]
Steps: 1%|▏ | 14660/1000000 [4:12:32<2692:28:25, 9.84s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [14660], local_loss=0.03367901220917702, train_loss=0.027537669986486435, time_cost=1.2994263172149658
+
Steps: 1%|▏ | 14660/1000000 [4:12:32<2692:28:25, 9.84s/it, lr=1e-5, step_loss=0.0337]
Steps: 1%|▏ | 14661/1000000 [4:12:42<2674:02:44, 9.77s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [14661], local_loss=0.012608525343239307, train_loss=0.030773350968956947, time_cost=2.0860791206359863
+
Steps: 1%|▏ | 14661/1000000 [4:12:42<2674:02:44, 9.77s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%|▏ | 14662/1000000 [4:12:49<2475:18:19, 9.04s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [14662], local_loss=0.013774355873465538, train_loss=0.02171763777732849, time_cost=2.479266405105591
+
Steps: 1%|▏ | 14662/1000000 [4:12:49<2475:18:19, 9.04s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%|▏ | 14663/1000000 [4:13:02<2770:35:46, 10.12s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [14663], local_loss=0.041670504957437515, train_loss=0.04560825973749161, time_cost=8.927667140960693
+
Steps: 1%|▏ | 14663/1000000 [4:13:02<2770:35:46, 10.12s/it, lr=1e-5, step_loss=0.0417]
Steps: 1%|▏ | 14664/1000000 [4:13:06<2283:51:56, 8.34s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [14664], local_loss=0.06967793405056, train_loss=0.14024938642978668, time_cost=1.5504729747772217
+
Steps: 1%|▏ | 14664/1000000 [4:13:06<2283:51:56, 8.34s/it, lr=1e-5, step_loss=0.0697]
Steps: 1%|▏ | 14665/1000000 [4:13:10<1957:02:33, 7.15s/it, lr=1e-5, step_loss=0.0697][RANK-0]: Step: [14665], local_loss=0.02603054977953434, train_loss=0.15593864023685455, time_cost=1.8883411884307861
+
Steps: 1%|▏ | 14665/1000000 [4:13:10<1957:02:33, 7.15s/it, lr=1e-5, step_loss=0.026]
Steps: 1%|▏ | 14666/1000000 [4:13:23<2431:57:30, 8.89s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [14666], local_loss=0.03315962105989456, train_loss=16.642147064208984, time_cost=4.747227907180786
+
Steps: 1%|▏ | 14666/1000000 [4:13:23<2431:57:30, 8.89s/it, lr=1e-5, step_loss=0.0332]
Steps: 1%|▏ | 14667/1000000 [4:13:36<2766:17:56, 10.11s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [14667], local_loss=0.05100315064191818, train_loss=0.1560850441455841, time_cost=3.229203462600708
+
Steps: 1%|▏ | 14667/1000000 [4:13:36<2766:17:56, 10.11s/it, lr=1e-5, step_loss=0.051]
Steps: 1%|▏ | 14668/1000000 [4:13:44<2530:20:21, 9.24s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [14668], local_loss=0.00665643997490406, train_loss=0.015772230923175812, time_cost=2.8561909198760986
+
Steps: 1%|▏ | 14668/1000000 [4:13:44<2530:20:21, 9.24s/it, lr=1e-5, step_loss=0.00666]
Steps: 1%|▏ | 14669/1000000 [4:13:48<2122:57:21, 7.76s/it, lr=1e-5, step_loss=0.00666][RANK-0]: Step: [14669], local_loss=0.00904964841902256, train_loss=0.022358674556016922, time_cost=1.4446923732757568
+
Steps: 1%|▏ | 14669/1000000 [4:13:48<2122:57:21, 7.76s/it, lr=1e-5, step_loss=0.00905]
Steps: 1%|▏ | 14670/1000000 [4:13:54<1971:27:00, 7.20s/it, lr=1e-5, step_loss=0.00905][RANK-0]: Step: [14670], local_loss=0.023012438789010048, train_loss=0.09013821184635162, time_cost=1.4564476013183594
+
Steps: 1%|▏ | 14670/1000000 [4:13:54<1971:27:00, 7.20s/it, lr=1e-5, step_loss=0.023]
Steps: 1%|▏ | 14671/1000000 [4:14:02<2064:47:33, 7.54s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [14671], local_loss=0.03994528204202652, train_loss=0.03136814758181572, time_cost=4.696420192718506
+
Steps: 1%|▏ | 14671/1000000 [4:14:02<2064:47:33, 7.54s/it, lr=1e-5, step_loss=0.0399]
Steps: 1%|▏ | 14672/1000000 [4:14:07<1869:47:26, 6.83s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [14672], local_loss=0.05449312925338745, train_loss=0.14480780065059662, time_cost=1.2247977256774902
+
Steps: 1%|▏ | 14672/1000000 [4:14:07<1869:47:26, 6.83s/it, lr=1e-5, step_loss=0.0545]
Steps: 1%|▏ | 14673/1000000 [4:14:18<2209:41:12, 8.07s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [14673], local_loss=0.0067075761035084724, train_loss=0.03779635205864906, time_cost=5.008602857589722
+
Steps: 1%|▏ | 14673/1000000 [4:14:18<2209:41:12, 8.07s/it, lr=1e-5, step_loss=0.00671]
Steps: 1%|▏ | 14674/1000000 [4:14:29<2456:45:39, 8.98s/it, lr=1e-5, step_loss=0.00671][RANK-0]: Step: [14674], local_loss=0.007118198089301586, train_loss=0.02661346271634102, time_cost=3.2537682056427
+
Steps: 1%|▏ | 14674/1000000 [4:14:29<2456:45:39, 8.98s/it, lr=1e-5, step_loss=0.00712]
Steps: 1%|▏ | 14675/1000000 [4:14:45<3012:54:02, 11.01s/it, lr=1e-5, step_loss=0.00712][RANK-0]: Step: [14675], local_loss=0.019374193623661995, train_loss=0.013129780068993568, time_cost=13.008808374404907
+
Steps: 1%|▏ | 14675/1000000 [4:14:45<3012:54:02, 11.01s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%|▏ | 14676/1000000 [4:14:51<2568:07:38, 9.38s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [14676], local_loss=0.02061731368303299, train_loss=0.02225489914417267, time_cost=1.5205073356628418
+
Steps: 1%|▏ | 14676/1000000 [4:14:51<2568:07:38, 9.38s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%|▏ | 14677/1000000 [4:15:04<2902:00:34, 10.60s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [14677], local_loss=0.008456965908408165, train_loss=0.028937583789229393, time_cost=1.8495733737945557
+
Steps: 1%|▏ | 14677/1000000 [4:15:04<2902:00:34, 10.60s/it, lr=1e-5, step_loss=0.00846]
Steps: 1%|▏ | 14678/1000000 [4:15:18<3161:43:42, 11.55s/it, lr=1e-5, step_loss=0.00846][RANK-0]: Step: [14678], local_loss=0.018700847402215004, train_loss=0.03970469534397125, time_cost=5.397881746292114
+
Steps: 1%|▏ | 14678/1000000 [4:15:18<3161:43:42, 11.55s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%|▏ | 14679/1000000 [4:15:24<2749:22:42, 10.05s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [14679], local_loss=0.2933834493160248, train_loss=0.09658661484718323, time_cost=2.2415273189544678
+
Steps: 1%|▏ | 14679/1000000 [4:15:24<2749:22:42, 10.05s/it, lr=1e-5, step_loss=0.293]
Steps: 1%|▏ | 14680/1000000 [4:15:32<2509:18:57, 9.17s/it, lr=1e-5, step_loss=0.293][RANK-0]: Step: [14680], local_loss=0.01043972373008728, train_loss=0.04361501336097717, time_cost=1.2326467037200928
+
Steps: 1%|▏ | 14680/1000000 [4:15:32<2509:18:57, 9.17s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 14681/1000000 [4:15:37<2165:56:07, 7.91s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [14681], local_loss=0.015951506793498993, train_loss=0.07343346625566483, time_cost=4.10580587387085
+
Steps: 1%|▏ | 14681/1000000 [4:15:37<2165:56:07, 7.91s/it, lr=1e-5, step_loss=0.016]
Steps: 1%|▏ | 14682/1000000 [4:15:50<2651:59:58, 9.69s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [14682], local_loss=0.009479801170527935, train_loss=0.04519259184598923, time_cost=1.216456413269043
+
Steps: 1%|▏ | 14682/1000000 [4:15:50<2651:59:58, 9.69s/it, lr=1e-5, step_loss=0.00948]
Steps: 1%|▏ | 14683/1000000 [4:15:55<2229:40:04, 8.15s/it, lr=1e-5, step_loss=0.00948][RANK-0]: Step: [14683], local_loss=0.013201999478042126, train_loss=0.020105700939893723, time_cost=1.7810404300689697
+
Steps: 1%|▏ | 14683/1000000 [4:15:55<2229:40:04, 8.15s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%|▏ | 14684/1000000 [4:16:05<2380:25:15, 8.70s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [14684], local_loss=0.014688118360936642, train_loss=0.04033602029085159, time_cost=1.6896491050720215
+
Steps: 1%|▏ | 14684/1000000 [4:16:05<2380:25:15, 8.70s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 14685/1000000 [4:16:13<2327:43:38, 8.50s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [14685], local_loss=0.004252323880791664, train_loss=0.04579472541809082, time_cost=7.147128582000732
+
Steps: 1%|▏ | 14685/1000000 [4:16:13<2327:43:38, 8.50s/it, lr=1e-5, step_loss=0.00425]
Steps: 1%|▏ | 14686/1000000 [4:16:26<2691:18:13, 9.83s/it, lr=1e-5, step_loss=0.00425][RANK-0]: Step: [14686], local_loss=0.036682650446891785, train_loss=0.05462517589330673, time_cost=1.2116577625274658
+
Steps: 1%|▏ | 14686/1000000 [4:16:26<2691:18:13, 9.83s/it, lr=1e-5, step_loss=0.0367]
Steps: 1%|▏ | 14687/1000000 [4:16:35<2602:59:14, 9.51s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [14687], local_loss=0.04279659688472748, train_loss=0.01321333460509777, time_cost=2.410400152206421
+
Steps: 1%|▏ | 14687/1000000 [4:16:35<2602:59:14, 9.51s/it, lr=1e-5, step_loss=0.0428]
Steps: 1%|▏ | 14688/1000000 [4:16:42<2401:20:58, 8.77s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [14688], local_loss=0.01973021775484085, train_loss=0.022813551127910614, time_cost=2.575984477996826
+
Steps: 1%|▏ | 14688/1000000 [4:16:42<2401:20:58, 8.77s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%|▏ | 14689/1000000 [4:16:50<2355:45:26, 8.61s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [14689], local_loss=0.011218957602977753, train_loss=0.05575517192482948, time_cost=1.2106106281280518
+
Steps: 1%|▏ | 14689/1000000 [4:16:50<2355:45:26, 8.61s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 14690/1000000 [4:17:04<2810:11:09, 10.27s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [14690], local_loss=0.08993136137723923, train_loss=0.19618883728981018, time_cost=5.50937819480896
+
Steps: 1%|▏ | 14690/1000000 [4:17:04<2810:11:09, 10.27s/it, lr=1e-5, step_loss=0.0899]
Steps: 1%|▏ | 14691/1000000 [4:17:09<2353:24:29, 8.60s/it, lr=1e-5, step_loss=0.0899][RANK-0]: Step: [14691], local_loss=0.012896453961730003, train_loss=0.01084536500275135, time_cost=1.904770851135254
+
Steps: 1%|▏ | 14691/1000000 [4:17:09<2353:24:29, 8.60s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14692/1000000 [4:17:16<2211:38:13, 8.08s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14692], local_loss=0.014731056988239288, train_loss=0.035565510392189026, time_cost=2.245335340499878
+
Steps: 1%|▏ | 14692/1000000 [4:17:16<2211:38:13, 8.08s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 14693/1000000 [4:17:31<2809:50:44, 10.27s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [14693], local_loss=0.12480141967535019, train_loss=0.06034925580024719, time_cost=11.657625436782837
+
Steps: 1%|▏ | 14693/1000000 [4:17:31<2809:50:44, 10.27s/it, lr=1e-5, step_loss=0.125]
Steps: 1%|▏ | 14694/1000000 [4:17:42<2862:39:36, 10.46s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [14694], local_loss=0.07062393426895142, train_loss=0.03577963262796402, time_cost=2.530029058456421
+
Steps: 1%|▏ | 14694/1000000 [4:17:42<2862:39:36, 10.46s/it, lr=1e-5, step_loss=0.0706]
Steps: 1%|▏ | 14695/1000000 [4:17:54<2980:14:56, 10.89s/it, lr=1e-5, step_loss=0.0706][RANK-0]: Step: [14695], local_loss=0.013449020683765411, train_loss=0.0174548476934433, time_cost=2.9132802486419678
+
Steps: 1%|▏ | 14695/1000000 [4:17:54<2980:14:56, 10.89s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%|▏ | 14696/1000000 [4:18:00<2627:45:48, 9.60s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [14696], local_loss=0.012743824161589146, train_loss=0.030202604830265045, time_cost=2.538081169128418
+
Steps: 1%|▏ | 14696/1000000 [4:18:00<2627:45:48, 9.60s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%|▏ | 14697/1000000 [4:18:06<2339:20:49, 8.55s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [14697], local_loss=0.1346888393163681, train_loss=0.037418290972709656, time_cost=2.090273857116699
+
Steps: 1%|▏ | 14697/1000000 [4:18:06<2339:20:49, 8.55s/it, lr=1e-5, step_loss=0.135]
Steps: 1%|▏ | 14698/1000000 [4:18:18<2555:40:15, 9.34s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [14698], local_loss=0.033116042613983154, train_loss=0.06770898401737213, time_cost=1.8584625720977783
+
Steps: 1%|▏ | 14698/1000000 [4:18:18<2555:40:15, 9.34s/it, lr=1e-5, step_loss=0.0331]
Steps: 1%|▏ | 14699/1000000 [4:18:22<2137:46:21, 7.81s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [14699], local_loss=1.0140783786773682, train_loss=0.1526990532875061, time_cost=1.3580305576324463
+
Steps: 1%|▏ | 14699/1000000 [4:18:22<2137:46:21, 7.81s/it, lr=1e-5, step_loss=1.01]
Steps: 1%|▏ | 14700/1000000 [4:18:26<1865:38:20, 6.82s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [14700], local_loss=0.060536306351423264, train_loss=0.05989828705787659, time_cost=1.7722680568695068
+
Steps: 1%|▏ | 14700/1000000 [4:18:26<1865:38:20, 6.82s/it, lr=1e-5, step_loss=0.0605]
Steps: 1%|▏ | 14701/1000000 [4:18:38<2220:50:37, 8.11s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [14701], local_loss=0.006368828006088734, train_loss=0.11250264942646027, time_cost=3.1074962615966797
+
Steps: 1%|▏ | 14701/1000000 [4:18:38<2220:50:37, 8.11s/it, lr=1e-5, step_loss=0.00637]
Steps: 1%|▏ | 14702/1000000 [4:18:45<2188:28:28, 8.00s/it, lr=1e-5, step_loss=0.00637][RANK-0]: Step: [14702], local_loss=0.008616430684924126, train_loss=0.012555585242807865, time_cost=2.972393035888672
+
Steps: 1%|▏ | 14702/1000000 [4:18:45<2188:28:28, 8.00s/it, lr=1e-5, step_loss=0.00862]
Steps: 1%|▏ | 14703/1000000 [4:18:50<1959:08:52, 7.16s/it, lr=1e-5, step_loss=0.00862][RANK-0]: Step: [14703], local_loss=0.03501215949654579, train_loss=0.0896555483341217, time_cost=1.9108788967132568
+
Steps: 1%|▏ | 14703/1000000 [4:18:50<1959:08:52, 7.16s/it, lr=1e-5, step_loss=0.035]
Steps: 1%|▏ | 14704/1000000 [4:19:01<2220:39:54, 8.11s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [14704], local_loss=0.0046443757601082325, train_loss=0.023621059954166412, time_cost=1.4807963371276855
+
Steps: 1%|▏ | 14704/1000000 [4:19:01<2220:39:54, 8.11s/it, lr=1e-5, step_loss=0.00464]
Steps: 1%|▏ | 14705/1000000 [4:19:05<1907:07:36, 6.97s/it, lr=1e-5, step_loss=0.00464][RANK-0]: Step: [14705], local_loss=0.019473567605018616, train_loss=0.027768343687057495, time_cost=1.2315361499786377
+
Steps: 1%|▏ | 14705/1000000 [4:19:05<1907:07:36, 6.97s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%|▏ | 14706/1000000 [4:19:20<2528:11:15, 9.24s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [14706], local_loss=0.022238856181502342, train_loss=0.0715840756893158, time_cost=5.931665420532227
+
Steps: 1%|▏ | 14706/1000000 [4:19:20<2528:11:15, 9.24s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%|▏ | 14707/1000000 [4:19:30<2646:10:33, 9.67s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [14707], local_loss=0.008103846572339535, train_loss=0.020859399810433388, time_cost=2.1177561283111572
+
Steps: 1%|▏ | 14707/1000000 [4:19:30<2646:10:33, 9.67s/it, lr=1e-5, step_loss=0.0081]
Steps: 1%|▏ | 14708/1000000 [4:19:37<2427:47:09, 8.87s/it, lr=1e-5, step_loss=0.0081][RANK-0]: Step: [14708], local_loss=0.04074040427803993, train_loss=0.05517875775694847, time_cost=3.1593050956726074
+
Steps: 1%|▏ | 14708/1000000 [4:19:37<2427:47:09, 8.87s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%|▏ | 14709/1000000 [4:19:43<2159:42:21, 7.89s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [14709], local_loss=0.31085333228111267, train_loss=0.060867805033922195, time_cost=2.291849374771118
+
Steps: 1%|▏ | 14709/1000000 [4:19:43<2159:42:21, 7.89s/it, lr=1e-5, step_loss=0.311]
Steps: 1%|▏ | 14710/1000000 [4:19:52<2241:12:12, 8.19s/it, lr=1e-5, step_loss=0.311][RANK-0]: Step: [14710], local_loss=0.016805298626422882, train_loss=0.042318589985370636, time_cost=2.6097726821899414
+
Steps: 1%|▏ | 14710/1000000 [4:19:52<2241:12:12, 8.19s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%|▏ | 14711/1000000 [4:20:04<2548:45:44, 9.31s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [14711], local_loss=0.012807956896722317, train_loss=0.13061562180519104, time_cost=4.615823984146118
+
Steps: 1%|▏ | 14711/1000000 [4:20:04<2548:45:44, 9.31s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%|▏ | 14712/1000000 [4:20:20<3086:16:33, 11.28s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [14712], local_loss=0.0067066471092402935, train_loss=0.018618889153003693, time_cost=1.196645975112915
+
Steps: 1%|▏ | 14712/1000000 [4:20:20<3086:16:33, 11.28s/it, lr=1e-5, step_loss=0.00671]
Steps: 1%|▏ | 14713/1000000 [4:20:35<3451:30:38, 12.61s/it, lr=1e-5, step_loss=0.00671][RANK-0]: Step: [14713], local_loss=0.008092278614640236, train_loss=0.019206978380680084, time_cost=6.926741361618042
+
Steps: 1%|▏ | 14713/1000000 [4:20:35<3451:30:38, 12.61s/it, lr=1e-5, step_loss=0.00809]
Steps: 1%|▏ | 14714/1000000 [4:20:40<2831:09:19, 10.34s/it, lr=1e-5, step_loss=0.00809][RANK-0]: Step: [14714], local_loss=0.007643518038094044, train_loss=0.05313144624233246, time_cost=2.1770784854888916
+
Steps: 1%|▏ | 14714/1000000 [4:20:40<2831:09:19, 10.34s/it, lr=1e-5, step_loss=0.00764]
Steps: 1%|▏ | 14715/1000000 [4:20:46<2483:58:48, 9.08s/it, lr=1e-5, step_loss=0.00764][RANK-0]: Step: [14715], local_loss=0.007377718575298786, train_loss=0.027723072096705437, time_cost=1.9326410293579102
+
Steps: 1%|▏ | 14715/1000000 [4:20:46<2483:58:48, 9.08s/it, lr=1e-5, step_loss=0.00738]
Steps: 1%|▏ | 14716/1000000 [4:21:01<2912:38:38, 10.64s/it, lr=1e-5, step_loss=0.00738][RANK-0]: Step: [14716], local_loss=0.007171782199293375, train_loss=0.07929309457540512, time_cost=2.4410157203674316
+
Steps: 1%|▏ | 14716/1000000 [4:21:01<2912:38:38, 10.64s/it, lr=1e-5, step_loss=0.00717]
Steps: 1%|▏ | 14717/1000000 [4:21:08<2613:47:23, 9.55s/it, lr=1e-5, step_loss=0.00717][RANK-0]: Step: [14717], local_loss=0.011380704119801521, train_loss=17.937610626220703, time_cost=1.5942082405090332
+
Steps: 1%|▏ | 14717/1000000 [4:21:08<2613:47:23, 9.55s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%|▏ | 14718/1000000 [4:21:14<2315:08:03, 8.46s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [14718], local_loss=0.01372694130986929, train_loss=0.037289105355739594, time_cost=1.3264992237091064
+
Steps: 1%|▏ | 14718/1000000 [4:21:14<2315:08:03, 8.46s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%|▏ | 14719/1000000 [4:21:28<2803:04:16, 10.24s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [14719], local_loss=0.3315747380256653, train_loss=0.07087769359350204, time_cost=2.6644623279571533
+
Steps: 1%|▏ | 14719/1000000 [4:21:28<2803:04:16, 10.24s/it, lr=1e-5, step_loss=0.332]
Steps: 1%|▏ | 14720/1000000 [4:21:35<2519:19:14, 9.21s/it, lr=1e-5, step_loss=0.332][RANK-0]: Step: [14720], local_loss=0.011948846280574799, train_loss=0.012041466310620308, time_cost=1.2257258892059326
+
Steps: 1%|▏ | 14720/1000000 [4:21:35<2519:19:14, 9.21s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%|▏ | 14721/1000000 [4:21:49<2893:50:34, 10.57s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [14721], local_loss=0.006196950096637011, train_loss=0.01943599432706833, time_cost=4.599635362625122
+
Steps: 1%|▏ | 14721/1000000 [4:21:49<2893:50:34, 10.57s/it, lr=1e-5, step_loss=0.0062]
Steps: 1%|▏ | 14722/1000000 [4:21:53<2377:17:07, 8.69s/it, lr=1e-5, step_loss=0.0062][RANK-0]: Step: [14722], local_loss=0.019143303856253624, train_loss=0.028703469783067703, time_cost=1.4418199062347412
+
Steps: 1%|▏ | 14722/1000000 [4:21:53<2377:17:07, 8.69s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%|▏ | 14723/1000000 [4:22:09<2954:41:47, 10.80s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [14723], local_loss=0.052869364619255066, train_loss=0.04422840476036072, time_cost=7.0427086353302
+
Steps: 1%|▏ | 14723/1000000 [4:22:09<2954:41:47, 10.80s/it, lr=1e-5, step_loss=0.0529]
Steps: 1%|▏ | 14724/1000000 [4:22:18<2804:05:24, 10.25s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [14724], local_loss=0.3453312814235687, train_loss=0.07253323495388031, time_cost=3.080416679382324
+
Steps: 1%|▏ | 14724/1000000 [4:22:18<2804:05:24, 10.25s/it, lr=1e-5, step_loss=0.345]
Steps: 1%|▏ | 14725/1000000 [4:22:22<2334:23:43, 8.53s/it, lr=1e-5, step_loss=0.345][RANK-0]: Step: [14725], local_loss=0.0645306408405304, train_loss=0.037890076637268066, time_cost=1.4964184761047363
+
Steps: 1%|▏ | 14725/1000000 [4:22:22<2334:23:43, 8.53s/it, lr=1e-5, step_loss=0.0645]
Steps: 1%|▏ | 14726/1000000 [4:22:32<2462:14:33, 9.00s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [14726], local_loss=0.06530415266752243, train_loss=0.22863152623176575, time_cost=8.215357780456543
+
Steps: 1%|▏ | 14726/1000000 [4:22:32<2462:14:33, 9.00s/it, lr=1e-5, step_loss=0.0653]
Steps: 1%|▏ | 14727/1000000 [4:22:45<2782:04:36, 10.17s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [14727], local_loss=0.05102715641260147, train_loss=0.07572601735591888, time_cost=4.617207288742065
+
Steps: 1%|▏ | 14727/1000000 [4:22:45<2782:04:36, 10.17s/it, lr=1e-5, step_loss=0.051]
Steps: 1%|▏ | 14728/1000000 [4:22:50<2360:36:08, 8.63s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [14728], local_loss=0.018326934427022934, train_loss=0.024676915258169174, time_cost=2.3060617446899414
+
Steps: 1%|▏ | 14728/1000000 [4:22:50<2360:36:08, 8.63s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%|▏ | 14729/1000000 [4:23:01<2558:01:27, 9.35s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [14729], local_loss=0.006772896274924278, train_loss=0.09560677409172058, time_cost=3.519225835800171
+
Steps: 1%|▏ | 14729/1000000 [4:23:01<2558:01:27, 9.35s/it, lr=1e-5, step_loss=0.00677]
Steps: 1%|▏ | 14730/1000000 [4:23:09<2410:01:24, 8.81s/it, lr=1e-5, step_loss=0.00677][RANK-0]: Step: [14730], local_loss=0.02275172248482704, train_loss=0.011686909943819046, time_cost=2.393359899520874
+
Steps: 1%|▏ | 14730/1000000 [4:23:09<2410:01:24, 8.81s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%|▏ | 14731/1000000 [4:23:14<2101:37:07, 7.68s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [14731], local_loss=0.023799583315849304, train_loss=0.02675991877913475, time_cost=2.536607027053833
+
Steps: 1%|▏ | 14731/1000000 [4:23:14<2101:37:07, 7.68s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%|▏ | 14732/1000000 [4:23:21<2081:44:29, 7.61s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [14732], local_loss=0.040172889828681946, train_loss=0.04837510734796524, time_cost=2.9944920539855957
+
Steps: 1%|▏ | 14732/1000000 [4:23:21<2081:44:29, 7.61s/it, lr=1e-5, step_loss=0.0402]
Steps: 1%|▏ | 14733/1000000 [4:23:28<2005:50:47, 7.33s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [14733], local_loss=0.06477022171020508, train_loss=0.05368828773498535, time_cost=2.5676004886627197
+
Steps: 1%|▏ | 14733/1000000 [4:23:28<2005:50:47, 7.33s/it, lr=1e-5, step_loss=0.0648]
Steps: 1%|▏ | 14734/1000000 [4:23:33<1847:48:05, 6.75s/it, lr=1e-5, step_loss=0.0648][RANK-0]: Step: [14734], local_loss=0.011593511328101158, train_loss=0.08908215910196304, time_cost=2.337218999862671
+
Steps: 1%|▏ | 14734/1000000 [4:23:33<1847:48:05, 6.75s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 14735/1000000 [4:23:38<1690:35:48, 6.18s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [14735], local_loss=0.05721653997898102, train_loss=0.0359342023730278, time_cost=1.8034489154815674
+
Steps: 1%|▏ | 14735/1000000 [4:23:38<1690:35:48, 6.18s/it, lr=1e-5, step_loss=0.0572]
Steps: 1%|▏ | 14736/1000000 [4:23:51<2273:35:08, 8.31s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [14736], local_loss=0.04444913938641548, train_loss=0.05178293213248253, time_cost=3.646395444869995
+
Steps: 1%|▏ | 14736/1000000 [4:23:51<2273:35:08, 8.31s/it, lr=1e-5, step_loss=0.0444]
Steps: 1%|▏ | 14737/1000000 [4:23:59<2238:32:45, 8.18s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [14737], local_loss=0.006248354911804199, train_loss=0.0403190553188324, time_cost=5.130163669586182
+
Steps: 1%|▏ | 14737/1000000 [4:23:59<2238:32:45, 8.18s/it, lr=1e-5, step_loss=0.00625]
Steps: 1%|▏ | 14738/1000000 [4:24:10<2432:27:13, 8.89s/it, lr=1e-5, step_loss=0.00625][RANK-0]: Step: [14738], local_loss=0.18905027210712433, train_loss=0.09211963415145874, time_cost=1.301830768585205
+
Steps: 1%|▏ | 14738/1000000 [4:24:10<2432:27:13, 8.89s/it, lr=1e-5, step_loss=0.189]
Steps: 1%|▏ | 14739/1000000 [4:24:16<2195:36:26, 8.02s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [14739], local_loss=0.009104305878281593, train_loss=0.02324623614549637, time_cost=3.1777141094207764
+
Steps: 1%|▏ | 14739/1000000 [4:24:16<2195:36:26, 8.02s/it, lr=1e-5, step_loss=0.0091]
Steps: 1%|▏ | 14740/1000000 [4:24:29<2583:53:43, 9.44s/it, lr=1e-5, step_loss=0.0091][RANK-0]: Step: [14740], local_loss=0.017236433923244476, train_loss=0.018276847898960114, time_cost=2.9021553993225098
+
Steps: 1%|▏ | 14740/1000000 [4:24:29<2583:53:43, 9.44s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%|▏ | 14741/1000000 [4:24:37<2479:09:58, 9.06s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [14741], local_loss=0.024645961821079254, train_loss=0.026179920881986618, time_cost=1.2180664539337158
+
Steps: 1%|▏ | 14741/1000000 [4:24:37<2479:09:58, 9.06s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%|▏ | 14742/1000000 [4:24:49<2766:11:55, 10.11s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [14742], local_loss=0.5124877691268921, train_loss=0.09734906256198883, time_cost=5.534940958023071
+
Steps: 1%|▏ | 14742/1000000 [4:24:49<2766:11:55, 10.11s/it, lr=1e-5, step_loss=0.512]
Steps: 1%|▏ | 14743/1000000 [4:25:06<3308:03:33, 12.09s/it, lr=1e-5, step_loss=0.512][RANK-0]: Step: [14743], local_loss=0.031570885330438614, train_loss=0.0648689717054367, time_cost=9.71142292022705
+
Steps: 1%|▏ | 14743/1000000 [4:25:06<3308:03:33, 12.09s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%|▏ | 14744/1000000 [4:25:19<3387:56:36, 12.38s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [14744], local_loss=0.02015490084886551, train_loss=0.020533312112092972, time_cost=1.2298436164855957
+
Steps: 1%|▏ | 14744/1000000 [4:25:19<3387:56:36, 12.38s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%|▏ | 14745/1000000 [4:25:30<3253:08:11, 11.89s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [14745], local_loss=0.01922481134533882, train_loss=0.01325689535588026, time_cost=5.503614664077759
+
Steps: 1%|▏ | 14745/1000000 [4:25:30<3253:08:11, 11.89s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%|▏ | 14746/1000000 [4:25:43<3335:44:21, 12.19s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [14746], local_loss=0.017090486362576485, train_loss=0.19434700906276703, time_cost=4.369600772857666
+
Steps: 1%|▏ | 14746/1000000 [4:25:43<3335:44:21, 12.19s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%|▏ | 14747/1000000 [4:25:58<3587:13:54, 13.11s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [14747], local_loss=0.008012808859348297, train_loss=0.016194578260183334, time_cost=6.8438568115234375
+
Steps: 1%|▏ | 14747/1000000 [4:25:58<3587:13:54, 13.11s/it, lr=1e-5, step_loss=0.00801]
Steps: 1%|▏ | 14748/1000000 [4:26:07<3273:11:23, 11.96s/it, lr=1e-5, step_loss=0.00801][RANK-0]: Step: [14748], local_loss=0.011242726817727089, train_loss=0.08539408445358276, time_cost=1.260352373123169
+
Steps: 1%|▏ | 14748/1000000 [4:26:07<3273:11:23, 11.96s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 14749/1000000 [4:26:17<3129:36:06, 11.44s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [14749], local_loss=0.41824159026145935, train_loss=0.15514610707759857, time_cost=3.1133220195770264
+
Steps: 1%|▏ | 14749/1000000 [4:26:17<3129:36:06, 11.44s/it, lr=1e-5, step_loss=0.418]
Steps: 1%|▏ | 14750/1000000 [4:26:25<2809:08:27, 10.26s/it, lr=1e-5, step_loss=0.418][RANK-0]: Step: [14750], local_loss=0.0099679846316576, train_loss=0.033337242901325226, time_cost=1.6044728755950928
+
Steps: 1%|▏ | 14750/1000000 [4:26:25<2809:08:27, 10.26s/it, lr=1e-5, step_loss=0.00997]
Steps: 1%|▏ | 14751/1000000 [4:26:30<2370:40:01, 8.66s/it, lr=1e-5, step_loss=0.00997][RANK-0]: Step: [14751], local_loss=0.007220172323286533, train_loss=0.05234305560588837, time_cost=1.2077817916870117
+
Steps: 1%|▏ | 14751/1000000 [4:26:30<2370:40:01, 8.66s/it, lr=1e-5, step_loss=0.00722]
Steps: 1%|▏ | 14752/1000000 [4:26:37<2259:58:05, 8.26s/it, lr=1e-5, step_loss=0.00722][RANK-0]: Step: [14752], local_loss=0.06638146936893463, train_loss=0.14995577931404114, time_cost=3.1686830520629883
+
Steps: 1%|▏ | 14752/1000000 [4:26:37<2259:58:05, 8.26s/it, lr=1e-5, step_loss=0.0664]
Steps: 1%|▏ | 14753/1000000 [4:26:47<2402:18:46, 8.78s/it, lr=1e-5, step_loss=0.0664][RANK-0]: Step: [14753], local_loss=0.010871435515582561, train_loss=0.04185755178332329, time_cost=1.9342014789581299
+
Steps: 1%|▏ | 14753/1000000 [4:26:47<2402:18:46, 8.78s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%|▏ | 14754/1000000 [4:26:58<2595:57:24, 9.49s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [14754], local_loss=0.038641925901174545, train_loss=0.07719317078590393, time_cost=3.724165439605713
+
Steps: 1%|▏ | 14754/1000000 [4:26:58<2595:57:24, 9.49s/it, lr=1e-5, step_loss=0.0386]
Steps: 1%|▏ | 14755/1000000 [4:27:09<2695:20:16, 9.85s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [14755], local_loss=0.2588414251804352, train_loss=0.061907801777124405, time_cost=1.6758460998535156
+
Steps: 1%|▏ | 14755/1000000 [4:27:09<2695:20:16, 9.85s/it, lr=1e-5, step_loss=0.259]
Steps: 1%|▏ | 14756/1000000 [4:27:15<2346:25:36, 8.57s/it, lr=1e-5, step_loss=0.259][RANK-0]: Step: [14756], local_loss=0.007079081144183874, train_loss=0.17632406949996948, time_cost=4.526883363723755
+
Steps: 1%|▏ | 14756/1000000 [4:27:15<2346:25:36, 8.57s/it, lr=1e-5, step_loss=0.00708]
Steps: 1%|▏ | 14757/1000000 [4:27:25<2529:05:57, 9.24s/it, lr=1e-5, step_loss=0.00708][RANK-0]: Step: [14757], local_loss=0.054147519171237946, train_loss=0.03315373882651329, time_cost=2.2659389972686768
+
Steps: 1%|▏ | 14757/1000000 [4:27:25<2529:05:57, 9.24s/it, lr=1e-5, step_loss=0.0541]
Steps: 1%|▏ | 14758/1000000 [4:27:35<2585:46:09, 9.45s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [14758], local_loss=0.027234073728322983, train_loss=0.059967316687107086, time_cost=4.172126054763794
+
Steps: 1%|▏ | 14758/1000000 [4:27:35<2585:46:09, 9.45s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%|▏ | 14759/1000000 [4:27:40<2175:49:39, 7.95s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [14759], local_loss=0.023570094257593155, train_loss=0.028712430968880653, time_cost=1.5836458206176758
+
Steps: 1%|▏ | 14759/1000000 [4:27:40<2175:49:39, 7.95s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%|▏ | 14760/1000000 [4:27:44<1871:56:53, 6.84s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [14760], local_loss=0.03781456500291824, train_loss=0.02861412800848484, time_cost=1.2435178756713867
+
Steps: 1%|▏ | 14760/1000000 [4:27:44<1871:56:53, 6.84s/it, lr=1e-5, step_loss=0.0378]
Steps: 1%|▏ | 14761/1000000 [4:27:49<1719:48:49, 6.28s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [14761], local_loss=0.006679291371256113, train_loss=0.034931331872940063, time_cost=1.878692388534546
+
Steps: 1%|▏ | 14761/1000000 [4:27:49<1719:48:49, 6.28s/it, lr=1e-5, step_loss=0.00668]
Steps: 1%|▏ | 14762/1000000 [4:27:56<1786:15:30, 6.53s/it, lr=1e-5, step_loss=0.00668][RANK-0]: Step: [14762], local_loss=0.016587339341640472, train_loss=0.03809472918510437, time_cost=1.5256938934326172
+
Steps: 1%|▏ | 14762/1000000 [4:27:56<1786:15:30, 6.53s/it, lr=1e-5, step_loss=0.0166]
Steps: 1%|▏ | 14763/1000000 [4:28:04<1886:59:54, 6.89s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [14763], local_loss=0.03556475788354874, train_loss=0.04630544036626816, time_cost=3.685504913330078
+
Steps: 1%|▏ | 14763/1000000 [4:28:04<1886:59:54, 6.89s/it, lr=1e-5, step_loss=0.0356]
Steps: 1%|▏ | 14764/1000000 [4:28:17<2413:38:18, 8.82s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [14764], local_loss=0.02948298305273056, train_loss=0.15300095081329346, time_cost=3.920696973800659
+
Steps: 1%|▏ | 14764/1000000 [4:28:17<2413:38:18, 8.82s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%|▏ | 14765/1000000 [4:28:30<2715:30:08, 9.92s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [14765], local_loss=0.012526502832770348, train_loss=0.01674320548772812, time_cost=2.968496561050415
+
Steps: 1%|▏ | 14765/1000000 [4:28:30<2715:30:08, 9.92s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%|▏ | 14766/1000000 [4:28:39<2679:11:04, 9.79s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [14766], local_loss=0.0484674833714962, train_loss=0.0439952090382576, time_cost=6.355089902877808
+
Steps: 1%|▏ | 14766/1000000 [4:28:39<2679:11:04, 9.79s/it, lr=1e-5, step_loss=0.0485]
Steps: 1%|▏ | 14767/1000000 [4:28:47<2556:09:48, 9.34s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [14767], local_loss=0.0194169282913208, train_loss=0.03641030937433243, time_cost=1.214000940322876
+
Steps: 1%|▏ | 14767/1000000 [4:28:47<2556:09:48, 9.34s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%|▏ | 14768/1000000 [4:28:52<2154:31:59, 7.87s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [14768], local_loss=0.008644084446132183, train_loss=7.058232307434082, time_cost=1.331955909729004
+
Steps: 1%|▏ | 14768/1000000 [4:28:52<2154:31:59, 7.87s/it, lr=1e-5, step_loss=0.00864]
Steps: 1%|▏ | 14769/1000000 [4:29:01<2267:48:22, 8.29s/it, lr=1e-5, step_loss=0.00864][RANK-0]: Step: [14769], local_loss=0.07727541029453278, train_loss=0.035246506333351135, time_cost=3.452263116836548
+
Steps: 1%|▏ | 14769/1000000 [4:29:01<2267:48:22, 8.29s/it, lr=1e-5, step_loss=0.0773]
Steps: 1%|▏ | 14770/1000000 [4:29:17<2861:35:03, 10.46s/it, lr=1e-5, step_loss=0.0773][RANK-0]: Step: [14770], local_loss=0.06920063495635986, train_loss=0.04885569587349892, time_cost=6.952779293060303
+
Steps: 1%|▏ | 14770/1000000 [4:29:17<2861:35:03, 10.46s/it, lr=1e-5, step_loss=0.0692]
Steps: 1%|▏ | 14771/1000000 [4:29:30<3117:10:31, 11.39s/it, lr=1e-5, step_loss=0.0692][RANK-0]: Step: [14771], local_loss=0.007162939291447401, train_loss=0.016770139336586, time_cost=3.999751329421997
+
Steps: 1%|▏ | 14771/1000000 [4:29:30<3117:10:31, 11.39s/it, lr=1e-5, step_loss=0.00716]
Steps: 1%|▏ | 14772/1000000 [4:29:37<2740:05:45, 10.01s/it, lr=1e-5, step_loss=0.00716][RANK-0]: Step: [14772], local_loss=0.008420668542385101, train_loss=8.098228454589844, time_cost=2.69419264793396
+
Steps: 1%|▏ | 14772/1000000 [4:29:37<2740:05:45, 10.01s/it, lr=1e-5, step_loss=0.00842]
Steps: 1%|▏ | 14773/1000000 [4:29:42<2330:32:18, 8.52s/it, lr=1e-5, step_loss=0.00842][RANK-0]: Step: [14773], local_loss=0.00513613922521472, train_loss=0.017110681161284447, time_cost=2.0475590229034424
+
Steps: 1%|▏ | 14773/1000000 [4:29:42<2330:32:18, 8.52s/it, lr=1e-5, step_loss=0.00514]
Steps: 1%|▏ | 14774/1000000 [4:29:53<2527:18:28, 9.23s/it, lr=1e-5, step_loss=0.00514][RANK-0]: Step: [14774], local_loss=0.037232983857393265, train_loss=0.14993226528167725, time_cost=1.2119042873382568
+
Steps: 1%|▏ | 14774/1000000 [4:29:53<2527:18:28, 9.23s/it, lr=1e-5, step_loss=0.0372]
Steps: 1%|▏ | 14775/1000000 [4:30:06<2858:58:02, 10.45s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [14775], local_loss=0.038351304829120636, train_loss=0.028034694492816925, time_cost=1.2315614223480225
+
Steps: 1%|▏ | 14775/1000000 [4:30:06<2858:58:02, 10.45s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%|▏ | 14776/1000000 [4:30:12<2488:34:27, 9.09s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [14776], local_loss=0.010765165090560913, train_loss=0.01721728779375553, time_cost=1.9841575622558594
+
Steps: 1%|▏ | 14776/1000000 [4:30:12<2488:34:27, 9.09s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 14777/1000000 [4:30:28<3036:31:29, 11.10s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [14777], local_loss=0.033903785049915314, train_loss=0.029913734644651413, time_cost=8.550203323364258
+
Steps: 1%|▏ | 14777/1000000 [4:30:28<3036:31:29, 11.10s/it, lr=1e-5, step_loss=0.0339]
Steps: 1%|▏ | 14778/1000000 [4:30:44<3413:04:22, 12.47s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [14778], local_loss=0.010332545265555382, train_loss=0.12545551359653473, time_cost=4.693894147872925
+
Steps: 1%|▏ | 14778/1000000 [4:30:44<3413:04:22, 12.47s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%|▏ | 14779/1000000 [4:30:49<2801:28:58, 10.24s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [14779], local_loss=0.022868186235427856, train_loss=0.018375590443611145, time_cost=1.8281707763671875
+
Steps: 1%|▏ | 14779/1000000 [4:30:49<2801:28:58, 10.24s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%|▏ | 14780/1000000 [4:30:55<2456:13:33, 8.98s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [14780], local_loss=0.008868050761520863, train_loss=0.0421130396425724, time_cost=1.9158589839935303
+
Steps: 1%|▏ | 14780/1000000 [4:30:55<2456:13:33, 8.98s/it, lr=1e-5, step_loss=0.00887]
Steps: 1%|▏ | 14781/1000000 [4:31:07<2721:24:15, 9.94s/it, lr=1e-5, step_loss=0.00887][RANK-0]: Step: [14781], local_loss=0.008531318977475166, train_loss=0.04000496864318848, time_cost=4.0945892333984375
+
Steps: 1%|▏ | 14781/1000000 [4:31:07<2721:24:15, 9.94s/it, lr=1e-5, step_loss=0.00853]
Steps: 1%|▏ | 14782/1000000 [4:31:15<2549:14:21, 9.31s/it, lr=1e-5, step_loss=0.00853][RANK-0]: Step: [14782], local_loss=0.03146262466907501, train_loss=0.03291485458612442, time_cost=1.5769782066345215
+
Steps: 1%|▏ | 14782/1000000 [4:31:15<2549:14:21, 9.31s/it, lr=1e-5, step_loss=0.0315]
Steps: 1%|▏ | 14783/1000000 [4:31:21<2324:13:41, 8.49s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [14783], local_loss=0.006524811964482069, train_loss=0.049108412116765976, time_cost=1.6601858139038086
+
Steps: 1%|▏ | 14783/1000000 [4:31:21<2324:13:41, 8.49s/it, lr=1e-5, step_loss=0.00652]
Steps: 1%|▏ | 14784/1000000 [4:31:36<2789:10:12, 10.19s/it, lr=1e-5, step_loss=0.00652][RANK-0]: Step: [14784], local_loss=0.06674962490797043, train_loss=0.029498636722564697, time_cost=1.9537417888641357
+
Steps: 1%|▏ | 14784/1000000 [4:31:36<2789:10:12, 10.19s/it, lr=1e-5, step_loss=0.0667]
Steps: 1%|▏ | 14785/1000000 [4:31:52<3301:07:20, 12.06s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [14785], local_loss=0.004551581107079983, train_loss=0.07398337870836258, time_cost=8.131178617477417
+
Steps: 1%|▏ | 14785/1000000 [4:31:52<3301:07:20, 12.06s/it, lr=1e-5, step_loss=0.00455]
Steps: 1%|▏ | 14786/1000000 [4:31:57<2742:55:45, 10.02s/it, lr=1e-5, step_loss=0.00455][RANK-0]: Step: [14786], local_loss=0.004014684818685055, train_loss=0.029241304844617844, time_cost=2.417013168334961
+
Steps: 1%|▏ | 14786/1000000 [4:31:57<2742:55:45, 10.02s/it, lr=1e-5, step_loss=0.00401]
Steps: 1%|▏ | 14787/1000000 [4:32:08<2799:22:30, 10.23s/it, lr=1e-5, step_loss=0.00401][RANK-0]: Step: [14787], local_loss=0.01321581844240427, train_loss=0.1387391835451126, time_cost=3.033693313598633
+
Steps: 1%|▏ | 14787/1000000 [4:32:08<2799:22:30, 10.23s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%|▏ | 14788/1000000 [4:32:19<2862:21:12, 10.46s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [14788], local_loss=0.022436074912548065, train_loss=0.06224597245454788, time_cost=1.5777034759521484
+
Steps: 1%|▏ | 14788/1000000 [4:32:19<2862:21:12, 10.46s/it, lr=1e-5, step_loss=0.0224]
Steps: 1%|▏ | 14789/1000000 [4:32:34<3256:01:24, 11.90s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [14789], local_loss=0.020938629284501076, train_loss=0.06824871152639389, time_cost=1.9491233825683594
+
Steps: 1%|▏ | 14789/1000000 [4:32:34<3256:01:24, 11.90s/it, lr=1e-5, step_loss=0.0209]
Steps: 1%|▏ | 14790/1000000 [4:32:45<3147:49:56, 11.50s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [14790], local_loss=0.04382288083434105, train_loss=0.03874736651778221, time_cost=1.2349133491516113
+
Steps: 1%|▏ | 14790/1000000 [4:32:45<3147:49:56, 11.50s/it, lr=1e-5, step_loss=0.0438]
Steps: 1%|▏ | 14791/1000000 [4:32:53<2859:10:09, 10.45s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [14791], local_loss=0.014927122741937637, train_loss=0.023195164278149605, time_cost=6.880376815795898
+
Steps: 1%|▏ | 14791/1000000 [4:32:53<2859:10:09, 10.45s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 14792/1000000 [4:32:59<2511:15:24, 9.18s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [14792], local_loss=0.05654202774167061, train_loss=0.028914660215377808, time_cost=1.8314435482025146
+
Steps: 1%|▏ | 14792/1000000 [4:32:59<2511:15:24, 9.18s/it, lr=1e-5, step_loss=0.0565]
Steps: 1%|▏ | 14793/1000000 [4:33:04<2192:11:41, 8.01s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [14793], local_loss=0.20125256478786469, train_loss=0.12844346463680267, time_cost=1.2321248054504395
+
Steps: 1%|▏ | 14793/1000000 [4:33:04<2192:11:41, 8.01s/it, lr=1e-5, step_loss=0.201]
Steps: 1%|▏ | 14794/1000000 [4:33:09<1953:29:50, 7.14s/it, lr=1e-5, step_loss=0.201][RANK-0]: Step: [14794], local_loss=0.012855418026447296, train_loss=0.02728576585650444, time_cost=2.3032679557800293
+
Steps: 1%|▏ | 14794/1000000 [4:33:09<1953:29:50, 7.14s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14795/1000000 [4:33:14<1724:18:38, 6.30s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14795], local_loss=0.014855224639177322, train_loss=0.19601213932037354, time_cost=1.7855050563812256
+
Steps: 1%|▏ | 14795/1000000 [4:33:14<1724:18:38, 6.30s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 14796/1000000 [4:33:22<1913:31:55, 6.99s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [14796], local_loss=0.01272561214864254, train_loss=0.0710512101650238, time_cost=1.443211317062378
+
Steps: 1%|▏ | 14796/1000000 [4:33:22<1913:31:55, 6.99s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%|▏ | 14797/1000000 [4:33:29<1930:37:37, 7.05s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [14797], local_loss=0.007273312192410231, train_loss=0.016218282282352448, time_cost=1.5101521015167236
+
Steps: 1%|▏ | 14797/1000000 [4:33:29<1930:37:37, 7.05s/it, lr=1e-5, step_loss=0.00727]
Steps: 1%|▏ | 14798/1000000 [4:33:35<1775:30:34, 6.49s/it, lr=1e-5, step_loss=0.00727][RANK-0]: Step: [14798], local_loss=0.011823873966932297, train_loss=0.04968133568763733, time_cost=1.8631386756896973
+
Steps: 1%|▏ | 14798/1000000 [4:33:35<1775:30:34, 6.49s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14799/1000000 [4:33:47<2276:18:15, 8.32s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14799], local_loss=0.04705583676695824, train_loss=0.05561906099319458, time_cost=2.690302610397339
+
Steps: 1%|▏ | 14799/1000000 [4:33:47<2276:18:15, 8.32s/it, lr=1e-5, step_loss=0.0471]
Steps: 1%|▏ | 14800/1000000 [4:34:05<3039:49:34, 11.11s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [14800], local_loss=0.022454898804426193, train_loss=0.0459202378988266, time_cost=8.98517394065857
+
Steps: 1%|▏ | 14800/1000000 [4:34:05<3039:49:34, 11.11s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%|▏ | 14801/1000000 [4:34:21<3417:51:05, 12.49s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [14801], local_loss=0.00830603577196598, train_loss=0.09358292073011398, time_cost=7.7917492389678955
+
Steps: 1%|▏ | 14801/1000000 [4:34:21<3417:51:05, 12.49s/it, lr=1e-5, step_loss=0.00831]
Steps: 1%|▏ | 14802/1000000 [4:34:28<2970:11:05, 10.85s/it, lr=1e-5, step_loss=0.00831][RANK-0]: Step: [14802], local_loss=0.0058510443195700645, train_loss=0.033347517251968384, time_cost=5.732261657714844
+
Steps: 1%|▏ | 14802/1000000 [4:34:28<2970:11:05, 10.85s/it, lr=1e-5, step_loss=0.00585]
Steps: 1%|▏ | 14803/1000000 [4:34:37<2834:09:28, 10.36s/it, lr=1e-5, step_loss=0.00585][RANK-0]: Step: [14803], local_loss=0.03989694267511368, train_loss=0.05242391303181648, time_cost=3.2324094772338867
+
Steps: 1%|▏ | 14803/1000000 [4:34:37<2834:09:28, 10.36s/it, lr=1e-5, step_loss=0.0399]
Steps: 1%|▏ | 14804/1000000 [4:34:43<2476:51:24, 9.05s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [14804], local_loss=0.006602143403142691, train_loss=0.08661015331745148, time_cost=2.260833740234375
+
Steps: 1%|▏ | 14804/1000000 [4:34:43<2476:51:24, 9.05s/it, lr=1e-5, step_loss=0.0066]
Steps: 1%|▏ | 14805/1000000 [4:34:48<2128:55:38, 7.78s/it, lr=1e-5, step_loss=0.0066][RANK-0]: Step: [14805], local_loss=0.01906452514231205, train_loss=0.06989328563213348, time_cost=1.6746571063995361
+
Steps: 1%|▏ | 14805/1000000 [4:34:48<2128:55:38, 7.78s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%|▏ | 14806/1000000 [4:34:55<2121:18:38, 7.75s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [14806], local_loss=0.026610292494297028, train_loss=0.07962582260370255, time_cost=1.9909813404083252
+
Steps: 1%|▏ | 14806/1000000 [4:34:55<2121:18:38, 7.75s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%|▏ | 14807/1000000 [4:35:11<2734:29:25, 9.99s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [14807], local_loss=0.03085925243794918, train_loss=0.09881483018398285, time_cost=1.2211668491363525
+
Steps: 1%|▏ | 14807/1000000 [4:35:11<2734:29:25, 9.99s/it, lr=1e-5, step_loss=0.0309]
Steps: 1%|▏ | 14808/1000000 [4:35:18<2544:55:28, 9.30s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [14808], local_loss=0.013425284996628761, train_loss=0.031245749443769455, time_cost=3.4435243606567383
+
Steps: 1%|▏ | 14808/1000000 [4:35:18<2544:55:28, 9.30s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%|▏ | 14809/1000000 [4:35:30<2710:46:55, 9.91s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [14809], local_loss=0.07911606132984161, train_loss=0.05628331005573273, time_cost=8.751146078109741
+
Steps: 1%|▏ | 14809/1000000 [4:35:30<2710:46:55, 9.91s/it, lr=1e-5, step_loss=0.0791]
Steps: 1%|▏ | 14810/1000000 [4:35:46<3213:24:12, 11.74s/it, lr=1e-5, step_loss=0.0791][RANK-0]: Step: [14810], local_loss=0.389151394367218, train_loss=0.11878819018602371, time_cost=4.353291034698486
+
Steps: 1%|▏ | 14810/1000000 [4:35:46<3213:24:12, 11.74s/it, lr=1e-5, step_loss=0.389]
Steps: 1%|▏ | 14811/1000000 [4:35:50<2603:41:30, 9.51s/it, lr=1e-5, step_loss=0.389][RANK-0]: Step: [14811], local_loss=0.004962211009114981, train_loss=0.023937920108437538, time_cost=1.2239768505096436
+
Steps: 1%|▏ | 14811/1000000 [4:35:50<2603:41:30, 9.51s/it, lr=1e-5, step_loss=0.00496]
Steps: 1%|▏ | 14812/1000000 [4:35:54<2191:42:25, 8.01s/it, lr=1e-5, step_loss=0.00496][RANK-0]: Step: [14812], local_loss=0.005521959159523249, train_loss=0.05101299285888672, time_cost=1.259460687637329
+
Steps: 1%|▏ | 14812/1000000 [4:35:54<2191:42:25, 8.01s/it, lr=1e-5, step_loss=0.00552]
Steps: 1%|▏ | 14813/1000000 [4:36:05<2405:45:08, 8.79s/it, lr=1e-5, step_loss=0.00552][RANK-0]: Step: [14813], local_loss=0.04479566216468811, train_loss=0.023201484233140945, time_cost=1.563767910003662
+
Steps: 1%|▏ | 14813/1000000 [4:36:05<2405:45:08, 8.79s/it, lr=1e-5, step_loss=0.0448]
Steps: 1%|▏ | 14814/1000000 [4:36:18<2790:14:09, 10.20s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [14814], local_loss=0.024646004661917686, train_loss=0.028846604749560356, time_cost=4.551395654678345
+
Steps: 1%|▏ | 14814/1000000 [4:36:18<2790:14:09, 10.20s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%|▏ | 14815/1000000 [4:36:33<3108:01:23, 11.36s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [14815], local_loss=0.016155889257788658, train_loss=0.03301559388637543, time_cost=4.212308406829834
+
Steps: 1%|▏ | 14815/1000000 [4:36:33<3108:01:23, 11.36s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%|▏ | 14816/1000000 [4:36:42<2990:40:58, 10.93s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [14816], local_loss=0.011028851382434368, train_loss=0.009879467077553272, time_cost=1.232269048690796
+
Steps: 1%|▏ | 14816/1000000 [4:36:42<2990:40:58, 10.93s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 14817/1000000 [4:36:56<3181:10:25, 11.62s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [14817], local_loss=0.011789103038609028, train_loss=0.17564740777015686, time_cost=3.7086598873138428
+
Steps: 1%|▏ | 14817/1000000 [4:36:56<3181:10:25, 11.62s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14818/1000000 [4:37:03<2804:21:02, 10.25s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14818], local_loss=0.005012230481952429, train_loss=7.963468074798584, time_cost=2.623952627182007
+
Steps: 1%|▏ | 14818/1000000 [4:37:03<2804:21:02, 10.25s/it, lr=1e-5, step_loss=0.00501]
Steps: 1%|▏ | 14819/1000000 [4:37:08<2394:58:15, 8.75s/it, lr=1e-5, step_loss=0.00501][RANK-0]: Step: [14819], local_loss=0.017702795565128326, train_loss=0.05346386879682541, time_cost=2.484811544418335
+
Steps: 1%|▏ | 14819/1000000 [4:37:08<2394:58:15, 8.75s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%|▏ | 14820/1000000 [4:37:21<2749:10:07, 10.05s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [14820], local_loss=0.012014608830213547, train_loss=0.022114213556051254, time_cost=4.284900188446045
+
Steps: 1%|▏ | 14820/1000000 [4:37:21<2749:10:07, 10.05s/it, lr=1e-5, step_loss=0.012]
Steps: 1%|▏ | 14821/1000000 [4:37:35<3053:13:54, 11.16s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [14821], local_loss=0.023724708706140518, train_loss=0.02087450958788395, time_cost=1.22501540184021
+
Steps: 1%|▏ | 14821/1000000 [4:37:35<3053:13:54, 11.16s/it, lr=1e-5, step_loss=0.0237]
Steps: 1%|▏ | 14822/1000000 [4:37:45<2940:44:44, 10.75s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [14822], local_loss=0.007905077189207077, train_loss=0.023478591814637184, time_cost=3.8023836612701416
+
Steps: 1%|▏ | 14822/1000000 [4:37:45<2940:44:44, 10.75s/it, lr=1e-5, step_loss=0.00791]
Steps: 1%|▏ | 14823/1000000 [4:38:01<3393:20:51, 12.40s/it, lr=1e-5, step_loss=0.00791][RANK-0]: Step: [14823], local_loss=0.033159732818603516, train_loss=0.08190475404262543, time_cost=6.152115345001221
+
Steps: 1%|▏ | 14823/1000000 [4:38:01<3393:20:51, 12.40s/it, lr=1e-5, step_loss=0.0332]
Steps: 1%|▏ | 14824/1000000 [4:38:05<2737:03:25, 10.00s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [14824], local_loss=0.005802393425256014, train_loss=0.022196782752871513, time_cost=3.2853963375091553
+
Steps: 1%|▏ | 14824/1000000 [4:38:05<2737:03:25, 10.00s/it, lr=1e-5, step_loss=0.0058]
Steps: 1%|▏ | 14825/1000000 [4:38:15<2753:54:05, 10.06s/it, lr=1e-5, step_loss=0.0058][RANK-0]: Step: [14825], local_loss=0.06451235711574554, train_loss=0.04599297419190407, time_cost=4.047606706619263
+
Steps: 1%|▏ | 14825/1000000 [4:38:15<2753:54:05, 10.06s/it, lr=1e-5, step_loss=0.0645]
Steps: 1%|▏ | 14826/1000000 [4:38:27<2904:38:05, 10.61s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [14826], local_loss=0.00855304580181837, train_loss=0.0285965483635664, time_cost=1.2130327224731445
+
Steps: 1%|▏ | 14826/1000000 [4:38:27<2904:38:05, 10.61s/it, lr=1e-5, step_loss=0.00855]
Steps: 1%|▏ | 14827/1000000 [4:38:33<2508:51:37, 9.17s/it, lr=1e-5, step_loss=0.00855][RANK-0]: Step: [14827], local_loss=0.005043285898864269, train_loss=0.015779532492160797, time_cost=1.4831554889678955
+
Steps: 1%|▏ | 14827/1000000 [4:38:33<2508:51:37, 9.17s/it, lr=1e-5, step_loss=0.00504]
Steps: 1%|▏ | 14828/1000000 [4:38:45<2689:34:33, 9.83s/it, lr=1e-5, step_loss=0.00504][RANK-0]: Step: [14828], local_loss=0.00866618100553751, train_loss=0.06484987586736679, time_cost=2.7751071453094482
+
Steps: 1%|▏ | 14828/1000000 [4:38:45<2689:34:33, 9.83s/it, lr=1e-5, step_loss=0.00867]
Steps: 1%|▏ | 14829/1000000 [4:38:50<2297:48:11, 8.40s/it, lr=1e-5, step_loss=0.00867][RANK-0]: Step: [14829], local_loss=0.006524417549371719, train_loss=0.025813501328229904, time_cost=2.2193331718444824
+
Steps: 1%|▏ | 14829/1000000 [4:38:50<2297:48:11, 8.40s/it, lr=1e-5, step_loss=0.00652]
Steps: 1%|▏ | 14830/1000000 [4:39:03<2724:54:00, 9.96s/it, lr=1e-5, step_loss=0.00652][RANK-0]: Step: [14830], local_loss=0.05991845205426216, train_loss=0.02402748540043831, time_cost=2.2206180095672607
+
Steps: 1%|▏ | 14830/1000000 [4:39:03<2724:54:00, 9.96s/it, lr=1e-5, step_loss=0.0599]
Steps: 1%|▏ | 14831/1000000 [4:39:10<2482:55:19, 9.07s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [14831], local_loss=0.013099508360028267, train_loss=5.575165271759033, time_cost=2.61555814743042
+
Steps: 1%|▏ | 14831/1000000 [4:39:10<2482:55:19, 9.07s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%|▏ | 14832/1000000 [4:39:17<2301:57:18, 8.41s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [14832], local_loss=0.006344515364617109, train_loss=0.029359815642237663, time_cost=3.0772736072540283
+
Steps: 1%|▏ | 14832/1000000 [4:39:17<2301:57:18, 8.41s/it, lr=1e-5, step_loss=0.00634]
Steps: 1%|▏ | 14833/1000000 [4:39:31<2797:04:00, 10.22s/it, lr=1e-5, step_loss=0.00634][RANK-0]: Step: [14833], local_loss=0.008183229714632034, train_loss=0.032412074506282806, time_cost=5.297165870666504
+
Steps: 1%|▏ | 14833/1000000 [4:39:32<2797:04:00, 10.22s/it, lr=1e-5, step_loss=0.00818]
Steps: 1%|▏ | 14834/1000000 [4:39:37<2444:30:35, 8.93s/it, lr=1e-5, step_loss=0.00818][RANK-0]: Step: [14834], local_loss=0.013165713287889957, train_loss=0.014291333965957165, time_cost=1.666050910949707
+
Steps: 1%|▏ | 14834/1000000 [4:39:37<2444:30:35, 8.93s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%|▏ | 14835/1000000 [4:39:51<2852:04:29, 10.42s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [14835], local_loss=0.05431947857141495, train_loss=0.029154865071177483, time_cost=4.6055755615234375
+
Steps: 1%|▏ | 14835/1000000 [4:39:51<2852:04:29, 10.42s/it, lr=1e-5, step_loss=0.0543]
Steps: 1%|▏ | 14836/1000000 [4:40:04<3048:08:48, 11.14s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [14836], local_loss=0.009044593200087547, train_loss=0.05978616327047348, time_cost=5.127327919006348
+
Steps: 1%|▏ | 14836/1000000 [4:40:04<3048:08:48, 11.14s/it, lr=1e-5, step_loss=0.00904]
Steps: 1%|▏ | 14837/1000000 [4:40:19<3357:18:08, 12.27s/it, lr=1e-5, step_loss=0.00904][RANK-0]: Step: [14837], local_loss=0.036407601088285446, train_loss=0.09661763906478882, time_cost=5.8117735385894775
+
Steps: 1%|▏ | 14837/1000000 [4:40:19<3357:18:08, 12.27s/it, lr=1e-5, step_loss=0.0364]
Steps: 1%|▏ | 14838/1000000 [4:40:27<2964:39:31, 10.83s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [14838], local_loss=0.026945089921355247, train_loss=0.018328092992305756, time_cost=3.2910406589508057
+
Steps: 1%|▏ | 14838/1000000 [4:40:27<2964:39:31, 10.83s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%|▏ | 14839/1000000 [4:40:41<3257:04:37, 11.90s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [14839], local_loss=0.09669850021600723, train_loss=8.161358833312988, time_cost=2.2960116863250732
+
Steps: 1%|▏ | 14839/1000000 [4:40:41<3257:04:37, 11.90s/it, lr=1e-5, step_loss=0.0967]
Steps: 1%|▏ | 14840/1000000 [4:40:46<2683:38:29, 9.81s/it, lr=1e-5, step_loss=0.0967][RANK-0]: Step: [14840], local_loss=0.0648893415927887, train_loss=0.017864849418401718, time_cost=1.319504737854004
+
Steps: 1%|▏ | 14840/1000000 [4:40:46<2683:38:29, 9.81s/it, lr=1e-5, step_loss=0.0649]
Steps: 1%|▏ | 14841/1000000 [4:40:59<2924:16:13, 10.69s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [14841], local_loss=0.02936621569097042, train_loss=0.037061143666505814, time_cost=1.268625020980835
+
Steps: 1%|▏ | 14841/1000000 [4:40:59<2924:16:13, 10.69s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%|▏ | 14842/1000000 [4:41:03<2411:50:40, 8.81s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [14842], local_loss=0.05391436070203781, train_loss=0.05640628933906555, time_cost=1.3912291526794434
+
Steps: 1%|▏ | 14842/1000000 [4:41:03<2411:50:40, 8.81s/it, lr=1e-5, step_loss=0.0539]
Steps: 1%|▏ | 14843/1000000 [4:41:09<2162:17:56, 7.90s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [14843], local_loss=0.9915362000465393, train_loss=0.14502274990081787, time_cost=1.3004136085510254
+
Steps: 1%|▏ | 14843/1000000 [4:41:09<2162:17:56, 7.90s/it, lr=1e-5, step_loss=0.992]
Steps: 1%|▏ | 14844/1000000 [4:41:26<2932:05:45, 10.71s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [14844], local_loss=0.01064026728272438, train_loss=0.0230155810713768, time_cost=5.1924803256988525
+
Steps: 1%|▏ | 14844/1000000 [4:41:26<2932:05:45, 10.71s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 14845/1000000 [4:41:34<2701:13:13, 9.87s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [14845], local_loss=0.004120511468499899, train_loss=0.02271994575858116, time_cost=3.84908127784729
+
Steps: 1%|▏ | 14845/1000000 [4:41:34<2701:13:13, 9.87s/it, lr=1e-5, step_loss=0.00412]
Steps: 1%|▏ | 14846/1000000 [4:41:40<2420:47:04, 8.85s/it, lr=1e-5, step_loss=0.00412][RANK-0]: Step: [14846], local_loss=0.10944651067256927, train_loss=0.10224325954914093, time_cost=2.684401512145996
+
Steps: 1%|▏ | 14846/1000000 [4:41:40<2420:47:04, 8.85s/it, lr=1e-5, step_loss=0.109]
Steps: 1%|▏ | 14847/1000000 [4:41:51<2533:45:28, 9.26s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [14847], local_loss=0.009811503812670708, train_loss=0.05238841846585274, time_cost=2.2114155292510986
+
Steps: 1%|▏ | 14847/1000000 [4:41:51<2533:45:28, 9.26s/it, lr=1e-5, step_loss=0.00981]
Steps: 1%|▏ | 14848/1000000 [4:42:01<2656:19:53, 9.71s/it, lr=1e-5, step_loss=0.00981][RANK-0]: Step: [14848], local_loss=0.008690925315022469, train_loss=0.1441684365272522, time_cost=3.019426107406616
+
Steps: 1%|▏ | 14848/1000000 [4:42:01<2656:19:53, 9.71s/it, lr=1e-5, step_loss=0.00869]
Steps: 1%|▏ | 14849/1000000 [4:42:12<2692:50:00, 9.84s/it, lr=1e-5, step_loss=0.00869][RANK-0]: Step: [14849], local_loss=0.004557411652058363, train_loss=0.054172225296497345, time_cost=3.872861385345459
+
Steps: 1%|▏ | 14849/1000000 [4:42:12<2692:50:00, 9.84s/it, lr=1e-5, step_loss=0.00456]
Steps: 1%|▏ | 14850/1000000 [4:42:24<2917:13:33, 10.66s/it, lr=1e-5, step_loss=0.00456][RANK-0]: Step: [14850], local_loss=0.020394615828990936, train_loss=0.039784468710422516, time_cost=8.588949918746948
+
Steps: 1%|▏ | 14850/1000000 [4:42:24<2917:13:33, 10.66s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 14851/1000000 [4:42:29<2435:52:46, 8.90s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [14851], local_loss=0.029995985329151154, train_loss=0.0241068247705698, time_cost=2.3414998054504395
+
Steps: 1%|▏ | 14851/1000000 [4:42:29<2435:52:46, 8.90s/it, lr=1e-5, step_loss=0.03]
Steps: 1%|▏ | 14852/1000000 [4:42:46<3105:40:13, 11.35s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [14852], local_loss=0.03151186555624008, train_loss=0.02915005013346672, time_cost=9.544118404388428
+
Steps: 1%|▏ | 14852/1000000 [4:42:46<3105:40:13, 11.35s/it, lr=1e-5, step_loss=0.0315]
Steps: 1%|▏ | 14853/1000000 [4:42:50<2525:11:39, 9.23s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [14853], local_loss=0.01286035031080246, train_loss=0.03426756337285042, time_cost=1.4227573871612549
+
Steps: 1%|▏ | 14853/1000000 [4:42:50<2525:11:39, 9.23s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14854/1000000 [4:42:55<2135:35:50, 7.80s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14854], local_loss=0.04536835104227066, train_loss=0.03085096925497055, time_cost=1.5889694690704346
+
Steps: 1%|▏ | 14854/1000000 [4:42:55<2135:35:50, 7.80s/it, lr=1e-5, step_loss=0.0454]
Steps: 1%|▏ | 14855/1000000 [4:43:03<2163:01:27, 7.90s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [14855], local_loss=0.018957801163196564, train_loss=0.060819678008556366, time_cost=4.62339186668396
+
Steps: 1%|▏ | 14855/1000000 [4:43:03<2163:01:27, 7.90s/it, lr=1e-5, step_loss=0.019]
Steps: 1%|▏ | 14856/1000000 [4:43:13<2374:34:44, 8.68s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [14856], local_loss=0.3215089738368988, train_loss=0.07736272364854813, time_cost=1.8554739952087402
+
Steps: 1%|▏ | 14856/1000000 [4:43:13<2374:34:44, 8.68s/it, lr=1e-5, step_loss=0.322]
Steps: 1%|▏ | 14857/1000000 [4:43:19<2085:54:59, 7.62s/it, lr=1e-5, step_loss=0.322][RANK-0]: Step: [14857], local_loss=0.010412374511361122, train_loss=0.026415303349494934, time_cost=2.189988136291504
+
Steps: 1%|▏ | 14857/1000000 [4:43:19<2085:54:59, 7.62s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 14858/1000000 [4:43:27<2187:26:42, 7.99s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [14858], local_loss=0.0193818099796772, train_loss=0.05466897785663605, time_cost=2.930114269256592
+
Steps: 1%|▏ | 14858/1000000 [4:43:27<2187:26:42, 7.99s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%|▏ | 14859/1000000 [4:43:38<2437:12:17, 8.91s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [14859], local_loss=0.011665517464280128, train_loss=0.02344628795981407, time_cost=1.72764253616333
+
Steps: 1%|▏ | 14859/1000000 [4:43:38<2437:12:17, 8.91s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14860/1000000 [4:43:43<2056:13:13, 7.51s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14860], local_loss=0.014888815581798553, train_loss=0.05460862070322037, time_cost=1.2703399658203125
+
Steps: 1%|▏ | 14860/1000000 [4:43:43<2056:13:13, 7.51s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 14861/1000000 [4:43:51<2083:24:32, 7.61s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [14861], local_loss=0.02450750209391117, train_loss=0.16053202748298645, time_cost=2.1387102603912354
+
Steps: 1%|▏ | 14861/1000000 [4:43:51<2083:24:32, 7.61s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%|▏ | 14862/1000000 [4:43:59<2167:38:38, 7.92s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [14862], local_loss=0.03185471147298813, train_loss=0.147830992937088, time_cost=1.2181892395019531
+
Steps: 1%|▏ | 14862/1000000 [4:43:59<2167:38:38, 7.92s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%|▏ | 14863/1000000 [4:44:05<1978:17:14, 7.23s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [14863], local_loss=0.01168286893516779, train_loss=0.1738828718662262, time_cost=2.9378087520599365
+
Steps: 1%|▏ | 14863/1000000 [4:44:05<1978:17:14, 7.23s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14864/1000000 [4:44:12<1950:37:36, 7.13s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14864], local_loss=0.23220379650592804, train_loss=0.06015945225954056, time_cost=2.933713674545288
+
Steps: 1%|▏ | 14864/1000000 [4:44:12<1950:37:36, 7.13s/it, lr=1e-5, step_loss=0.232]
Steps: 1%|▏ | 14865/1000000 [4:44:24<2395:07:43, 8.75s/it, lr=1e-5, step_loss=0.232][RANK-0]: Step: [14865], local_loss=0.018582938238978386, train_loss=0.031575486063957214, time_cost=4.049297571182251
+
Steps: 1%|▏ | 14865/1000000 [4:44:24<2395:07:43, 8.75s/it, lr=1e-5, step_loss=0.0186]
Steps: 1%|▏ | 14866/1000000 [4:44:31<2234:56:27, 8.17s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [14866], local_loss=0.010844404809176922, train_loss=0.01850321516394615, time_cost=1.991137981414795
+
Steps: 1%|▏ | 14866/1000000 [4:44:31<2234:56:27, 8.17s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 14867/1000000 [4:44:36<1984:46:09, 7.25s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [14867], local_loss=0.09466363489627838, train_loss=0.038755107671022415, time_cost=2.0824391841888428
+
Steps: 1%|▏ | 14867/1000000 [4:44:36<1984:46:09, 7.25s/it, lr=1e-5, step_loss=0.0947]
Steps: 1%|▏ | 14868/1000000 [4:44:47<2313:11:28, 8.45s/it, lr=1e-5, step_loss=0.0947][RANK-0]: Step: [14868], local_loss=0.07670803368091583, train_loss=0.05899801850318909, time_cost=3.045818567276001
+
Steps: 1%|▏ | 14868/1000000 [4:44:47<2313:11:28, 8.45s/it, lr=1e-5, step_loss=0.0767]
Steps: 1%|▏ | 14869/1000000 [4:45:02<2815:00:41, 10.29s/it, lr=1e-5, step_loss=0.0767][RANK-0]: Step: [14869], local_loss=0.014161542989313602, train_loss=0.012871958315372467, time_cost=6.033977031707764
+
Steps: 1%|▏ | 14869/1000000 [4:45:02<2815:00:41, 10.29s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%|▏ | 14870/1000000 [4:45:07<2395:00:50, 8.75s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [14870], local_loss=0.008356627076864243, train_loss=31.253324508666992, time_cost=2.3718066215515137
+
Steps: 1%|▏ | 14870/1000000 [4:45:07<2395:00:50, 8.75s/it, lr=1e-5, step_loss=0.00836]
Steps: 1%|▏ | 14871/1000000 [4:45:18<2540:08:13, 9.28s/it, lr=1e-5, step_loss=0.00836][RANK-0]: Step: [14871], local_loss=0.01099596917629242, train_loss=43.17817306518555, time_cost=2.065843105316162
+
Steps: 1%|▏ | 14871/1000000 [4:45:18<2540:08:13, 9.28s/it, lr=1e-5, step_loss=0.011]
Steps: 1%|▏ | 14872/1000000 [4:45:22<2152:45:23, 7.87s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [14872], local_loss=0.011485685594379902, train_loss=0.07860848307609558, time_cost=2.1320953369140625
+
Steps: 1%|▏ | 14872/1000000 [4:45:22<2152:45:23, 7.87s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%|▏ | 14873/1000000 [4:45:33<2359:03:36, 8.62s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [14873], local_loss=0.00956951268017292, train_loss=0.059486325830221176, time_cost=3.9265198707580566
+
Steps: 1%|▏ | 14873/1000000 [4:45:33<2359:03:36, 8.62s/it, lr=1e-5, step_loss=0.00957]
Steps: 1%|▏ | 14874/1000000 [4:45:44<2586:00:45, 9.45s/it, lr=1e-5, step_loss=0.00957][RANK-0]: Step: [14874], local_loss=0.09483054280281067, train_loss=0.032478779554367065, time_cost=1.213463306427002
+
Steps: 1%|▏ | 14874/1000000 [4:45:44<2586:00:45, 9.45s/it, lr=1e-5, step_loss=0.0948]
Steps: 1%|▏ | 14875/1000000 [4:45:57<2868:36:33, 10.48s/it, lr=1e-5, step_loss=0.0948][RANK-0]: Step: [14875], local_loss=0.004477085545659065, train_loss=0.029415035620331764, time_cost=1.2315211296081543
+
Steps: 1%|▏ | 14875/1000000 [4:45:57<2868:36:33, 10.48s/it, lr=1e-5, step_loss=0.00448]
Steps: 1%|▏ | 14876/1000000 [4:46:04<2630:25:03, 9.61s/it, lr=1e-5, step_loss=0.00448][RANK-0]: Step: [14876], local_loss=0.2974267303943634, train_loss=0.06547889858484268, time_cost=2.129624128341675
+
Steps: 1%|▏ | 14876/1000000 [4:46:04<2630:25:03, 9.61s/it, lr=1e-5, step_loss=0.297]
Steps: 1%|▏ | 14877/1000000 [4:46:09<2255:21:02, 8.24s/it, lr=1e-5, step_loss=0.297][RANK-0]: Step: [14877], local_loss=0.006797281093895435, train_loss=0.08902861922979355, time_cost=2.277390718460083
+
Steps: 1%|▏ | 14877/1000000 [4:46:10<2255:21:02, 8.24s/it, lr=1e-5, step_loss=0.0068]
Steps: 1%|▏ | 14878/1000000 [4:46:24<2783:56:21, 10.17s/it, lr=1e-5, step_loss=0.0068][RANK-0]: Step: [14878], local_loss=0.008729508146643639, train_loss=0.03729673847556114, time_cost=6.384620904922485
+
Steps: 1%|▏ | 14878/1000000 [4:46:24<2783:56:21, 10.17s/it, lr=1e-5, step_loss=0.00873]
Steps: 1%|▏ | 14879/1000000 [4:46:38<3113:17:46, 11.38s/it, lr=1e-5, step_loss=0.00873][RANK-0]: Step: [14879], local_loss=0.003822814440354705, train_loss=0.07443229854106903, time_cost=5.392857313156128
+
Steps: 1%|▏ | 14879/1000000 [4:46:38<3113:17:46, 11.38s/it, lr=1e-5, step_loss=0.00382]
Steps: 1%|▏ | 14880/1000000 [4:46:51<3244:08:25, 11.86s/it, lr=1e-5, step_loss=0.00382][RANK-0]: Step: [14880], local_loss=0.023162072524428368, train_loss=0.07429881393909454, time_cost=1.8364183902740479
+
Steps: 1%|▏ | 14880/1000000 [4:46:51<3244:08:25, 11.86s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%|▏ | 14881/1000000 [4:46:57<2775:57:32, 10.14s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [14881], local_loss=0.0033265589736402035, train_loss=0.045768268406391144, time_cost=2.000793695449829
+
Steps: 1%|▏ | 14881/1000000 [4:46:57<2775:57:32, 10.14s/it, lr=1e-5, step_loss=0.00333]
Steps: 1%|▏ | 14882/1000000 [4:47:12<3161:07:09, 11.55s/it, lr=1e-5, step_loss=0.00333][RANK-0]: Step: [14882], local_loss=0.20817150175571442, train_loss=0.058191344141960144, time_cost=5.534129619598389
+
Steps: 1%|▏ | 14882/1000000 [4:47:12<3161:07:09, 11.55s/it, lr=1e-5, step_loss=0.208]
Steps: 1%|▏ | 14883/1000000 [4:47:22<3025:14:58, 11.06s/it, lr=1e-5, step_loss=0.208][RANK-0]: Step: [14883], local_loss=0.004683848936110735, train_loss=0.016470635309815407, time_cost=8.400119304656982
+
Steps: 1%|▏ | 14883/1000000 [4:47:22<3025:14:58, 11.06s/it, lr=1e-5, step_loss=0.00468]
Steps: 1%|▏ | 14884/1000000 [4:47:27<2471:32:35, 9.03s/it, lr=1e-5, step_loss=0.00468][RANK-0]: Step: [14884], local_loss=0.0325736328959465, train_loss=0.03558166325092316, time_cost=1.3867528438568115
+
Steps: 1%|▏ | 14884/1000000 [4:47:27<2471:32:35, 9.03s/it, lr=1e-5, step_loss=0.0326]
Steps: 1%|▏ | 14885/1000000 [4:47:31<2082:57:43, 7.61s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [14885], local_loss=0.06649735569953918, train_loss=0.02406466379761696, time_cost=1.2034509181976318
+
Steps: 1%|▏ | 14885/1000000 [4:47:31<2082:57:43, 7.61s/it, lr=1e-5, step_loss=0.0665]
Steps: 1%|▏ | 14886/1000000 [4:47:35<1802:04:12, 6.59s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [14886], local_loss=0.012897252105176449, train_loss=0.019658301025629044, time_cost=1.5552630424499512
+
Steps: 1%|▏ | 14886/1000000 [4:47:35<1802:04:12, 6.59s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%|▏ | 14887/1000000 [4:47:41<1739:37:22, 6.36s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [14887], local_loss=0.035976752638816833, train_loss=0.03292878717184067, time_cost=2.5460376739501953
+
Steps: 1%|▏ | 14887/1000000 [4:47:41<1739:37:22, 6.36s/it, lr=1e-5, step_loss=0.036]
Steps: 1%|▏ | 14888/1000000 [4:47:52<2131:11:20, 7.79s/it, lr=1e-5, step_loss=0.036][RANK-0]: Step: [14888], local_loss=0.03590158745646477, train_loss=0.039994653314352036, time_cost=2.4313416481018066
+
Steps: 1%|▏ | 14888/1000000 [4:47:52<2131:11:20, 7.79s/it, lr=1e-5, step_loss=0.0359]
Steps: 1%|▏ | 14889/1000000 [4:48:05<2583:21:46, 9.44s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [14889], local_loss=0.07831653952598572, train_loss=0.07773766666650772, time_cost=4.103408336639404
+
Steps: 1%|▏ | 14889/1000000 [4:48:05<2583:21:46, 9.44s/it, lr=1e-5, step_loss=0.0783]
Steps: 1%|▏ | 14890/1000000 [4:48:13<2454:40:28, 8.97s/it, lr=1e-5, step_loss=0.0783][RANK-0]: Step: [14890], local_loss=0.011599493212997913, train_loss=0.1914181113243103, time_cost=2.2278833389282227
+
Steps: 1%|▏ | 14890/1000000 [4:48:13<2454:40:28, 8.97s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%|▏ | 14891/1000000 [4:48:18<2136:16:04, 7.81s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [14891], local_loss=0.03936706855893135, train_loss=0.04050739109516144, time_cost=2.1330044269561768
+
Steps: 1%|▏ | 14891/1000000 [4:48:18<2136:16:04, 7.81s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%|▏ | 14892/1000000 [4:48:24<1964:44:22, 7.18s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [14892], local_loss=0.010369147174060345, train_loss=15.301007270812988, time_cost=2.7393319606781006
+
Steps: 1%|▏ | 14892/1000000 [4:48:24<1964:44:22, 7.18s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%|▏ | 14893/1000000 [4:48:38<2526:15:54, 9.23s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [14893], local_loss=0.03423941880464554, train_loss=0.08997899293899536, time_cost=5.656130313873291
+
Steps: 1%|▏ | 14893/1000000 [4:48:38<2526:15:54, 9.23s/it, lr=1e-5, step_loss=0.0342]
Steps: 1%|▏ | 14894/1000000 [4:48:50<2719:09:53, 9.94s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [14894], local_loss=0.007291649002581835, train_loss=0.020896239206194878, time_cost=3.0508487224578857
+
Steps: 1%|▏ | 14894/1000000 [4:48:50<2719:09:53, 9.94s/it, lr=1e-5, step_loss=0.00729]
Steps: 1%|▏ | 14895/1000000 [4:49:00<2776:04:17, 10.14s/it, lr=1e-5, step_loss=0.00729][RANK-0]: Step: [14895], local_loss=0.010838943533599377, train_loss=0.020304029807448387, time_cost=1.2332305908203125
+
Steps: 1%|▏ | 14895/1000000 [4:49:00<2776:04:17, 10.14s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%|▏ | 14896/1000000 [4:49:11<2866:01:10, 10.47s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [14896], local_loss=0.008966566063463688, train_loss=0.01889868453145027, time_cost=1.9922287464141846
+
Steps: 1%|▏ | 14896/1000000 [4:49:11<2866:01:10, 10.47s/it, lr=1e-5, step_loss=0.00897]
Steps: 1%|▏ | 14897/1000000 [4:49:18<2553:52:27, 9.33s/it, lr=1e-5, step_loss=0.00897][RANK-0]: Step: [14897], local_loss=0.012420487590134144, train_loss=0.01929599791765213, time_cost=1.9348959922790527
+
Steps: 1%|▏ | 14897/1000000 [4:49:18<2553:52:27, 9.33s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%|▏ | 14898/1000000 [4:49:24<2231:18:45, 8.15s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [14898], local_loss=0.07140347361564636, train_loss=0.03905991464853287, time_cost=1.2125623226165771
+
Steps: 1%|▏ | 14898/1000000 [4:49:24<2231:18:45, 8.15s/it, lr=1e-5, step_loss=0.0714]
Steps: 1%|▏ | 14899/1000000 [4:49:28<1908:27:10, 6.97s/it, lr=1e-5, step_loss=0.0714][RANK-0]: Step: [14899], local_loss=0.08165682107210159, train_loss=0.030667323619127274, time_cost=1.5373280048370361
+
Steps: 1%|▏ | 14899/1000000 [4:49:28<1908:27:10, 6.97s/it, lr=1e-5, step_loss=0.0817]
Steps: 1%|▏ | 14900/1000000 [4:49:44<2638:58:39, 9.64s/it, lr=1e-5, step_loss=0.0817][RANK-0]: Step: [14900], local_loss=0.19004124402999878, train_loss=0.09126138687133789, time_cost=9.005156517028809
+
Steps: 1%|▏ | 14900/1000000 [4:49:44<2638:58:39, 9.64s/it, lr=1e-5, step_loss=0.19]
Steps: 1%|▏ | 14901/1000000 [4:49:51<2486:10:20, 9.09s/it, lr=1e-5, step_loss=0.19][RANK-0]: Step: [14901], local_loss=0.009863552637398243, train_loss=0.02184509113430977, time_cost=4.028268098831177
+
Steps: 1%|▏ | 14901/1000000 [4:49:51<2486:10:20, 9.09s/it, lr=1e-5, step_loss=0.00986]
Steps: 1%|▏ | 14902/1000000 [4:50:05<2821:20:05, 10.31s/it, lr=1e-5, step_loss=0.00986][RANK-0]: Step: [14902], local_loss=0.044554226100444794, train_loss=0.05196381360292435, time_cost=4.623145341873169
+
Steps: 1%|▏ | 14902/1000000 [4:50:05<2821:20:05, 10.31s/it, lr=1e-5, step_loss=0.0446]
Steps: 1%|▏ | 14903/1000000 [4:50:19<3159:11:46, 11.55s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [14903], local_loss=0.014319363050162792, train_loss=0.0698741152882576, time_cost=4.879827976226807
+
Steps: 1%|▏ | 14903/1000000 [4:50:19<3159:11:46, 11.55s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%|▏ | 14904/1000000 [4:50:25<2693:16:46, 9.84s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [14904], local_loss=0.004913366865366697, train_loss=0.015425808727741241, time_cost=1.9921445846557617
+
Steps: 1%|▏ | 14904/1000000 [4:50:25<2693:16:46, 9.84s/it, lr=1e-5, step_loss=0.00491]
Steps: 1%|▏ | 14905/1000000 [4:50:33<2574:00:25, 9.41s/it, lr=1e-5, step_loss=0.00491][RANK-0]: Step: [14905], local_loss=0.011673720553517342, train_loss=0.04447215795516968, time_cost=2.658564805984497
+
Steps: 1%|▏ | 14905/1000000 [4:50:33<2574:00:25, 9.41s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%|▏ | 14906/1000000 [4:50:41<2452:43:21, 8.96s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [14906], local_loss=0.007256001699715853, train_loss=0.05185055732727051, time_cost=6.359926462173462
+
Steps: 1%|▏ | 14906/1000000 [4:50:41<2452:43:21, 8.96s/it, lr=1e-5, step_loss=0.00726]
Steps: 1%|▏ | 14907/1000000 [4:50:54<2749:47:28, 10.05s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [14907], local_loss=0.025612955912947655, train_loss=0.029528824612498283, time_cost=3.260500431060791
+
Steps: 1%|▏ | 14907/1000000 [4:50:54<2749:47:28, 10.05s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%|▏ | 14908/1000000 [4:51:08<3068:12:25, 11.21s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [14908], local_loss=0.009396294131875038, train_loss=0.01834207773208618, time_cost=1.266603708267212
+
Steps: 1%|▏ | 14908/1000000 [4:51:08<3068:12:25, 11.21s/it, lr=1e-5, step_loss=0.0094]
Steps: 1%|▏ | 14909/1000000 [4:51:14<2700:12:26, 9.87s/it, lr=1e-5, step_loss=0.0094][RANK-0]: Step: [14909], local_loss=0.0065124304965138435, train_loss=0.03529401123523712, time_cost=2.8514115810394287
+
Steps: 1%|▏ | 14909/1000000 [4:51:14<2700:12:26, 9.87s/it, lr=1e-5, step_loss=0.00651]
Steps: 1%|▏ | 14910/1000000 [4:51:23<2620:38:36, 9.58s/it, lr=1e-5, step_loss=0.00651][RANK-0]: Step: [14910], local_loss=0.03219713270664215, train_loss=0.057958174496889114, time_cost=2.807530164718628
+
Steps: 1%|▏ | 14910/1000000 [4:51:23<2620:38:36, 9.58s/it, lr=1e-5, step_loss=0.0322]
Steps: 1%|▏ | 14911/1000000 [4:51:30<2405:32:38, 8.79s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [14911], local_loss=0.008725691586732864, train_loss=0.04367058724164963, time_cost=1.4010860919952393
+
Steps: 1%|▏ | 14911/1000000 [4:51:30<2405:32:38, 8.79s/it, lr=1e-5, step_loss=0.00873]
Steps: 1%|▏ | 14912/1000000 [4:51:36<2119:30:23, 7.75s/it, lr=1e-5, step_loss=0.00873][RANK-0]: Step: [14912], local_loss=0.029493872076272964, train_loss=0.03331279754638672, time_cost=2.188317060470581
+
Steps: 1%|▏ | 14912/1000000 [4:51:36<2119:30:23, 7.75s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%|▏ | 14913/1000000 [4:51:51<2749:19:03, 10.05s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [14913], local_loss=0.0455271378159523, train_loss=0.022016607224941254, time_cost=3.363168954849243
+
Steps: 1%|▏ | 14913/1000000 [4:51:51<2749:19:03, 10.05s/it, lr=1e-5, step_loss=0.0455]
Steps: 1%|▏ | 14914/1000000 [4:52:02<2793:59:48, 10.21s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [14914], local_loss=0.017495330423116684, train_loss=0.013129323720932007, time_cost=4.987297534942627
+
Steps: 1%|▏ | 14914/1000000 [4:52:02<2793:59:48, 10.21s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 14915/1000000 [4:52:12<2770:56:06, 10.13s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [14915], local_loss=0.01388215646147728, train_loss=0.05900370329618454, time_cost=4.380313158035278
+
Steps: 1%|▏ | 14915/1000000 [4:52:12<2770:56:06, 10.13s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%|▏ | 14916/1000000 [4:52:26<3151:05:09, 11.52s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [14916], local_loss=0.009191063232719898, train_loss=0.05086424946784973, time_cost=11.204217433929443
+
Steps: 1%|▏ | 14916/1000000 [4:52:26<3151:05:09, 11.52s/it, lr=1e-5, step_loss=0.00919]
Steps: 1%|▏ | 14917/1000000 [4:52:32<2648:13:40, 9.68s/it, lr=1e-5, step_loss=0.00919][RANK-0]: Step: [14917], local_loss=0.06794412434101105, train_loss=0.021165715530514717, time_cost=1.9440968036651611
+
Steps: 1%|▏ | 14917/1000000 [4:52:32<2648:13:40, 9.68s/it, lr=1e-5, step_loss=0.0679]
Steps: 1%|▏ | 14918/1000000 [4:52:41<2651:57:33, 9.69s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [14918], local_loss=0.009100977331399918, train_loss=0.09711330384016037, time_cost=1.200270414352417
+
Steps: 1%|▏ | 14918/1000000 [4:52:41<2651:57:33, 9.69s/it, lr=1e-5, step_loss=0.0091]
Steps: 1%|▏ | 14919/1000000 [4:52:46<2268:12:22, 8.29s/it, lr=1e-5, step_loss=0.0091][RANK-0]: Step: [14919], local_loss=0.13404610753059387, train_loss=0.040609899908304214, time_cost=2.076732873916626
+
Steps: 1%|▏ | 14919/1000000 [4:52:46<2268:12:22, 8.29s/it, lr=1e-5, step_loss=0.134]
Steps: 1%|▏ | 14920/1000000 [4:52:53<2163:25:09, 7.91s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [14920], local_loss=0.017527960240840912, train_loss=0.01893514394760132, time_cost=3.3025062084198
+
Steps: 1%|▏ | 14920/1000000 [4:52:53<2163:25:09, 7.91s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 14921/1000000 [4:53:07<2635:42:57, 9.63s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [14921], local_loss=0.989104688167572, train_loss=0.25178465247154236, time_cost=3.6236562728881836
+
Steps: 1%|▏ | 14921/1000000 [4:53:07<2635:42:57, 9.63s/it, lr=1e-5, step_loss=0.989]
Steps: 1%|▏ | 14922/1000000 [4:53:12<2263:40:09, 8.27s/it, lr=1e-5, step_loss=0.989][RANK-0]: Step: [14922], local_loss=0.005381550639867783, train_loss=0.03906988352537155, time_cost=2.274428129196167
+
Steps: 1%|▏ | 14922/1000000 [4:53:12<2263:40:09, 8.27s/it, lr=1e-5, step_loss=0.00538]
Steps: 1%|▏ | 14923/1000000 [4:53:20<2241:50:00, 8.19s/it, lr=1e-5, step_loss=0.00538][RANK-0]: Step: [14923], local_loss=0.06275425851345062, train_loss=0.07923828065395355, time_cost=2.518515110015869
+
Steps: 1%|▏ | 14923/1000000 [4:53:20<2241:50:00, 8.19s/it, lr=1e-5, step_loss=0.0628]
Steps: 1%|▏ | 14924/1000000 [4:53:31<2475:14:00, 9.05s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [14924], local_loss=0.022150453180074692, train_loss=0.03947938233613968, time_cost=1.2339355945587158
+
Steps: 1%|▏ | 14924/1000000 [4:53:31<2475:14:00, 9.05s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%|▏ | 14925/1000000 [4:53:42<2628:14:40, 9.61s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [14925], local_loss=0.05998176336288452, train_loss=0.031015988439321518, time_cost=1.931788682937622
+
Steps: 1%|▏ | 14925/1000000 [4:53:42<2628:14:40, 9.61s/it, lr=1e-5, step_loss=0.06]
Steps: 1%|▏ | 14926/1000000 [4:53:47<2201:53:16, 8.05s/it, lr=1e-5, step_loss=0.06][RANK-0]: Step: [14926], local_loss=0.008434812538325787, train_loss=0.056858599185943604, time_cost=1.7905807495117188
+
Steps: 1%|▏ | 14926/1000000 [4:53:47<2201:53:16, 8.05s/it, lr=1e-5, step_loss=0.00843]
Steps: 1%|▏ | 14927/1000000 [4:54:02<2830:13:38, 10.34s/it, lr=1e-5, step_loss=0.00843][RANK-0]: Step: [14927], local_loss=0.07816984504461288, train_loss=35.160770416259766, time_cost=4.850481033325195
+
Steps: 1%|▏ | 14927/1000000 [4:54:02<2830:13:38, 10.34s/it, lr=1e-5, step_loss=0.0782]
Steps: 1%|▏ | 14928/1000000 [4:54:10<2618:14:20, 9.57s/it, lr=1e-5, step_loss=0.0782][RANK-0]: Step: [14928], local_loss=0.014941250905394554, train_loss=0.020625313743948936, time_cost=1.1936264038085938
+
Steps: 1%|▏ | 14928/1000000 [4:54:10<2618:14:20, 9.57s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%|▏ | 14929/1000000 [4:54:17<2396:11:43, 8.76s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [14929], local_loss=0.5079153776168823, train_loss=0.14052200317382812, time_cost=1.2382590770721436
+
Steps: 1%|▏ | 14929/1000000 [4:54:17<2396:11:43, 8.76s/it, lr=1e-5, step_loss=0.508]
Steps: 1%|▏ | 14930/1000000 [4:54:27<2545:23:16, 9.30s/it, lr=1e-5, step_loss=0.508][RANK-0]: Step: [14930], local_loss=0.01729588769376278, train_loss=0.023306023329496384, time_cost=2.6332755088806152
+
Steps: 1%|▏ | 14930/1000000 [4:54:27<2545:23:16, 9.30s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%|▏ | 14931/1000000 [4:54:35<2415:18:37, 8.83s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [14931], local_loss=0.00836565438657999, train_loss=0.01968005858361721, time_cost=3.4842073917388916
+
Steps: 1%|▏ | 14931/1000000 [4:54:35<2415:18:37, 8.83s/it, lr=1e-5, step_loss=0.00837]
Steps: 1%|▏ | 14932/1000000 [4:54:41<2181:55:25, 7.97s/it, lr=1e-5, step_loss=0.00837][RANK-0]: Step: [14932], local_loss=0.06856310367584229, train_loss=0.032497040927410126, time_cost=4.563480615615845
+
Steps: 1%|▏ | 14932/1000000 [4:54:41<2181:55:25, 7.97s/it, lr=1e-5, step_loss=0.0686]
Steps: 1%|▏ | 14933/1000000 [4:54:46<1949:15:21, 7.12s/it, lr=1e-5, step_loss=0.0686][RANK-0]: Step: [14933], local_loss=0.04252425208687782, train_loss=0.027926165610551834, time_cost=2.0531885623931885
+
Steps: 1%|▏ | 14933/1000000 [4:54:46<1949:15:21, 7.12s/it, lr=1e-5, step_loss=0.0425]
Steps: 1%|▏ | 14934/1000000 [4:54:54<2018:46:24, 7.38s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [14934], local_loss=0.012267474085092545, train_loss=0.015946129336953163, time_cost=6.782374143600464
+
Steps: 1%|▏ | 14934/1000000 [4:54:54<2018:46:24, 7.38s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%|▏ | 14935/1000000 [4:54:59<1827:20:35, 6.68s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [14935], local_loss=0.036382049322128296, train_loss=0.041997164487838745, time_cost=3.8158047199249268
+
Steps: 1%|▏ | 14935/1000000 [4:54:59<1827:20:35, 6.68s/it, lr=1e-5, step_loss=0.0364]
Steps: 1%|▏ | 14936/1000000 [4:55:04<1706:02:30, 6.23s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [14936], local_loss=0.020432639867067337, train_loss=0.013049746863543987, time_cost=4.370388746261597
+
Steps: 1%|▏ | 14936/1000000 [4:55:04<1706:02:30, 6.23s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%|▏ | 14937/1000000 [4:55:13<1872:46:34, 6.84s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [14937], local_loss=0.12094493210315704, train_loss=0.05454851686954498, time_cost=4.454131364822388
+
Steps: 1%|▏ | 14937/1000000 [4:55:13<1872:46:34, 6.84s/it, lr=1e-5, step_loss=0.121]
Steps: 1%|▏ | 14938/1000000 [4:55:25<2331:25:48, 8.52s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [14938], local_loss=0.24203114211559296, train_loss=0.04931742697954178, time_cost=1.9732308387756348
+
Steps: 1%|▏ | 14938/1000000 [4:55:25<2331:25:48, 8.52s/it, lr=1e-5, step_loss=0.242]
Steps: 1%|▏ | 14939/1000000 [4:55:33<2288:57:22, 8.37s/it, lr=1e-5, step_loss=0.242][RANK-0]: Step: [14939], local_loss=0.006249512080103159, train_loss=0.15129759907722473, time_cost=3.748800754547119
+
Steps: 1%|▏ | 14939/1000000 [4:55:33<2288:57:22, 8.37s/it, lr=1e-5, step_loss=0.00625]
Steps: 1%|▏ | 14940/1000000 [4:55:46<2686:38:39, 9.82s/it, lr=1e-5, step_loss=0.00625][RANK-0]: Step: [14940], local_loss=0.016809917986392975, train_loss=0.04817280173301697, time_cost=1.2977352142333984
+
Steps: 1%|▏ | 14940/1000000 [4:55:46<2686:38:39, 9.82s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%|▏ | 14941/1000000 [4:55:58<2813:22:42, 10.28s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [14941], local_loss=0.008415818214416504, train_loss=0.016546715050935745, time_cost=3.2643468379974365
+
Steps: 1%|▏ | 14941/1000000 [4:55:58<2813:22:42, 10.28s/it, lr=1e-5, step_loss=0.00842]
Steps: 1%|▏ | 14942/1000000 [4:56:04<2507:07:51, 9.16s/it, lr=1e-5, step_loss=0.00842][RANK-0]: Step: [14942], local_loss=0.027566513046622276, train_loss=0.030062297359108925, time_cost=2.958451747894287
+
Steps: 1%|▏ | 14942/1000000 [4:56:04<2507:07:51, 9.16s/it, lr=1e-5, step_loss=0.0276]
Steps: 1%|▏ | 14943/1000000 [4:56:16<2746:30:43, 10.04s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [14943], local_loss=0.009646369144320488, train_loss=0.14589770138263702, time_cost=4.544978618621826
+
Steps: 1%|▏ | 14943/1000000 [4:56:16<2746:30:43, 10.04s/it, lr=1e-5, step_loss=0.00965]
Steps: 1%|▏ | 14944/1000000 [4:56:22<2353:33:36, 8.60s/it, lr=1e-5, step_loss=0.00965][RANK-0]: Step: [14944], local_loss=0.005416546948254108, train_loss=0.09972669184207916, time_cost=2.234903335571289
+
Steps: 1%|▏ | 14944/1000000 [4:56:22<2353:33:36, 8.60s/it, lr=1e-5, step_loss=0.00542]
Steps: 1%|▏ | 14945/1000000 [4:56:27<2076:35:09, 7.59s/it, lr=1e-5, step_loss=0.00542][RANK-0]: Step: [14945], local_loss=0.08491826802492142, train_loss=0.06360266357660294, time_cost=2.6016452312469482
+
Steps: 1%|▏ | 14945/1000000 [4:56:27<2076:35:09, 7.59s/it, lr=1e-5, step_loss=0.0849]
Steps: 1%|▏ | 14946/1000000 [4:56:40<2551:49:47, 9.33s/it, lr=1e-5, step_loss=0.0849][RANK-0]: Step: [14946], local_loss=0.02817263826727867, train_loss=0.06260805577039719, time_cost=1.2272987365722656
+
Steps: 1%|▏ | 14946/1000000 [4:56:40<2551:49:47, 9.33s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%|▏ | 14947/1000000 [4:56:51<2672:32:28, 9.77s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [14947], local_loss=0.00894880574196577, train_loss=0.023342406377196312, time_cost=3.4099974632263184
+
Steps: 1%|▏ | 14947/1000000 [4:56:51<2672:32:28, 9.77s/it, lr=1e-5, step_loss=0.00895]
Steps: 1%|▏ | 14948/1000000 [4:57:05<3042:50:21, 11.12s/it, lr=1e-5, step_loss=0.00895][RANK-0]: Step: [14948], local_loss=0.04030827432870865, train_loss=0.04945296049118042, time_cost=4.3191444873809814
+
Steps: 1%|▏ | 14948/1000000 [4:57:05<3042:50:21, 11.12s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%|▏ | 14949/1000000 [4:57:18<3186:09:02, 11.64s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [14949], local_loss=0.007608009502291679, train_loss=0.08632191270589828, time_cost=3.775759220123291
+
Steps: 1%|▏ | 14949/1000000 [4:57:18<3186:09:02, 11.64s/it, lr=1e-5, step_loss=0.00761]
Steps: 1%|▏ | 14950/1000000 [4:57:31<3267:10:24, 11.94s/it, lr=1e-5, step_loss=0.00761][RANK-0]: Step: [14950], local_loss=0.03574813902378082, train_loss=0.030340904369950294, time_cost=3.2026710510253906
+
Steps: 1%|▏ | 14950/1000000 [4:57:31<3267:10:24, 11.94s/it, lr=1e-5, step_loss=0.0357]
Steps: 1%|▏ | 14951/1000000 [4:57:38<2880:29:44, 10.53s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [14951], local_loss=0.02318013831973076, train_loss=0.018386226147413254, time_cost=6.159324407577515
+
Steps: 1%|▏ | 14951/1000000 [4:57:38<2880:29:44, 10.53s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%|▏ | 14952/1000000 [4:57:44<2498:59:50, 9.13s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [14952], local_loss=0.01058135461062193, train_loss=15.130525588989258, time_cost=3.077847957611084
+
Steps: 1%|▏ | 14952/1000000 [4:57:44<2498:59:50, 9.13s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%|▏ | 14953/1000000 [4:57:51<2336:38:14, 8.54s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [14953], local_loss=0.1592450588941574, train_loss=0.10258179157972336, time_cost=2.5939691066741943
+
Steps: 1%|▏ | 14953/1000000 [4:57:51<2336:38:14, 8.54s/it, lr=1e-5, step_loss=0.159]
Steps: 1%|▏ | 14954/1000000 [4:57:55<1979:58:28, 7.24s/it, lr=1e-5, step_loss=0.159][RANK-0]: Step: [14954], local_loss=0.045372866094112396, train_loss=0.02127995528280735, time_cost=1.2246425151824951
+
Steps: 1%|▏ | 14954/1000000 [4:57:55<1979:58:28, 7.24s/it, lr=1e-5, step_loss=0.0454]
Steps: 1%|▏ | 14955/1000000 [4:58:06<2276:15:11, 8.32s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [14955], local_loss=0.027692899107933044, train_loss=0.021026823669672012, time_cost=3.3313424587249756
+
Steps: 1%|▏ | 14955/1000000 [4:58:06<2276:15:11, 8.32s/it, lr=1e-5, step_loss=0.0277]
Steps: 1%|▏ | 14956/1000000 [4:58:18<2546:54:36, 9.31s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [14956], local_loss=0.008913141675293446, train_loss=0.045163676142692566, time_cost=8.511451482772827
+
Steps: 1%|▏ | 14956/1000000 [4:58:18<2546:54:36, 9.31s/it, lr=1e-5, step_loss=0.00891]
Steps: 1%|▏ | 14957/1000000 [4:58:31<2890:34:35, 10.56s/it, lr=1e-5, step_loss=0.00891][RANK-0]: Step: [14957], local_loss=0.3675192594528198, train_loss=0.21551942825317383, time_cost=5.136701345443726
+
Steps: 1%|▏ | 14957/1000000 [4:58:31<2890:34:35, 10.56s/it, lr=1e-5, step_loss=0.368]
Steps: 1%|▏ | 14958/1000000 [4:58:37<2459:15:26, 8.99s/it, lr=1e-5, step_loss=0.368][RANK-0]: Step: [14958], local_loss=0.0027796360664069653, train_loss=0.01196802593767643, time_cost=1.7749042510986328
+
Steps: 1%|▏ | 14958/1000000 [4:58:37<2459:15:26, 8.99s/it, lr=1e-5, step_loss=0.00278]
Steps: 1%|▏ | 14959/1000000 [4:58:48<2651:08:09, 9.69s/it, lr=1e-5, step_loss=0.00278][RANK-0]: Step: [14959], local_loss=0.006381568964570761, train_loss=0.10289501398801804, time_cost=2.1055212020874023
+
Steps: 1%|▏ | 14959/1000000 [4:58:48<2651:08:09, 9.69s/it, lr=1e-5, step_loss=0.00638]
Steps: 1%|▏ | 14960/1000000 [4:58:54<2371:02:58, 8.67s/it, lr=1e-5, step_loss=0.00638][RANK-0]: Step: [14960], local_loss=0.09144099801778793, train_loss=8.827804565429688, time_cost=1.2809526920318604
+
Steps: 1%|▏ | 14960/1000000 [4:58:54<2371:02:58, 8.67s/it, lr=1e-5, step_loss=0.0914]
Steps: 1%|▏ | 14961/1000000 [4:59:05<2552:45:48, 9.33s/it, lr=1e-5, step_loss=0.0914][RANK-0]: Step: [14961], local_loss=0.01751765049993992, train_loss=0.03839963674545288, time_cost=1.2013821601867676
+
Steps: 1%|▏ | 14961/1000000 [4:59:05<2552:45:48, 9.33s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%|▏ | 14962/1000000 [4:59:14<2548:30:02, 9.31s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [14962], local_loss=0.005634406581521034, train_loss=0.010674634017050266, time_cost=1.778726577758789
+
Steps: 1%|▏ | 14962/1000000 [4:59:14<2548:30:02, 9.31s/it, lr=1e-5, step_loss=0.00563]
Steps: 1%|▏ | 14963/1000000 [4:59:23<2528:32:45, 9.24s/it, lr=1e-5, step_loss=0.00563][RANK-0]: Step: [14963], local_loss=0.018023649230599403, train_loss=0.021402690559625626, time_cost=2.508854866027832
+
Steps: 1%|▏ | 14963/1000000 [4:59:23<2528:32:45, 9.24s/it, lr=1e-5, step_loss=0.018]
Steps: 1%|▏ | 14964/1000000 [4:59:36<2803:11:14, 10.24s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [14964], local_loss=0.014684857800602913, train_loss=0.07730332762002945, time_cost=1.2960100173950195
+
Steps: 1%|▏ | 14964/1000000 [4:59:36<2803:11:14, 10.24s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%|▏ | 14965/1000000 [4:59:43<2561:54:21, 9.36s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [14965], local_loss=0.053594063967466354, train_loss=0.02234571799635887, time_cost=2.924541473388672
+
Steps: 1%|▏ | 14965/1000000 [4:59:43<2561:54:21, 9.36s/it, lr=1e-5, step_loss=0.0536]
Steps: 1%|▏ | 14966/1000000 [4:59:49<2296:02:21, 8.39s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [14966], local_loss=0.01851709373295307, train_loss=0.14847558736801147, time_cost=3.299039125442505
+
Steps: 1%|▏ | 14966/1000000 [4:59:49<2296:02:21, 8.39s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%|▏ | 14967/1000000 [5:00:01<2523:23:58, 9.22s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [14967], local_loss=0.03385856747627258, train_loss=0.043002352118492126, time_cost=2.7067456245422363
+
Steps: 1%|▏ | 14967/1000000 [5:00:01<2523:23:58, 9.22s/it, lr=1e-5, step_loss=0.0339]
Steps: 1%|▏ | 14968/1000000 [5:00:17<3132:08:53, 11.45s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [14968], local_loss=0.005345182493329048, train_loss=0.07653853297233582, time_cost=8.044683933258057
+
Steps: 1%|▏ | 14968/1000000 [5:00:17<3132:08:53, 11.45s/it, lr=1e-5, step_loss=0.00535]
Steps: 1%|▏ | 14969/1000000 [5:00:31<3326:28:42, 12.16s/it, lr=1e-5, step_loss=0.00535][RANK-0]: Step: [14969], local_loss=0.011761806905269623, train_loss=0.046815820038318634, time_cost=4.3030009269714355
+
Steps: 1%|▏ | 14969/1000000 [5:00:31<3326:28:42, 12.16s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%|▏ | 14970/1000000 [5:00:42<3261:14:03, 11.92s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [14970], local_loss=0.029513167217373848, train_loss=0.06903909891843796, time_cost=1.7578997611999512
+
Steps: 1%|▏ | 14970/1000000 [5:00:42<3261:14:03, 11.92s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%|▏ | 14971/1000000 [5:00:56<3414:46:39, 12.48s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [14971], local_loss=0.022537769749760628, train_loss=0.0354018434882164, time_cost=4.771652698516846
+
Steps: 1%|▏ | 14971/1000000 [5:00:56<3414:46:39, 12.48s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%|▏ | 14972/1000000 [5:01:01<2758:23:56, 10.08s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [14972], local_loss=0.006871397607028484, train_loss=0.012170154601335526, time_cost=1.6899082660675049
+
Steps: 1%|▏ | 14972/1000000 [5:01:01<2758:23:56, 10.08s/it, lr=1e-5, step_loss=0.00687]
Steps: 1%|▏ | 14973/1000000 [5:01:08<2531:45:11, 9.25s/it, lr=1e-5, step_loss=0.00687][RANK-0]: Step: [14973], local_loss=0.010027647018432617, train_loss=58.16704559326172, time_cost=3.421003580093384
+
Steps: 1%|▏ | 14973/1000000 [5:01:08<2531:45:11, 9.25s/it, lr=1e-5, step_loss=0.01]
Steps: 1%|▏ | 14974/1000000 [5:01:17<2518:33:20, 9.20s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [14974], local_loss=0.0604991689324379, train_loss=0.05613723397254944, time_cost=3.5785183906555176
+
Steps: 1%|▏ | 14974/1000000 [5:01:17<2518:33:20, 9.20s/it, lr=1e-5, step_loss=0.0605]
Steps: 1%|▏ | 14975/1000000 [5:01:30<2803:39:59, 10.25s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [14975], local_loss=0.03658410161733627, train_loss=0.035375092178583145, time_cost=2.8893849849700928
+
Steps: 1%|▏ | 14975/1000000 [5:01:30<2803:39:59, 10.25s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%|▏ | 14976/1000000 [5:01:35<2394:34:49, 8.75s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [14976], local_loss=0.01704169437289238, train_loss=0.1769600361585617, time_cost=2.260026216506958
+
Steps: 1%|▏ | 14976/1000000 [5:01:35<2394:34:49, 8.75s/it, lr=1e-5, step_loss=0.017]
Steps: 1%|▏ | 14977/1000000 [5:01:43<2354:59:37, 8.61s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [14977], local_loss=0.015411701053380966, train_loss=0.05635450780391693, time_cost=7.292575836181641
+
Steps: 1%|▏ | 14977/1000000 [5:01:43<2354:59:37, 8.61s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%|▏ | 14978/1000000 [5:01:52<2364:39:58, 8.64s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [14978], local_loss=0.04235377162694931, train_loss=0.06105021387338638, time_cost=4.598029613494873
+
Steps: 1%|▏ | 14978/1000000 [5:01:52<2364:39:58, 8.64s/it, lr=1e-5, step_loss=0.0424]
Steps: 1%|▏ | 14979/1000000 [5:02:04<2681:16:44, 9.80s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [14979], local_loss=0.0157538503408432, train_loss=0.05819885805249214, time_cost=4.056572914123535
+
Steps: 1%|▏ | 14979/1000000 [5:02:04<2681:16:44, 9.80s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%|▏ | 14980/1000000 [5:02:15<2700:42:17, 9.87s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [14980], local_loss=0.011160544119775295, train_loss=0.03562426567077637, time_cost=2.0903499126434326
+
Steps: 1%|▏ | 14980/1000000 [5:02:15<2700:42:17, 9.87s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%|▏ | 14981/1000000 [5:02:26<2794:25:23, 10.21s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [14981], local_loss=0.991702675819397, train_loss=0.18128320574760437, time_cost=9.355064630508423
+
Steps: 1%|▏ | 14981/1000000 [5:02:26<2794:25:23, 10.21s/it, lr=1e-5, step_loss=0.992]
Steps: 1%|▏ | 14982/1000000 [5:02:31<2428:08:19, 8.87s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [14982], local_loss=0.1776522845029831, train_loss=0.04170875996351242, time_cost=1.2803401947021484
+
Steps: 1%|▏ | 14982/1000000 [5:02:31<2428:08:19, 8.87s/it, lr=1e-5, step_loss=0.178]
Steps: 1%|▏ | 14983/1000000 [5:02:41<2464:11:11, 9.01s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [14983], local_loss=0.008203554898500443, train_loss=0.050577230751514435, time_cost=2.179497718811035
+
Steps: 1%|▏ | 14983/1000000 [5:02:41<2464:11:11, 9.01s/it, lr=1e-5, step_loss=0.0082]
Steps: 1%|▏ | 14984/1000000 [5:02:53<2703:41:27, 9.88s/it, lr=1e-5, step_loss=0.0082][RANK-0]: Step: [14984], local_loss=0.021437522023916245, train_loss=0.04280975088477135, time_cost=4.289448261260986
+
Steps: 1%|▏ | 14984/1000000 [5:02:53<2703:41:27, 9.88s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%|▏ | 14985/1000000 [5:03:04<2795:12:17, 10.22s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [14985], local_loss=0.013058867305517197, train_loss=0.06692707538604736, time_cost=4.022445917129517
+
Steps: 1%|▏ | 14985/1000000 [5:03:04<2795:12:17, 10.22s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%|▏ | 14986/1000000 [5:03:18<3110:49:52, 11.37s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [14986], local_loss=0.013118263334035873, train_loss=0.023586556315422058, time_cost=8.288788318634033
+
Steps: 1%|▏ | 14986/1000000 [5:03:18<3110:49:52, 11.37s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%|▏ | 14987/1000000 [5:03:27<2939:21:56, 10.74s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [14987], local_loss=0.04185914248228073, train_loss=0.031664200127124786, time_cost=2.806729555130005
+
Steps: 1%|▏ | 14987/1000000 [5:03:27<2939:21:56, 10.74s/it, lr=1e-5, step_loss=0.0419]
Steps: 1%|▏ | 14988/1000000 [5:03:38<2965:06:56, 10.84s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [14988], local_loss=0.04528738930821419, train_loss=0.18084633350372314, time_cost=3.5661613941192627
+
Steps: 1%|▏ | 14988/1000000 [5:03:38<2965:06:56, 10.84s/it, lr=1e-5, step_loss=0.0453]
Steps: 1%|▏ | 14989/1000000 [5:03:44<2562:55:26, 9.37s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [14989], local_loss=0.02529233880341053, train_loss=0.01682550087571144, time_cost=4.8602821826934814
+
Steps: 1%|▏ | 14989/1000000 [5:03:44<2562:55:26, 9.37s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%|▏ | 14990/1000000 [5:03:54<2661:48:17, 9.73s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [14990], local_loss=0.04264327883720398, train_loss=0.030596420168876648, time_cost=5.048052549362183
+
Steps: 1%|▏ | 14990/1000000 [5:03:54<2661:48:17, 9.73s/it, lr=1e-5, step_loss=0.0426]
Steps: 1%|▏ | 14991/1000000 [5:04:03<2585:32:15, 9.45s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [14991], local_loss=0.11805418133735657, train_loss=0.039668336510658264, time_cost=1.2512407302856445
+
Steps: 1%|▏ | 14991/1000000 [5:04:03<2585:32:15, 9.45s/it, lr=1e-5, step_loss=0.118]
Steps: 1%|▏ | 14992/1000000 [5:04:12<2530:24:26, 9.25s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [14992], local_loss=0.06206664443016052, train_loss=0.044359996914863586, time_cost=2.6154134273529053
+
Steps: 1%|▏ | 14992/1000000 [5:04:12<2530:24:26, 9.25s/it, lr=1e-5, step_loss=0.0621]
Steps: 1%|▏ | 14993/1000000 [5:04:18<2277:24:19, 8.32s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [14993], local_loss=0.00429752841591835, train_loss=0.14423403143882751, time_cost=1.9264562129974365
+
Steps: 1%|▏ | 14993/1000000 [5:04:18<2277:24:19, 8.32s/it, lr=1e-5, step_loss=0.0043]
Steps: 1%|▏ | 14994/1000000 [5:04:23<1995:22:46, 7.29s/it, lr=1e-5, step_loss=0.0043][RANK-0]: Step: [14994], local_loss=0.06508456915616989, train_loss=0.039533257484436035, time_cost=2.3237664699554443
+
Steps: 1%|▏ | 14994/1000000 [5:04:23<1995:22:46, 7.29s/it, lr=1e-5, step_loss=0.0651]
Steps: 1%|▏ | 14995/1000000 [5:04:31<2014:28:54, 7.36s/it, lr=1e-5, step_loss=0.0651][RANK-0]: Step: [14995], local_loss=0.6284631490707397, train_loss=0.10391630977392197, time_cost=1.4157311916351318
+
Steps: 1%|▏ | 14995/1000000 [5:04:31<2014:28:54, 7.36s/it, lr=1e-5, step_loss=0.628]
Steps: 1%|▏ | 14996/1000000 [5:04:42<2319:13:53, 8.48s/it, lr=1e-5, step_loss=0.628][RANK-0]: Step: [14996], local_loss=0.017426704987883568, train_loss=0.0746159479022026, time_cost=4.855468511581421
+
Steps: 1%|▏ | 14996/1000000 [5:04:42<2319:13:53, 8.48s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%|▏ | 14997/1000000 [5:04:48<2114:56:33, 7.73s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [14997], local_loss=0.00821920670568943, train_loss=0.04432462155818939, time_cost=1.2147493362426758
+
Steps: 1%|▏ | 14997/1000000 [5:04:48<2114:56:33, 7.73s/it, lr=1e-5, step_loss=0.00822]
Steps: 1%|▏ | 14998/1000000 [5:04:54<1970:17:01, 7.20s/it, lr=1e-5, step_loss=0.00822][RANK-0]: Step: [14998], local_loss=0.008506854996085167, train_loss=0.022678963840007782, time_cost=1.205686330795288
+
Steps: 1%|▏ | 14998/1000000 [5:04:54<1970:17:01, 7.20s/it, lr=1e-5, step_loss=0.00851]
Steps: 1%|▏ | 14999/1000000 [5:05:04<2246:27:38, 8.21s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [14999], local_loss=0.04895856976509094, train_loss=0.03646227344870567, time_cost=3.8604767322540283
+
Steps: 1%|▏ | 14999/1000000 [5:05:04<2246:27:38, 8.21s/it, lr=1e-5, step_loss=0.049]
Steps: 2%|▏ | 15000/1000000 [5:05:15<2465:38:18, 9.01s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [15000], local_loss=0.010953709483146667, train_loss=0.1530054658651352, time_cost=1.2270545959472656
+09/18/2024 14:29:18 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1/checkpoint-15000
+09/18/2024 14:29:18 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-18 14:29:18,474] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-18 14:29:18,503] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-18 14:29:18,504] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 14:29:35,249] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-18 14:29:35,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-18 14:30:07,892] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:07,893] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:07,893] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:10,786] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:10,787] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:10,787] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:10,859] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:10,859] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:10,859] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:11,258] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:11,315] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:11,316] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:11,571] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:11,572] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:11,572] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:11,646] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:11,647] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:11,647] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:11,722] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:11,722] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:11,722] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 14:30:11,811] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-18 14:30:11,812] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-18 14:30:11,812] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/18/2024 14:30:11 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/pytorch_model
+{'norm_num_groups', 'dropout', 'use_additional_conditions'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/model/diffusion_pytorch_model.safetensors
+09/18/2024 14:31:18 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/scheduler.bin
+09/18/2024 14:31:18 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/sampler.bin
+09/18/2024 14:31:18 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-15000/random_states_0.pkl
+09/18/2024 14:31:18 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1/checkpoint-15000
+
Steps: 2%|▏ | 15000/1000000 [5:07:15<2465:38:18, 9.01s/it, lr=1e-5, step_loss=0.011]
Steps: 2%|▏ | 15001/1000000 [5:07:22<12112:19:51, 44.27s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [15001], local_loss=0.0664018914103508, train_loss=0.04341889172792435, time_cost=1.1887142658233643
+
Steps: 2%|▏ | 15001/1000000 [5:07:22<12112:19:51, 44.27s/it, lr=1e-5, step_loss=0.0664]
Steps: 2%|▏ | 15002/1000000 [5:07:35<9567:07:54, 34.97s/it, lr=1e-5, step_loss=0.0664] [RANK-0]: Step: [15002], local_loss=0.009025155566632748, train_loss=0.11038168519735336, time_cost=4.679073810577393
+
Steps: 2%|▏ | 15002/1000000 [5:07:35<9567:07:54, 34.97s/it, lr=1e-5, step_loss=0.00903]
Steps: 2%|▏ | 15003/1000000 [5:07:51<7986:31:10, 29.19s/it, lr=1e-5, step_loss=0.00903][RANK-0]: Step: [15003], local_loss=0.02536092698574066, train_loss=0.0588790662586689, time_cost=6.293683290481567
+
Steps: 2%|▏ | 15003/1000000 [5:07:51<7986:31:10, 29.19s/it, lr=1e-5, step_loss=0.0254]
Steps: 2%|▏ | 15004/1000000 [5:08:07<6904:16:21, 25.23s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [15004], local_loss=0.004802236333489418, train_loss=0.03196771442890167, time_cost=8.47372317314148
+
Steps: 2%|▏ | 15004/1000000 [5:08:07<6904:16:21, 25.23s/it, lr=1e-5, step_loss=0.0048]
Steps: 2%|▏ | 15005/1000000 [5:08:17<5728:44:05, 20.94s/it, lr=1e-5, step_loss=0.0048][RANK-0]: Step: [15005], local_loss=0.02567053958773613, train_loss=0.02281268872320652, time_cost=1.7831251621246338
+
Steps: 2%|▏ | 15005/1000000 [5:08:17<5728:44:05, 20.94s/it, lr=1e-5, step_loss=0.0257]
Steps: 2%|▏ | 15006/1000000 [5:08:25<4625:29:52, 16.91s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [15006], local_loss=0.030210983008146286, train_loss=0.2063785344362259, time_cost=4.508682727813721
+
Steps: 2%|▏ | 15006/1000000 [5:08:25<4625:29:52, 16.91s/it, lr=1e-5, step_loss=0.0302]
Steps: 2%|▏ | 15007/1000000 [5:08:38<4345:33:05, 15.88s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [15007], local_loss=0.009531447663903236, train_loss=31.772851943969727, time_cost=3.4838109016418457
+
Steps: 2%|▏ | 15007/1000000 [5:08:38<4345:33:05, 15.88s/it, lr=1e-5, step_loss=0.00953]
Steps: 2%|▏ | 15008/1000000 [5:08:44<3466:42:27, 12.67s/it, lr=1e-5, step_loss=0.00953][RANK-0]: Step: [15008], local_loss=0.019244976341724396, train_loss=0.01748669147491455, time_cost=3.972959280014038
+
Steps: 2%|▏ | 15008/1000000 [5:08:44<3466:42:27, 12.67s/it, lr=1e-5, step_loss=0.0192]
Steps: 2%|▏ | 15009/1000000 [5:08:55<3338:44:11, 12.20s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [15009], local_loss=0.01608860306441784, train_loss=0.02302943542599678, time_cost=4.166036128997803
+
Steps: 2%|▏ | 15009/1000000 [5:08:55<3338:44:11, 12.20s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 15010/1000000 [5:09:01<2848:08:02, 10.41s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [15010], local_loss=0.010687915608286858, train_loss=0.03522860258817673, time_cost=2.2816531658172607
+
Steps: 2%|▏ | 15010/1000000 [5:09:01<2848:08:02, 10.41s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15011/1000000 [5:09:14<3071:47:12, 11.23s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15011], local_loss=0.025478119030594826, train_loss=0.07551519572734833, time_cost=10.767616033554077
+
Steps: 2%|▏ | 15011/1000000 [5:09:14<3071:47:12, 11.23s/it, lr=1e-5, step_loss=0.0255]
Steps: 2%|▏ | 15012/1000000 [5:09:23<2881:55:00, 10.53s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [15012], local_loss=0.08251237869262695, train_loss=0.050542332231998444, time_cost=2.6508097648620605
+
Steps: 2%|▏ | 15012/1000000 [5:09:23<2881:55:00, 10.53s/it, lr=1e-5, step_loss=0.0825]
Steps: 2%|▏ | 15013/1000000 [5:09:30<2590:54:43, 9.47s/it, lr=1e-5, step_loss=0.0825][RANK-0]: Step: [15013], local_loss=0.01439093891531229, train_loss=0.05462828278541565, time_cost=1.529052495956421
+
Steps: 2%|▏ | 15013/1000000 [5:09:30<2590:54:43, 9.47s/it, lr=1e-5, step_loss=0.0144]
Steps: 2%|▏ | 15014/1000000 [5:09:45<3083:33:20, 11.27s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [15014], local_loss=1.0092120170593262, train_loss=0.1663050651550293, time_cost=6.784725666046143
+
Steps: 2%|▏ | 15014/1000000 [5:09:45<3083:33:20, 11.27s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 15015/1000000 [5:09:57<3122:49:28, 11.41s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [15015], local_loss=0.015630103647708893, train_loss=0.0227959044277668, time_cost=1.1946887969970703
+
Steps: 2%|▏ | 15015/1000000 [5:09:57<3122:49:28, 11.41s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 15016/1000000 [5:10:05<2784:15:35, 10.18s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [15016], local_loss=0.004479163326323032, train_loss=0.04465247690677643, time_cost=1.54213285446167
+
Steps: 2%|▏ | 15016/1000000 [5:10:05<2784:15:35, 10.18s/it, lr=1e-5, step_loss=0.00448]
Steps: 2%|▏ | 15017/1000000 [5:10:12<2587:20:38, 9.46s/it, lr=1e-5, step_loss=0.00448][RANK-0]: Step: [15017], local_loss=0.016569094732403755, train_loss=0.03212253004312515, time_cost=3.172685146331787
+
Steps: 2%|▏ | 15017/1000000 [5:10:12<2587:20:38, 9.46s/it, lr=1e-5, step_loss=0.0166]
Steps: 2%|▏ | 15018/1000000 [5:10:18<2287:27:04, 8.36s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [15018], local_loss=0.0038213650695979595, train_loss=12.637565612792969, time_cost=3.066084146499634
+
Steps: 2%|▏ | 15018/1000000 [5:10:18<2287:27:04, 8.36s/it, lr=1e-5, step_loss=0.00382]
Steps: 2%|▏ | 15019/1000000 [5:10:30<2574:16:49, 9.41s/it, lr=1e-5, step_loss=0.00382][RANK-0]: Step: [15019], local_loss=0.007932540029287338, train_loss=0.03243068605661392, time_cost=2.7103898525238037
+
Steps: 2%|▏ | 15019/1000000 [5:10:30<2574:16:49, 9.41s/it, lr=1e-5, step_loss=0.00793]
Steps: 2%|▏ | 15020/1000000 [5:10:34<2134:41:03, 7.80s/it, lr=1e-5, step_loss=0.00793][RANK-0]: Step: [15020], local_loss=0.020406708121299744, train_loss=0.034216880798339844, time_cost=1.3292450904846191
+
Steps: 2%|▏ | 15020/1000000 [5:10:34<2134:41:03, 7.80s/it, lr=1e-5, step_loss=0.0204]
Steps: 2%|▏ | 15021/1000000 [5:10:39<1896:54:51, 6.93s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [15021], local_loss=0.00936418492347002, train_loss=0.06288447231054306, time_cost=2.4803402423858643
+
Steps: 2%|▏ | 15021/1000000 [5:10:39<1896:54:51, 6.93s/it, lr=1e-5, step_loss=0.00936]
Steps: 2%|▏ | 15022/1000000 [5:10:54<2535:50:42, 9.27s/it, lr=1e-5, step_loss=0.00936][RANK-0]: Step: [15022], local_loss=0.008505556732416153, train_loss=0.03149835020303726, time_cost=5.2818591594696045
+
Steps: 2%|▏ | 15022/1000000 [5:10:54<2535:50:42, 9.27s/it, lr=1e-5, step_loss=0.00851]
Steps: 2%|▏ | 15023/1000000 [5:11:04<2663:12:42, 9.73s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [15023], local_loss=0.38781502842903137, train_loss=0.0698661208152771, time_cost=3.0899817943573
+
Steps: 2%|▏ | 15023/1000000 [5:11:04<2663:12:42, 9.73s/it, lr=1e-5, step_loss=0.388]
Steps: 2%|▏ | 15024/1000000 [5:11:21<3200:24:40, 11.70s/it, lr=1e-5, step_loss=0.388][RANK-0]: Step: [15024], local_loss=0.0035444600507616997, train_loss=0.07709558308124542, time_cost=7.941025495529175
+
Steps: 2%|▏ | 15024/1000000 [5:11:21<3200:24:40, 11.70s/it, lr=1e-5, step_loss=0.00354]
Steps: 2%|▏ | 15025/1000000 [5:11:38<3629:29:59, 13.27s/it, lr=1e-5, step_loss=0.00354][RANK-0]: Step: [15025], local_loss=0.019266314804553986, train_loss=0.04580600932240486, time_cost=8.434195280075073
+
Steps: 2%|▏ | 15025/1000000 [5:11:38<3629:29:59, 13.27s/it, lr=1e-5, step_loss=0.0193]
Steps: 2%|▏ | 15026/1000000 [5:11:43<2953:23:31, 10.79s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [15026], local_loss=0.01604790799319744, train_loss=0.018017549067735672, time_cost=2.288132429122925
+
Steps: 2%|▏ | 15026/1000000 [5:11:43<2953:23:31, 10.79s/it, lr=1e-5, step_loss=0.016]
Steps: 2%|▏ | 15027/1000000 [5:11:55<3059:45:00, 11.18s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [15027], local_loss=0.11311529576778412, train_loss=0.15324711799621582, time_cost=5.500717878341675
+
Steps: 2%|▏ | 15027/1000000 [5:11:55<3059:45:00, 11.18s/it, lr=1e-5, step_loss=0.113]
Steps: 2%|▏ | 15028/1000000 [5:12:08<3189:27:47, 11.66s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [15028], local_loss=0.01668751798570156, train_loss=0.05138043314218521, time_cost=5.789654731750488
+
Steps: 2%|▏ | 15028/1000000 [5:12:08<3189:27:47, 11.66s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 15029/1000000 [5:12:15<2823:03:03, 10.32s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [15029], local_loss=0.012465658597648144, train_loss=0.04462706297636032, time_cost=1.245115041732788
+
Steps: 2%|▏ | 15029/1000000 [5:12:15<2823:03:03, 10.32s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 15030/1000000 [5:12:24<2705:26:10, 9.89s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [15030], local_loss=0.17171530425548553, train_loss=0.04331788420677185, time_cost=2.860541820526123
+
Steps: 2%|▏ | 15030/1000000 [5:12:24<2705:26:10, 9.89s/it, lr=1e-5, step_loss=0.172]
Steps: 2%|▏ | 15031/1000000 [5:12:33<2655:14:04, 9.70s/it, lr=1e-5, step_loss=0.172][RANK-0]: Step: [15031], local_loss=0.07114405930042267, train_loss=0.023088641464710236, time_cost=3.098262310028076
+
Steps: 2%|▏ | 15031/1000000 [5:12:33<2655:14:04, 9.70s/it, lr=1e-5, step_loss=0.0711]
Steps: 2%|▏ | 15032/1000000 [5:12:48<3098:24:49, 11.32s/it, lr=1e-5, step_loss=0.0711][RANK-0]: Step: [15032], local_loss=0.012095644138753414, train_loss=0.08286577463150024, time_cost=6.618855714797974
+
Steps: 2%|▏ | 15032/1000000 [5:12:48<3098:24:49, 11.32s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 15033/1000000 [5:12:57<2901:32:38, 10.60s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [15033], local_loss=0.0049839927814900875, train_loss=0.030180901288986206, time_cost=4.896801471710205
+
Steps: 2%|▏ | 15033/1000000 [5:12:57<2901:32:38, 10.60s/it, lr=1e-5, step_loss=0.00498]
Steps: 2%|▏ | 15034/1000000 [5:13:03<2547:00:51, 9.31s/it, lr=1e-5, step_loss=0.00498][RANK-0]: Step: [15034], local_loss=0.024040929973125458, train_loss=0.051916658878326416, time_cost=2.182690382003784
+
Steps: 2%|▏ | 15034/1000000 [5:13:03<2547:00:51, 9.31s/it, lr=1e-5, step_loss=0.024]
Steps: 2%|▏ | 15035/1000000 [5:13:08<2139:04:56, 7.82s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [15035], local_loss=0.010695422068238258, train_loss=0.04205245524644852, time_cost=1.6698763370513916
+
Steps: 2%|▏ | 15035/1000000 [5:13:08<2139:04:56, 7.82s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15036/1000000 [5:13:15<2077:18:27, 7.59s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15036], local_loss=0.006554490886628628, train_loss=0.15347503125667572, time_cost=2.881964683532715
+
Steps: 2%|▏ | 15036/1000000 [5:13:15<2077:18:27, 7.59s/it, lr=1e-5, step_loss=0.00655]
Steps: 2%|▏ | 15037/1000000 [5:13:26<2360:28:32, 8.63s/it, lr=1e-5, step_loss=0.00655][RANK-0]: Step: [15037], local_loss=0.046513352543115616, train_loss=0.03602725267410278, time_cost=3.754425048828125
+
Steps: 2%|▏ | 15037/1000000 [5:13:26<2360:28:32, 8.63s/it, lr=1e-5, step_loss=0.0465]
Steps: 2%|▏ | 15038/1000000 [5:13:35<2421:06:10, 8.85s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [15038], local_loss=0.007056966423988342, train_loss=0.03400471806526184, time_cost=2.069789409637451
+
Steps: 2%|▏ | 15038/1000000 [5:13:35<2421:06:10, 8.85s/it, lr=1e-5, step_loss=0.00706]
Steps: 2%|▏ | 15039/1000000 [5:13:39<2022:25:20, 7.39s/it, lr=1e-5, step_loss=0.00706][RANK-0]: Step: [15039], local_loss=0.004482946824282408, train_loss=28.78230857849121, time_cost=1.639394760131836
+
Steps: 2%|▏ | 15039/1000000 [5:13:39<2022:25:20, 7.39s/it, lr=1e-5, step_loss=0.00448]
Steps: 2%|▏ | 15040/1000000 [5:13:46<1991:44:22, 7.28s/it, lr=1e-5, step_loss=0.00448][RANK-0]: Step: [15040], local_loss=0.015087198466062546, train_loss=0.024594737216830254, time_cost=2.0142199993133545
+
Steps: 2%|▏ | 15040/1000000 [5:13:46<1991:44:22, 7.28s/it, lr=1e-5, step_loss=0.0151]
Steps: 2%|▏ | 15041/1000000 [5:13:59<2485:21:12, 9.08s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [15041], local_loss=0.012512575834989548, train_loss=0.04386438801884651, time_cost=5.1465394496917725
+
Steps: 2%|▏ | 15041/1000000 [5:13:59<2485:21:12, 9.08s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 15042/1000000 [5:14:15<3038:32:54, 11.11s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [15042], local_loss=0.008835220709443092, train_loss=0.017978426069021225, time_cost=6.421398639678955
+
Steps: 2%|▏ | 15042/1000000 [5:14:15<3038:32:54, 11.11s/it, lr=1e-5, step_loss=0.00884]
Steps: 2%|▏ | 15043/1000000 [5:14:22<2665:52:00, 9.74s/it, lr=1e-5, step_loss=0.00884][RANK-0]: Step: [15043], local_loss=0.1163993626832962, train_loss=0.04315660148859024, time_cost=2.1247620582580566
+
Steps: 2%|▏ | 15043/1000000 [5:14:22<2665:52:00, 9.74s/it, lr=1e-5, step_loss=0.116]
Steps: 2%|▏ | 15044/1000000 [5:14:31<2613:26:03, 9.55s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [15044], local_loss=0.02065718173980713, train_loss=0.134202241897583, time_cost=1.2530057430267334
+
Steps: 2%|▏ | 15044/1000000 [5:14:31<2613:26:03, 9.55s/it, lr=1e-5, step_loss=0.0207]
Steps: 2%|▏ | 15045/1000000 [5:14:36<2236:55:20, 8.18s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [15045], local_loss=0.016386689618229866, train_loss=0.059685416519641876, time_cost=2.201601028442383
+
Steps: 2%|▏ | 15045/1000000 [5:14:36<2236:55:20, 8.18s/it, lr=1e-5, step_loss=0.0164]
Steps: 2%|▏ | 15046/1000000 [5:14:43<2142:39:54, 7.83s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [15046], local_loss=0.05901440605521202, train_loss=0.10190488398075104, time_cost=2.569096326828003
+
Steps: 2%|▏ | 15046/1000000 [5:14:43<2142:39:54, 7.83s/it, lr=1e-5, step_loss=0.059]
Steps: 2%|▏ | 15047/1000000 [5:14:51<2171:32:52, 7.94s/it, lr=1e-5, step_loss=0.059][RANK-0]: Step: [15047], local_loss=0.03574224188923836, train_loss=0.1500462293624878, time_cost=2.899381637573242
+
Steps: 2%|▏ | 15047/1000000 [5:14:51<2171:32:52, 7.94s/it, lr=1e-5, step_loss=0.0357]
Steps: 2%|▏ | 15048/1000000 [5:14:56<1944:01:28, 7.11s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [15048], local_loss=0.0259347353130579, train_loss=0.12595325708389282, time_cost=1.3003270626068115
+
Steps: 2%|▏ | 15048/1000000 [5:14:56<1944:01:28, 7.11s/it, lr=1e-5, step_loss=0.0259]
Steps: 2%|▏ | 15049/1000000 [5:15:10<2521:51:32, 9.22s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [15049], local_loss=0.05696547403931618, train_loss=0.05471797659993172, time_cost=4.680372714996338
+
Steps: 2%|▏ | 15049/1000000 [5:15:10<2521:51:32, 9.22s/it, lr=1e-5, step_loss=0.057]
Steps: 2%|▏ | 15050/1000000 [5:15:20<2520:34:46, 9.21s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [15050], local_loss=0.013829553499817848, train_loss=0.01614677906036377, time_cost=6.813446760177612
+
Steps: 2%|▏ | 15050/1000000 [5:15:20<2520:34:46, 9.21s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 15051/1000000 [5:15:29<2550:19:59, 9.32s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [15051], local_loss=0.052706826478242874, train_loss=0.041559502482414246, time_cost=3.814685106277466
+
Steps: 2%|▏ | 15051/1000000 [5:15:29<2550:19:59, 9.32s/it, lr=1e-5, step_loss=0.0527]
Steps: 2%|▏ | 15052/1000000 [5:15:43<2948:01:23, 10.78s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [15052], local_loss=0.05967167392373085, train_loss=0.03489493951201439, time_cost=1.3540232181549072
+
Steps: 2%|▏ | 15052/1000000 [5:15:43<2948:01:23, 10.78s/it, lr=1e-5, step_loss=0.0597]
Steps: 2%|▏ | 15053/1000000 [5:15:51<2663:26:15, 9.73s/it, lr=1e-5, step_loss=0.0597][RANK-0]: Step: [15053], local_loss=0.025281690061092377, train_loss=0.03744082152843475, time_cost=1.3299405574798584
+
Steps: 2%|▏ | 15053/1000000 [5:15:51<2663:26:15, 9.73s/it, lr=1e-5, step_loss=0.0253]
Steps: 2%|▏ | 15054/1000000 [5:15:58<2464:56:30, 9.01s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [15054], local_loss=0.018060913309454918, train_loss=0.07763765752315521, time_cost=2.530989646911621
+
Steps: 2%|▏ | 15054/1000000 [5:15:58<2464:56:30, 9.01s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 15055/1000000 [5:16:03<2149:41:33, 7.86s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [15055], local_loss=0.05434088036417961, train_loss=0.1675347238779068, time_cost=4.422094821929932
+
Steps: 2%|▏ | 15055/1000000 [5:16:03<2149:41:33, 7.86s/it, lr=1e-5, step_loss=0.0543]
Steps: 2%|▏ | 15056/1000000 [5:16:08<1870:22:26, 6.84s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [15056], local_loss=0.020272180438041687, train_loss=0.0361332967877388, time_cost=1.4151439666748047
+
Steps: 2%|▏ | 15056/1000000 [5:16:08<1870:22:26, 6.84s/it, lr=1e-5, step_loss=0.0203]
Steps: 2%|▏ | 15057/1000000 [5:16:13<1748:48:42, 6.39s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [15057], local_loss=0.08870534598827362, train_loss=0.04828360676765442, time_cost=4.148422002792358
+
Steps: 2%|▏ | 15057/1000000 [5:16:13<1748:48:42, 6.39s/it, lr=1e-5, step_loss=0.0887]
Steps: 2%|▏ | 15058/1000000 [5:16:18<1651:08:31, 6.03s/it, lr=1e-5, step_loss=0.0887][RANK-0]: Step: [15058], local_loss=0.011637309566140175, train_loss=0.036679793149232864, time_cost=2.22525691986084
+
Steps: 2%|▏ | 15058/1000000 [5:16:18<1651:08:31, 6.03s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 15059/1000000 [5:16:30<2164:50:13, 7.91s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [15059], local_loss=0.026844972744584084, train_loss=0.049532052129507065, time_cost=8.75303030014038
+
Steps: 2%|▏ | 15059/1000000 [5:16:30<2164:50:13, 7.91s/it, lr=1e-5, step_loss=0.0268]
Steps: 2%|▏ | 15060/1000000 [5:16:44<2602:18:20, 9.51s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [15060], local_loss=0.010710208676755428, train_loss=0.1356237828731537, time_cost=5.212726354598999
+
Steps: 2%|▏ | 15060/1000000 [5:16:44<2602:18:20, 9.51s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15061/1000000 [5:16:48<2222:33:18, 8.12s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15061], local_loss=0.013497264124453068, train_loss=0.013648269698023796, time_cost=1.9944007396697998
+
Steps: 2%|▏ | 15061/1000000 [5:16:48<2222:33:18, 8.12s/it, lr=1e-5, step_loss=0.0135]
Steps: 2%|▏ | 15062/1000000 [5:16:53<1908:52:52, 6.98s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [15062], local_loss=0.0327390693128109, train_loss=0.08161090314388275, time_cost=1.7623765468597412
+
Steps: 2%|▏ | 15062/1000000 [5:16:53<1908:52:52, 6.98s/it, lr=1e-5, step_loss=0.0327]
Steps: 2%|▏ | 15063/1000000 [5:17:02<2117:02:01, 7.74s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [15063], local_loss=0.02814401499927044, train_loss=0.03156363219022751, time_cost=1.8780715465545654
+
Steps: 2%|▏ | 15063/1000000 [5:17:02<2117:02:01, 7.74s/it, lr=1e-5, step_loss=0.0281]
Steps: 2%|▏ | 15064/1000000 [5:17:09<2046:45:31, 7.48s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [15064], local_loss=0.05086171627044678, train_loss=0.024203259497880936, time_cost=1.320667028427124
+
Steps: 2%|▏ | 15064/1000000 [5:17:09<2046:45:31, 7.48s/it, lr=1e-5, step_loss=0.0509]
Steps: 2%|▏ | 15065/1000000 [5:17:14<1846:50:09, 6.75s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [15065], local_loss=0.020054228603839874, train_loss=0.1950594037771225, time_cost=2.0059897899627686
+
Steps: 2%|▏ | 15065/1000000 [5:17:14<1846:50:09, 6.75s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 15066/1000000 [5:17:23<2049:19:29, 7.49s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [15066], local_loss=0.0044302064925432205, train_loss=0.0375693216919899, time_cost=4.328916311264038
+
Steps: 2%|▏ | 15066/1000000 [5:17:23<2049:19:29, 7.49s/it, lr=1e-5, step_loss=0.00443]
Steps: 2%|▏ | 15067/1000000 [5:17:28<1790:39:15, 6.54s/it, lr=1e-5, step_loss=0.00443][RANK-0]: Step: [15067], local_loss=0.014087642543017864, train_loss=0.018277429044246674, time_cost=1.3801724910736084
+
Steps: 2%|▏ | 15067/1000000 [5:17:28<1790:39:15, 6.54s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 15068/1000000 [5:17:33<1667:41:01, 6.10s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [15068], local_loss=0.007233195006847382, train_loss=0.03839820623397827, time_cost=1.9707322120666504
+
Steps: 2%|▏ | 15068/1000000 [5:17:33<1667:41:01, 6.10s/it, lr=1e-5, step_loss=0.00723]
Steps: 2%|▏ | 15069/1000000 [5:17:40<1745:31:46, 6.38s/it, lr=1e-5, step_loss=0.00723][RANK-0]: Step: [15069], local_loss=0.021056074649095535, train_loss=0.06545338034629822, time_cost=2.366626739501953
+
Steps: 2%|▏ | 15069/1000000 [5:17:40<1745:31:46, 6.38s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 15070/1000000 [5:17:47<1809:11:48, 6.61s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [15070], local_loss=0.021582264453172684, train_loss=0.05042700842022896, time_cost=3.608283758163452
+
Steps: 2%|▏ | 15070/1000000 [5:17:47<1809:11:48, 6.61s/it, lr=1e-5, step_loss=0.0216]
Steps: 2%|▏ | 15071/1000000 [5:17:53<1744:20:59, 6.38s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [15071], local_loss=0.049771539866924286, train_loss=0.0662795826792717, time_cost=1.9560096263885498
+
Steps: 2%|▏ | 15071/1000000 [5:17:53<1744:20:59, 6.38s/it, lr=1e-5, step_loss=0.0498]
Steps: 2%|▏ | 15072/1000000 [5:18:06<2286:08:58, 8.36s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [15072], local_loss=0.003812518436461687, train_loss=0.06653733551502228, time_cost=3.616513967514038
+
Steps: 2%|▏ | 15072/1000000 [5:18:06<2286:08:58, 8.36s/it, lr=1e-5, step_loss=0.00381]
Steps: 2%|▏ | 15073/1000000 [5:18:18<2613:39:07, 9.55s/it, lr=1e-5, step_loss=0.00381][RANK-0]: Step: [15073], local_loss=0.013310209847986698, train_loss=0.14389801025390625, time_cost=2.1341519355773926
+
Steps: 2%|▏ | 15073/1000000 [5:18:18<2613:39:07, 9.55s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 15074/1000000 [5:18:24<2312:40:39, 8.45s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [15074], local_loss=0.10838928073644638, train_loss=0.04177086055278778, time_cost=1.3651633262634277
+
Steps: 2%|▏ | 15074/1000000 [5:18:24<2312:40:39, 8.45s/it, lr=1e-5, step_loss=0.108]
Steps: 2%|▏ | 15075/1000000 [5:18:36<2570:28:06, 9.40s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [15075], local_loss=0.019620656967163086, train_loss=0.07396171987056732, time_cost=2.1723504066467285
+
Steps: 2%|▏ | 15075/1000000 [5:18:36<2570:28:06, 9.40s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 15076/1000000 [5:18:43<2375:52:14, 8.68s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [15076], local_loss=0.01835450902581215, train_loss=0.037772972136735916, time_cost=5.35170841217041
+
Steps: 2%|▏ | 15076/1000000 [5:18:43<2375:52:14, 8.68s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 15077/1000000 [5:18:50<2276:54:34, 8.32s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [15077], local_loss=0.06914474815130234, train_loss=14.915681838989258, time_cost=1.5500986576080322
+
Steps: 2%|▏ | 15077/1000000 [5:18:50<2276:54:34, 8.32s/it, lr=1e-5, step_loss=0.0691]
Steps: 2%|▏ | 15078/1000000 [5:18:59<2344:14:48, 8.57s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [15078], local_loss=0.02779521606862545, train_loss=0.046855852007865906, time_cost=1.9245719909667969
+
Steps: 2%|▏ | 15078/1000000 [5:18:59<2344:14:48, 8.57s/it, lr=1e-5, step_loss=0.0278]
Steps: 2%|▏ | 15079/1000000 [5:19:12<2683:41:34, 9.81s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [15079], local_loss=0.014050097204744816, train_loss=0.032172203063964844, time_cost=4.589231014251709
+
Steps: 2%|▏ | 15079/1000000 [5:19:12<2683:41:34, 9.81s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 15080/1000000 [5:19:17<2272:09:03, 8.30s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [15080], local_loss=0.00527964998036623, train_loss=0.007437855005264282, time_cost=1.2407619953155518
+
Steps: 2%|▏ | 15080/1000000 [5:19:17<2272:09:03, 8.30s/it, lr=1e-5, step_loss=0.00528]
Steps: 2%|▏ | 15081/1000000 [5:19:28<2488:25:32, 9.10s/it, lr=1e-5, step_loss=0.00528][RANK-0]: Step: [15081], local_loss=0.04591180756688118, train_loss=0.049692459404468536, time_cost=1.2160251140594482
+
Steps: 2%|▏ | 15081/1000000 [5:19:28<2488:25:32, 9.10s/it, lr=1e-5, step_loss=0.0459]
Steps: 2%|▏ | 15082/1000000 [5:19:34<2223:53:01, 8.13s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [15082], local_loss=0.027793290093541145, train_loss=0.022770730778574944, time_cost=1.7673025131225586
+
Steps: 2%|▏ | 15082/1000000 [5:19:34<2223:53:01, 8.13s/it, lr=1e-5, step_loss=0.0278]
Steps: 2%|▏ | 15083/1000000 [5:19:48<2739:51:20, 10.01s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [15083], local_loss=0.012791501358151436, train_loss=0.05408960580825806, time_cost=1.2295475006103516
+
Steps: 2%|▏ | 15083/1000000 [5:19:48<2739:51:20, 10.01s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15084/1000000 [5:19:57<2652:54:21, 9.70s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15084], local_loss=0.01111756730824709, train_loss=0.013524220325052738, time_cost=3.065183401107788
+
Steps: 2%|▏ | 15084/1000000 [5:19:57<2652:54:21, 9.70s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15085/1000000 [5:20:02<2258:12:10, 8.25s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15085], local_loss=0.015590066090226173, train_loss=0.02348170429468155, time_cost=1.8251252174377441
+
Steps: 2%|▏ | 15085/1000000 [5:20:02<2258:12:10, 8.25s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 15086/1000000 [5:20:11<2369:50:44, 8.66s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [15086], local_loss=0.007452749647200108, train_loss=12.11906909942627, time_cost=2.215132236480713
+
Steps: 2%|▏ | 15086/1000000 [5:20:11<2369:50:44, 8.66s/it, lr=1e-5, step_loss=0.00745]
Steps: 2%|▏ | 15087/1000000 [5:20:16<2061:29:36, 7.54s/it, lr=1e-5, step_loss=0.00745][RANK-0]: Step: [15087], local_loss=0.00986594520509243, train_loss=0.049883510917425156, time_cost=2.6117825508117676
+
Steps: 2%|▏ | 15087/1000000 [5:20:16<2061:29:36, 7.54s/it, lr=1e-5, step_loss=0.00987]
Steps: 2%|▏ | 15088/1000000 [5:20:24<2044:51:06, 7.47s/it, lr=1e-5, step_loss=0.00987][RANK-0]: Step: [15088], local_loss=0.1036575436592102, train_loss=0.03675733506679535, time_cost=1.3366172313690186
+
Steps: 2%|▏ | 15088/1000000 [5:20:24<2044:51:06, 7.47s/it, lr=1e-5, step_loss=0.104]
Steps: 2%|▏ | 15089/1000000 [5:20:30<1923:10:35, 7.03s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [15089], local_loss=0.009761925786733627, train_loss=0.025508344173431396, time_cost=1.7113244533538818
+
Steps: 2%|▏ | 15089/1000000 [5:20:30<1923:10:35, 7.03s/it, lr=1e-5, step_loss=0.00976]
Steps: 2%|▏ | 15090/1000000 [5:20:37<1976:02:42, 7.22s/it, lr=1e-5, step_loss=0.00976][RANK-0]: Step: [15090], local_loss=0.07137544453144073, train_loss=0.05943867564201355, time_cost=1.9738342761993408
+
Steps: 2%|▏ | 15090/1000000 [5:20:37<1976:02:42, 7.22s/it, lr=1e-5, step_loss=0.0714]
Steps: 2%|▏ | 15091/1000000 [5:20:47<2140:38:52, 7.82s/it, lr=1e-5, step_loss=0.0714][RANK-0]: Step: [15091], local_loss=0.02027858793735504, train_loss=0.11329467594623566, time_cost=1.661484718322754
+
Steps: 2%|▏ | 15091/1000000 [5:20:47<2140:38:52, 7.82s/it, lr=1e-5, step_loss=0.0203]
Steps: 2%|▏ | 15092/1000000 [5:20:54<2073:02:45, 7.58s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [15092], local_loss=0.0460839606821537, train_loss=0.042901985347270966, time_cost=3.3338067531585693
+
Steps: 2%|▏ | 15092/1000000 [5:20:54<2073:02:45, 7.58s/it, lr=1e-5, step_loss=0.0461]
Steps: 2%|▏ | 15093/1000000 [5:21:04<2282:16:23, 8.34s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [15093], local_loss=0.026701906695961952, train_loss=0.04550071805715561, time_cost=1.3029556274414062
+
Steps: 2%|▏ | 15093/1000000 [5:21:04<2282:16:23, 8.34s/it, lr=1e-5, step_loss=0.0267]
Steps: 2%|▏ | 15094/1000000 [5:21:10<2142:03:27, 7.83s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [15094], local_loss=0.4238118529319763, train_loss=0.2113901525735855, time_cost=5.461354494094849
+
Steps: 2%|▏ | 15094/1000000 [5:21:10<2142:03:27, 7.83s/it, lr=1e-5, step_loss=0.424]
Steps: 2%|▏ | 15095/1000000 [5:21:17<2078:12:20, 7.60s/it, lr=1e-5, step_loss=0.424][RANK-0]: Step: [15095], local_loss=0.05133204162120819, train_loss=0.07088916003704071, time_cost=1.2919526100158691
+
Steps: 2%|▏ | 15095/1000000 [5:21:17<2078:12:20, 7.60s/it, lr=1e-5, step_loss=0.0513]
Steps: 2%|▏ | 15096/1000000 [5:21:29<2398:23:32, 8.77s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [15096], local_loss=0.016925733536481857, train_loss=0.02056307904422283, time_cost=1.9066286087036133
+
Steps: 2%|▏ | 15096/1000000 [5:21:29<2398:23:32, 8.77s/it, lr=1e-5, step_loss=0.0169]
Steps: 2%|▏ | 15097/1000000 [5:21:36<2296:49:50, 8.40s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [15097], local_loss=0.1766781359910965, train_loss=0.03702717274427414, time_cost=1.6777122020721436
+
Steps: 2%|▏ | 15097/1000000 [5:21:36<2296:49:50, 8.40s/it, lr=1e-5, step_loss=0.177]
Steps: 2%|▏ | 15098/1000000 [5:21:47<2445:25:27, 8.94s/it, lr=1e-5, step_loss=0.177][RANK-0]: Step: [15098], local_loss=0.0273036677390337, train_loss=0.028370730578899384, time_cost=1.2608990669250488
+
Steps: 2%|▏ | 15098/1000000 [5:21:47<2445:25:27, 8.94s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 15099/1000000 [5:22:00<2790:57:40, 10.20s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [15099], local_loss=0.011119146831333637, train_loss=0.021067583933472633, time_cost=5.905284643173218
+
Steps: 2%|▏ | 15099/1000000 [5:22:00<2790:57:40, 10.20s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15100/1000000 [5:22:15<3222:46:32, 11.78s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15100], local_loss=0.007464341353625059, train_loss=0.01580837182700634, time_cost=11.836679220199585
+
Steps: 2%|▏ | 15100/1000000 [5:22:15<3222:46:32, 11.78s/it, lr=1e-5, step_loss=0.00746]
Steps: 2%|▏ | 15101/1000000 [5:22:20<2668:19:01, 9.75s/it, lr=1e-5, step_loss=0.00746][RANK-0]: Step: [15101], local_loss=0.04024968668818474, train_loss=0.1547749787569046, time_cost=1.246603012084961
+
Steps: 2%|▏ | 15101/1000000 [5:22:20<2668:19:01, 9.75s/it, lr=1e-5, step_loss=0.0402]
Steps: 2%|▏ | 15102/1000000 [5:22:30<2656:37:08, 9.71s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [15102], local_loss=0.01948990672826767, train_loss=0.018109045922756195, time_cost=3.307720184326172
+
Steps: 2%|▏ | 15102/1000000 [5:22:30<2656:37:08, 9.71s/it, lr=1e-5, step_loss=0.0195]
Steps: 2%|▏ | 15103/1000000 [5:22:43<2944:01:03, 10.76s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [15103], local_loss=0.08471900969743729, train_loss=0.05947607383131981, time_cost=3.9347527027130127
+
Steps: 2%|▏ | 15103/1000000 [5:22:43<2944:01:03, 10.76s/it, lr=1e-5, step_loss=0.0847]
Steps: 2%|▏ | 15104/1000000 [5:22:49<2539:56:23, 9.28s/it, lr=1e-5, step_loss=0.0847][RANK-0]: Step: [15104], local_loss=0.00519670732319355, train_loss=0.022211847826838493, time_cost=1.6653099060058594
+
Steps: 2%|▏ | 15104/1000000 [5:22:49<2539:56:23, 9.28s/it, lr=1e-5, step_loss=0.0052]
Steps: 2%|▏ | 15105/1000000 [5:22:57<2427:50:57, 8.87s/it, lr=1e-5, step_loss=0.0052][RANK-0]: Step: [15105], local_loss=0.009224053472280502, train_loss=0.09845206886529922, time_cost=2.2446448802948
+
Steps: 2%|▏ | 15105/1000000 [5:22:57<2427:50:57, 8.87s/it, lr=1e-5, step_loss=0.00922]
Steps: 2%|▏ | 15106/1000000 [5:23:10<2752:21:35, 10.06s/it, lr=1e-5, step_loss=0.00922][RANK-0]: Step: [15106], local_loss=0.013726059347391129, train_loss=0.04366505146026611, time_cost=5.689903020858765
+
Steps: 2%|▏ | 15106/1000000 [5:23:10<2752:21:35, 10.06s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 15107/1000000 [5:23:14<2288:29:49, 8.36s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [15107], local_loss=0.10282713919878006, train_loss=0.09957201778888702, time_cost=3.6159095764160156
+
Steps: 2%|▏ | 15107/1000000 [5:23:14<2288:29:49, 8.36s/it, lr=1e-5, step_loss=0.103]
Steps: 2%|▏ | 15108/1000000 [5:23:23<2358:07:54, 8.62s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [15108], local_loss=0.11774218082427979, train_loss=0.15812183916568756, time_cost=1.8499760627746582
+
Steps: 2%|▏ | 15108/1000000 [5:23:23<2358:07:54, 8.62s/it, lr=1e-5, step_loss=0.118]
Steps: 2%|▏ | 15109/1000000 [5:23:31<2314:38:47, 8.46s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [15109], local_loss=0.029769763350486755, train_loss=0.06652390956878662, time_cost=3.2119944095611572
+
Steps: 2%|▏ | 15109/1000000 [5:23:31<2314:38:47, 8.46s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 15110/1000000 [5:23:37<2101:02:27, 7.68s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [15110], local_loss=0.007125644013285637, train_loss=0.01984388940036297, time_cost=1.209883689880371
+
Steps: 2%|▏ | 15110/1000000 [5:23:37<2101:02:27, 7.68s/it, lr=1e-5, step_loss=0.00713]
Steps: 2%|▏ | 15111/1000000 [5:23:45<2125:09:06, 7.77s/it, lr=1e-5, step_loss=0.00713][RANK-0]: Step: [15111], local_loss=0.06362994015216827, train_loss=0.020277418196201324, time_cost=3.716420888900757
+
Steps: 2%|▏ | 15111/1000000 [5:23:45<2125:09:06, 7.77s/it, lr=1e-5, step_loss=0.0636]
Steps: 2%|▏ | 15112/1000000 [5:23:52<2065:31:51, 7.55s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [15112], local_loss=0.006603843066841364, train_loss=0.09387524425983429, time_cost=5.624904155731201
+
Steps: 2%|▏ | 15112/1000000 [5:23:52<2065:31:51, 7.55s/it, lr=1e-5, step_loss=0.0066]
Steps: 2%|▏ | 15113/1000000 [5:24:01<2164:46:53, 7.91s/it, lr=1e-5, step_loss=0.0066][RANK-0]: Step: [15113], local_loss=0.012809745036065578, train_loss=0.06448404490947723, time_cost=2.6526126861572266
+
Steps: 2%|▏ | 15113/1000000 [5:24:01<2164:46:53, 7.91s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15114/1000000 [5:24:08<2099:19:46, 7.67s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15114], local_loss=0.10566423088312149, train_loss=0.026995273306965828, time_cost=1.2197186946868896
+
Steps: 2%|▏ | 15114/1000000 [5:24:08<2099:19:46, 7.67s/it, lr=1e-5, step_loss=0.106]
Steps: 2%|▏ | 15115/1000000 [5:24:14<1925:59:41, 7.04s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [15115], local_loss=0.04328697919845581, train_loss=0.04655144363641739, time_cost=2.9590909481048584
+
Steps: 2%|▏ | 15115/1000000 [5:24:14<1925:59:41, 7.04s/it, lr=1e-5, step_loss=0.0433]
Steps: 2%|▏ | 15116/1000000 [5:24:28<2546:03:08, 9.31s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [15116], local_loss=1.005741834640503, train_loss=0.18087737262248993, time_cost=5.256582736968994
+
Steps: 2%|▏ | 15116/1000000 [5:24:28<2546:03:08, 9.31s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 15117/1000000 [5:24:38<2546:36:37, 9.31s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [15117], local_loss=0.010931373573839664, train_loss=0.02204202301800251, time_cost=1.3033854961395264
+
Steps: 2%|▏ | 15117/1000000 [5:24:38<2546:36:37, 9.31s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 15118/1000000 [5:24:48<2671:06:44, 9.76s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [15118], local_loss=0.03376387804746628, train_loss=0.03743840008974075, time_cost=2.442401170730591
+
Steps: 2%|▏ | 15118/1000000 [5:24:48<2671:06:44, 9.76s/it, lr=1e-5, step_loss=0.0338]
Steps: 2%|▏ | 15119/1000000 [5:24:56<2526:35:26, 9.24s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [15119], local_loss=0.9930495023727417, train_loss=0.14315535128116608, time_cost=2.998094081878662
+
Steps: 2%|▏ | 15119/1000000 [5:24:56<2526:35:26, 9.24s/it, lr=1e-5, step_loss=0.993]
Steps: 2%|▏ | 15120/1000000 [5:25:02<2189:04:50, 8.00s/it, lr=1e-5, step_loss=0.993][RANK-0]: Step: [15120], local_loss=0.0050453501753509045, train_loss=0.04028552025556564, time_cost=2.0264620780944824
+
Steps: 2%|▏ | 15120/1000000 [5:25:02<2189:04:50, 8.00s/it, lr=1e-5, step_loss=0.00505]
Steps: 2%|▏ | 15121/1000000 [5:25:12<2425:40:06, 8.87s/it, lr=1e-5, step_loss=0.00505][RANK-0]: Step: [15121], local_loss=0.0407111831009388, train_loss=0.036742690950632095, time_cost=8.019122123718262
+
Steps: 2%|▏ | 15121/1000000 [5:25:12<2425:40:06, 8.87s/it, lr=1e-5, step_loss=0.0407]
Steps: 2%|▏ | 15122/1000000 [5:25:24<2682:53:54, 9.81s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [15122], local_loss=0.013375170528888702, train_loss=0.024220114573836327, time_cost=4.739190578460693
+
Steps: 2%|▏ | 15122/1000000 [5:25:24<2682:53:54, 9.81s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 15123/1000000 [5:25:29<2259:10:26, 8.26s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [15123], local_loss=0.007684681098908186, train_loss=0.01557720173150301, time_cost=1.8446781635284424
+
Steps: 2%|▏ | 15123/1000000 [5:25:29<2259:10:26, 8.26s/it, lr=1e-5, step_loss=0.00768]
Steps: 2%|▏ | 15124/1000000 [5:25:34<2007:48:17, 7.34s/it, lr=1e-5, step_loss=0.00768][RANK-0]: Step: [15124], local_loss=0.06957515329122543, train_loss=0.11210130155086517, time_cost=2.658644676208496
+
Steps: 2%|▏ | 15124/1000000 [5:25:34<2007:48:17, 7.34s/it, lr=1e-5, step_loss=0.0696]
Steps: 2%|▏ | 15125/1000000 [5:25:50<2689:57:02, 9.83s/it, lr=1e-5, step_loss=0.0696][RANK-0]: Step: [15125], local_loss=0.04069225862622261, train_loss=0.030170168727636337, time_cost=6.189468860626221
+
Steps: 2%|▏ | 15125/1000000 [5:25:50<2689:57:02, 9.83s/it, lr=1e-5, step_loss=0.0407]
Steps: 2%|▏ | 15126/1000000 [5:25:55<2298:32:29, 8.40s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [15126], local_loss=0.04319882020354271, train_loss=0.0597403347492218, time_cost=2.2818868160247803
+
Steps: 2%|▏ | 15126/1000000 [5:25:55<2298:32:29, 8.40s/it, lr=1e-5, step_loss=0.0432]
Steps: 2%|▏ | 15127/1000000 [5:26:06<2530:12:45, 9.25s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [15127], local_loss=0.9858735203742981, train_loss=0.563766598701477, time_cost=1.8258171081542969
+
Steps: 2%|▏ | 15127/1000000 [5:26:06<2530:12:45, 9.25s/it, lr=1e-5, step_loss=0.986]
Steps: 2%|▏ | 15128/1000000 [5:26:14<2391:25:31, 8.74s/it, lr=1e-5, step_loss=0.986][RANK-0]: Step: [15128], local_loss=0.016623781993985176, train_loss=0.03817569836974144, time_cost=1.7714288234710693
+
Steps: 2%|▏ | 15128/1000000 [5:26:14<2391:25:31, 8.74s/it, lr=1e-5, step_loss=0.0166]
Steps: 2%|▏ | 15129/1000000 [5:26:27<2746:03:34, 10.04s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [15129], local_loss=0.023778144270181656, train_loss=0.04194725677371025, time_cost=5.4515814781188965
+
Steps: 2%|▏ | 15129/1000000 [5:26:27<2746:03:34, 10.04s/it, lr=1e-5, step_loss=0.0238]
Steps: 2%|▏ | 15130/1000000 [5:26:32<2368:53:59, 8.66s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [15130], local_loss=0.06295900046825409, train_loss=0.045461393892765045, time_cost=1.2861323356628418
+
Steps: 2%|▏ | 15130/1000000 [5:26:32<2368:53:59, 8.66s/it, lr=1e-5, step_loss=0.063]
Steps: 2%|▏ | 15131/1000000 [5:26:38<2161:31:27, 7.90s/it, lr=1e-5, step_loss=0.063][RANK-0]: Step: [15131], local_loss=0.03761262446641922, train_loss=0.05698401853442192, time_cost=3.539125680923462
+
Steps: 2%|▏ | 15131/1000000 [5:26:38<2161:31:27, 7.90s/it, lr=1e-5, step_loss=0.0376]
Steps: 2%|▏ | 15132/1000000 [5:26:51<2510:51:07, 9.18s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [15132], local_loss=0.025387585163116455, train_loss=0.036896299570798874, time_cost=4.800430059432983
+
Steps: 2%|▏ | 15132/1000000 [5:26:51<2510:51:07, 9.18s/it, lr=1e-5, step_loss=0.0254]
Steps: 2%|▏ | 15133/1000000 [5:27:02<2695:17:52, 9.85s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [15133], local_loss=0.11083842813968658, train_loss=0.04525882005691528, time_cost=3.2140021324157715
+
Steps: 2%|▏ | 15133/1000000 [5:27:02<2695:17:52, 9.85s/it, lr=1e-5, step_loss=0.111]
Steps: 2%|▏ | 15134/1000000 [5:27:16<3050:40:04, 11.15s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [15134], local_loss=0.006113741546869278, train_loss=14.0947904586792, time_cost=6.2030863761901855
+
Steps: 2%|▏ | 15134/1000000 [5:27:16<3050:40:04, 11.15s/it, lr=1e-5, step_loss=0.00611]
Steps: 2%|▏ | 15135/1000000 [5:27:31<3321:13:19, 12.14s/it, lr=1e-5, step_loss=0.00611][RANK-0]: Step: [15135], local_loss=0.014053392224013805, train_loss=0.016939645633101463, time_cost=4.439723491668701
+
Steps: 2%|▏ | 15135/1000000 [5:27:31<3321:13:19, 12.14s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 15136/1000000 [5:27:43<3331:18:16, 12.18s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [15136], local_loss=0.05257066339254379, train_loss=0.024401333183050156, time_cost=4.709189414978027
+
Steps: 2%|▏ | 15136/1000000 [5:27:43<3331:18:16, 12.18s/it, lr=1e-5, step_loss=0.0526]
Steps: 2%|▏ | 15137/1000000 [5:27:51<2965:31:04, 10.84s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [15137], local_loss=0.008967823348939419, train_loss=0.02132810465991497, time_cost=3.112161636352539
+
Steps: 2%|▏ | 15137/1000000 [5:27:51<2965:31:04, 10.84s/it, lr=1e-5, step_loss=0.00897]
Steps: 2%|▏ | 15138/1000000 [5:28:04<3135:04:14, 11.46s/it, lr=1e-5, step_loss=0.00897][RANK-0]: Step: [15138], local_loss=0.014441350474953651, train_loss=0.046284228563308716, time_cost=4.554769515991211
+
Steps: 2%|▏ | 15138/1000000 [5:28:04<3135:04:14, 11.46s/it, lr=1e-5, step_loss=0.0144]
Steps: 2%|▏ | 15139/1000000 [5:28:09<2677:54:40, 9.79s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [15139], local_loss=0.008595548570156097, train_loss=0.03934873268008232, time_cost=1.4557743072509766
+
Steps: 2%|▏ | 15139/1000000 [5:28:09<2677:54:40, 9.79s/it, lr=1e-5, step_loss=0.0086]
Steps: 2%|▏ | 15140/1000000 [5:28:23<2983:13:27, 10.90s/it, lr=1e-5, step_loss=0.0086][RANK-0]: Step: [15140], local_loss=1.0069127082824707, train_loss=0.18068411946296692, time_cost=1.8357818126678467
+
Steps: 2%|▏ | 15140/1000000 [5:28:23<2983:13:27, 10.90s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 15141/1000000 [5:28:29<2550:18:20, 9.32s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [15141], local_loss=0.0057248701341450214, train_loss=0.05501716583967209, time_cost=2.590540647506714
+
Steps: 2%|▏ | 15141/1000000 [5:28:29<2550:18:20, 9.32s/it, lr=1e-5, step_loss=0.00572]
Steps: 2%|▏ | 15142/1000000 [5:28:39<2675:20:59, 9.78s/it, lr=1e-5, step_loss=0.00572][RANK-0]: Step: [15142], local_loss=0.009493716061115265, train_loss=0.035527557134628296, time_cost=1.2341818809509277
+
Steps: 2%|▏ | 15142/1000000 [5:28:39<2675:20:59, 9.78s/it, lr=1e-5, step_loss=0.00949]
Steps: 2%|▏ | 15143/1000000 [5:28:45<2358:04:14, 8.62s/it, lr=1e-5, step_loss=0.00949][RANK-0]: Step: [15143], local_loss=0.0515466146171093, train_loss=0.01585773378610611, time_cost=1.3290181159973145
+
Steps: 2%|▏ | 15143/1000000 [5:28:45<2358:04:14, 8.62s/it, lr=1e-5, step_loss=0.0515]
Steps: 2%|▏ | 15144/1000000 [5:28:59<2807:10:29, 10.26s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [15144], local_loss=0.008030696772038937, train_loss=0.06898700445890427, time_cost=12.403333902359009
+
Steps: 2%|▏ | 15144/1000000 [5:28:59<2807:10:29, 10.26s/it, lr=1e-5, step_loss=0.00803]
Steps: 2%|▏ | 15145/1000000 [5:29:04<2335:28:40, 8.54s/it, lr=1e-5, step_loss=0.00803][RANK-0]: Step: [15145], local_loss=0.010020740330219269, train_loss=0.0375148206949234, time_cost=1.2224180698394775
+
Steps: 2%|▏ | 15145/1000000 [5:29:04<2335:28:40, 8.54s/it, lr=1e-5, step_loss=0.01]
Steps: 2%|▏ | 15146/1000000 [5:29:11<2193:14:44, 8.02s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [15146], local_loss=0.11525966972112656, train_loss=0.05294497311115265, time_cost=2.3829550743103027
+
Steps: 2%|▏ | 15146/1000000 [5:29:11<2193:14:44, 8.02s/it, lr=1e-5, step_loss=0.115]
Steps: 2%|▏ | 15147/1000000 [5:29:16<1938:45:59, 7.09s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [15147], local_loss=0.0075789825059473515, train_loss=0.03018335998058319, time_cost=1.9310510158538818
+
Steps: 2%|▏ | 15147/1000000 [5:29:16<1938:45:59, 7.09s/it, lr=1e-5, step_loss=0.00758]
Steps: 2%|▏ | 15148/1000000 [5:29:25<2101:30:23, 7.68s/it, lr=1e-5, step_loss=0.00758][RANK-0]: Step: [15148], local_loss=0.022502180188894272, train_loss=0.024475395679473877, time_cost=5.0463457107543945
+
Steps: 2%|▏ | 15148/1000000 [5:29:25<2101:30:23, 7.68s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 15149/1000000 [5:29:39<2642:14:11, 9.66s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [15149], local_loss=0.05099083483219147, train_loss=0.11879482120275497, time_cost=3.572944164276123
+
Steps: 2%|▏ | 15149/1000000 [5:29:39<2642:14:11, 9.66s/it, lr=1e-5, step_loss=0.051]
Steps: 2%|▏ | 15150/1000000 [5:29:46<2420:32:09, 8.85s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [15150], local_loss=0.009380524978041649, train_loss=0.0599357932806015, time_cost=3.274691581726074
+
Steps: 2%|▏ | 15150/1000000 [5:29:46<2420:32:09, 8.85s/it, lr=1e-5, step_loss=0.00938]
Steps: 2%|▏ | 15151/1000000 [5:29:50<2053:54:12, 7.51s/it, lr=1e-5, step_loss=0.00938][RANK-0]: Step: [15151], local_loss=0.031748734414577484, train_loss=0.16766640543937683, time_cost=1.6733057498931885
+
Steps: 2%|▏ | 15151/1000000 [5:29:50<2053:54:12, 7.51s/it, lr=1e-5, step_loss=0.0317]
Steps: 2%|▏ | 15152/1000000 [5:30:04<2594:50:04, 9.49s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [15152], local_loss=0.055377811193466187, train_loss=2.498584270477295, time_cost=4.17692494392395
+
Steps: 2%|▏ | 15152/1000000 [5:30:04<2594:50:04, 9.49s/it, lr=1e-5, step_loss=0.0554]
Steps: 2%|▏ | 15153/1000000 [5:30:13<2485:38:43, 9.09s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [15153], local_loss=0.027679739519953728, train_loss=0.02433476597070694, time_cost=3.9816226959228516
+
Steps: 2%|▏ | 15153/1000000 [5:30:13<2485:38:43, 9.09s/it, lr=1e-5, step_loss=0.0277]
Steps: 2%|▏ | 15154/1000000 [5:30:28<3028:54:13, 11.07s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [15154], local_loss=0.016231149435043335, train_loss=0.054837167263031006, time_cost=1.202064037322998
+
Steps: 2%|▏ | 15154/1000000 [5:30:28<3028:54:13, 11.07s/it, lr=1e-5, step_loss=0.0162]
Steps: 2%|▏ | 15155/1000000 [5:30:34<2598:33:00, 9.50s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [15155], local_loss=0.007649287581443787, train_loss=0.07344505190849304, time_cost=1.2281651496887207
+
Steps: 2%|▏ | 15155/1000000 [5:30:34<2598:33:00, 9.50s/it, lr=1e-5, step_loss=0.00765]
Steps: 2%|▏ | 15156/1000000 [5:30:41<2395:45:09, 8.76s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [15156], local_loss=0.04878261685371399, train_loss=0.08194971084594727, time_cost=3.0441160202026367
+
Steps: 2%|▏ | 15156/1000000 [5:30:41<2395:45:09, 8.76s/it, lr=1e-5, step_loss=0.0488]
Steps: 2%|▏ | 15157/1000000 [5:30:51<2447:53:09, 8.95s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [15157], local_loss=0.09398249536752701, train_loss=0.039255306124687195, time_cost=3.093127965927124
+
Steps: 2%|▏ | 15157/1000000 [5:30:51<2447:53:09, 8.95s/it, lr=1e-5, step_loss=0.094]
Steps: 2%|▏ | 15158/1000000 [5:30:58<2285:20:39, 8.35s/it, lr=1e-5, step_loss=0.094][RANK-0]: Step: [15158], local_loss=0.05942942574620247, train_loss=0.04096905142068863, time_cost=5.509654998779297
+
Steps: 2%|▏ | 15158/1000000 [5:30:58<2285:20:39, 8.35s/it, lr=1e-5, step_loss=0.0594]
Steps: 2%|▏ | 15159/1000000 [5:31:03<2058:44:51, 7.53s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [15159], local_loss=0.010957824066281319, train_loss=0.06046554818749428, time_cost=2.8763415813446045
+
Steps: 2%|▏ | 15159/1000000 [5:31:03<2058:44:51, 7.53s/it, lr=1e-5, step_loss=0.011]
Steps: 2%|▏ | 15160/1000000 [5:31:15<2389:29:34, 8.73s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [15160], local_loss=0.05171734839677811, train_loss=30.33310317993164, time_cost=7.22776460647583
+
Steps: 2%|▏ | 15160/1000000 [5:31:15<2389:29:34, 8.73s/it, lr=1e-5, step_loss=0.0517]
Steps: 2%|▏ | 15161/1000000 [5:31:20<2087:36:36, 7.63s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [15161], local_loss=0.012460269965231419, train_loss=0.02451193332672119, time_cost=2.0728864669799805
+
Steps: 2%|▏ | 15161/1000000 [5:31:20<2087:36:36, 7.63s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 15162/1000000 [5:31:33<2550:20:21, 9.32s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [15162], local_loss=0.08741617947816849, train_loss=0.19951465725898743, time_cost=9.73512864112854
+
Steps: 2%|▏ | 15162/1000000 [5:31:33<2550:20:21, 9.32s/it, lr=1e-5, step_loss=0.0874]
Steps: 2%|▏ | 15163/1000000 [5:31:40<2364:03:52, 8.64s/it, lr=1e-5, step_loss=0.0874][RANK-0]: Step: [15163], local_loss=0.009809440933167934, train_loss=0.013231603428721428, time_cost=2.7710697650909424
+
Steps: 2%|▏ | 15163/1000000 [5:31:40<2364:03:52, 8.64s/it, lr=1e-5, step_loss=0.00981]
Steps: 2%|▏ | 15164/1000000 [5:31:55<2877:43:14, 10.52s/it, lr=1e-5, step_loss=0.00981][RANK-0]: Step: [15164], local_loss=0.08215320110321045, train_loss=0.08451689034700394, time_cost=11.320799112319946
+
Steps: 2%|▏ | 15164/1000000 [5:31:55<2877:43:14, 10.52s/it, lr=1e-5, step_loss=0.0822]
Steps: 2%|▏ | 15165/1000000 [5:32:01<2497:39:32, 9.13s/it, lr=1e-5, step_loss=0.0822][RANK-0]: Step: [15165], local_loss=0.029252327978610992, train_loss=0.026933329179883003, time_cost=1.7525787353515625
+
Steps: 2%|▏ | 15165/1000000 [5:32:01<2497:39:32, 9.13s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 15166/1000000 [5:32:12<2704:37:53, 9.89s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [15166], local_loss=0.009429903700947762, train_loss=29.577974319458008, time_cost=3.031867027282715
+
Steps: 2%|▏ | 15166/1000000 [5:32:12<2704:37:53, 9.89s/it, lr=1e-5, step_loss=0.00943]
Steps: 2%|▏ | 15167/1000000 [5:32:19<2461:24:31, 9.00s/it, lr=1e-5, step_loss=0.00943][RANK-0]: Step: [15167], local_loss=0.007531831972301006, train_loss=0.03794628381729126, time_cost=2.072439193725586
+
Steps: 2%|▏ | 15167/1000000 [5:32:19<2461:24:31, 9.00s/it, lr=1e-5, step_loss=0.00753]
Steps: 2%|▏ | 15168/1000000 [5:32:32<2735:40:18, 10.00s/it, lr=1e-5, step_loss=0.00753][RANK-0]: Step: [15168], local_loss=0.13147202134132385, train_loss=0.044133685529232025, time_cost=2.5280590057373047
+
Steps: 2%|▏ | 15168/1000000 [5:32:32<2735:40:18, 10.00s/it, lr=1e-5, step_loss=0.131]
Steps: 2%|▏ | 15169/1000000 [5:32:41<2709:43:54, 9.91s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [15169], local_loss=0.4217102825641632, train_loss=0.13430459797382355, time_cost=4.275476932525635
+
Steps: 2%|▏ | 15169/1000000 [5:32:41<2709:43:54, 9.91s/it, lr=1e-5, step_loss=0.422]
Steps: 2%|▏ | 15170/1000000 [5:32:49<2532:01:19, 9.26s/it, lr=1e-5, step_loss=0.422][RANK-0]: Step: [15170], local_loss=0.008962679654359818, train_loss=0.027034275233745575, time_cost=3.6095755100250244
+
Steps: 2%|▏ | 15170/1000000 [5:32:49<2532:01:19, 9.26s/it, lr=1e-5, step_loss=0.00896]
Steps: 2%|▏ | 15171/1000000 [5:33:00<2651:02:56, 9.69s/it, lr=1e-5, step_loss=0.00896][RANK-0]: Step: [15171], local_loss=0.028322404250502586, train_loss=0.0288159791380167, time_cost=1.8995249271392822
+
Steps: 2%|▏ | 15171/1000000 [5:33:00<2651:02:56, 9.69s/it, lr=1e-5, step_loss=0.0283]
Steps: 2%|▏ | 15172/1000000 [5:33:11<2735:45:34, 10.00s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [15172], local_loss=0.00914264190942049, train_loss=0.035716570913791656, time_cost=1.44366455078125
+
Steps: 2%|▏ | 15172/1000000 [5:33:11<2735:45:34, 10.00s/it, lr=1e-5, step_loss=0.00914]
Steps: 2%|▏ | 15173/1000000 [5:33:20<2687:39:43, 9.82s/it, lr=1e-5, step_loss=0.00914][RANK-0]: Step: [15173], local_loss=0.006927039939910173, train_loss=0.04654480889439583, time_cost=3.456570863723755
+
Steps: 2%|▏ | 15173/1000000 [5:33:20<2687:39:43, 9.82s/it, lr=1e-5, step_loss=0.00693]
Steps: 2%|▏ | 15174/1000000 [5:33:30<2665:59:21, 9.75s/it, lr=1e-5, step_loss=0.00693][RANK-0]: Step: [15174], local_loss=0.024618180468678474, train_loss=0.027984213083982468, time_cost=2.020331859588623
+
Steps: 2%|▏ | 15174/1000000 [5:33:30<2665:59:21, 9.75s/it, lr=1e-5, step_loss=0.0246]
Steps: 2%|▏ | 15175/1000000 [5:33:38<2520:03:41, 9.21s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [15175], local_loss=0.048460703343153, train_loss=0.10409383475780487, time_cost=3.8115010261535645
+
Steps: 2%|▏ | 15175/1000000 [5:33:38<2520:03:41, 9.21s/it, lr=1e-5, step_loss=0.0485]
Steps: 2%|▏ | 15176/1000000 [5:33:51<2906:18:46, 10.62s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [15176], local_loss=0.045053452253341675, train_loss=0.05686505138874054, time_cost=1.4634225368499756
+
Steps: 2%|▏ | 15176/1000000 [5:33:51<2906:18:46, 10.62s/it, lr=1e-5, step_loss=0.0451]
Steps: 2%|▏ | 15177/1000000 [5:34:06<3213:47:39, 11.75s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [15177], local_loss=0.01458188146352768, train_loss=0.04402550682425499, time_cost=5.8519580364227295
+
Steps: 2%|▏ | 15177/1000000 [5:34:06<3213:47:39, 11.75s/it, lr=1e-5, step_loss=0.0146]
Steps: 2%|▏ | 15178/1000000 [5:34:13<2841:11:58, 10.39s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [15178], local_loss=0.00983906164765358, train_loss=0.036247655749320984, time_cost=3.515047550201416
+
Steps: 2%|▏ | 15178/1000000 [5:34:13<2841:11:58, 10.39s/it, lr=1e-5, step_loss=0.00984]
Steps: 2%|▏ | 15179/1000000 [5:34:24<2917:27:54, 10.66s/it, lr=1e-5, step_loss=0.00984][RANK-0]: Step: [15179], local_loss=0.009765232913196087, train_loss=0.15756183862686157, time_cost=8.34230089187622
+
Steps: 2%|▏ | 15179/1000000 [5:34:24<2917:27:54, 10.66s/it, lr=1e-5, step_loss=0.00977]
Steps: 2%|▏ | 15180/1000000 [5:34:39<3210:29:25, 11.74s/it, lr=1e-5, step_loss=0.00977][RANK-0]: Step: [15180], local_loss=0.03726229444146156, train_loss=0.02318856120109558, time_cost=2.9249556064605713
+
Steps: 2%|▏ | 15180/1000000 [5:34:39<3210:29:25, 11.74s/it, lr=1e-5, step_loss=0.0373]
Steps: 2%|▏ | 15181/1000000 [5:34:46<2852:42:58, 10.43s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [15181], local_loss=0.012715840712189674, train_loss=0.015527591109275818, time_cost=3.1409389972686768
+
Steps: 2%|▏ | 15181/1000000 [5:34:46<2852:42:58, 10.43s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 15182/1000000 [5:34:58<2955:37:48, 10.80s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [15182], local_loss=0.016313951462507248, train_loss=0.06141190230846405, time_cost=6.7720983028411865
+
Steps: 2%|▏ | 15182/1000000 [5:34:58<2955:37:48, 10.80s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 15183/1000000 [5:35:13<3367:36:43, 12.31s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [15183], local_loss=0.04331691935658455, train_loss=0.06086983159184456, time_cost=6.472963094711304
+
Steps: 2%|▏ | 15183/1000000 [5:35:13<3367:36:43, 12.31s/it, lr=1e-5, step_loss=0.0433]
Steps: 2%|▏ | 15184/1000000 [5:35:23<3116:34:01, 11.39s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [15184], local_loss=0.009478139691054821, train_loss=0.025729544460773468, time_cost=2.293891668319702
+
Steps: 2%|▏ | 15184/1000000 [5:35:23<3116:34:01, 11.39s/it, lr=1e-5, step_loss=0.00948]
Steps: 2%|▏ | 15185/1000000 [5:35:31<2881:45:00, 10.53s/it, lr=1e-5, step_loss=0.00948][RANK-0]: Step: [15185], local_loss=0.03810734674334526, train_loss=0.0990571603178978, time_cost=2.661994695663452
+
Steps: 2%|▏ | 15185/1000000 [5:35:31<2881:45:00, 10.53s/it, lr=1e-5, step_loss=0.0381]
Steps: 2%|▏ | 15186/1000000 [5:35:42<2877:08:42, 10.52s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [15186], local_loss=0.105421282351017, train_loss=0.035978659987449646, time_cost=1.6541736125946045
+
Steps: 2%|▏ | 15186/1000000 [5:35:42<2877:08:42, 10.52s/it, lr=1e-5, step_loss=0.105]
Steps: 2%|▏ | 15187/1000000 [5:35:54<2988:02:17, 10.92s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [15187], local_loss=0.02134859375655651, train_loss=0.0747358426451683, time_cost=3.0093085765838623
+
Steps: 2%|▏ | 15187/1000000 [5:35:54<2988:02:17, 10.92s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 15188/1000000 [5:36:02<2797:24:50, 10.23s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [15188], local_loss=0.06758356094360352, train_loss=0.04358226805925369, time_cost=3.509063720703125
+
Steps: 2%|▏ | 15188/1000000 [5:36:02<2797:24:50, 10.23s/it, lr=1e-5, step_loss=0.0676]
Steps: 2%|▏ | 15189/1000000 [5:36:10<2581:07:26, 9.44s/it, lr=1e-5, step_loss=0.0676][RANK-0]: Step: [15189], local_loss=0.018805289641022682, train_loss=0.05665828660130501, time_cost=1.6696372032165527
+
Steps: 2%|▏ | 15189/1000000 [5:36:10<2581:07:26, 9.44s/it, lr=1e-5, step_loss=0.0188]
Steps: 2%|▏ | 15190/1000000 [5:36:23<2867:47:11, 10.48s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [15190], local_loss=0.04624297097325325, train_loss=0.03237428516149521, time_cost=4.420888423919678
+
Steps: 2%|▏ | 15190/1000000 [5:36:23<2867:47:11, 10.48s/it, lr=1e-5, step_loss=0.0462]
Steps: 2%|▏ | 15191/1000000 [5:36:28<2475:09:08, 9.05s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [15191], local_loss=0.03666863590478897, train_loss=0.013891342096030712, time_cost=1.6290006637573242
+
Steps: 2%|▏ | 15191/1000000 [5:36:28<2475:09:08, 9.05s/it, lr=1e-5, step_loss=0.0367]
Steps: 2%|▏ | 15192/1000000 [5:36:33<2140:53:43, 7.83s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [15192], local_loss=0.008287575095891953, train_loss=0.08476525545120239, time_cost=2.1823344230651855
+
Steps: 2%|▏ | 15192/1000000 [5:36:33<2140:53:43, 7.83s/it, lr=1e-5, step_loss=0.00829]
Steps: 2%|▏ | 15193/1000000 [5:36:43<2251:21:28, 8.23s/it, lr=1e-5, step_loss=0.00829][RANK-0]: Step: [15193], local_loss=0.3014282286167145, train_loss=0.06258010119199753, time_cost=1.2854886054992676
+
Steps: 2%|▏ | 15193/1000000 [5:36:43<2251:21:28, 8.23s/it, lr=1e-5, step_loss=0.301]
Steps: 2%|▏ | 15194/1000000 [5:36:58<2842:14:36, 10.39s/it, lr=1e-5, step_loss=0.301][RANK-0]: Step: [15194], local_loss=69.9351577758789, train_loss=8.76233196258545, time_cost=7.129266977310181
+
Steps: 2%|▏ | 15194/1000000 [5:36:58<2842:14:36, 10.39s/it, lr=1e-5, step_loss=69.9]
Steps: 2%|▏ | 15195/1000000 [5:37:10<2988:17:13, 10.92s/it, lr=1e-5, step_loss=69.9][RANK-0]: Step: [15195], local_loss=0.0135454460978508, train_loss=0.04715046286582947, time_cost=10.099836111068726
+
Steps: 2%|▏ | 15195/1000000 [5:37:10<2988:17:13, 10.92s/it, lr=1e-5, step_loss=0.0135]
Steps: 2%|▏ | 15196/1000000 [5:37:19<2822:01:30, 10.32s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [15196], local_loss=0.08087650686502457, train_loss=0.10107819736003876, time_cost=1.2043519020080566
+
Steps: 2%|▏ | 15196/1000000 [5:37:19<2822:01:30, 10.32s/it, lr=1e-5, step_loss=0.0809]
Steps: 2%|▏ | 15197/1000000 [5:37:24<2413:34:12, 8.82s/it, lr=1e-5, step_loss=0.0809][RANK-0]: Step: [15197], local_loss=0.2052057534456253, train_loss=0.05822635814547539, time_cost=2.024273633956909
+
Steps: 2%|▏ | 15197/1000000 [5:37:24<2413:34:12, 8.82s/it, lr=1e-5, step_loss=0.205]
Steps: 2%|▏ | 15198/1000000 [5:37:39<2874:09:09, 10.51s/it, lr=1e-5, step_loss=0.205][RANK-0]: Step: [15198], local_loss=0.009508047252893448, train_loss=0.07107191532850266, time_cost=6.232437610626221
+
Steps: 2%|▏ | 15198/1000000 [5:37:39<2874:09:09, 10.51s/it, lr=1e-5, step_loss=0.00951]
Steps: 2%|▏ | 15199/1000000 [5:37:44<2435:10:25, 8.90s/it, lr=1e-5, step_loss=0.00951][RANK-0]: Step: [15199], local_loss=0.12934617698192596, train_loss=0.038295626640319824, time_cost=1.1975739002227783
+
Steps: 2%|▏ | 15199/1000000 [5:37:44<2435:10:25, 8.90s/it, lr=1e-5, step_loss=0.129]
Steps: 2%|▏ | 15200/1000000 [5:37:58<2867:58:26, 10.48s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [15200], local_loss=0.023097429424524307, train_loss=0.06770718842744827, time_cost=4.4223175048828125
+
Steps: 2%|▏ | 15200/1000000 [5:37:58<2867:58:26, 10.48s/it, lr=1e-5, step_loss=0.0231]
Steps: 2%|▏ | 15201/1000000 [5:38:07<2741:25:08, 10.02s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [15201], local_loss=0.007887626998126507, train_loss=0.0794617235660553, time_cost=3.9585492610931396
+
Steps: 2%|▏ | 15201/1000000 [5:38:07<2741:25:08, 10.02s/it, lr=1e-5, step_loss=0.00789]
Steps: 2%|▏ | 15202/1000000 [5:38:15<2528:06:19, 9.24s/it, lr=1e-5, step_loss=0.00789][RANK-0]: Step: [15202], local_loss=0.14386716485023499, train_loss=0.182656928896904, time_cost=1.9262645244598389
+
Steps: 2%|▏ | 15202/1000000 [5:38:15<2528:06:19, 9.24s/it, lr=1e-5, step_loss=0.144]
Steps: 2%|▏ | 15203/1000000 [5:38:20<2250:22:31, 8.23s/it, lr=1e-5, step_loss=0.144][RANK-0]: Step: [15203], local_loss=0.06824550777673721, train_loss=0.07407598197460175, time_cost=4.232650518417358
+
Steps: 2%|▏ | 15203/1000000 [5:38:20<2250:22:31, 8.23s/it, lr=1e-5, step_loss=0.0682]
Steps: 2%|▏ | 15204/1000000 [5:38:30<2337:47:50, 8.55s/it, lr=1e-5, step_loss=0.0682][RANK-0]: Step: [15204], local_loss=0.06159352511167526, train_loss=0.07007521390914917, time_cost=2.1681904792785645
+
Steps: 2%|▏ | 15204/1000000 [5:38:30<2337:47:50, 8.55s/it, lr=1e-5, step_loss=0.0616]
Steps: 2%|▏ | 15205/1000000 [5:38:34<1984:11:09, 7.25s/it, lr=1e-5, step_loss=0.0616][RANK-0]: Step: [15205], local_loss=0.0070409090258181095, train_loss=0.06842397898435593, time_cost=1.397249698638916
+
Steps: 2%|▏ | 15205/1000000 [5:38:34<1984:11:09, 7.25s/it, lr=1e-5, step_loss=0.00704]
Steps: 2%|▏ | 15206/1000000 [5:38:39<1791:02:27, 6.55s/it, lr=1e-5, step_loss=0.00704][RANK-0]: Step: [15206], local_loss=0.17783884704113007, train_loss=0.04209878668189049, time_cost=2.221247911453247
+
Steps: 2%|▏ | 15206/1000000 [5:38:39<1791:02:27, 6.55s/it, lr=1e-5, step_loss=0.178]
Steps: 2%|▏ | 15207/1000000 [5:38:51<2292:37:39, 8.38s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [15207], local_loss=0.058191150426864624, train_loss=0.05219346284866333, time_cost=6.01226806640625
+
Steps: 2%|▏ | 15207/1000000 [5:38:51<2292:37:39, 8.38s/it, lr=1e-5, step_loss=0.0582]
Steps: 2%|▏ | 15208/1000000 [5:39:03<2510:13:05, 9.18s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [15208], local_loss=0.014070832170546055, train_loss=0.04028625413775444, time_cost=1.536982774734497
+
Steps: 2%|▏ | 15208/1000000 [5:39:03<2510:13:05, 9.18s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 15209/1000000 [5:39:07<2109:06:05, 7.71s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [15209], local_loss=0.015225458890199661, train_loss=0.14950627088546753, time_cost=1.2307276725769043
+
Steps: 2%|▏ | 15209/1000000 [5:39:07<2109:06:05, 7.71s/it, lr=1e-5, step_loss=0.0152]
Steps: 2%|▏ | 15210/1000000 [5:39:12<1869:59:12, 6.84s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [15210], local_loss=0.006702923681586981, train_loss=0.07176066935062408, time_cost=1.9992821216583252
+
Steps: 2%|▏ | 15210/1000000 [5:39:12<1869:59:12, 6.84s/it, lr=1e-5, step_loss=0.0067]
Steps: 2%|▏ | 15211/1000000 [5:39:16<1674:05:22, 6.12s/it, lr=1e-5, step_loss=0.0067][RANK-0]: Step: [15211], local_loss=0.03778911754488945, train_loss=0.017559539526700974, time_cost=1.5744214057922363
+
Steps: 2%|▏ | 15211/1000000 [5:39:16<1674:05:22, 6.12s/it, lr=1e-5, step_loss=0.0378]
Steps: 2%|▏ | 15212/1000000 [5:39:21<1577:24:20, 5.77s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [15212], local_loss=0.01605781726539135, train_loss=0.14867688715457916, time_cost=2.062649726867676
+
Steps: 2%|▏ | 15212/1000000 [5:39:21<1577:24:20, 5.77s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 15213/1000000 [5:39:34<2202:50:33, 8.05s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [15213], local_loss=0.005109273362904787, train_loss=0.11619602143764496, time_cost=4.1560564041137695
+
Steps: 2%|▏ | 15213/1000000 [5:39:34<2202:50:33, 8.05s/it, lr=1e-5, step_loss=0.00511]
Steps: 2%|▏ | 15214/1000000 [5:39:51<2870:02:11, 10.49s/it, lr=1e-5, step_loss=0.00511][RANK-0]: Step: [15214], local_loss=0.007527779787778854, train_loss=0.03594150394201279, time_cost=7.250680208206177
+
Steps: 2%|▏ | 15214/1000000 [5:39:51<2870:02:11, 10.49s/it, lr=1e-5, step_loss=0.00753]
Steps: 2%|▏ | 15215/1000000 [5:40:02<2908:50:14, 10.63s/it, lr=1e-5, step_loss=0.00753][RANK-0]: Step: [15215], local_loss=0.2021263688802719, train_loss=0.04275989904999733, time_cost=3.2150747776031494
+
Steps: 2%|▏ | 15215/1000000 [5:40:02<2908:50:14, 10.63s/it, lr=1e-5, step_loss=0.202]
Steps: 2%|▏ | 15216/1000000 [5:40:14<3089:20:07, 11.29s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [15216], local_loss=0.02015993744134903, train_loss=0.01332058198750019, time_cost=3.7774250507354736
+
Steps: 2%|▏ | 15216/1000000 [5:40:14<3089:20:07, 11.29s/it, lr=1e-5, step_loss=0.0202]
Steps: 2%|▏ | 15217/1000000 [5:40:26<3078:25:56, 11.25s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [15217], local_loss=0.005847585387527943, train_loss=0.03986024856567383, time_cost=4.085529565811157
+
Steps: 2%|▏ | 15217/1000000 [5:40:26<3078:25:56, 11.25s/it, lr=1e-5, step_loss=0.00585]
Steps: 2%|▏ | 15218/1000000 [5:40:31<2564:15:17, 9.37s/it, lr=1e-5, step_loss=0.00585][RANK-0]: Step: [15218], local_loss=0.01366999838501215, train_loss=0.024778248742222786, time_cost=2.2406561374664307
+
Steps: 2%|▏ | 15218/1000000 [5:40:31<2564:15:17, 9.37s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 15219/1000000 [5:40:46<3028:37:48, 11.07s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [15219], local_loss=0.01572684943675995, train_loss=0.042419351637363434, time_cost=6.119369268417358
+
Steps: 2%|▏ | 15219/1000000 [5:40:46<3028:37:48, 11.07s/it, lr=1e-5, step_loss=0.0157]
Steps: 2%|▏ | 15220/1000000 [5:40:56<2984:00:59, 10.91s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [15220], local_loss=0.01203209813684225, train_loss=0.018611792474985123, time_cost=2.9872701168060303
+
Steps: 2%|▏ | 15220/1000000 [5:40:56<2984:00:59, 10.91s/it, lr=1e-5, step_loss=0.012]
Steps: 2%|▏ | 15221/1000000 [5:41:01<2499:13:36, 9.14s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [15221], local_loss=0.007350284140557051, train_loss=19.776559829711914, time_cost=2.301642656326294
+
Steps: 2%|▏ | 15221/1000000 [5:41:01<2499:13:36, 9.14s/it, lr=1e-5, step_loss=0.00735]
Steps: 2%|▏ | 15222/1000000 [5:41:07<2237:57:50, 8.18s/it, lr=1e-5, step_loss=0.00735][RANK-0]: Step: [15222], local_loss=0.01546271052211523, train_loss=0.07384216040372849, time_cost=1.572967767715454
+
Steps: 2%|▏ | 15222/1000000 [5:41:07<2237:57:50, 8.18s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 15223/1000000 [5:41:17<2353:35:53, 8.60s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [15223], local_loss=0.020152052864432335, train_loss=0.023269234225153923, time_cost=3.325291156768799
+
Steps: 2%|▏ | 15223/1000000 [5:41:17<2353:35:53, 8.60s/it, lr=1e-5, step_loss=0.0202]
Steps: 2%|▏ | 15224/1000000 [5:41:26<2451:14:04, 8.96s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [15224], local_loss=0.027099337428808212, train_loss=0.025388743728399277, time_cost=2.4651975631713867
+
Steps: 2%|▏ | 15224/1000000 [5:41:26<2451:14:04, 8.96s/it, lr=1e-5, step_loss=0.0271]
Steps: 2%|▏ | 15225/1000000 [5:41:45<3230:28:17, 11.81s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [15225], local_loss=0.1294175535440445, train_loss=0.09525781869888306, time_cost=11.616614580154419
+
Steps: 2%|▏ | 15225/1000000 [5:41:45<3230:28:17, 11.81s/it, lr=1e-5, step_loss=0.129]
Steps: 2%|▏ | 15226/1000000 [5:42:00<3506:18:54, 12.82s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [15226], local_loss=0.12157920002937317, train_loss=0.03183118253946304, time_cost=6.875264644622803
+
Steps: 2%|▏ | 15226/1000000 [5:42:00<3506:18:54, 12.82s/it, lr=1e-5, step_loss=0.122]
Steps: 2%|▏ | 15227/1000000 [5:42:14<3603:10:24, 13.17s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [15227], local_loss=0.01477152481675148, train_loss=0.017500881105661392, time_cost=5.7784247398376465
+
Steps: 2%|▏ | 15227/1000000 [5:42:14<3603:10:24, 13.17s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 15228/1000000 [5:42:20<3007:35:13, 10.99s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [15228], local_loss=0.05651038885116577, train_loss=0.07303743064403534, time_cost=2.0019443035125732
+
Steps: 2%|▏ | 15228/1000000 [5:42:20<3007:35:13, 10.99s/it, lr=1e-5, step_loss=0.0565]
Steps: 2%|▏ | 15229/1000000 [5:42:26<2602:31:27, 9.51s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [15229], local_loss=0.12639716267585754, train_loss=0.08748462051153183, time_cost=1.981229543685913
+
Steps: 2%|▏ | 15229/1000000 [5:42:26<2602:31:27, 9.51s/it, lr=1e-5, step_loss=0.126]
Steps: 2%|▏ | 15230/1000000 [5:42:33<2424:54:28, 8.86s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [15230], local_loss=0.044681526720523834, train_loss=0.0486118346452713, time_cost=3.1248385906219482
+
Steps: 2%|▏ | 15230/1000000 [5:42:33<2424:54:28, 8.86s/it, lr=1e-5, step_loss=0.0447]
Steps: 2%|▏ | 15231/1000000 [5:42:38<2068:16:37, 7.56s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [15231], local_loss=0.018408412113785744, train_loss=0.16749908030033112, time_cost=1.5343503952026367
+
Steps: 2%|▏ | 15231/1000000 [5:42:38<2068:16:37, 7.56s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 15232/1000000 [5:42:53<2694:52:10, 9.85s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [15232], local_loss=0.05279424041509628, train_loss=0.17847350239753723, time_cost=6.407045602798462
+
Steps: 2%|▏ | 15232/1000000 [5:42:53<2694:52:10, 9.85s/it, lr=1e-5, step_loss=0.0528]
Steps: 2%|▏ | 15233/1000000 [5:43:01<2510:37:42, 9.18s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [15233], local_loss=0.005949496757239103, train_loss=0.056183360517024994, time_cost=1.7194147109985352
+
Steps: 2%|▏ | 15233/1000000 [5:43:01<2510:37:42, 9.18s/it, lr=1e-5, step_loss=0.00595]
Steps: 2%|▏ | 15234/1000000 [5:43:15<2925:36:15, 10.70s/it, lr=1e-5, step_loss=0.00595][RANK-0]: Step: [15234], local_loss=0.10481604933738708, train_loss=0.03445139527320862, time_cost=10.083882331848145
+
Steps: 2%|▏ | 15234/1000000 [5:43:15<2925:36:15, 10.70s/it, lr=1e-5, step_loss=0.105]
Steps: 2%|▏ | 15235/1000000 [5:43:28<3125:57:47, 11.43s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [15235], local_loss=0.022756166756153107, train_loss=0.016140496358275414, time_cost=9.308237075805664
+
Steps: 2%|▏ | 15235/1000000 [5:43:28<3125:57:47, 11.43s/it, lr=1e-5, step_loss=0.0228]
Steps: 2%|▏ | 15236/1000000 [5:43:35<2745:29:07, 10.04s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [15236], local_loss=0.057957619428634644, train_loss=0.016317201778292656, time_cost=1.3098316192626953
+
Steps: 2%|▏ | 15236/1000000 [5:43:35<2745:29:07, 10.04s/it, lr=1e-5, step_loss=0.058]
Steps: 2%|▏ | 15237/1000000 [5:43:40<2385:40:06, 8.72s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [15237], local_loss=0.01144244521856308, train_loss=0.0743449330329895, time_cost=1.9263026714324951
+
Steps: 2%|▏ | 15237/1000000 [5:43:40<2385:40:06, 8.72s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 15238/1000000 [5:43:53<2716:59:05, 9.93s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [15238], local_loss=0.01650075800716877, train_loss=0.03149612620472908, time_cost=6.202775716781616
+
Steps: 2%|▏ | 15238/1000000 [5:43:53<2716:59:05, 9.93s/it, lr=1e-5, step_loss=0.0165]
Steps: 2%|▏ | 15239/1000000 [5:44:04<2824:01:14, 10.32s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [15239], local_loss=0.07426878809928894, train_loss=0.029097173362970352, time_cost=1.6746923923492432
+
Steps: 2%|▏ | 15239/1000000 [5:44:04<2824:01:14, 10.32s/it, lr=1e-5, step_loss=0.0743]
Steps: 2%|▏ | 15240/1000000 [5:44:09<2385:37:47, 8.72s/it, lr=1e-5, step_loss=0.0743][RANK-0]: Step: [15240], local_loss=0.005953907035291195, train_loss=0.06126444414258003, time_cost=1.8573863506317139
+
Steps: 2%|▏ | 15240/1000000 [5:44:09<2385:37:47, 8.72s/it, lr=1e-5, step_loss=0.00595]
Steps: 2%|▏ | 15241/1000000 [5:44:20<2547:05:26, 9.31s/it, lr=1e-5, step_loss=0.00595][RANK-0]: Step: [15241], local_loss=0.00906499195843935, train_loss=0.06770963966846466, time_cost=8.65982961654663
+
Steps: 2%|▏ | 15241/1000000 [5:44:20<2547:05:26, 9.31s/it, lr=1e-5, step_loss=0.00906]
Steps: 2%|▏ | 15242/1000000 [5:44:32<2772:43:45, 10.14s/it, lr=1e-5, step_loss=0.00906][RANK-0]: Step: [15242], local_loss=0.06894955784082413, train_loss=0.11562816798686981, time_cost=2.7545368671417236
+
Steps: 2%|▏ | 15242/1000000 [5:44:32<2772:43:45, 10.14s/it, lr=1e-5, step_loss=0.0689]
Steps: 2%|▏ | 15243/1000000 [5:44:49<3307:22:40, 12.09s/it, lr=1e-5, step_loss=0.0689][RANK-0]: Step: [15243], local_loss=0.00467841187492013, train_loss=0.11650915443897247, time_cost=13.138053894042969
+
Steps: 2%|▏ | 15243/1000000 [5:44:49<3307:22:40, 12.09s/it, lr=1e-5, step_loss=0.00468]
Steps: 2%|▏ | 15244/1000000 [5:45:02<3364:36:22, 12.30s/it, lr=1e-5, step_loss=0.00468][RANK-0]: Step: [15244], local_loss=0.011546612717211246, train_loss=0.07856851816177368, time_cost=1.2429587841033936
+
Steps: 2%|▏ | 15244/1000000 [5:45:02<3364:36:22, 12.30s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 15245/1000000 [5:45:16<3546:35:13, 12.97s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [15245], local_loss=0.010694194585084915, train_loss=0.031592950224876404, time_cost=1.2561516761779785
+
Steps: 2%|▏ | 15245/1000000 [5:45:16<3546:35:13, 12.97s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15246/1000000 [5:45:29<3512:42:44, 12.84s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15246], local_loss=0.06649832427501678, train_loss=0.05939285084605217, time_cost=4.987425327301025
+
Steps: 2%|▏ | 15246/1000000 [5:45:29<3512:42:44, 12.84s/it, lr=1e-5, step_loss=0.0665]
Steps: 2%|▏ | 15247/1000000 [5:45:34<2895:43:05, 10.59s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [15247], local_loss=0.038376640528440475, train_loss=1.3777083158493042, time_cost=3.174675464630127
+
Steps: 2%|▏ | 15247/1000000 [5:45:34<2895:43:05, 10.59s/it, lr=1e-5, step_loss=0.0384]
Steps: 2%|▏ | 15248/1000000 [5:45:45<2960:51:00, 10.82s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [15248], local_loss=0.09223302453756332, train_loss=0.07511521875858307, time_cost=2.1523244380950928
+
Steps: 2%|▏ | 15248/1000000 [5:45:45<2960:51:00, 10.82s/it, lr=1e-5, step_loss=0.0922]
Steps: 2%|▏ | 15249/1000000 [5:45:58<3127:21:00, 11.43s/it, lr=1e-5, step_loss=0.0922][RANK-0]: Step: [15249], local_loss=0.013258686289191246, train_loss=0.014208251610398293, time_cost=3.690105676651001
+
Steps: 2%|▏ | 15249/1000000 [5:45:58<3127:21:00, 11.43s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 15250/1000000 [5:46:12<3285:34:30, 12.01s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [15250], local_loss=0.008020970039069653, train_loss=0.012710459530353546, time_cost=3.4611620903015137
+
Steps: 2%|▏ | 15250/1000000 [5:46:12<3285:34:30, 12.01s/it, lr=1e-5, step_loss=0.00802]
Steps: 2%|▏ | 15251/1000000 [5:46:18<2793:02:19, 10.21s/it, lr=1e-5, step_loss=0.00802][RANK-0]: Step: [15251], local_loss=0.059971101582050323, train_loss=0.016827702522277832, time_cost=4.566795110702515
+
Steps: 2%|▏ | 15251/1000000 [5:46:18<2793:02:19, 10.21s/it, lr=1e-5, step_loss=0.06]
Steps: 2%|▏ | 15252/1000000 [5:46:33<3218:32:07, 11.77s/it, lr=1e-5, step_loss=0.06][RANK-0]: Step: [15252], local_loss=0.034339141100645065, train_loss=0.016727034002542496, time_cost=12.038364887237549
+
Steps: 2%|▏ | 15252/1000000 [5:46:33<3218:32:07, 11.77s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 15253/1000000 [5:46:46<3330:34:10, 12.18s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [15253], local_loss=0.01321214996278286, train_loss=0.030708663165569305, time_cost=9.828892230987549
+
Steps: 2%|▏ | 15253/1000000 [5:46:46<3330:34:10, 12.18s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 15254/1000000 [5:46:57<3217:39:38, 11.76s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [15254], local_loss=0.010463452897965908, train_loss=0.025326959788799286, time_cost=1.6400723457336426
+
Steps: 2%|▏ | 15254/1000000 [5:46:57<3217:39:38, 11.76s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 15255/1000000 [5:47:03<2732:48:35, 9.99s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [15255], local_loss=0.02717454917728901, train_loss=0.07004307210445404, time_cost=1.6945605278015137
+
Steps: 2%|▏ | 15255/1000000 [5:47:03<2732:48:35, 9.99s/it, lr=1e-5, step_loss=0.0272]
Steps: 2%|▏ | 15256/1000000 [5:47:10<2517:10:50, 9.20s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [15256], local_loss=0.07131586968898773, train_loss=0.0679238811135292, time_cost=1.4989261627197266
+
Steps: 2%|▏ | 15256/1000000 [5:47:10<2517:10:50, 9.20s/it, lr=1e-5, step_loss=0.0713]
Steps: 2%|▏ | 15257/1000000 [5:47:24<2857:46:54, 10.45s/it, lr=1e-5, step_loss=0.0713][RANK-0]: Step: [15257], local_loss=0.02195580303668976, train_loss=0.1188274696469307, time_cost=5.872691869735718
+
Steps: 2%|▏ | 15257/1000000 [5:47:24<2857:46:54, 10.45s/it, lr=1e-5, step_loss=0.022]
Steps: 2%|▏ | 15258/1000000 [5:47:30<2518:35:10, 9.21s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [15258], local_loss=0.00827851239591837, train_loss=0.0261665191501379, time_cost=2.1396381855010986
+
Steps: 2%|▏ | 15258/1000000 [5:47:30<2518:35:10, 9.21s/it, lr=1e-5, step_loss=0.00828]
Steps: 2%|▏ | 15259/1000000 [5:47:43<2809:31:00, 10.27s/it, lr=1e-5, step_loss=0.00828][RANK-0]: Step: [15259], local_loss=0.01549747958779335, train_loss=0.023082610219717026, time_cost=7.201918125152588
+
Steps: 2%|▏ | 15259/1000000 [5:47:43<2809:31:00, 10.27s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 15260/1000000 [5:47:47<2350:44:58, 8.59s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [15260], local_loss=0.008099707774817944, train_loss=0.17553915083408356, time_cost=2.351553440093994
+
Steps: 2%|▏ | 15260/1000000 [5:47:47<2350:44:58, 8.59s/it, lr=1e-5, step_loss=0.0081]
Steps: 2%|▏ | 15261/1000000 [5:48:02<2860:28:58, 10.46s/it, lr=1e-5, step_loss=0.0081][RANK-0]: Step: [15261], local_loss=0.014649753458797932, train_loss=0.01344786211848259, time_cost=3.091904878616333
+
Steps: 2%|▏ | 15261/1000000 [5:48:02<2860:28:58, 10.46s/it, lr=1e-5, step_loss=0.0146]
Steps: 2%|▏ | 15262/1000000 [5:48:13<2881:11:58, 10.53s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [15262], local_loss=0.009214555844664574, train_loss=0.024689795449376106, time_cost=5.3506622314453125
+
Steps: 2%|▏ | 15262/1000000 [5:48:13<2881:11:58, 10.53s/it, lr=1e-5, step_loss=0.00921]
Steps: 2%|▏ | 15263/1000000 [5:48:25<3012:51:19, 11.01s/it, lr=1e-5, step_loss=0.00921][RANK-0]: Step: [15263], local_loss=0.013151979073882103, train_loss=0.019166581332683563, time_cost=1.191321611404419
+
Steps: 2%|▏ | 15263/1000000 [5:48:25<3012:51:19, 11.01s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 15264/1000000 [5:48:32<2652:35:43, 9.70s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [15264], local_loss=0.011464335955679417, train_loss=0.03488190472126007, time_cost=2.451080799102783
+
Steps: 2%|▏ | 15264/1000000 [5:48:32<2652:35:43, 9.70s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 15265/1000000 [5:48:38<2352:48:59, 8.60s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [15265], local_loss=0.0815545991063118, train_loss=0.03062470443546772, time_cost=1.5005254745483398
+
Steps: 2%|▏ | 15265/1000000 [5:48:38<2352:48:59, 8.60s/it, lr=1e-5, step_loss=0.0816]
Steps: 2%|▏ | 15266/1000000 [5:48:43<2124:22:00, 7.77s/it, lr=1e-5, step_loss=0.0816][RANK-0]: Step: [15266], local_loss=0.01974170096218586, train_loss=0.02677132561802864, time_cost=3.1034696102142334
+
Steps: 2%|▏ | 15266/1000000 [5:48:43<2124:22:00, 7.77s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 15267/1000000 [5:48:49<1927:06:58, 7.05s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [15267], local_loss=0.07903293520212173, train_loss=0.17049893736839294, time_cost=2.988617420196533
+
Steps: 2%|▏ | 15267/1000000 [5:48:49<1927:06:58, 7.05s/it, lr=1e-5, step_loss=0.079]
Steps: 2%|▏ | 15268/1000000 [5:48:57<2028:44:22, 7.42s/it, lr=1e-5, step_loss=0.079][RANK-0]: Step: [15268], local_loss=0.051280006766319275, train_loss=0.08559753000736237, time_cost=1.5084826946258545
+
Steps: 2%|▏ | 15268/1000000 [5:48:57<2028:44:22, 7.42s/it, lr=1e-5, step_loss=0.0513]
Steps: 2%|▏ | 15269/1000000 [5:49:07<2219:04:02, 8.11s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [15269], local_loss=0.01965266279876232, train_loss=0.03811858966946602, time_cost=1.556236982345581
+
Steps: 2%|▏ | 15269/1000000 [5:49:07<2219:04:02, 8.11s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 15270/1000000 [5:49:13<2082:16:06, 7.61s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [15270], local_loss=0.0076354132033884525, train_loss=0.05702075734734535, time_cost=2.713766098022461
+
Steps: 2%|▏ | 15270/1000000 [5:49:13<2082:16:06, 7.61s/it, lr=1e-5, step_loss=0.00764]
Steps: 2%|▏ | 15271/1000000 [5:49:24<2310:04:29, 8.45s/it, lr=1e-5, step_loss=0.00764][RANK-0]: Step: [15271], local_loss=0.018066566437482834, train_loss=0.034855883568525314, time_cost=2.045468330383301
+
Steps: 2%|▏ | 15271/1000000 [5:49:24<2310:04:29, 8.45s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 15272/1000000 [5:49:29<2042:07:04, 7.47s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [15272], local_loss=0.11676866561174393, train_loss=0.032570548355579376, time_cost=2.6256091594696045
+
Steps: 2%|▏ | 15272/1000000 [5:49:29<2042:07:04, 7.47s/it, lr=1e-5, step_loss=0.117]
Steps: 2%|▏ | 15273/1000000 [5:49:37<2102:56:07, 7.69s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [15273], local_loss=0.00509064132347703, train_loss=0.03006152994930744, time_cost=2.141860008239746
+
Steps: 2%|▏ | 15273/1000000 [5:49:37<2102:56:07, 7.69s/it, lr=1e-5, step_loss=0.00509]
Steps: 2%|▏ | 15274/1000000 [5:49:46<2229:11:02, 8.15s/it, lr=1e-5, step_loss=0.00509][RANK-0]: Step: [15274], local_loss=0.08358165621757507, train_loss=0.04412420094013214, time_cost=1.9254674911499023
+
Steps: 2%|▏ | 15274/1000000 [5:49:46<2229:11:02, 8.15s/it, lr=1e-5, step_loss=0.0836]
Steps: 2%|▏ | 15275/1000000 [5:49:51<1916:38:42, 7.01s/it, lr=1e-5, step_loss=0.0836][RANK-0]: Step: [15275], local_loss=0.11679213494062424, train_loss=0.05710044130682945, time_cost=1.4215023517608643
+
Steps: 2%|▏ | 15275/1000000 [5:49:51<1916:38:42, 7.01s/it, lr=1e-5, step_loss=0.117]
Steps: 2%|▏ | 15276/1000000 [5:49:57<1832:47:46, 6.70s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [15276], local_loss=0.05687938630580902, train_loss=0.2685467004776001, time_cost=1.2616755962371826
+
Steps: 2%|▏ | 15276/1000000 [5:49:57<1832:47:46, 6.70s/it, lr=1e-5, step_loss=0.0569]
Steps: 2%|▏ | 15277/1000000 [5:50:08<2258:20:06, 8.26s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [15277], local_loss=0.055110760033130646, train_loss=0.04448753222823143, time_cost=3.8280017375946045
+
Steps: 2%|▏ | 15277/1000000 [5:50:08<2258:20:06, 8.26s/it, lr=1e-5, step_loss=0.0551]
Steps: 2%|▏ | 15278/1000000 [5:50:22<2698:37:26, 9.87s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [15278], local_loss=0.03737388923764229, train_loss=0.02500176802277565, time_cost=6.5483832359313965
+
Steps: 2%|▏ | 15278/1000000 [5:50:22<2698:37:26, 9.87s/it, lr=1e-5, step_loss=0.0374]
Steps: 2%|▏ | 15279/1000000 [5:50:26<2243:11:11, 8.20s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [15279], local_loss=0.034127309918403625, train_loss=0.04286620765924454, time_cost=1.4932303428649902
+
Steps: 2%|▏ | 15279/1000000 [5:50:26<2243:11:11, 8.20s/it, lr=1e-5, step_loss=0.0341]
Steps: 2%|▏ | 15280/1000000 [5:50:31<1985:54:12, 7.26s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [15280], local_loss=0.03363844007253647, train_loss=0.04086345434188843, time_cost=3.8557441234588623
+
Steps: 2%|▏ | 15280/1000000 [5:50:31<1985:54:12, 7.26s/it, lr=1e-5, step_loss=0.0336]
Steps: 2%|▏ | 15281/1000000 [5:50:42<2271:11:00, 8.30s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [15281], local_loss=0.07169462740421295, train_loss=0.1587938815355301, time_cost=3.4972004890441895
+
Steps: 2%|▏ | 15281/1000000 [5:50:42<2271:11:00, 8.30s/it, lr=1e-5, step_loss=0.0717]
Steps: 2%|▏ | 15282/1000000 [5:50:47<1962:47:40, 7.18s/it, lr=1e-5, step_loss=0.0717][RANK-0]: Step: [15282], local_loss=0.08109157532453537, train_loss=0.07918614149093628, time_cost=3.5709877014160156
+
Steps: 2%|▏ | 15282/1000000 [5:50:47<1962:47:40, 7.18s/it, lr=1e-5, step_loss=0.0811]
Steps: 2%|▏ | 15283/1000000 [5:50:58<2277:17:29, 8.33s/it, lr=1e-5, step_loss=0.0811][RANK-0]: Step: [15283], local_loss=0.009303824976086617, train_loss=0.08086326718330383, time_cost=5.9352381229400635
+
Steps: 2%|▏ | 15283/1000000 [5:50:58<2277:17:29, 8.33s/it, lr=1e-5, step_loss=0.0093]
Steps: 2%|▏ | 15284/1000000 [5:51:03<2011:21:59, 7.35s/it, lr=1e-5, step_loss=0.0093][RANK-0]: Step: [15284], local_loss=0.02283334918320179, train_loss=0.022283945232629776, time_cost=2.1989200115203857
+
Steps: 2%|▏ | 15284/1000000 [5:51:03<2011:21:59, 7.35s/it, lr=1e-5, step_loss=0.0228]
Steps: 2%|▏ | 15285/1000000 [5:51:12<2191:17:31, 8.01s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [15285], local_loss=0.06355015933513641, train_loss=0.029000241309404373, time_cost=1.7926442623138428
+
Steps: 2%|▏ | 15285/1000000 [5:51:12<2191:17:31, 8.01s/it, lr=1e-5, step_loss=0.0636]
Steps: 2%|▏ | 15286/1000000 [5:51:21<2240:56:44, 8.19s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [15286], local_loss=0.06440769135951996, train_loss=0.03141311556100845, time_cost=2.5977487564086914
+
Steps: 2%|▏ | 15286/1000000 [5:51:21<2240:56:44, 8.19s/it, lr=1e-5, step_loss=0.0644]
Steps: 2%|▏ | 15287/1000000 [5:51:29<2205:38:38, 8.06s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [15287], local_loss=0.06438306719064713, train_loss=0.029122933745384216, time_cost=2.1215503215789795
+
Steps: 2%|▏ | 15287/1000000 [5:51:29<2205:38:38, 8.06s/it, lr=1e-5, step_loss=0.0644]
Steps: 2%|▏ | 15288/1000000 [5:51:45<2875:10:47, 10.51s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [15288], local_loss=0.02017032355070114, train_loss=0.04480239376425743, time_cost=7.883184909820557
+
Steps: 2%|▏ | 15288/1000000 [5:51:45<2875:10:47, 10.51s/it, lr=1e-5, step_loss=0.0202]
Steps: 2%|▏ | 15289/1000000 [5:52:00<3282:16:42, 12.00s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [15289], local_loss=0.007854829542338848, train_loss=0.0471104197204113, time_cost=7.1412131786346436
+
Steps: 2%|▏ | 15289/1000000 [5:52:00<3282:16:42, 12.00s/it, lr=1e-5, step_loss=0.00785]
Steps: 2%|▏ | 15290/1000000 [5:52:08<2916:19:41, 10.66s/it, lr=1e-5, step_loss=0.00785][RANK-0]: Step: [15290], local_loss=0.004460892174392939, train_loss=0.1011827290058136, time_cost=2.0511298179626465
+
Steps: 2%|▏ | 15290/1000000 [5:52:08<2916:19:41, 10.66s/it, lr=1e-5, step_loss=0.00446]
Steps: 2%|▏ | 15291/1000000 [5:52:22<3228:03:47, 11.80s/it, lr=1e-5, step_loss=0.00446][RANK-0]: Step: [15291], local_loss=0.02249283157289028, train_loss=0.050847768783569336, time_cost=2.4055333137512207
+
Steps: 2%|▏ | 15291/1000000 [5:52:22<3228:03:47, 11.80s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 15292/1000000 [5:52:29<2831:58:28, 10.35s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [15292], local_loss=0.007056397385895252, train_loss=0.024315085262060165, time_cost=2.2517192363739014
+
Steps: 2%|▏ | 15292/1000000 [5:52:29<2831:58:28, 10.35s/it, lr=1e-5, step_loss=0.00706]
Steps: 2%|▏ | 15293/1000000 [5:52:40<2855:58:58, 10.44s/it, lr=1e-5, step_loss=0.00706][RANK-0]: Step: [15293], local_loss=0.05130894109606743, train_loss=0.07480023801326752, time_cost=3.1086266040802
+
Steps: 2%|▏ | 15293/1000000 [5:52:40<2855:58:58, 10.44s/it, lr=1e-5, step_loss=0.0513]
Steps: 2%|▏ | 15294/1000000 [5:52:50<2783:53:09, 10.18s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [15294], local_loss=0.006808177102357149, train_loss=0.02777996100485325, time_cost=3.46628999710083
+
Steps: 2%|▏ | 15294/1000000 [5:52:50<2783:53:09, 10.18s/it, lr=1e-5, step_loss=0.00681]
Steps: 2%|▏ | 15295/1000000 [5:53:00<2834:29:30, 10.36s/it, lr=1e-5, step_loss=0.00681][RANK-0]: Step: [15295], local_loss=0.013071935623884201, train_loss=0.03950648009777069, time_cost=2.347947597503662
+
Steps: 2%|▏ | 15295/1000000 [5:53:00<2834:29:30, 10.36s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 15296/1000000 [5:53:11<2880:31:02, 10.53s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [15296], local_loss=0.015466425567865372, train_loss=0.010798681527376175, time_cost=3.616881847381592
+
Steps: 2%|▏ | 15296/1000000 [5:53:11<2880:31:02, 10.53s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 15297/1000000 [5:53:22<2909:20:47, 10.64s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [15297], local_loss=0.0679171085357666, train_loss=0.03456979617476463, time_cost=2.0849545001983643
+
Steps: 2%|▏ | 15297/1000000 [5:53:22<2909:20:47, 10.64s/it, lr=1e-5, step_loss=0.0679]
Steps: 2%|▏ | 15298/1000000 [5:53:28<2490:15:08, 9.10s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [15298], local_loss=0.006998933851718903, train_loss=0.1445566564798355, time_cost=1.3097915649414062
+
Steps: 2%|▏ | 15298/1000000 [5:53:28<2490:15:08, 9.10s/it, lr=1e-5, step_loss=0.007]
Steps: 2%|▏ | 15299/1000000 [5:53:36<2415:29:12, 8.83s/it, lr=1e-5, step_loss=0.007][RANK-0]: Step: [15299], local_loss=0.024910226464271545, train_loss=0.043273262679576874, time_cost=1.7948648929595947
+
Steps: 2%|▏ | 15299/1000000 [5:53:36<2415:29:12, 8.83s/it, lr=1e-5, step_loss=0.0249]
Steps: 2%|▏ | 15300/1000000 [5:53:47<2590:26:34, 9.47s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [15300], local_loss=0.027875572443008423, train_loss=0.02272452786564827, time_cost=3.1198272705078125
+
Steps: 2%|▏ | 15300/1000000 [5:53:47<2590:26:34, 9.47s/it, lr=1e-5, step_loss=0.0279]
Steps: 2%|▏ | 15301/1000000 [5:53:53<2328:11:26, 8.51s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [15301], local_loss=0.04183923453092575, train_loss=0.032830625772476196, time_cost=5.305126190185547
+
Steps: 2%|▏ | 15301/1000000 [5:53:53<2328:11:26, 8.51s/it, lr=1e-5, step_loss=0.0418]
Steps: 2%|▏ | 15302/1000000 [5:54:01<2273:21:12, 8.31s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [15302], local_loss=0.014374983496963978, train_loss=0.06333004683256149, time_cost=3.830860137939453
+
Steps: 2%|▏ | 15302/1000000 [5:54:01<2273:21:12, 8.31s/it, lr=1e-5, step_loss=0.0144]
Steps: 2%|▏ | 15303/1000000 [5:54:14<2675:22:30, 9.78s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [15303], local_loss=0.03494882583618164, train_loss=0.044781576842069626, time_cost=1.2746524810791016
+
Steps: 2%|▏ | 15303/1000000 [5:54:14<2675:22:30, 9.78s/it, lr=1e-5, step_loss=0.0349]
Steps: 2%|▏ | 15304/1000000 [5:54:21<2438:16:45, 8.91s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [15304], local_loss=0.010258381254971027, train_loss=0.07487604767084122, time_cost=1.2122905254364014
+
Steps: 2%|▏ | 15304/1000000 [5:54:21<2438:16:45, 8.91s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 15305/1000000 [5:54:26<2082:25:39, 7.61s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [15305], local_loss=0.022896237671375275, train_loss=0.02322724461555481, time_cost=1.6911070346832275
+
Steps: 2%|▏ | 15305/1000000 [5:54:26<2082:25:39, 7.61s/it, lr=1e-5, step_loss=0.0229]
Steps: 2%|▏ | 15306/1000000 [5:54:33<2040:34:04, 7.46s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [15306], local_loss=0.016572726890444756, train_loss=0.032581448554992676, time_cost=2.9643454551696777
+
Steps: 2%|▏ | 15306/1000000 [5:54:33<2040:34:04, 7.46s/it, lr=1e-5, step_loss=0.0166]
Steps: 2%|▏ | 15307/1000000 [5:54:40<2015:25:08, 7.37s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [15307], local_loss=0.028631504625082016, train_loss=0.08092457801103592, time_cost=2.807715892791748
+
Steps: 2%|▏ | 15307/1000000 [5:54:40<2015:25:08, 7.37s/it, lr=1e-5, step_loss=0.0286]
Steps: 2%|▏ | 15308/1000000 [5:54:47<1985:57:23, 7.26s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [15308], local_loss=0.004865780007094145, train_loss=0.08226785063743591, time_cost=5.118542194366455
+
Steps: 2%|▏ | 15308/1000000 [5:54:47<1985:57:23, 7.26s/it, lr=1e-5, step_loss=0.00487]
Steps: 2%|▏ | 15309/1000000 [5:54:58<2288:03:03, 8.37s/it, lr=1e-5, step_loss=0.00487][RANK-0]: Step: [15309], local_loss=0.019838932901620865, train_loss=0.03388840705156326, time_cost=2.32498836517334
+
Steps: 2%|▏ | 15309/1000000 [5:54:58<2288:03:03, 8.37s/it, lr=1e-5, step_loss=0.0198]
Steps: 2%|▏ | 15310/1000000 [5:55:11<2701:30:37, 9.88s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [15310], local_loss=0.007497042417526245, train_loss=0.03645123541355133, time_cost=3.500624895095825
+
Steps: 2%|▏ | 15310/1000000 [5:55:11<2701:30:37, 9.88s/it, lr=1e-5, step_loss=0.0075]
Steps: 2%|▏ | 15311/1000000 [5:55:22<2778:07:17, 10.16s/it, lr=1e-5, step_loss=0.0075][RANK-0]: Step: [15311], local_loss=0.009162215515971184, train_loss=0.04312540218234062, time_cost=3.5191619396209717
+
Steps: 2%|▏ | 15311/1000000 [5:55:22<2778:07:17, 10.16s/it, lr=1e-5, step_loss=0.00916]
Steps: 2%|▏ | 15312/1000000 [5:55:27<2354:01:52, 8.61s/it, lr=1e-5, step_loss=0.00916][RANK-0]: Step: [15312], local_loss=0.006171697285026312, train_loss=0.032483115792274475, time_cost=1.9405462741851807
+
Steps: 2%|▏ | 15312/1000000 [5:55:27<2354:01:52, 8.61s/it, lr=1e-5, step_loss=0.00617]
Steps: 2%|▏ | 15313/1000000 [5:55:33<2116:17:39, 7.74s/it, lr=1e-5, step_loss=0.00617][RANK-0]: Step: [15313], local_loss=0.06980247795581818, train_loss=0.027521474286913872, time_cost=3.0527212619781494
+
Steps: 2%|▏ | 15313/1000000 [5:55:33<2116:17:39, 7.74s/it, lr=1e-5, step_loss=0.0698]
Steps: 2%|▏ | 15314/1000000 [5:55:40<2079:05:50, 7.60s/it, lr=1e-5, step_loss=0.0698][RANK-0]: Step: [15314], local_loss=0.05327258259057999, train_loss=0.06273241341114044, time_cost=2.672556161880493
+
Steps: 2%|▏ | 15314/1000000 [5:55:40<2079:05:50, 7.60s/it, lr=1e-5, step_loss=0.0533]
Steps: 2%|▏ | 15315/1000000 [5:55:50<2269:38:51, 8.30s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [15315], local_loss=0.043576519936323166, train_loss=0.031066522002220154, time_cost=7.563086748123169
+
Steps: 2%|▏ | 15315/1000000 [5:55:50<2269:38:51, 8.30s/it, lr=1e-5, step_loss=0.0436]
Steps: 2%|▏ | 15316/1000000 [5:56:05<2839:43:58, 10.38s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [15316], local_loss=0.02802613191306591, train_loss=0.027050698176026344, time_cost=6.290494680404663
+
Steps: 2%|▏ | 15316/1000000 [5:56:05<2839:43:58, 10.38s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 15317/1000000 [5:56:10<2408:50:02, 8.81s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [15317], local_loss=0.17624862492084503, train_loss=0.03699300438165665, time_cost=2.0215578079223633
+
Steps: 2%|▏ | 15317/1000000 [5:56:10<2408:50:02, 8.81s/it, lr=1e-5, step_loss=0.176]
Steps: 2%|▏ | 15318/1000000 [5:56:22<2600:31:45, 9.51s/it, lr=1e-5, step_loss=0.176][RANK-0]: Step: [15318], local_loss=0.0077639296650886536, train_loss=0.02087133750319481, time_cost=3.122394561767578
+
Steps: 2%|▏ | 15318/1000000 [5:56:22<2600:31:45, 9.51s/it, lr=1e-5, step_loss=0.00776]
Steps: 2%|▏ | 15319/1000000 [5:56:29<2466:27:21, 9.02s/it, lr=1e-5, step_loss=0.00776][RANK-0]: Step: [15319], local_loss=0.006706406362354755, train_loss=0.03267788514494896, time_cost=4.107684850692749
+
Steps: 2%|▏ | 15319/1000000 [5:56:29<2466:27:21, 9.02s/it, lr=1e-5, step_loss=0.00671]
Steps: 2%|▏ | 15320/1000000 [5:56:43<2860:23:55, 10.46s/it, lr=1e-5, step_loss=0.00671][RANK-0]: Step: [15320], local_loss=0.0065233176574110985, train_loss=0.025553414598107338, time_cost=3.576780080795288
+
Steps: 2%|▏ | 15320/1000000 [5:56:43<2860:23:55, 10.46s/it, lr=1e-5, step_loss=0.00652]
Steps: 2%|▏ | 15321/1000000 [5:56:55<2930:17:56, 10.71s/it, lr=1e-5, step_loss=0.00652][RANK-0]: Step: [15321], local_loss=0.20547644793987274, train_loss=0.04854102432727814, time_cost=2.149524688720703
+
Steps: 2%|▏ | 15321/1000000 [5:56:55<2930:17:56, 10.71s/it, lr=1e-5, step_loss=0.205]
Steps: 2%|▏ | 15322/1000000 [5:57:03<2702:19:54, 9.88s/it, lr=1e-5, step_loss=0.205][RANK-0]: Step: [15322], local_loss=0.020331542938947678, train_loss=0.03284945338964462, time_cost=1.9798107147216797
+
Steps: 2%|▏ | 15322/1000000 [5:57:03<2702:19:54, 9.88s/it, lr=1e-5, step_loss=0.0203]
Steps: 2%|▏ | 15323/1000000 [5:57:10<2507:14:15, 9.17s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [15323], local_loss=0.06275781989097595, train_loss=0.02149946615099907, time_cost=1.4032089710235596
+
Steps: 2%|▏ | 15323/1000000 [5:57:10<2507:14:15, 9.17s/it, lr=1e-5, step_loss=0.0628]
Steps: 2%|▏ | 15324/1000000 [5:57:20<2555:44:45, 9.34s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [15324], local_loss=0.037505730986595154, train_loss=0.05241885408759117, time_cost=2.058678388595581
+
Steps: 2%|▏ | 15324/1000000 [5:57:20<2555:44:45, 9.34s/it, lr=1e-5, step_loss=0.0375]
Steps: 2%|▏ | 15325/1000000 [5:57:25<2208:13:10, 8.07s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [15325], local_loss=0.10661431401968002, train_loss=0.07080511748790741, time_cost=2.1644041538238525
+
Steps: 2%|▏ | 15325/1000000 [5:57:25<2208:13:10, 8.07s/it, lr=1e-5, step_loss=0.107]
Steps: 2%|▏ | 15326/1000000 [5:57:34<2323:56:06, 8.50s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [15326], local_loss=0.007566102780401707, train_loss=0.01996595598757267, time_cost=7.187986135482788
+
Steps: 2%|▏ | 15326/1000000 [5:57:34<2323:56:06, 8.50s/it, lr=1e-5, step_loss=0.00757]
Steps: 2%|▏ | 15327/1000000 [5:57:45<2505:35:41, 9.16s/it, lr=1e-5, step_loss=0.00757][RANK-0]: Step: [15327], local_loss=0.02268272638320923, train_loss=0.028634535148739815, time_cost=3.2768473625183105
+
Steps: 2%|▏ | 15327/1000000 [5:57:45<2505:35:41, 9.16s/it, lr=1e-5, step_loss=0.0227]
Steps: 2%|▏ | 15328/1000000 [5:57:55<2565:08:53, 9.38s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [15328], local_loss=0.022514797747135162, train_loss=0.050334177911281586, time_cost=2.747821092605591
+
Steps: 2%|▏ | 15328/1000000 [5:57:55<2565:08:53, 9.38s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 15329/1000000 [5:57:59<2167:06:03, 7.92s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [15329], local_loss=0.007900478318333626, train_loss=0.02924269251525402, time_cost=1.670705795288086
+
Steps: 2%|▏ | 15329/1000000 [5:58:00<2167:06:03, 7.92s/it, lr=1e-5, step_loss=0.0079]
Steps: 2%|▏ | 15330/1000000 [5:58:11<2428:33:04, 8.88s/it, lr=1e-5, step_loss=0.0079][RANK-0]: Step: [15330], local_loss=0.03592239320278168, train_loss=0.0745973289012909, time_cost=1.8068995475769043
+
Steps: 2%|▏ | 15330/1000000 [5:58:11<2428:33:04, 8.88s/it, lr=1e-5, step_loss=0.0359]
Steps: 2%|▏ | 15331/1000000 [5:58:17<2223:08:37, 8.13s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [15331], local_loss=0.008925136178731918, train_loss=0.02362661063671112, time_cost=1.637796401977539
+
Steps: 2%|▏ | 15331/1000000 [5:58:17<2223:08:37, 8.13s/it, lr=1e-5, step_loss=0.00893]
Steps: 2%|▏ | 15332/1000000 [5:58:30<2611:59:13, 9.55s/it, lr=1e-5, step_loss=0.00893][RANK-0]: Step: [15332], local_loss=0.0066665904596447945, train_loss=0.03430383279919624, time_cost=1.2187395095825195
+
Steps: 2%|▏ | 15332/1000000 [5:58:30<2611:59:13, 9.55s/it, lr=1e-5, step_loss=0.00667]
Steps: 2%|▏ | 15333/1000000 [5:58:35<2254:45:50, 8.24s/it, lr=1e-5, step_loss=0.00667][RANK-0]: Step: [15333], local_loss=0.09720294177532196, train_loss=0.02753867395222187, time_cost=2.402263641357422
+
Steps: 2%|▏ | 15333/1000000 [5:58:35<2254:45:50, 8.24s/it, lr=1e-5, step_loss=0.0972]
Steps: 2%|▏ | 15334/1000000 [5:58:46<2482:09:41, 9.07s/it, lr=1e-5, step_loss=0.0972][RANK-0]: Step: [15334], local_loss=0.016757993027567863, train_loss=0.06692814826965332, time_cost=2.4935340881347656
+
Steps: 2%|▏ | 15334/1000000 [5:58:46<2482:09:41, 9.07s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 15335/1000000 [5:58:51<2144:17:03, 7.84s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [15335], local_loss=0.0134225869551301, train_loss=0.03741401806473732, time_cost=2.2502057552337646
+
Steps: 2%|▏ | 15335/1000000 [5:58:51<2144:17:03, 7.84s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 15336/1000000 [5:58:56<1911:18:59, 6.99s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [15336], local_loss=1.007583737373352, train_loss=0.1586928814649582, time_cost=2.1176598072052
+
Steps: 2%|▏ | 15336/1000000 [5:58:56<1911:18:59, 6.99s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 15337/1000000 [5:59:01<1772:51:54, 6.48s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [15337], local_loss=0.007759863045066595, train_loss=0.03523707389831543, time_cost=2.5753328800201416
+
Steps: 2%|▏ | 15337/1000000 [5:59:01<1772:51:54, 6.48s/it, lr=1e-5, step_loss=0.00776]
Steps: 2%|▏ | 15338/1000000 [5:59:08<1824:51:26, 6.67s/it, lr=1e-5, step_loss=0.00776][RANK-0]: Step: [15338], local_loss=0.10165981948375702, train_loss=0.055190976709127426, time_cost=1.5700387954711914
+
Steps: 2%|▏ | 15338/1000000 [5:59:08<1824:51:26, 6.67s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 15339/1000000 [5:59:12<1603:11:14, 5.86s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [15339], local_loss=0.007225803565233946, train_loss=0.14799028635025024, time_cost=1.2531273365020752
+
Steps: 2%|▏ | 15339/1000000 [5:59:12<1603:11:14, 5.86s/it, lr=1e-5, step_loss=0.00723]
Steps: 2%|▏ | 15340/1000000 [5:59:18<1559:45:31, 5.70s/it, lr=1e-5, step_loss=0.00723][RANK-0]: Step: [15340], local_loss=0.015506362542510033, train_loss=0.036880653351545334, time_cost=2.2565813064575195
+
Steps: 2%|▏ | 15340/1000000 [5:59:18<1559:45:31, 5.70s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 15341/1000000 [5:59:23<1509:57:34, 5.52s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [15341], local_loss=0.016303427517414093, train_loss=0.0383000373840332, time_cost=2.397414445877075
+
Steps: 2%|▏ | 15341/1000000 [5:59:23<1509:57:34, 5.52s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 15342/1000000 [5:59:30<1653:15:15, 6.04s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [15342], local_loss=0.3747469484806061, train_loss=0.17312636971473694, time_cost=2.1726322174072266
+
Steps: 2%|▏ | 15342/1000000 [5:59:30<1653:15:15, 6.04s/it, lr=1e-5, step_loss=0.375]
Steps: 2%|▏ | 15343/1000000 [5:59:39<1846:39:03, 6.75s/it, lr=1e-5, step_loss=0.375][RANK-0]: Step: [15343], local_loss=0.02888224646449089, train_loss=0.05154787749052048, time_cost=1.8067371845245361
+
Steps: 2%|▏ | 15343/1000000 [5:59:39<1846:39:03, 6.75s/it, lr=1e-5, step_loss=0.0289]
Steps: 2%|▏ | 15344/1000000 [5:59:46<1939:23:39, 7.09s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [15344], local_loss=0.026627693325281143, train_loss=0.05083305761218071, time_cost=2.2308082580566406
+
Steps: 2%|▏ | 15344/1000000 [5:59:46<1939:23:39, 7.09s/it, lr=1e-5, step_loss=0.0266]
Steps: 2%|▏ | 15345/1000000 [5:59:55<2051:14:09, 7.50s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [15345], local_loss=0.006618895102292299, train_loss=0.0887676551938057, time_cost=1.548173427581787
+
Steps: 2%|▏ | 15345/1000000 [5:59:55<2051:14:09, 7.50s/it, lr=1e-5, step_loss=0.00662]
Steps: 2%|▏ | 15346/1000000 [6:00:00<1870:11:52, 6.84s/it, lr=1e-5, step_loss=0.00662][RANK-0]: Step: [15346], local_loss=0.14986717700958252, train_loss=0.03926472365856171, time_cost=3.938518762588501
+
Steps: 2%|▏ | 15346/1000000 [6:00:00<1870:11:52, 6.84s/it, lr=1e-5, step_loss=0.15]
Steps: 2%|▏ | 15347/1000000 [6:00:13<2375:14:46, 8.68s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [15347], local_loss=0.04867292195558548, train_loss=0.09801970422267914, time_cost=6.227625608444214
+
Steps: 2%|▏ | 15347/1000000 [6:00:13<2375:14:46, 8.68s/it, lr=1e-5, step_loss=0.0487]
Steps: 2%|▏ | 15348/1000000 [6:00:28<2895:17:07, 10.59s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [15348], local_loss=0.005874682683497667, train_loss=0.029323313385248184, time_cost=6.565134763717651
+
Steps: 2%|▏ | 15348/1000000 [6:00:28<2895:17:07, 10.59s/it, lr=1e-5, step_loss=0.00587]
Steps: 2%|▏ | 15349/1000000 [6:00:36<2703:49:38, 9.89s/it, lr=1e-5, step_loss=0.00587][RANK-0]: Step: [15349], local_loss=0.01800122670829296, train_loss=0.016642028465867043, time_cost=4.0723161697387695
+
Steps: 2%|▏ | 15349/1000000 [6:00:36<2703:49:38, 9.89s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 15350/1000000 [6:00:44<2535:59:17, 9.27s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [15350], local_loss=0.01892593689262867, train_loss=0.04169450327754021, time_cost=2.1222753524780273
+
Steps: 2%|▏ | 15350/1000000 [6:00:44<2535:59:17, 9.27s/it, lr=1e-5, step_loss=0.0189]
Steps: 2%|▏ | 15351/1000000 [6:00:58<2923:50:27, 10.69s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [15351], local_loss=0.005980941001325846, train_loss=0.016890617087483406, time_cost=2.311673402786255
+
Steps: 2%|▏ | 15351/1000000 [6:00:58<2923:50:27, 10.69s/it, lr=1e-5, step_loss=0.00598]
Steps: 2%|▏ | 15352/1000000 [6:01:06<2658:09:25, 9.72s/it, lr=1e-5, step_loss=0.00598][RANK-0]: Step: [15352], local_loss=0.013644261285662651, train_loss=0.014370989054441452, time_cost=1.731142282485962
+
Steps: 2%|▏ | 15352/1000000 [6:01:06<2658:09:25, 9.72s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 15353/1000000 [6:01:15<2659:45:37, 9.72s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [15353], local_loss=0.04078077897429466, train_loss=0.027502262964844704, time_cost=2.2491443157196045
+
Steps: 2%|▏ | 15353/1000000 [6:01:15<2659:45:37, 9.72s/it, lr=1e-5, step_loss=0.0408]
Steps: 2%|▏ | 15354/1000000 [6:01:27<2806:08:51, 10.26s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [15354], local_loss=0.006603680085390806, train_loss=0.08955684304237366, time_cost=3.14658784866333
+
Steps: 2%|▏ | 15354/1000000 [6:01:27<2806:08:51, 10.26s/it, lr=1e-5, step_loss=0.0066]
Steps: 2%|▏ | 15355/1000000 [6:01:39<2983:52:31, 10.91s/it, lr=1e-5, step_loss=0.0066][RANK-0]: Step: [15355], local_loss=0.06535708159208298, train_loss=0.062489449977874756, time_cost=1.2290501594543457
+
Steps: 2%|▏ | 15355/1000000 [6:01:39<2983:52:31, 10.91s/it, lr=1e-5, step_loss=0.0654]
Steps: 2%|▏ | 15356/1000000 [6:01:51<3023:45:08, 11.06s/it, lr=1e-5, step_loss=0.0654][RANK-0]: Step: [15356], local_loss=0.006660084705799818, train_loss=0.02250530943274498, time_cost=3.2721569538116455
+
Steps: 2%|▏ | 15356/1000000 [6:01:51<3023:45:08, 11.06s/it, lr=1e-5, step_loss=0.00666]
Steps: 2%|▏ | 15357/1000000 [6:02:01<2962:26:54, 10.83s/it, lr=1e-5, step_loss=0.00666][RANK-0]: Step: [15357], local_loss=0.023670397698879242, train_loss=0.05643535032868385, time_cost=3.679784059524536
+
Steps: 2%|▏ | 15357/1000000 [6:02:01<2962:26:54, 10.83s/it, lr=1e-5, step_loss=0.0237]
Steps: 2%|▏ | 15358/1000000 [6:02:08<2632:32:09, 9.62s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [15358], local_loss=0.017329096794128418, train_loss=0.030552811920642853, time_cost=2.2889277935028076
+
Steps: 2%|▏ | 15358/1000000 [6:02:08<2632:32:09, 9.62s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 15359/1000000 [6:02:13<2228:13:01, 8.15s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [15359], local_loss=0.3473156988620758, train_loss=0.05721244588494301, time_cost=1.202042579650879
+
Steps: 2%|▏ | 15359/1000000 [6:02:13<2228:13:01, 8.15s/it, lr=1e-5, step_loss=0.347]
Steps: 2%|▏ | 15360/1000000 [6:02:27<2779:58:39, 10.16s/it, lr=1e-5, step_loss=0.347][RANK-0]: Step: [15360], local_loss=0.18880674242973328, train_loss=0.0527617409825325, time_cost=12.216607570648193
+
Steps: 2%|▏ | 15360/1000000 [6:02:27<2779:58:39, 10.16s/it, lr=1e-5, step_loss=0.189]
Steps: 2%|▏ | 15361/1000000 [6:02:39<2889:13:27, 10.56s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [15361], local_loss=0.07926899194717407, train_loss=0.2733009457588196, time_cost=2.1668879985809326
+
Steps: 2%|▏ | 15361/1000000 [6:02:39<2889:13:27, 10.56s/it, lr=1e-5, step_loss=0.0793]
Steps: 2%|▏ | 15362/1000000 [6:02:44<2442:38:26, 8.93s/it, lr=1e-5, step_loss=0.0793][RANK-0]: Step: [15362], local_loss=0.032942455261945724, train_loss=0.015250583179295063, time_cost=1.4663677215576172
+
Steps: 2%|▏ | 15362/1000000 [6:02:44<2442:38:26, 8.93s/it, lr=1e-5, step_loss=0.0329]
Steps: 2%|▏ | 15363/1000000 [6:02:50<2156:50:22, 7.89s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [15363], local_loss=0.02366836555302143, train_loss=0.05154913291335106, time_cost=4.4742138385772705
+
Steps: 2%|▏ | 15363/1000000 [6:02:50<2156:50:22, 7.89s/it, lr=1e-5, step_loss=0.0237]
Steps: 2%|▏ | 15364/1000000 [6:02:55<1957:14:48, 7.16s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [15364], local_loss=0.02898603491485119, train_loss=0.04106402024626732, time_cost=1.2107112407684326
+
Steps: 2%|▏ | 15364/1000000 [6:02:55<1957:14:48, 7.16s/it, lr=1e-5, step_loss=0.029]
Steps: 2%|▏ | 15365/1000000 [6:03:02<1928:33:14, 7.05s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [15365], local_loss=0.03793603181838989, train_loss=0.04280867055058479, time_cost=2.738152027130127
+
Steps: 2%|▏ | 15365/1000000 [6:03:02<1928:33:14, 7.05s/it, lr=1e-5, step_loss=0.0379]
Steps: 2%|▏ | 15366/1000000 [6:03:10<2022:19:39, 7.39s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [15366], local_loss=0.007800155319273472, train_loss=0.008297382853925228, time_cost=4.260461091995239
+
Steps: 2%|▏ | 15366/1000000 [6:03:10<2022:19:39, 7.39s/it, lr=1e-5, step_loss=0.0078]
Steps: 2%|▏ | 15367/1000000 [6:03:23<2456:59:13, 8.98s/it, lr=1e-5, step_loss=0.0078][RANK-0]: Step: [15367], local_loss=0.029490027576684952, train_loss=0.07020189613103867, time_cost=8.774460554122925
+
Steps: 2%|▏ | 15367/1000000 [6:03:23<2456:59:13, 8.98s/it, lr=1e-5, step_loss=0.0295]
Steps: 2%|▏ | 15368/1000000 [6:03:28<2144:07:00, 7.84s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [15368], local_loss=0.08112932741641998, train_loss=0.022770943120121956, time_cost=3.949554443359375
+
Steps: 2%|▏ | 15368/1000000 [6:03:28<2144:07:00, 7.84s/it, lr=1e-5, step_loss=0.0811]
Steps: 2%|▏ | 15369/1000000 [6:03:37<2284:57:03, 8.35s/it, lr=1e-5, step_loss=0.0811][RANK-0]: Step: [15369], local_loss=0.004068078938871622, train_loss=0.027957860380411148, time_cost=4.08077597618103
+
Steps: 2%|▏ | 15369/1000000 [6:03:37<2284:57:03, 8.35s/it, lr=1e-5, step_loss=0.00407]
Steps: 2%|▏ | 15370/1000000 [6:03:52<2759:48:23, 10.09s/it, lr=1e-5, step_loss=0.00407][RANK-0]: Step: [15370], local_loss=0.004133397247642279, train_loss=0.02897701784968376, time_cost=1.2078688144683838
+
Steps: 2%|▏ | 15370/1000000 [6:03:52<2759:48:23, 10.09s/it, lr=1e-5, step_loss=0.00413]
Steps: 2%|▏ | 15371/1000000 [6:04:06<3098:42:41, 11.33s/it, lr=1e-5, step_loss=0.00413][RANK-0]: Step: [15371], local_loss=0.005955557804554701, train_loss=0.06901869922876358, time_cost=4.617151975631714
+
Steps: 2%|▏ | 15371/1000000 [6:04:06<3098:42:41, 11.33s/it, lr=1e-5, step_loss=0.00596]
Steps: 2%|▏ | 15372/1000000 [6:04:14<2883:56:44, 10.54s/it, lr=1e-5, step_loss=0.00596][RANK-0]: Step: [15372], local_loss=0.11338287591934204, train_loss=0.07487842440605164, time_cost=2.4157590866088867
+
Steps: 2%|▏ | 15372/1000000 [6:04:14<2883:56:44, 10.54s/it, lr=1e-5, step_loss=0.113]
Steps: 2%|▏ | 15373/1000000 [6:04:25<2867:03:09, 10.48s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [15373], local_loss=0.2297166883945465, train_loss=0.20886367559432983, time_cost=1.840717077255249
+
Steps: 2%|▏ | 15373/1000000 [6:04:25<2867:03:09, 10.48s/it, lr=1e-5, step_loss=0.23]
Steps: 2%|▏ | 15374/1000000 [6:04:41<3296:59:52, 12.05s/it, lr=1e-5, step_loss=0.23][RANK-0]: Step: [15374], local_loss=0.014468410983681679, train_loss=0.040264278650283813, time_cost=7.884730339050293
+
Steps: 2%|▏ | 15374/1000000 [6:04:41<3296:59:52, 12.05s/it, lr=1e-5, step_loss=0.0145]
Steps: 2%|▏ | 15375/1000000 [6:04:48<2888:43:47, 10.56s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [15375], local_loss=0.008216891437768936, train_loss=0.03119882568717003, time_cost=3.0510833263397217
+
Steps: 2%|▏ | 15375/1000000 [6:04:48<2888:43:47, 10.56s/it, lr=1e-5, step_loss=0.00822]
Steps: 2%|▏ | 15376/1000000 [6:04:56<2688:06:10, 9.83s/it, lr=1e-5, step_loss=0.00822][RANK-0]: Step: [15376], local_loss=0.034808605909347534, train_loss=0.04187189042568207, time_cost=4.023817300796509
+
Steps: 2%|▏ | 15376/1000000 [6:04:56<2688:06:10, 9.83s/it, lr=1e-5, step_loss=0.0348]
Steps: 2%|▏ | 15377/1000000 [6:05:06<2757:57:05, 10.08s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [15377], local_loss=0.015599841251969337, train_loss=0.08956543356180191, time_cost=6.10006308555603
+
Steps: 2%|▏ | 15377/1000000 [6:05:06<2757:57:05, 10.08s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 15378/1000000 [6:05:11<2296:45:58, 8.40s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [15378], local_loss=0.06051506847143173, train_loss=0.030460745096206665, time_cost=1.7951459884643555
+
Steps: 2%|▏ | 15378/1000000 [6:05:11<2296:45:58, 8.40s/it, lr=1e-5, step_loss=0.0605]
Steps: 2%|▏ | 15379/1000000 [6:05:18<2201:32:13, 8.05s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [15379], local_loss=0.012635868042707443, train_loss=0.10128074884414673, time_cost=1.5898215770721436
+
Steps: 2%|▏ | 15379/1000000 [6:05:18<2201:32:13, 8.05s/it, lr=1e-5, step_loss=0.0126]
Steps: 2%|▏ | 15380/1000000 [6:05:31<2572:59:53, 9.41s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [15380], local_loss=0.02728569321334362, train_loss=0.03320920467376709, time_cost=9.925436973571777
+
Steps: 2%|▏ | 15380/1000000 [6:05:31<2572:59:53, 9.41s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 15381/1000000 [6:05:42<2760:20:40, 10.09s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [15381], local_loss=0.008543135598301888, train_loss=0.0782843679189682, time_cost=3.2512972354888916
+
Steps: 2%|▏ | 15381/1000000 [6:05:42<2760:20:40, 10.09s/it, lr=1e-5, step_loss=0.00854]
Steps: 2%|▏ | 15382/1000000 [6:05:48<2407:11:39, 8.80s/it, lr=1e-5, step_loss=0.00854][RANK-0]: Step: [15382], local_loss=0.06440635025501251, train_loss=0.04746086522936821, time_cost=2.140378713607788
+
Steps: 2%|▏ | 15382/1000000 [6:05:48<2407:11:39, 8.80s/it, lr=1e-5, step_loss=0.0644]
Steps: 2%|▏ | 15383/1000000 [6:06:03<2888:30:33, 10.56s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [15383], local_loss=0.07944847643375397, train_loss=0.03140789270401001, time_cost=5.539035797119141
+
Steps: 2%|▏ | 15383/1000000 [6:06:03<2888:30:33, 10.56s/it, lr=1e-5, step_loss=0.0794]
Steps: 2%|▏ | 15384/1000000 [6:06:16<3076:28:01, 11.25s/it, lr=1e-5, step_loss=0.0794][RANK-0]: Step: [15384], local_loss=0.005488375201821327, train_loss=0.034705374389886856, time_cost=1.3187880516052246
+
Steps: 2%|▏ | 15384/1000000 [6:06:16<3076:28:01, 11.25s/it, lr=1e-5, step_loss=0.00549]
Steps: 2%|▏ | 15385/1000000 [6:06:26<3041:41:47, 11.12s/it, lr=1e-5, step_loss=0.00549][RANK-0]: Step: [15385], local_loss=0.008465725928544998, train_loss=0.02092086151242256, time_cost=2.456535816192627
+
Steps: 2%|▏ | 15385/1000000 [6:06:26<3041:41:47, 11.12s/it, lr=1e-5, step_loss=0.00847]
Steps: 2%|▏ | 15386/1000000 [6:06:42<3361:34:37, 12.29s/it, lr=1e-5, step_loss=0.00847][RANK-0]: Step: [15386], local_loss=0.18465809524059296, train_loss=0.03822905570268631, time_cost=6.816226243972778
+
Steps: 2%|▏ | 15386/1000000 [6:06:42<3361:34:37, 12.29s/it, lr=1e-5, step_loss=0.185]
Steps: 2%|▏ | 15387/1000000 [6:06:49<2997:13:58, 10.96s/it, lr=1e-5, step_loss=0.185][RANK-0]: Step: [15387], local_loss=0.03299271687865257, train_loss=0.0639929473400116, time_cost=4.224502086639404
+
Steps: 2%|▏ | 15387/1000000 [6:06:49<2997:13:58, 10.96s/it, lr=1e-5, step_loss=0.033]
Steps: 2%|▏ | 15388/1000000 [6:06:54<2514:02:12, 9.19s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [15388], local_loss=0.027995619922876358, train_loss=0.045120637863874435, time_cost=1.2710793018341064
+
Steps: 2%|▏ | 15388/1000000 [6:06:54<2514:02:12, 9.19s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 15389/1000000 [6:07:08<2870:26:49, 10.50s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [15389], local_loss=0.006096224300563335, train_loss=0.02148425579071045, time_cost=4.675643682479858
+
Steps: 2%|▏ | 15389/1000000 [6:07:08<2870:26:49, 10.50s/it, lr=1e-5, step_loss=0.0061]
Steps: 2%|▏ | 15390/1000000 [6:07:16<2680:21:24, 9.80s/it, lr=1e-5, step_loss=0.0061][RANK-0]: Step: [15390], local_loss=0.01328877080231905, train_loss=0.03334719315171242, time_cost=1.9811384677886963
+
Steps: 2%|▏ | 15390/1000000 [6:07:16<2680:21:24, 9.80s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 15391/1000000 [6:07:21<2240:17:06, 8.19s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [15391], local_loss=0.32350611686706543, train_loss=0.10504139214754105, time_cost=1.4416980743408203
+
Steps: 2%|▏ | 15391/1000000 [6:07:21<2240:17:06, 8.19s/it, lr=1e-5, step_loss=0.324]
Steps: 2%|▏ | 15392/1000000 [6:07:25<1906:12:56, 6.97s/it, lr=1e-5, step_loss=0.324][RANK-0]: Step: [15392], local_loss=0.011769539676606655, train_loss=0.10615746676921844, time_cost=1.418086290359497
+
Steps: 2%|▏ | 15392/1000000 [6:07:25<1906:12:56, 6.97s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 15393/1000000 [6:07:31<1858:56:39, 6.80s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [15393], local_loss=0.009975623339414597, train_loss=0.025801993906497955, time_cost=3.1838724613189697
+
Steps: 2%|▏ | 15393/1000000 [6:07:31<1858:56:39, 6.80s/it, lr=1e-5, step_loss=0.00998]
Steps: 2%|▏ | 15394/1000000 [6:07:36<1727:10:47, 6.32s/it, lr=1e-5, step_loss=0.00998][RANK-0]: Step: [15394], local_loss=0.01183555368334055, train_loss=0.056968994438648224, time_cost=2.0995659828186035
+
Steps: 2%|▏ | 15394/1000000 [6:07:36<1727:10:47, 6.32s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 15395/1000000 [6:07:48<2166:39:32, 7.92s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [15395], local_loss=0.006827587261795998, train_loss=0.037782393395900726, time_cost=2.993751287460327
+
Steps: 2%|▏ | 15395/1000000 [6:07:48<2166:39:32, 7.92s/it, lr=1e-5, step_loss=0.00683]
Steps: 2%|▏ | 15396/1000000 [6:07:54<2043:49:25, 7.47s/it, lr=1e-5, step_loss=0.00683][RANK-0]: Step: [15396], local_loss=0.03995582461357117, train_loss=0.03754299506545067, time_cost=1.3580293655395508
+
Steps: 2%|▏ | 15396/1000000 [6:07:54<2043:49:25, 7.47s/it, lr=1e-5, step_loss=0.04]
Steps: 2%|▏ | 15397/1000000 [6:08:07<2472:43:24, 9.04s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [15397], local_loss=0.12811584770679474, train_loss=0.050047390162944794, time_cost=3.8259971141815186
+
Steps: 2%|▏ | 15397/1000000 [6:08:07<2472:43:24, 9.04s/it, lr=1e-5, step_loss=0.128]
Steps: 2%|▏ | 15398/1000000 [6:08:18<2637:30:37, 9.64s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [15398], local_loss=0.007998691871762276, train_loss=0.017067674547433853, time_cost=3.3645546436309814
+
Steps: 2%|▏ | 15398/1000000 [6:08:18<2637:30:37, 9.64s/it, lr=1e-5, step_loss=0.008]
Steps: 2%|▏ | 15399/1000000 [6:08:23<2238:09:05, 8.18s/it, lr=1e-5, step_loss=0.008][RANK-0]: Step: [15399], local_loss=0.038419753313064575, train_loss=0.16052505373954773, time_cost=2.1308095455169678
+
Steps: 2%|▏ | 15399/1000000 [6:08:23<2238:09:05, 8.18s/it, lr=1e-5, step_loss=0.0384]
Steps: 2%|▏ | 15400/1000000 [6:08:34<2508:48:09, 9.17s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [15400], local_loss=0.06452090293169022, train_loss=0.04277585446834564, time_cost=3.6860666275024414
+
Steps: 2%|▏ | 15400/1000000 [6:08:34<2508:48:09, 9.17s/it, lr=1e-5, step_loss=0.0645]
Steps: 2%|▏ | 15401/1000000 [6:08:45<2650:47:04, 9.69s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [15401], local_loss=0.06868335604667664, train_loss=0.04589126259088516, time_cost=2.3275763988494873
+
Steps: 2%|▏ | 15401/1000000 [6:08:45<2650:47:04, 9.69s/it, lr=1e-5, step_loss=0.0687]
Steps: 2%|▏ | 15402/1000000 [6:08:56<2774:44:12, 10.15s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [15402], local_loss=0.011168798431754112, train_loss=0.015866965055465698, time_cost=1.5331497192382812
+
Steps: 2%|▏ | 15402/1000000 [6:08:56<2774:44:12, 10.15s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 15403/1000000 [6:09:05<2661:33:31, 9.73s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [15403], local_loss=0.008163239806890488, train_loss=0.04378591105341911, time_cost=1.7839305400848389
+
Steps: 2%|▏ | 15403/1000000 [6:09:05<2661:33:31, 9.73s/it, lr=1e-5, step_loss=0.00816]
Steps: 2%|▏ | 15404/1000000 [6:09:11<2297:01:59, 8.40s/it, lr=1e-5, step_loss=0.00816][RANK-0]: Step: [15404], local_loss=0.00953880324959755, train_loss=0.08067066222429276, time_cost=1.219651699066162
+
Steps: 2%|▏ | 15404/1000000 [6:09:11<2297:01:59, 8.40s/it, lr=1e-5, step_loss=0.00954]
Steps: 2%|▏ | 15405/1000000 [6:09:22<2509:33:41, 9.18s/it, lr=1e-5, step_loss=0.00954][RANK-0]: Step: [15405], local_loss=0.012782727368175983, train_loss=0.02782408706843853, time_cost=2.076134204864502
+
Steps: 2%|▏ | 15405/1000000 [6:09:22<2509:33:41, 9.18s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15406/1000000 [6:09:27<2242:11:28, 8.20s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15406], local_loss=0.029619360342621803, train_loss=0.02243785373866558, time_cost=1.418522596359253
+
Steps: 2%|▏ | 15406/1000000 [6:09:27<2242:11:28, 8.20s/it, lr=1e-5, step_loss=0.0296]
Steps: 2%|▏ | 15407/1000000 [6:09:36<2289:12:55, 8.37s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [15407], local_loss=0.022591959685087204, train_loss=0.031302206218242645, time_cost=1.626418113708496
+
Steps: 2%|▏ | 15407/1000000 [6:09:36<2289:12:55, 8.37s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 15408/1000000 [6:09:45<2339:22:39, 8.55s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [15408], local_loss=0.08613349497318268, train_loss=0.04722357168793678, time_cost=3.465481758117676
+
Steps: 2%|▏ | 15408/1000000 [6:09:45<2339:22:39, 8.55s/it, lr=1e-5, step_loss=0.0861]
Steps: 2%|▏ | 15409/1000000 [6:09:59<2730:21:07, 9.98s/it, lr=1e-5, step_loss=0.0861][RANK-0]: Step: [15409], local_loss=0.011445960029959679, train_loss=0.06595878303050995, time_cost=5.304501533508301
+
Steps: 2%|▏ | 15409/1000000 [6:09:59<2730:21:07, 9.98s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 15410/1000000 [6:10:08<2682:36:16, 9.81s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [15410], local_loss=0.09799790382385254, train_loss=0.031608883291482925, time_cost=7.6684019565582275
+
Steps: 2%|▏ | 15410/1000000 [6:10:08<2682:36:16, 9.81s/it, lr=1e-5, step_loss=0.098]
Steps: 2%|▏ | 15411/1000000 [6:10:16<2519:21:43, 9.21s/it, lr=1e-5, step_loss=0.098][RANK-0]: Step: [15411], local_loss=0.03569041192531586, train_loss=7.235696792602539, time_cost=2.390746831893921
+
Steps: 2%|▏ | 15411/1000000 [6:10:16<2519:21:43, 9.21s/it, lr=1e-5, step_loss=0.0357]
Steps: 2%|▏ | 15412/1000000 [6:10:27<2651:47:38, 9.70s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [15412], local_loss=0.034973613917827606, train_loss=0.07626593112945557, time_cost=2.9497227668762207
+
Steps: 2%|▏ | 15412/1000000 [6:10:27<2651:47:38, 9.70s/it, lr=1e-5, step_loss=0.035]
Steps: 2%|▏ | 15413/1000000 [6:10:32<2291:14:28, 8.38s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [15413], local_loss=0.006664518266916275, train_loss=0.028475197032094002, time_cost=4.346542596817017
+
Steps: 2%|▏ | 15413/1000000 [6:10:32<2291:14:28, 8.38s/it, lr=1e-5, step_loss=0.00666]
Steps: 2%|▏ | 15414/1000000 [6:10:41<2332:09:31, 8.53s/it, lr=1e-5, step_loss=0.00666][RANK-0]: Step: [15414], local_loss=0.00917966291308403, train_loss=0.08995247632265091, time_cost=3.463715076446533
+
Steps: 2%|▏ | 15414/1000000 [6:10:41<2332:09:31, 8.53s/it, lr=1e-5, step_loss=0.00918]
Steps: 2%|▏ | 15415/1000000 [6:10:48<2202:40:33, 8.05s/it, lr=1e-5, step_loss=0.00918][RANK-0]: Step: [15415], local_loss=0.39894571900367737, train_loss=0.09051196277141571, time_cost=2.6606431007385254
+
Steps: 2%|▏ | 15415/1000000 [6:10:48<2202:40:33, 8.05s/it, lr=1e-5, step_loss=0.399]
Steps: 2%|▏ | 15416/1000000 [6:10:57<2286:35:49, 8.36s/it, lr=1e-5, step_loss=0.399][RANK-0]: Step: [15416], local_loss=0.008062501437962055, train_loss=0.05039368197321892, time_cost=1.8264427185058594
+
Steps: 2%|▏ | 15416/1000000 [6:10:57<2286:35:49, 8.36s/it, lr=1e-5, step_loss=0.00806]
Steps: 2%|▏ | 15417/1000000 [6:11:05<2262:55:49, 8.27s/it, lr=1e-5, step_loss=0.00806][RANK-0]: Step: [15417], local_loss=0.009042947553098202, train_loss=0.02332943305373192, time_cost=3.956538438796997
+
Steps: 2%|▏ | 15417/1000000 [6:11:05<2262:55:49, 8.27s/it, lr=1e-5, step_loss=0.00904]
Steps: 2%|▏ | 15418/1000000 [6:11:09<1948:01:59, 7.12s/it, lr=1e-5, step_loss=0.00904][RANK-0]: Step: [15418], local_loss=0.008659208193421364, train_loss=0.07319846749305725, time_cost=1.6869583129882812
+
Steps: 2%|▏ | 15418/1000000 [6:11:09<1948:01:59, 7.12s/it, lr=1e-5, step_loss=0.00866]
Steps: 2%|▏ | 15419/1000000 [6:11:21<2336:53:57, 8.54s/it, lr=1e-5, step_loss=0.00866][RANK-0]: Step: [15419], local_loss=0.005448296200484037, train_loss=0.13026535511016846, time_cost=2.6118457317352295
+
Steps: 2%|▏ | 15419/1000000 [6:11:21<2336:53:57, 8.54s/it, lr=1e-5, step_loss=0.00545]
Steps: 2%|▏ | 15420/1000000 [6:11:32<2488:59:08, 9.10s/it, lr=1e-5, step_loss=0.00545][RANK-0]: Step: [15420], local_loss=0.006730465684086084, train_loss=0.05949544906616211, time_cost=2.8145432472229004
+
Steps: 2%|▏ | 15420/1000000 [6:11:32<2488:59:08, 9.10s/it, lr=1e-5, step_loss=0.00673]
Steps: 2%|▏ | 15421/1000000 [6:11:37<2161:25:35, 7.90s/it, lr=1e-5, step_loss=0.00673][RANK-0]: Step: [15421], local_loss=0.014735857024788857, train_loss=17.016490936279297, time_cost=2.17887282371521
+
Steps: 2%|▏ | 15421/1000000 [6:11:37<2161:25:35, 7.90s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 15422/1000000 [6:11:43<2014:26:53, 7.37s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [15422], local_loss=0.00998026505112648, train_loss=0.051308102905750275, time_cost=1.6066405773162842
+
Steps: 2%|▏ | 15422/1000000 [6:11:43<2014:26:53, 7.37s/it, lr=1e-5, step_loss=0.00998]
Steps: 2%|▏ | 15423/1000000 [6:11:48<1819:23:15, 6.65s/it, lr=1e-5, step_loss=0.00998][RANK-0]: Step: [15423], local_loss=0.027344606816768646, train_loss=0.11157765239477158, time_cost=3.7598934173583984
+
Steps: 2%|▏ | 15423/1000000 [6:11:48<1819:23:15, 6.65s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 15424/1000000 [6:12:01<2364:14:54, 8.64s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [15424], local_loss=0.05115176737308502, train_loss=0.03149435669183731, time_cost=3.7409729957580566
+
Steps: 2%|▏ | 15424/1000000 [6:12:01<2364:14:54, 8.64s/it, lr=1e-5, step_loss=0.0512]
Steps: 2%|▏ | 15425/1000000 [6:12:12<2513:12:17, 9.19s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [15425], local_loss=0.02007816731929779, train_loss=0.025338999927043915, time_cost=1.2107110023498535
+
Steps: 2%|▏ | 15425/1000000 [6:12:12<2513:12:17, 9.19s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 15426/1000000 [6:12:25<2858:38:47, 10.45s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [15426], local_loss=0.009956619702279568, train_loss=0.04429159313440323, time_cost=2.0943334102630615
+
Steps: 2%|▏ | 15426/1000000 [6:12:25<2858:38:47, 10.45s/it, lr=1e-5, step_loss=0.00996]
Steps: 2%|▏ | 15427/1000000 [6:12:30<2386:25:28, 8.73s/it, lr=1e-5, step_loss=0.00996][RANK-0]: Step: [15427], local_loss=0.052175793796777725, train_loss=0.06015225872397423, time_cost=2.2903456687927246
+
Steps: 2%|▏ | 15427/1000000 [6:12:30<2386:25:28, 8.73s/it, lr=1e-5, step_loss=0.0522]
Steps: 2%|▏ | 15428/1000000 [6:12:35<2150:09:30, 7.86s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [15428], local_loss=0.039505477994680405, train_loss=0.07308728992938995, time_cost=2.062282085418701
+
Steps: 2%|▏ | 15428/1000000 [6:12:35<2150:09:30, 7.86s/it, lr=1e-5, step_loss=0.0395]
Steps: 2%|▏ | 15429/1000000 [6:12:47<2445:53:09, 8.94s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [15429], local_loss=0.04549175128340721, train_loss=0.0692068412899971, time_cost=4.094973802566528
+
Steps: 2%|▏ | 15429/1000000 [6:12:47<2445:53:09, 8.94s/it, lr=1e-5, step_loss=0.0455]
Steps: 2%|▏ | 15430/1000000 [6:12:53<2208:11:40, 8.07s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [15430], local_loss=0.03390884771943092, train_loss=0.09451399743556976, time_cost=3.3233587741851807
+
Steps: 2%|▏ | 15430/1000000 [6:12:53<2208:11:40, 8.07s/it, lr=1e-5, step_loss=0.0339]
Steps: 2%|▏ | 15431/1000000 [6:13:04<2430:30:40, 8.89s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [15431], local_loss=0.008732249960303307, train_loss=0.14232520759105682, time_cost=3.1249783039093018
+
Steps: 2%|▏ | 15431/1000000 [6:13:04<2430:30:40, 8.89s/it, lr=1e-5, step_loss=0.00873]
Steps: 2%|▏ | 15432/1000000 [6:13:11<2311:30:57, 8.45s/it, lr=1e-5, step_loss=0.00873][RANK-0]: Step: [15432], local_loss=0.005795891396701336, train_loss=7.7607316970825195, time_cost=3.5733835697174072
+
Steps: 2%|▏ | 15432/1000000 [6:13:11<2311:30:57, 8.45s/it, lr=1e-5, step_loss=0.0058]
Steps: 2%|▏ | 15433/1000000 [6:13:17<2103:50:54, 7.69s/it, lr=1e-5, step_loss=0.0058][RANK-0]: Step: [15433], local_loss=0.008812028914690018, train_loss=0.022258203476667404, time_cost=4.815153121948242
+
Steps: 2%|▏ | 15433/1000000 [6:13:17<2103:50:54, 7.69s/it, lr=1e-5, step_loss=0.00881]
Steps: 2%|▏ | 15434/1000000 [6:13:26<2194:36:39, 8.02s/it, lr=1e-5, step_loss=0.00881][RANK-0]: Step: [15434], local_loss=0.14879384636878967, train_loss=0.07151372730731964, time_cost=3.7614026069641113
+
Steps: 2%|▏ | 15434/1000000 [6:13:26<2194:36:39, 8.02s/it, lr=1e-5, step_loss=0.149]
Steps: 2%|▏ | 15435/1000000 [6:13:38<2560:55:35, 9.36s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [15435], local_loss=0.102264404296875, train_loss=0.04477988928556442, time_cost=4.216109752655029
+
Steps: 2%|▏ | 15435/1000000 [6:13:38<2560:55:35, 9.36s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 15436/1000000 [6:13:52<2869:25:48, 10.49s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [15436], local_loss=0.01405922882258892, train_loss=0.022863784804940224, time_cost=4.4621741771698
+
Steps: 2%|▏ | 15436/1000000 [6:13:52<2869:25:48, 10.49s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 15437/1000000 [6:14:02<2896:23:04, 10.59s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [15437], local_loss=0.007275410462170839, train_loss=0.04281117767095566, time_cost=2.2143900394439697
+
Steps: 2%|▏ | 15437/1000000 [6:14:02<2896:23:04, 10.59s/it, lr=1e-5, step_loss=0.00728]
Steps: 2%|▏ | 15438/1000000 [6:14:18<3336:58:10, 12.20s/it, lr=1e-5, step_loss=0.00728][RANK-0]: Step: [15438], local_loss=0.049371276050806046, train_loss=0.01796722784638405, time_cost=7.960508346557617
+
Steps: 2%|▏ | 15438/1000000 [6:14:18<3336:58:10, 12.20s/it, lr=1e-5, step_loss=0.0494]
Steps: 2%|▏ | 15439/1000000 [6:14:31<3377:37:10, 12.35s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [15439], local_loss=0.013248869217932224, train_loss=0.051520608365535736, time_cost=6.000169992446899
+
Steps: 2%|▏ | 15439/1000000 [6:14:31<3377:37:10, 12.35s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 15440/1000000 [6:14:42<3299:56:37, 12.07s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [15440], local_loss=0.04628775641322136, train_loss=0.0905788391828537, time_cost=2.804765462875366
+
Steps: 2%|▏ | 15440/1000000 [6:14:42<3299:56:37, 12.07s/it, lr=1e-5, step_loss=0.0463]
Steps: 2%|▏ | 15441/1000000 [6:14:52<3100:51:53, 11.34s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [15441], local_loss=0.012513667345046997, train_loss=0.037441033869981766, time_cost=3.612149238586426
+
Steps: 2%|▏ | 15441/1000000 [6:14:52<3100:51:53, 11.34s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 15442/1000000 [6:15:06<3353:07:06, 12.26s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [15442], local_loss=0.018157772719860077, train_loss=0.022235456854104996, time_cost=8.75077223777771
+
Steps: 2%|▏ | 15442/1000000 [6:15:06<3353:07:06, 12.26s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 15443/1000000 [6:15:19<3414:31:35, 12.49s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [15443], local_loss=0.011738776229321957, train_loss=0.0933692678809166, time_cost=1.2101850509643555
+
Steps: 2%|▏ | 15443/1000000 [6:15:19<3414:31:35, 12.49s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 15444/1000000 [6:15:24<2741:24:44, 10.02s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [15444], local_loss=0.011590059846639633, train_loss=0.048402637243270874, time_cost=1.4728989601135254
+
Steps: 2%|▏ | 15444/1000000 [6:15:24<2741:24:44, 10.02s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 15445/1000000 [6:15:28<2283:36:09, 8.35s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [15445], local_loss=0.010145142674446106, train_loss=0.0125814788043499, time_cost=3.314269542694092
+
Steps: 2%|▏ | 15445/1000000 [6:15:28<2283:36:09, 8.35s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 15446/1000000 [6:15:35<2196:04:37, 8.03s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [15446], local_loss=0.012910169549286366, train_loss=0.05434978008270264, time_cost=2.806978464126587
+
Steps: 2%|▏ | 15446/1000000 [6:15:35<2196:04:37, 8.03s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 15447/1000000 [6:15:41<2023:53:41, 7.40s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [15447], local_loss=0.006018337793648243, train_loss=17.344131469726562, time_cost=1.8196780681610107
+
Steps: 2%|▏ | 15447/1000000 [6:15:41<2023:53:41, 7.40s/it, lr=1e-5, step_loss=0.00602]
Steps: 2%|▏ | 15448/1000000 [6:15:46<1782:22:14, 6.52s/it, lr=1e-5, step_loss=0.00602][RANK-0]: Step: [15448], local_loss=1.0027433633804321, train_loss=46.6814079284668, time_cost=2.029963970184326
+
Steps: 2%|▏ | 15448/1000000 [6:15:46<1782:22:14, 6.52s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 15449/1000000 [6:15:57<2181:00:25, 7.97s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [15449], local_loss=0.03791532665491104, train_loss=0.03984962776303291, time_cost=5.463440418243408
+
Steps: 2%|▏ | 15449/1000000 [6:15:57<2181:00:25, 7.97s/it, lr=1e-5, step_loss=0.0379]
Steps: 2%|▏ | 15450/1000000 [6:16:12<2716:47:27, 9.93s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [15450], local_loss=0.012820564210414886, train_loss=0.02893708646297455, time_cost=5.989128112792969
+
Steps: 2%|▏ | 15450/1000000 [6:16:12<2716:47:27, 9.93s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15451/1000000 [6:16:18<2378:30:17, 8.70s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15451], local_loss=0.028934989124536514, train_loss=0.14296074211597443, time_cost=1.4192543029785156
+
Steps: 2%|▏ | 15451/1000000 [6:16:18<2378:30:17, 8.70s/it, lr=1e-5, step_loss=0.0289]
Steps: 2%|▏ | 15452/1000000 [6:16:27<2420:22:26, 8.85s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [15452], local_loss=0.04769602417945862, train_loss=0.1621798872947693, time_cost=4.187894105911255
+
Steps: 2%|▏ | 15452/1000000 [6:16:27<2420:22:26, 8.85s/it, lr=1e-5, step_loss=0.0477]
Steps: 2%|▏ | 15453/1000000 [6:16:36<2492:15:49, 9.11s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [15453], local_loss=0.022388653829693794, train_loss=0.014406026341021061, time_cost=5.3974220752716064
+
Steps: 2%|▏ | 15453/1000000 [6:16:36<2492:15:49, 9.11s/it, lr=1e-5, step_loss=0.0224]
Steps: 2%|▏ | 15454/1000000 [6:16:44<2322:06:20, 8.49s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [15454], local_loss=0.17708833515644073, train_loss=0.045363109558820724, time_cost=2.583069086074829
+
Steps: 2%|▏ | 15454/1000000 [6:16:44<2322:06:20, 8.49s/it, lr=1e-5, step_loss=0.177]
Steps: 2%|▏ | 15455/1000000 [6:16:50<2140:07:52, 7.83s/it, lr=1e-5, step_loss=0.177][RANK-0]: Step: [15455], local_loss=0.007765952032059431, train_loss=0.045807015150785446, time_cost=1.8088197708129883
+
Steps: 2%|▏ | 15455/1000000 [6:16:50<2140:07:52, 7.83s/it, lr=1e-5, step_loss=0.00777]
Steps: 2%|▏ | 15456/1000000 [6:17:00<2307:22:16, 8.44s/it, lr=1e-5, step_loss=0.00777][RANK-0]: Step: [15456], local_loss=0.018128708004951477, train_loss=0.20938657224178314, time_cost=3.7571802139282227
+
Steps: 2%|▏ | 15456/1000000 [6:17:00<2307:22:16, 8.44s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 15457/1000000 [6:17:04<1979:33:38, 7.24s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [15457], local_loss=0.024103963747620583, train_loss=0.024160154163837433, time_cost=1.4665513038635254
+
Steps: 2%|▏ | 15457/1000000 [6:17:04<1979:33:38, 7.24s/it, lr=1e-5, step_loss=0.0241]
Steps: 2%|▏ | 15458/1000000 [6:17:11<1938:29:13, 7.09s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [15458], local_loss=0.06683587282896042, train_loss=0.05736510455608368, time_cost=5.689876556396484
+
Steps: 2%|▏ | 15458/1000000 [6:17:11<1938:29:13, 7.09s/it, lr=1e-5, step_loss=0.0668]
Steps: 2%|▏ | 15459/1000000 [6:17:17<1839:00:49, 6.72s/it, lr=1e-5, step_loss=0.0668][RANK-0]: Step: [15459], local_loss=0.03112061321735382, train_loss=0.03379756957292557, time_cost=1.2733283042907715
+
Steps: 2%|▏ | 15459/1000000 [6:17:17<1839:00:49, 6.72s/it, lr=1e-5, step_loss=0.0311]
Steps: 2%|▏ | 15460/1000000 [6:17:27<2158:46:01, 7.89s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [15460], local_loss=0.004867143463343382, train_loss=0.016725167632102966, time_cost=1.2157764434814453
+
Steps: 2%|▏ | 15460/1000000 [6:17:27<2158:46:01, 7.89s/it, lr=1e-5, step_loss=0.00487]
Steps: 2%|▏ | 15461/1000000 [6:17:39<2483:55:53, 9.08s/it, lr=1e-5, step_loss=0.00487][RANK-0]: Step: [15461], local_loss=0.14695711433887482, train_loss=0.06063583493232727, time_cost=9.340977907180786
+
Steps: 2%|▏ | 15461/1000000 [6:17:39<2483:55:53, 9.08s/it, lr=1e-5, step_loss=0.147]
Steps: 2%|▏ | 15462/1000000 [6:17:52<2771:58:06, 10.14s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [15462], local_loss=0.03952797129750252, train_loss=4.592670440673828, time_cost=3.440054178237915
+
Steps: 2%|▏ | 15462/1000000 [6:17:52<2771:58:06, 10.14s/it, lr=1e-5, step_loss=0.0395]
Steps: 2%|▏ | 15463/1000000 [6:18:02<2786:06:56, 10.19s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [15463], local_loss=0.024621589109301567, train_loss=0.03257623314857483, time_cost=2.626829147338867
+
Steps: 2%|▏ | 15463/1000000 [6:18:02<2786:06:56, 10.19s/it, lr=1e-5, step_loss=0.0246]
Steps: 2%|▏ | 15464/1000000 [6:18:07<2378:30:28, 8.70s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [15464], local_loss=1.0008553266525269, train_loss=0.15793132781982422, time_cost=2.074566125869751
+
Steps: 2%|▏ | 15464/1000000 [6:18:07<2378:30:28, 8.70s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 15465/1000000 [6:18:21<2807:57:11, 10.27s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [15465], local_loss=0.03492686524987221, train_loss=0.032090917229652405, time_cost=4.111582040786743
+
Steps: 2%|▏ | 15465/1000000 [6:18:21<2807:57:11, 10.27s/it, lr=1e-5, step_loss=0.0349]
Steps: 2%|▏ | 15466/1000000 [6:18:27<2461:22:33, 9.00s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [15466], local_loss=1.0148065090179443, train_loss=0.14071646332740784, time_cost=2.1494216918945312
+
Steps: 2%|▏ | 15466/1000000 [6:18:27<2461:22:33, 9.00s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 15467/1000000 [6:18:32<2087:16:25, 7.63s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [15467], local_loss=0.03577295318245888, train_loss=0.0724848061800003, time_cost=1.414529800415039
+
Steps: 2%|▏ | 15467/1000000 [6:18:32<2087:16:25, 7.63s/it, lr=1e-5, step_loss=0.0358]
Steps: 2%|▏ | 15468/1000000 [6:18:43<2394:02:38, 8.75s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [15468], local_loss=0.04178749397397041, train_loss=0.02528643235564232, time_cost=6.380384683609009
+
Steps: 2%|▏ | 15468/1000000 [6:18:43<2394:02:38, 8.75s/it, lr=1e-5, step_loss=0.0418]
Steps: 2%|▏ | 15469/1000000 [6:18:53<2448:41:51, 8.95s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [15469], local_loss=0.4271291494369507, train_loss=0.08313656598329544, time_cost=2.0350852012634277
+
Steps: 2%|▏ | 15469/1000000 [6:18:53<2448:41:51, 8.95s/it, lr=1e-5, step_loss=0.427]
Steps: 2%|▏ | 15470/1000000 [6:19:06<2852:53:08, 10.43s/it, lr=1e-5, step_loss=0.427][RANK-0]: Step: [15470], local_loss=0.046501606702804565, train_loss=0.08846442401409149, time_cost=1.2903270721435547
+
Steps: 2%|▏ | 15470/1000000 [6:19:06<2852:53:08, 10.43s/it, lr=1e-5, step_loss=0.0465]
Steps: 2%|▏ | 15471/1000000 [6:19:19<3009:30:11, 11.00s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [15471], local_loss=0.024932272732257843, train_loss=0.06616073101758957, time_cost=4.416438817977905
+
Steps: 2%|▏ | 15471/1000000 [6:19:19<3009:30:11, 11.00s/it, lr=1e-5, step_loss=0.0249]
Steps: 2%|▏ | 15472/1000000 [6:19:25<2612:10:59, 9.55s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [15472], local_loss=0.005434466525912285, train_loss=0.01650705561041832, time_cost=1.6119279861450195
+
Steps: 2%|▏ | 15472/1000000 [6:19:25<2612:10:59, 9.55s/it, lr=1e-5, step_loss=0.00543]
Steps: 2%|▏ | 15473/1000000 [6:19:33<2459:22:56, 8.99s/it, lr=1e-5, step_loss=0.00543][RANK-0]: Step: [15473], local_loss=0.03308764472603798, train_loss=0.06704695522785187, time_cost=2.0659759044647217
+
Steps: 2%|▏ | 15473/1000000 [6:19:33<2459:22:56, 8.99s/it, lr=1e-5, step_loss=0.0331]
Steps: 2%|▏ | 15474/1000000 [6:19:47<2870:05:02, 10.49s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [15474], local_loss=0.00782471988350153, train_loss=0.017572056502103806, time_cost=4.237911224365234
+
Steps: 2%|▏ | 15474/1000000 [6:19:47<2870:05:02, 10.49s/it, lr=1e-5, step_loss=0.00782]
Steps: 2%|▏ | 15475/1000000 [6:19:59<3050:24:58, 11.15s/it, lr=1e-5, step_loss=0.00782][RANK-0]: Step: [15475], local_loss=0.019093265756964684, train_loss=0.013790363445878029, time_cost=4.483327150344849
+
Steps: 2%|▏ | 15475/1000000 [6:19:59<3050:24:58, 11.15s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 15476/1000000 [6:20:05<2627:09:09, 9.61s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [15476], local_loss=0.02589324861764908, train_loss=0.09515431523323059, time_cost=1.8178048133850098
+
Steps: 2%|▏ | 15476/1000000 [6:20:05<2627:09:09, 9.61s/it, lr=1e-5, step_loss=0.0259]
Steps: 2%|▏ | 15477/1000000 [6:20:14<2575:54:04, 9.42s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [15477], local_loss=0.029827041551470757, train_loss=0.038277000188827515, time_cost=3.2820041179656982
+
Steps: 2%|▏ | 15477/1000000 [6:20:14<2575:54:04, 9.42s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 15478/1000000 [6:20:26<2740:33:31, 10.02s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [15478], local_loss=0.031599968671798706, train_loss=0.06512857973575592, time_cost=1.941110372543335
+
Steps: 2%|▏ | 15478/1000000 [6:20:26<2740:33:31, 10.02s/it, lr=1e-5, step_loss=0.0316]
Steps: 2%|▏ | 15479/1000000 [6:20:36<2799:03:12, 10.24s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [15479], local_loss=0.00552921649068594, train_loss=0.01632499322295189, time_cost=1.3285903930664062
+
Steps: 2%|▏ | 15479/1000000 [6:20:36<2799:03:12, 10.24s/it, lr=1e-5, step_loss=0.00553]
Steps: 2%|▏ | 15480/1000000 [6:20:49<2977:38:25, 10.89s/it, lr=1e-5, step_loss=0.00553][RANK-0]: Step: [15480], local_loss=0.01433239784091711, train_loss=0.022064536809921265, time_cost=5.701997756958008
+
Steps: 2%|▏ | 15480/1000000 [6:20:49<2977:38:25, 10.89s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 15481/1000000 [6:20:54<2502:42:58, 9.15s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [15481], local_loss=0.0060242158360779285, train_loss=0.032855913043022156, time_cost=4.08644962310791
+
Steps: 2%|▏ | 15481/1000000 [6:20:54<2502:42:58, 9.15s/it, lr=1e-5, step_loss=0.00602]
Steps: 2%|▏ | 15482/1000000 [6:20:59<2185:00:19, 7.99s/it, lr=1e-5, step_loss=0.00602][RANK-0]: Step: [15482], local_loss=0.02784735895693302, train_loss=0.04492802172899246, time_cost=1.2672994136810303
+
Steps: 2%|▏ | 15482/1000000 [6:20:59<2185:00:19, 7.99s/it, lr=1e-5, step_loss=0.0278]
Steps: 2%|▏ | 15483/1000000 [6:21:07<2142:53:04, 7.84s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [15483], local_loss=0.011994592845439911, train_loss=0.051683299243450165, time_cost=3.7709240913391113
+
Steps: 2%|▏ | 15483/1000000 [6:21:07<2142:53:04, 7.84s/it, lr=1e-5, step_loss=0.012]
Steps: 2%|▏ | 15484/1000000 [6:21:11<1858:20:55, 6.80s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [15484], local_loss=0.015682008117437363, train_loss=0.019768159836530685, time_cost=1.5100579261779785
+
Steps: 2%|▏ | 15484/1000000 [6:21:11<1858:20:55, 6.80s/it, lr=1e-5, step_loss=0.0157]
Steps: 2%|▏ | 15485/1000000 [6:21:18<1868:54:29, 6.83s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [15485], local_loss=0.031564682722091675, train_loss=0.07723912596702576, time_cost=3.0424396991729736
+
Steps: 2%|▏ | 15485/1000000 [6:21:18<1868:54:29, 6.83s/it, lr=1e-5, step_loss=0.0316]
Steps: 2%|▏ | 15486/1000000 [6:21:33<2524:58:02, 9.23s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [15486], local_loss=0.00343115720897913, train_loss=0.016733719035983086, time_cost=7.535509824752808
+
Steps: 2%|▏ | 15486/1000000 [6:21:33<2524:58:02, 9.23s/it, lr=1e-5, step_loss=0.00343]
Steps: 2%|▏ | 15487/1000000 [6:21:42<2510:13:11, 9.18s/it, lr=1e-5, step_loss=0.00343][RANK-0]: Step: [15487], local_loss=0.004349446855485439, train_loss=0.026486869901418686, time_cost=1.218879222869873
+
Steps: 2%|▏ | 15487/1000000 [6:21:42<2510:13:11, 9.18s/it, lr=1e-5, step_loss=0.00435]
Steps: 2%|▏ | 15488/1000000 [6:21:54<2787:12:42, 10.19s/it, lr=1e-5, step_loss=0.00435][RANK-0]: Step: [15488], local_loss=0.11606757342815399, train_loss=11.45469856262207, time_cost=7.10907506942749
+
Steps: 2%|▏ | 15488/1000000 [6:21:54<2787:12:42, 10.19s/it, lr=1e-5, step_loss=0.116]
Steps: 2%|▏ | 15489/1000000 [6:22:03<2637:31:28, 9.64s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [15489], local_loss=0.009939382784068584, train_loss=6.977734565734863, time_cost=1.2159607410430908
+
Steps: 2%|▏ | 15489/1000000 [6:22:03<2637:31:28, 9.64s/it, lr=1e-5, step_loss=0.00994]
Steps: 2%|▏ | 15490/1000000 [6:22:17<3040:01:04, 11.12s/it, lr=1e-5, step_loss=0.00994][RANK-0]: Step: [15490], local_loss=0.4675547480583191, train_loss=0.09205225110054016, time_cost=4.736355543136597
+
Steps: 2%|▏ | 15490/1000000 [6:22:17<3040:01:04, 11.12s/it, lr=1e-5, step_loss=0.468]
Steps: 2%|▏ | 15491/1000000 [6:22:23<2611:50:54, 9.55s/it, lr=1e-5, step_loss=0.468][RANK-0]: Step: [15491], local_loss=0.02007298544049263, train_loss=0.016567042097449303, time_cost=1.4342823028564453
+
Steps: 2%|▏ | 15491/1000000 [6:22:23<2611:50:54, 9.55s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 15492/1000000 [6:22:30<2393:43:37, 8.75s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [15492], local_loss=0.41677677631378174, train_loss=0.08047455549240112, time_cost=1.2810697555541992
+
Steps: 2%|▏ | 15492/1000000 [6:22:30<2393:43:37, 8.75s/it, lr=1e-5, step_loss=0.417]
Steps: 2%|▏ | 15493/1000000 [6:22:37<2263:34:39, 8.28s/it, lr=1e-5, step_loss=0.417][RANK-0]: Step: [15493], local_loss=0.04822520911693573, train_loss=0.05900948867201805, time_cost=1.1996262073516846
+
Steps: 2%|▏ | 15493/1000000 [6:22:37<2263:34:39, 8.28s/it, lr=1e-5, step_loss=0.0482]
Steps: 2%|▏ | 15494/1000000 [6:22:49<2506:35:54, 9.17s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [15494], local_loss=0.007804151624441147, train_loss=0.15826398134231567, time_cost=2.451833486557007
+
Steps: 2%|▏ | 15494/1000000 [6:22:49<2506:35:54, 9.17s/it, lr=1e-5, step_loss=0.0078]
Steps: 2%|▏ | 15495/1000000 [6:22:59<2592:21:23, 9.48s/it, lr=1e-5, step_loss=0.0078][RANK-0]: Step: [15495], local_loss=0.00947655737400055, train_loss=0.020837824791669846, time_cost=4.054812669754028
+
Steps: 2%|▏ | 15495/1000000 [6:22:59<2592:21:23, 9.48s/it, lr=1e-5, step_loss=0.00948]
Steps: 2%|▏ | 15496/1000000 [6:23:05<2288:41:38, 8.37s/it, lr=1e-5, step_loss=0.00948][RANK-0]: Step: [15496], local_loss=0.017615551128983498, train_loss=0.15803413093090057, time_cost=3.320272207260132
+
Steps: 2%|▏ | 15496/1000000 [6:23:05<2288:41:38, 8.37s/it, lr=1e-5, step_loss=0.0176]
Steps: 2%|▏ | 15497/1000000 [6:23:09<1967:28:20, 7.19s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [15497], local_loss=0.01777508109807968, train_loss=0.03979676216840744, time_cost=1.3002822399139404
+
Steps: 2%|▏ | 15497/1000000 [6:23:09<1967:28:20, 7.19s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 15498/1000000 [6:23:14<1780:57:47, 6.51s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [15498], local_loss=0.05768090859055519, train_loss=0.04046880453824997, time_cost=1.2427208423614502
+
Steps: 2%|▏ | 15498/1000000 [6:23:14<1780:57:47, 6.51s/it, lr=1e-5, step_loss=0.0577]
Steps: 2%|▏ | 15499/1000000 [6:23:19<1645:44:04, 6.02s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [15499], local_loss=0.015501770190894604, train_loss=0.14769792556762695, time_cost=1.8310420513153076
+
Steps: 2%|▏ | 15499/1000000 [6:23:19<1645:44:04, 6.02s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 15500/1000000 [6:23:34<2431:17:42, 8.89s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [15500], local_loss=0.034052882343530655, train_loss=0.0333961583673954, time_cost=7.042981863021851
+
Steps: 2%|▏ | 15500/1000000 [6:23:34<2431:17:42, 8.89s/it, lr=1e-5, step_loss=0.0341]
Steps: 2%|▏ | 15501/1000000 [6:23:47<2740:03:07, 10.02s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [15501], local_loss=0.011803037486970425, train_loss=0.06555400043725967, time_cost=3.22619891166687
+
Steps: 2%|▏ | 15501/1000000 [6:23:47<2740:03:07, 10.02s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 15502/1000000 [6:23:53<2410:13:03, 8.81s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [15502], local_loss=0.00977832917124033, train_loss=0.022303840145468712, time_cost=1.5578539371490479
+
Steps: 2%|▏ | 15502/1000000 [6:23:53<2410:13:03, 8.81s/it, lr=1e-5, step_loss=0.00978]
Steps: 2%|▏ | 15503/1000000 [6:24:07<2846:48:07, 10.41s/it, lr=1e-5, step_loss=0.00978][RANK-0]: Step: [15503], local_loss=0.03725837171077728, train_loss=0.046209581196308136, time_cost=1.1902601718902588
+
Steps: 2%|▏ | 15503/1000000 [6:24:07<2846:48:07, 10.41s/it, lr=1e-5, step_loss=0.0373]
Steps: 2%|▏ | 15504/1000000 [6:24:20<3015:06:43, 11.03s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [15504], local_loss=0.008253422565758228, train_loss=0.023112311959266663, time_cost=2.5992674827575684
+
Steps: 2%|▏ | 15504/1000000 [6:24:20<3015:06:43, 11.03s/it, lr=1e-5, step_loss=0.00825]
Steps: 2%|▏ | 15505/1000000 [6:24:29<2853:00:04, 10.43s/it, lr=1e-5, step_loss=0.00825][RANK-0]: Step: [15505], local_loss=0.31201258301734924, train_loss=0.06457731127738953, time_cost=2.3575098514556885
+
Steps: 2%|▏ | 15505/1000000 [6:24:29<2853:00:04, 10.43s/it, lr=1e-5, step_loss=0.312]
Steps: 2%|▏ | 15506/1000000 [6:24:38<2777:48:43, 10.16s/it, lr=1e-5, step_loss=0.312][RANK-0]: Step: [15506], local_loss=0.04654107615351677, train_loss=0.022358981892466545, time_cost=3.9878432750701904
+
Steps: 2%|▏ | 15506/1000000 [6:24:38<2777:48:43, 10.16s/it, lr=1e-5, step_loss=0.0465]
Steps: 2%|▏ | 15507/1000000 [6:24:43<2378:22:57, 8.70s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [15507], local_loss=0.017804566770792007, train_loss=0.035951949656009674, time_cost=2.486382007598877
+
Steps: 2%|▏ | 15507/1000000 [6:24:43<2378:22:57, 8.70s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 15508/1000000 [6:24:48<2047:35:13, 7.49s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [15508], local_loss=0.12196066975593567, train_loss=0.062185440212488174, time_cost=1.8154385089874268
+
Steps: 2%|▏ | 15508/1000000 [6:24:48<2047:35:13, 7.49s/it, lr=1e-5, step_loss=0.122]
Steps: 2%|▏ | 15509/1000000 [6:24:53<1840:40:17, 6.73s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [15509], local_loss=0.009437336586415768, train_loss=0.05165252462029457, time_cost=2.470825672149658
+
Steps: 2%|▏ | 15509/1000000 [6:24:53<1840:40:17, 6.73s/it, lr=1e-5, step_loss=0.00944]
Steps: 2%|▏ | 15510/1000000 [6:24:57<1628:02:57, 5.95s/it, lr=1e-5, step_loss=0.00944][RANK-0]: Step: [15510], local_loss=0.009908037260174751, train_loss=0.05246700718998909, time_cost=1.3325226306915283
+
Steps: 2%|▏ | 15510/1000000 [6:24:57<1628:02:57, 5.95s/it, lr=1e-5, step_loss=0.00991]
Steps: 2%|▏ | 15511/1000000 [6:25:04<1724:00:44, 6.30s/it, lr=1e-5, step_loss=0.00991][RANK-0]: Step: [15511], local_loss=0.061025604605674744, train_loss=0.15612153708934784, time_cost=2.7189862728118896
+
Steps: 2%|▏ | 15511/1000000 [6:25:04<1724:00:44, 6.30s/it, lr=1e-5, step_loss=0.061]
Steps: 2%|▏ | 15512/1000000 [6:25:11<1720:59:44, 6.29s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [15512], local_loss=0.38552701473236084, train_loss=0.08518495410680771, time_cost=1.6257109642028809
+
Steps: 2%|▏ | 15512/1000000 [6:25:11<1720:59:44, 6.29s/it, lr=1e-5, step_loss=0.386]
Steps: 2%|▏ | 15513/1000000 [6:25:18<1793:02:06, 6.56s/it, lr=1e-5, step_loss=0.386][RANK-0]: Step: [15513], local_loss=0.01690826192498207, train_loss=0.16752631962299347, time_cost=2.3990893363952637
+
Steps: 2%|▏ | 15513/1000000 [6:25:18<1793:02:06, 6.56s/it, lr=1e-5, step_loss=0.0169]
Steps: 2%|▏ | 15514/1000000 [6:25:26<1946:17:44, 7.12s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [15514], local_loss=0.024382995441555977, train_loss=0.014168508350849152, time_cost=1.5846233367919922
+
Steps: 2%|▏ | 15514/1000000 [6:25:26<1946:17:44, 7.12s/it, lr=1e-5, step_loss=0.0244]
Steps: 2%|▏ | 15515/1000000 [6:25:37<2252:26:40, 8.24s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [15515], local_loss=0.023344090208411217, train_loss=0.08036849647760391, time_cost=7.374667167663574
+
Steps: 2%|▏ | 15515/1000000 [6:25:37<2252:26:40, 8.24s/it, lr=1e-5, step_loss=0.0233]
Steps: 2%|▏ | 15516/1000000 [6:25:43<2053:18:33, 7.51s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [15516], local_loss=0.02480173483490944, train_loss=0.025553762912750244, time_cost=1.3372457027435303
+
Steps: 2%|▏ | 15516/1000000 [6:25:43<2053:18:33, 7.51s/it, lr=1e-5, step_loss=0.0248]
Steps: 2%|▏ | 15517/1000000 [6:25:48<1851:44:46, 6.77s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [15517], local_loss=0.019169459119439125, train_loss=0.031144164502620697, time_cost=2.1317861080169678
+
Steps: 2%|▏ | 15517/1000000 [6:25:48<1851:44:46, 6.77s/it, lr=1e-5, step_loss=0.0192]
Steps: 2%|▏ | 15518/1000000 [6:25:55<1883:50:21, 6.89s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [15518], local_loss=0.015059495344758034, train_loss=0.029366210103034973, time_cost=3.3104071617126465
+
Steps: 2%|▏ | 15518/1000000 [6:25:55<1883:50:21, 6.89s/it, lr=1e-5, step_loss=0.0151]
Steps: 2%|▏ | 15519/1000000 [6:26:02<1909:40:38, 6.98s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [15519], local_loss=0.010677885264158249, train_loss=0.04682052135467529, time_cost=1.2228646278381348
+
Steps: 2%|▏ | 15519/1000000 [6:26:02<1909:40:38, 6.98s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15520/1000000 [6:26:10<1965:32:23, 7.19s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15520], local_loss=0.004561550449579954, train_loss=0.07512350380420685, time_cost=2.7359538078308105
+
Steps: 2%|▏ | 15520/1000000 [6:26:10<1965:32:23, 7.19s/it, lr=1e-5, step_loss=0.00456]
Steps: 2%|▏ | 15521/1000000 [6:26:16<1860:40:17, 6.80s/it, lr=1e-5, step_loss=0.00456][RANK-0]: Step: [15521], local_loss=0.010848393663764, train_loss=0.029346471652388573, time_cost=4.005641460418701
+
Steps: 2%|▏ | 15521/1000000 [6:26:16<1860:40:17, 6.80s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 15522/1000000 [6:26:27<2231:39:25, 8.16s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [15522], local_loss=0.06866513937711716, train_loss=0.05453033372759819, time_cost=2.6567912101745605
+
Steps: 2%|▏ | 15522/1000000 [6:26:27<2231:39:25, 8.16s/it, lr=1e-5, step_loss=0.0687]
Steps: 2%|▏ | 15523/1000000 [6:26:32<1981:07:16, 7.24s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [15523], local_loss=1.0176515579223633, train_loss=0.1496410220861435, time_cost=1.2175188064575195
+
Steps: 2%|▏ | 15523/1000000 [6:26:32<1981:07:16, 7.24s/it, lr=1e-5, step_loss=1.02]
Steps: 2%|▏ | 15524/1000000 [6:26:43<2277:49:59, 8.33s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [15524], local_loss=0.050166524946689606, train_loss=0.1472463756799698, time_cost=2.6392300128936768
+
Steps: 2%|▏ | 15524/1000000 [6:26:43<2277:49:59, 8.33s/it, lr=1e-5, step_loss=0.0502]
Steps: 2%|▏ | 15525/1000000 [6:26:49<2059:21:44, 7.53s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [15525], local_loss=0.0037963991053402424, train_loss=0.022221216931939125, time_cost=2.6339120864868164
+
Steps: 2%|▏ | 15525/1000000 [6:26:49<2059:21:44, 7.53s/it, lr=1e-5, step_loss=0.0038]
Steps: 2%|▏ | 15526/1000000 [6:27:02<2549:03:24, 9.32s/it, lr=1e-5, step_loss=0.0038][RANK-0]: Step: [15526], local_loss=0.058092791587114334, train_loss=0.03933277726173401, time_cost=1.209059238433838
+
Steps: 2%|▏ | 15526/1000000 [6:27:02<2549:03:24, 9.32s/it, lr=1e-5, step_loss=0.0581]
Steps: 2%|▏ | 15527/1000000 [6:27:07<2187:10:34, 8.00s/it, lr=1e-5, step_loss=0.0581][RANK-0]: Step: [15527], local_loss=0.08096876740455627, train_loss=0.02982155606150627, time_cost=1.994887113571167
+
Steps: 2%|▏ | 15527/1000000 [6:27:07<2187:10:34, 8.00s/it, lr=1e-5, step_loss=0.081]
Steps: 2%|▏ | 15528/1000000 [6:27:12<1952:28:32, 7.14s/it, lr=1e-5, step_loss=0.081][RANK-0]: Step: [15528], local_loss=0.010638532228767872, train_loss=0.06568925082683563, time_cost=2.233093023300171
+
Steps: 2%|▏ | 15528/1000000 [6:27:12<1952:28:32, 7.14s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 15529/1000000 [6:27:22<2140:12:25, 7.83s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [15529], local_loss=0.007067153230309486, train_loss=0.017579877749085426, time_cost=1.7211318016052246
+
Steps: 2%|▏ | 15529/1000000 [6:27:22<2140:12:25, 7.83s/it, lr=1e-5, step_loss=0.00707]
Steps: 2%|▏ | 15530/1000000 [6:27:27<1948:49:25, 7.13s/it, lr=1e-5, step_loss=0.00707][RANK-0]: Step: [15530], local_loss=0.016301173716783524, train_loss=0.02517261728644371, time_cost=1.3928828239440918
+
Steps: 2%|▏ | 15530/1000000 [6:27:27<1948:49:25, 7.13s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 15531/1000000 [6:27:32<1777:39:42, 6.50s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [15531], local_loss=0.013825371861457825, train_loss=0.17260225117206573, time_cost=2.3812403678894043
+
Steps: 2%|▏ | 15531/1000000 [6:27:32<1777:39:42, 6.50s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 15532/1000000 [6:27:43<2082:13:10, 7.61s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [15532], local_loss=0.011193601414561272, train_loss=0.03007158637046814, time_cost=2.5521109104156494
+
Steps: 2%|▏ | 15532/1000000 [6:27:43<2082:13:10, 7.61s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 15533/1000000 [6:27:48<1878:49:49, 6.87s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [15533], local_loss=0.004256135318428278, train_loss=0.02713867463171482, time_cost=2.3847079277038574
+
Steps: 2%|▏ | 15533/1000000 [6:27:48<1878:49:49, 6.87s/it, lr=1e-5, step_loss=0.00426]
Steps: 2%|▏ | 15534/1000000 [6:27:53<1736:43:46, 6.35s/it, lr=1e-5, step_loss=0.00426][RANK-0]: Step: [15534], local_loss=0.03438548743724823, train_loss=0.049210090190172195, time_cost=1.3324058055877686
+
Steps: 2%|▏ | 15534/1000000 [6:27:53<1736:43:46, 6.35s/it, lr=1e-5, step_loss=0.0344]
Steps: 2%|▏ | 15535/1000000 [6:28:05<2196:50:57, 8.03s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [15535], local_loss=0.02189875952899456, train_loss=0.03309192880988121, time_cost=7.169977426528931
+
Steps: 2%|▏ | 15535/1000000 [6:28:05<2196:50:57, 8.03s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 15536/1000000 [6:28:09<1877:14:42, 6.86s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [15536], local_loss=0.02688129059970379, train_loss=0.06583196669816971, time_cost=1.6531667709350586
+
Steps: 2%|▏ | 15536/1000000 [6:28:09<1877:14:42, 6.86s/it, lr=1e-5, step_loss=0.0269]
Steps: 2%|▏ | 15537/1000000 [6:28:14<1734:54:59, 6.34s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [15537], local_loss=0.006416120566427708, train_loss=0.05202193558216095, time_cost=2.223989725112915
+
Steps: 2%|▏ | 15537/1000000 [6:28:14<1734:54:59, 6.34s/it, lr=1e-5, step_loss=0.00642]
Steps: 2%|▏ | 15538/1000000 [6:28:18<1577:31:23, 5.77s/it, lr=1e-5, step_loss=0.00642][RANK-0]: Step: [15538], local_loss=0.041570086032152176, train_loss=44.07830047607422, time_cost=1.2136704921722412
+
Steps: 2%|▏ | 15538/1000000 [6:28:18<1577:31:23, 5.77s/it, lr=1e-5, step_loss=0.0416]
Steps: 2%|▏ | 15539/1000000 [6:28:26<1732:48:04, 6.34s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [15539], local_loss=0.008474547415971756, train_loss=0.01680707558989525, time_cost=3.8725807666778564
+
Steps: 2%|▏ | 15539/1000000 [6:28:26<1732:48:04, 6.34s/it, lr=1e-5, step_loss=0.00847]
Steps: 2%|▏ | 15540/1000000 [6:28:34<1836:04:24, 6.71s/it, lr=1e-5, step_loss=0.00847][RANK-0]: Step: [15540], local_loss=0.004883734509348869, train_loss=0.05008256807923317, time_cost=1.2021281719207764
+
Steps: 2%|▏ | 15540/1000000 [6:28:34<1836:04:24, 6.71s/it, lr=1e-5, step_loss=0.00488]
Steps: 2%|▏ | 15541/1000000 [6:28:41<1897:09:27, 6.94s/it, lr=1e-5, step_loss=0.00488][RANK-0]: Step: [15541], local_loss=0.014753817580640316, train_loss=0.025084076449275017, time_cost=1.5658979415893555
+
Steps: 2%|▏ | 15541/1000000 [6:28:41<1897:09:27, 6.94s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 15542/1000000 [6:28:55<2433:53:03, 8.90s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [15542], local_loss=0.028923021629452705, train_loss=0.09197874367237091, time_cost=2.590036153793335
+
Steps: 2%|▏ | 15542/1000000 [6:28:55<2433:53:03, 8.90s/it, lr=1e-5, step_loss=0.0289]
Steps: 2%|▏ | 15543/1000000 [6:29:06<2634:51:42, 9.64s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [15543], local_loss=0.037543218582868576, train_loss=0.04653598368167877, time_cost=2.696559429168701
+
Steps: 2%|▏ | 15543/1000000 [6:29:06<2634:51:42, 9.64s/it, lr=1e-5, step_loss=0.0375]
Steps: 2%|▏ | 15544/1000000 [6:29:17<2761:53:40, 10.10s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [15544], local_loss=0.016920093446969986, train_loss=0.020490799099206924, time_cost=2.9530117511749268
+
Steps: 2%|▏ | 15544/1000000 [6:29:17<2761:53:40, 10.10s/it, lr=1e-5, step_loss=0.0169]
Steps: 2%|▏ | 15545/1000000 [6:29:23<2389:40:47, 8.74s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [15545], local_loss=0.028372475877404213, train_loss=0.020867563784122467, time_cost=1.4659860134124756
+
Steps: 2%|▏ | 15545/1000000 [6:29:23<2389:40:47, 8.74s/it, lr=1e-5, step_loss=0.0284]
Steps: 2%|▏ | 15546/1000000 [6:29:30<2264:29:12, 8.28s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [15546], local_loss=0.05570057034492493, train_loss=0.030202368274331093, time_cost=2.8790807723999023
+
Steps: 2%|▏ | 15546/1000000 [6:29:30<2264:29:12, 8.28s/it, lr=1e-5, step_loss=0.0557]
Steps: 2%|▏ | 15547/1000000 [6:29:38<2246:39:45, 8.22s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [15547], local_loss=0.00807782169431448, train_loss=0.02730632573366165, time_cost=4.521139144897461
+
Steps: 2%|▏ | 15547/1000000 [6:29:38<2246:39:45, 8.22s/it, lr=1e-5, step_loss=0.00808]
Steps: 2%|▏ | 15548/1000000 [6:29:43<1998:22:18, 7.31s/it, lr=1e-5, step_loss=0.00808][RANK-0]: Step: [15548], local_loss=0.03511325269937515, train_loss=0.15013916790485382, time_cost=1.4304041862487793
+
Steps: 2%|▏ | 15548/1000000 [6:29:43<1998:22:18, 7.31s/it, lr=1e-5, step_loss=0.0351]
Steps: 2%|▏ | 15549/1000000 [6:29:52<2150:17:11, 7.86s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [15549], local_loss=0.05037976801395416, train_loss=0.17475970089435577, time_cost=1.506378412246704
+
Steps: 2%|▏ | 15549/1000000 [6:29:52<2150:17:11, 7.86s/it, lr=1e-5, step_loss=0.0504]
Steps: 2%|▏ | 15550/1000000 [6:30:04<2458:29:05, 8.99s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [15550], local_loss=0.0384797640144825, train_loss=0.16012510657310486, time_cost=6.085160255432129
+
Steps: 2%|▏ | 15550/1000000 [6:30:04<2458:29:05, 8.99s/it, lr=1e-5, step_loss=0.0385]
Steps: 2%|▏ | 15551/1000000 [6:30:11<2299:31:30, 8.41s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [15551], local_loss=0.16856808960437775, train_loss=0.03937305137515068, time_cost=2.3986012935638428
+
Steps: 2%|▏ | 15551/1000000 [6:30:11<2299:31:30, 8.41s/it, lr=1e-5, step_loss=0.169]
Steps: 2%|▏ | 15552/1000000 [6:30:18<2212:38:59, 8.09s/it, lr=1e-5, step_loss=0.169][RANK-0]: Step: [15552], local_loss=0.10461211949586868, train_loss=0.03534412384033203, time_cost=3.011861562728882
+
Steps: 2%|▏ | 15552/1000000 [6:30:18<2212:38:59, 8.09s/it, lr=1e-5, step_loss=0.105]
Steps: 2%|▏ | 15553/1000000 [6:30:25<2068:39:45, 7.56s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [15553], local_loss=0.009411857463419437, train_loss=0.024173498153686523, time_cost=1.2154052257537842
+
Steps: 2%|▏ | 15553/1000000 [6:30:25<2068:39:45, 7.56s/it, lr=1e-5, step_loss=0.00941]
Steps: 2%|▏ | 15554/1000000 [6:30:34<2189:31:46, 8.01s/it, lr=1e-5, step_loss=0.00941][RANK-0]: Step: [15554], local_loss=0.014672022312879562, train_loss=0.021031677722930908, time_cost=2.2256574630737305
+
Steps: 2%|▏ | 15554/1000000 [6:30:34<2189:31:46, 8.01s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 15555/1000000 [6:30:38<1877:44:37, 6.87s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [15555], local_loss=0.07688047736883163, train_loss=0.03162030130624771, time_cost=2.7346351146698
+
Steps: 2%|▏ | 15555/1000000 [6:30:38<1877:44:37, 6.87s/it, lr=1e-5, step_loss=0.0769]
Steps: 2%|▏ | 15556/1000000 [6:30:47<2080:52:27, 7.61s/it, lr=1e-5, step_loss=0.0769][RANK-0]: Step: [15556], local_loss=0.07727732509374619, train_loss=0.04707441106438637, time_cost=1.2076737880706787
+
Steps: 2%|▏ | 15556/1000000 [6:30:47<2080:52:27, 7.61s/it, lr=1e-5, step_loss=0.0773]
Steps: 2%|▏ | 15557/1000000 [6:30:54<2039:39:07, 7.46s/it, lr=1e-5, step_loss=0.0773][RANK-0]: Step: [15557], local_loss=0.004352306015789509, train_loss=0.02982574887573719, time_cost=1.7779638767242432
+
Steps: 2%|▏ | 15557/1000000 [6:30:54<2039:39:07, 7.46s/it, lr=1e-5, step_loss=0.00435]
Steps: 2%|▏ | 15558/1000000 [6:31:04<2240:45:55, 8.19s/it, lr=1e-5, step_loss=0.00435][RANK-0]: Step: [15558], local_loss=0.010183960199356079, train_loss=8.613402366638184, time_cost=8.243403911590576
+
Steps: 2%|▏ | 15558/1000000 [6:31:04<2240:45:55, 8.19s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 15559/1000000 [6:31:11<2153:19:20, 7.87s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [15559], local_loss=0.02493259683251381, train_loss=0.06981076300144196, time_cost=3.4205853939056396
+
Steps: 2%|▏ | 15559/1000000 [6:31:11<2153:19:20, 7.87s/it, lr=1e-5, step_loss=0.0249]
Steps: 2%|▏ | 15560/1000000 [6:31:23<2415:21:40, 8.83s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [15560], local_loss=0.20207728445529938, train_loss=0.14562247693538666, time_cost=2.527662754058838
+
Steps: 2%|▏ | 15560/1000000 [6:31:23<2415:21:40, 8.83s/it, lr=1e-5, step_loss=0.202]
Steps: 2%|▏ | 15561/1000000 [6:31:28<2176:23:31, 7.96s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [15561], local_loss=0.19755421578884125, train_loss=0.061868634074926376, time_cost=1.4543490409851074
+
Steps: 2%|▏ | 15561/1000000 [6:31:28<2176:23:31, 7.96s/it, lr=1e-5, step_loss=0.198]
Steps: 2%|▏ | 15562/1000000 [6:31:40<2431:54:00, 8.89s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [15562], local_loss=0.006673290394246578, train_loss=0.06353043019771576, time_cost=3.7003724575042725
+
Steps: 2%|▏ | 15562/1000000 [6:31:40<2431:54:00, 8.89s/it, lr=1e-5, step_loss=0.00667]
Steps: 2%|▏ | 15563/1000000 [6:31:50<2600:19:25, 9.51s/it, lr=1e-5, step_loss=0.00667][RANK-0]: Step: [15563], local_loss=0.006343237590044737, train_loss=0.1206965446472168, time_cost=4.578745365142822
+
Steps: 2%|▏ | 15563/1000000 [6:31:50<2600:19:25, 9.51s/it, lr=1e-5, step_loss=0.00634]
Steps: 2%|▏ | 15564/1000000 [6:32:04<2924:38:30, 10.70s/it, lr=1e-5, step_loss=0.00634][RANK-0]: Step: [15564], local_loss=0.04787604138255119, train_loss=0.06690176576375961, time_cost=1.3334267139434814
+
Steps: 2%|▏ | 15564/1000000 [6:32:04<2924:38:30, 10.70s/it, lr=1e-5, step_loss=0.0479]
Steps: 2%|▏ | 15565/1000000 [6:32:10<2547:49:36, 9.32s/it, lr=1e-5, step_loss=0.0479][RANK-0]: Step: [15565], local_loss=0.010800402611494064, train_loss=0.03718475252389908, time_cost=2.501281261444092
+
Steps: 2%|▏ | 15565/1000000 [6:32:10<2547:49:36, 9.32s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 15566/1000000 [6:32:18<2399:42:56, 8.78s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [15566], local_loss=0.013113096356391907, train_loss=0.042285315692424774, time_cost=2.994142770767212
+
Steps: 2%|▏ | 15566/1000000 [6:32:18<2399:42:56, 8.78s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 15567/1000000 [6:32:33<2926:48:44, 10.70s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [15567], local_loss=0.07536033540964127, train_loss=0.17866122722625732, time_cost=5.408966541290283
+
Steps: 2%|▏ | 15567/1000000 [6:32:33<2926:48:44, 10.70s/it, lr=1e-5, step_loss=0.0754]
Steps: 2%|▏ | 15568/1000000 [6:32:41<2685:58:48, 9.82s/it, lr=1e-5, step_loss=0.0754][RANK-0]: Step: [15568], local_loss=0.01268046349287033, train_loss=0.14358928799629211, time_cost=1.75813627243042
+
Steps: 2%|▏ | 15568/1000000 [6:32:41<2685:58:48, 9.82s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 15569/1000000 [6:32:50<2625:38:12, 9.60s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [15569], local_loss=0.012281329371035099, train_loss=0.042702481150627136, time_cost=2.633920907974243
+
Steps: 2%|▏ | 15569/1000000 [6:32:50<2625:38:12, 9.60s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 15570/1000000 [6:32:56<2323:54:45, 8.50s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [15570], local_loss=0.1916908621788025, train_loss=0.05405258387327194, time_cost=3.09161114692688
+
Steps: 2%|▏ | 15570/1000000 [6:32:56<2323:54:45, 8.50s/it, lr=1e-5, step_loss=0.192]
Steps: 2%|▏ | 15571/1000000 [6:33:03<2208:16:44, 8.08s/it, lr=1e-5, step_loss=0.192][RANK-0]: Step: [15571], local_loss=0.022606918588280678, train_loss=0.03401722386479378, time_cost=2.654572010040283
+
Steps: 2%|▏ | 15571/1000000 [6:33:03<2208:16:44, 8.08s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 15572/1000000 [6:33:18<2778:01:19, 10.16s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [15572], local_loss=0.05559545382857323, train_loss=0.2226513922214508, time_cost=11.615381717681885
+
Steps: 2%|▏ | 15572/1000000 [6:33:18<2778:01:19, 10.16s/it, lr=1e-5, step_loss=0.0556]
Steps: 2%|▏ | 15573/1000000 [6:33:26<2636:53:13, 9.64s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [15573], local_loss=0.016054796054959297, train_loss=0.022661369293928146, time_cost=4.425785779953003
+
Steps: 2%|▏ | 15573/1000000 [6:33:26<2636:53:13, 9.64s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 15574/1000000 [6:33:30<2176:26:40, 7.96s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [15574], local_loss=0.049927420914173126, train_loss=0.05243149399757385, time_cost=1.2772550582885742
+
Steps: 2%|▏ | 15574/1000000 [6:33:30<2176:26:40, 7.96s/it, lr=1e-5, step_loss=0.0499]
Steps: 2%|▏ | 15575/1000000 [6:33:39<2235:58:10, 8.18s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [15575], local_loss=0.149429053068161, train_loss=0.05734359845519066, time_cost=2.5420544147491455
+
Steps: 2%|▏ | 15575/1000000 [6:33:39<2235:58:10, 8.18s/it, lr=1e-5, step_loss=0.149]
Steps: 2%|▏ | 15576/1000000 [6:33:53<2764:04:50, 10.11s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [15576], local_loss=0.012900306843221188, train_loss=0.017599325627088547, time_cost=1.2132213115692139
+
Steps: 2%|▏ | 15576/1000000 [6:33:53<2764:04:50, 10.11s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 15577/1000000 [6:34:00<2478:16:43, 9.06s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [15577], local_loss=0.011746319942176342, train_loss=0.03314206004142761, time_cost=1.21431303024292
+
Steps: 2%|▏ | 15577/1000000 [6:34:00<2478:16:43, 9.06s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 15578/1000000 [6:34:09<2502:08:11, 9.15s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [15578], local_loss=0.026706332340836525, train_loss=0.04863418638706207, time_cost=5.78933310508728
+
Steps: 2%|▏ | 15578/1000000 [6:34:09<2502:08:11, 9.15s/it, lr=1e-5, step_loss=0.0267]
Steps: 2%|▏ | 15579/1000000 [6:34:21<2696:47:00, 9.86s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [15579], local_loss=0.0503714494407177, train_loss=0.0348631851375103, time_cost=3.317157030105591
+
Steps: 2%|▏ | 15579/1000000 [6:34:21<2696:47:00, 9.86s/it, lr=1e-5, step_loss=0.0504]
Steps: 2%|▏ | 15580/1000000 [6:34:25<2245:10:57, 8.21s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [15580], local_loss=0.3712567985057831, train_loss=0.07858584821224213, time_cost=1.7320306301116943
+
Steps: 2%|▏ | 15580/1000000 [6:34:25<2245:10:57, 8.21s/it, lr=1e-5, step_loss=0.371]
Steps: 2%|▏ | 15581/1000000 [6:34:38<2637:36:01, 9.65s/it, lr=1e-5, step_loss=0.371][RANK-0]: Step: [15581], local_loss=0.006090318318456411, train_loss=0.018079636618494987, time_cost=5.955573081970215
+
Steps: 2%|▏ | 15581/1000000 [6:34:38<2637:36:01, 9.65s/it, lr=1e-5, step_loss=0.00609]
Steps: 2%|▏ | 15582/1000000 [6:34:44<2277:45:47, 8.33s/it, lr=1e-5, step_loss=0.00609][RANK-0]: Step: [15582], local_loss=0.043347008526325226, train_loss=0.026008455082774162, time_cost=2.8124232292175293
+
Steps: 2%|▏ | 15582/1000000 [6:34:44<2277:45:47, 8.33s/it, lr=1e-5, step_loss=0.0433]
Steps: 2%|▏ | 15583/1000000 [6:34:55<2503:57:38, 9.16s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [15583], local_loss=0.0074723404832184315, train_loss=0.040040042251348495, time_cost=3.8093655109405518
+
Steps: 2%|▏ | 15583/1000000 [6:34:55<2503:57:38, 9.16s/it, lr=1e-5, step_loss=0.00747]
Steps: 2%|▏ | 15584/1000000 [6:35:07<2759:47:08, 10.09s/it, lr=1e-5, step_loss=0.00747][RANK-0]: Step: [15584], local_loss=0.3844812214374542, train_loss=0.06715899705886841, time_cost=1.2423732280731201
+
Steps: 2%|▏ | 15584/1000000 [6:35:07<2759:47:08, 10.09s/it, lr=1e-5, step_loss=0.384]
Steps: 2%|▏ | 15585/1000000 [6:35:11<2283:19:58, 8.35s/it, lr=1e-5, step_loss=0.384][RANK-0]: Step: [15585], local_loss=0.013897492550313473, train_loss=0.023951534181833267, time_cost=2.475778102874756
+
Steps: 2%|▏ | 15585/1000000 [6:35:11<2283:19:58, 8.35s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 15586/1000000 [6:35:16<2007:43:36, 7.34s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [15586], local_loss=0.026483118534088135, train_loss=0.024334076792001724, time_cost=2.311784267425537
+
Steps: 2%|▏ | 15586/1000000 [6:35:16<2007:43:36, 7.34s/it, lr=1e-5, step_loss=0.0265]
Steps: 2%|▏ | 15587/1000000 [6:35:27<2291:34:03, 8.38s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [15587], local_loss=0.018080271780490875, train_loss=0.026960015296936035, time_cost=4.209447383880615
+
Steps: 2%|▏ | 15587/1000000 [6:35:27<2291:34:03, 8.38s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 15588/1000000 [6:35:34<2216:04:15, 8.10s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [15588], local_loss=0.008617171086370945, train_loss=0.0173588115721941, time_cost=5.226836204528809
+
Steps: 2%|▏ | 15588/1000000 [6:35:34<2216:04:15, 8.10s/it, lr=1e-5, step_loss=0.00862]
Steps: 2%|▏ | 15589/1000000 [6:35:41<2125:47:48, 7.77s/it, lr=1e-5, step_loss=0.00862][RANK-0]: Step: [15589], local_loss=0.008895151317119598, train_loss=0.011015417985618114, time_cost=2.4730255603790283
+
Steps: 2%|▏ | 15589/1000000 [6:35:41<2125:47:48, 7.77s/it, lr=1e-5, step_loss=0.0089]
Steps: 2%|▏ | 15590/1000000 [6:35:49<2072:52:19, 7.58s/it, lr=1e-5, step_loss=0.0089][RANK-0]: Step: [15590], local_loss=0.05033746361732483, train_loss=0.03402918577194214, time_cost=2.888990640640259
+
Steps: 2%|▏ | 15590/1000000 [6:35:49<2072:52:19, 7.58s/it, lr=1e-5, step_loss=0.0503]
Steps: 2%|▏ | 15591/1000000 [6:35:57<2116:29:40, 7.74s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [15591], local_loss=0.02543313428759575, train_loss=0.0537930466234684, time_cost=2.4388153553009033
+
Steps: 2%|▏ | 15591/1000000 [6:35:57<2116:29:40, 7.74s/it, lr=1e-5, step_loss=0.0254]
Steps: 2%|▏ | 15592/1000000 [6:36:08<2433:22:07, 8.90s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [15592], local_loss=0.005518237128853798, train_loss=1.090060830116272, time_cost=5.454664945602417
+
Steps: 2%|▏ | 15592/1000000 [6:36:08<2433:22:07, 8.90s/it, lr=1e-5, step_loss=0.00552]
Steps: 2%|▏ | 15593/1000000 [6:36:17<2401:44:59, 8.78s/it, lr=1e-5, step_loss=0.00552][RANK-0]: Step: [15593], local_loss=0.005295829847455025, train_loss=0.16278886795043945, time_cost=3.2239441871643066
+
Steps: 2%|▏ | 15593/1000000 [6:36:17<2401:44:59, 8.78s/it, lr=1e-5, step_loss=0.0053]
Steps: 2%|▏ | 15594/1000000 [6:36:31<2828:11:37, 10.34s/it, lr=1e-5, step_loss=0.0053][RANK-0]: Step: [15594], local_loss=0.007142487447708845, train_loss=0.38645321130752563, time_cost=6.207723617553711
+
Steps: 2%|▏ | 15594/1000000 [6:36:31<2828:11:37, 10.34s/it, lr=1e-5, step_loss=0.00714]
Steps: 2%|▏ | 15595/1000000 [6:36:45<3108:10:39, 11.37s/it, lr=1e-5, step_loss=0.00714][RANK-0]: Step: [15595], local_loss=0.005621781572699547, train_loss=0.1397213339805603, time_cost=4.736748456954956
+
Steps: 2%|▏ | 15595/1000000 [6:36:45<3108:10:39, 11.37s/it, lr=1e-5, step_loss=0.00562]
Steps: 2%|▏ | 15596/1000000 [6:36:55<3060:38:12, 11.19s/it, lr=1e-5, step_loss=0.00562][RANK-0]: Step: [15596], local_loss=0.013542700558900833, train_loss=0.08756927400827408, time_cost=3.81042742729187
+
Steps: 2%|▏ | 15596/1000000 [6:36:55<3060:38:12, 11.19s/it, lr=1e-5, step_loss=0.0135]
Steps: 2%|▏ | 15597/1000000 [6:37:04<2836:28:36, 10.37s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [15597], local_loss=0.007058187387883663, train_loss=0.09724284708499908, time_cost=3.301635980606079
+
Steps: 2%|▏ | 15597/1000000 [6:37:04<2836:28:36, 10.37s/it, lr=1e-5, step_loss=0.00706]
Steps: 2%|▏ | 15598/1000000 [6:37:09<2414:16:54, 8.83s/it, lr=1e-5, step_loss=0.00706][RANK-0]: Step: [15598], local_loss=0.03264898434281349, train_loss=0.026556510478258133, time_cost=3.9708094596862793
+
Steps: 2%|▏ | 15598/1000000 [6:37:09<2414:16:54, 8.83s/it, lr=1e-5, step_loss=0.0326]
Steps: 2%|▏ | 15599/1000000 [6:37:18<2419:29:43, 8.85s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [15599], local_loss=0.03587358072400093, train_loss=0.07484135031700134, time_cost=2.945590019226074
+
Steps: 2%|▏ | 15599/1000000 [6:37:18<2419:29:43, 8.85s/it, lr=1e-5, step_loss=0.0359]
Steps: 2%|▏ | 15600/1000000 [6:37:29<2582:35:11, 9.44s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [15600], local_loss=0.012744734063744545, train_loss=0.19200938940048218, time_cost=2.135711431503296
+
Steps: 2%|▏ | 15600/1000000 [6:37:29<2582:35:11, 9.44s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 15601/1000000 [6:37:33<2168:14:44, 7.93s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [15601], local_loss=0.039301253855228424, train_loss=0.02455110102891922, time_cost=3.165303945541382
+
Steps: 2%|▏ | 15601/1000000 [6:37:33<2168:14:44, 7.93s/it, lr=1e-5, step_loss=0.0393]
Steps: 2%|▏ | 15602/1000000 [6:37:48<2738:18:35, 10.01s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [15602], local_loss=0.010992239229381084, train_loss=0.03173946216702461, time_cost=5.639688730239868
+
Steps: 2%|▏ | 15602/1000000 [6:37:48<2738:18:35, 10.01s/it, lr=1e-5, step_loss=0.011]
Steps: 2%|▏ | 15603/1000000 [6:37:56<2533:07:06, 9.26s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [15603], local_loss=0.02064868062734604, train_loss=0.07780296355485916, time_cost=3.45090913772583
+
Steps: 2%|▏ | 15603/1000000 [6:37:56<2533:07:06, 9.26s/it, lr=1e-5, step_loss=0.0206]
Steps: 2%|▏ | 15604/1000000 [6:38:08<2757:19:56, 10.08s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [15604], local_loss=0.05113076791167259, train_loss=0.11063607782125473, time_cost=1.294283151626587
+
Steps: 2%|▏ | 15604/1000000 [6:38:08<2757:19:56, 10.08s/it, lr=1e-5, step_loss=0.0511]
Steps: 2%|▏ | 15605/1000000 [6:38:22<3080:44:46, 11.27s/it, lr=1e-5, step_loss=0.0511][RANK-0]: Step: [15605], local_loss=0.008525251410901546, train_loss=0.07597243785858154, time_cost=4.464200496673584
+
Steps: 2%|▏ | 15605/1000000 [6:38:22<3080:44:46, 11.27s/it, lr=1e-5, step_loss=0.00853]
Steps: 2%|▏ | 15606/1000000 [6:38:29<2769:05:03, 10.13s/it, lr=1e-5, step_loss=0.00853][RANK-0]: Step: [15606], local_loss=0.14463676512241364, train_loss=0.2750568985939026, time_cost=1.7013812065124512
+
Steps: 2%|▏ | 15606/1000000 [6:38:29<2769:05:03, 10.13s/it, lr=1e-5, step_loss=0.145]
Steps: 2%|▏ | 15607/1000000 [6:38:43<3060:51:29, 11.19s/it, lr=1e-5, step_loss=0.145][RANK-0]: Step: [15607], local_loss=0.004201638977974653, train_loss=8.871679306030273, time_cost=1.9214184284210205
+
Steps: 2%|▏ | 15607/1000000 [6:38:43<3060:51:29, 11.19s/it, lr=1e-5, step_loss=0.0042]
Steps: 2%|▏ | 15608/1000000 [6:38:49<2687:11:17, 9.83s/it, lr=1e-5, step_loss=0.0042][RANK-0]: Step: [15608], local_loss=0.006328396499156952, train_loss=0.02136976830661297, time_cost=1.4622957706451416
+
Steps: 2%|▏ | 15608/1000000 [6:38:49<2687:11:17, 9.83s/it, lr=1e-5, step_loss=0.00633]
Steps: 2%|▏ | 15609/1000000 [6:38:55<2352:16:30, 8.60s/it, lr=1e-5, step_loss=0.00633][RANK-0]: Step: [15609], local_loss=0.0042337835766375065, train_loss=0.11116281896829605, time_cost=2.9886441230773926
+
Steps: 2%|▏ | 15609/1000000 [6:38:55<2352:16:30, 8.60s/it, lr=1e-5, step_loss=0.00423]
Steps: 2%|▏ | 15610/1000000 [6:39:05<2493:16:37, 9.12s/it, lr=1e-5, step_loss=0.00423][RANK-0]: Step: [15610], local_loss=0.046790916472673416, train_loss=17.047578811645508, time_cost=2.3905487060546875
+
Steps: 2%|▏ | 15610/1000000 [6:39:05<2493:16:37, 9.12s/it, lr=1e-5, step_loss=0.0468]
Steps: 2%|▏ | 15611/1000000 [6:39:14<2414:16:59, 8.83s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [15611], local_loss=0.007991538383066654, train_loss=28.26620101928711, time_cost=3.7519643306732178
+
Steps: 2%|▏ | 15611/1000000 [6:39:14<2414:16:59, 8.83s/it, lr=1e-5, step_loss=0.00799]
Steps: 2%|▏ | 15612/1000000 [6:39:26<2730:39:06, 9.99s/it, lr=1e-5, step_loss=0.00799][RANK-0]: Step: [15612], local_loss=0.06366720050573349, train_loss=0.05664185434579849, time_cost=4.919127702713013
+
Steps: 2%|▏ | 15612/1000000 [6:39:26<2730:39:06, 9.99s/it, lr=1e-5, step_loss=0.0637]
Steps: 2%|▏ | 15613/1000000 [6:39:32<2344:28:26, 8.57s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [15613], local_loss=0.016027506440877914, train_loss=0.02307170256972313, time_cost=2.6094136238098145
+
Steps: 2%|▏ | 15613/1000000 [6:39:32<2344:28:26, 8.57s/it, lr=1e-5, step_loss=0.016]
Steps: 2%|▏ | 15614/1000000 [6:39:37<2075:39:00, 7.59s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [15614], local_loss=0.004649835173040628, train_loss=0.04769787937402725, time_cost=2.886439800262451
+
Steps: 2%|▏ | 15614/1000000 [6:39:37<2075:39:00, 7.59s/it, lr=1e-5, step_loss=0.00465]
Steps: 2%|▏ | 15615/1000000 [6:39:50<2571:10:47, 9.40s/it, lr=1e-5, step_loss=0.00465][RANK-0]: Step: [15615], local_loss=0.003752528689801693, train_loss=0.012232628650963306, time_cost=5.304409503936768
+
Steps: 2%|▏ | 15615/1000000 [6:39:50<2571:10:47, 9.40s/it, lr=1e-5, step_loss=0.00375]
Steps: 2%|▏ | 15616/1000000 [6:39:55<2209:42:17, 8.08s/it, lr=1e-5, step_loss=0.00375][RANK-0]: Step: [15616], local_loss=0.010436407290399075, train_loss=0.04751342162489891, time_cost=3.728815793991089
+
Steps: 2%|▏ | 15616/1000000 [6:39:55<2209:42:17, 8.08s/it, lr=1e-5, step_loss=0.0104]
Steps: 2%|▏ | 15617/1000000 [6:40:06<2429:46:23, 8.89s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [15617], local_loss=0.012801036238670349, train_loss=0.047039248049259186, time_cost=2.505244493484497
+
Steps: 2%|▏ | 15617/1000000 [6:40:06<2429:46:23, 8.89s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15618/1000000 [6:40:17<2563:34:31, 9.38s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15618], local_loss=0.04565777629613876, train_loss=0.021426718682050705, time_cost=9.176487922668457
+
Steps: 2%|▏ | 15618/1000000 [6:40:17<2563:34:31, 9.38s/it, lr=1e-5, step_loss=0.0457]
Steps: 2%|▏ | 15619/1000000 [6:40:22<2263:12:46, 8.28s/it, lr=1e-5, step_loss=0.0457][RANK-0]: Step: [15619], local_loss=0.053146157413721085, train_loss=0.039102062582969666, time_cost=1.7702860832214355
+
Steps: 2%|▏ | 15619/1000000 [6:40:22<2263:12:46, 8.28s/it, lr=1e-5, step_loss=0.0531]
Steps: 2%|▏ | 15620/1000000 [6:40:30<2177:49:01, 7.96s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [15620], local_loss=0.0601961649954319, train_loss=0.038689881563186646, time_cost=2.6213414669036865
+
Steps: 2%|▏ | 15620/1000000 [6:40:30<2177:49:01, 7.96s/it, lr=1e-5, step_loss=0.0602]
Steps: 2%|▏ | 15621/1000000 [6:40:38<2242:17:32, 8.20s/it, lr=1e-5, step_loss=0.0602][RANK-0]: Step: [15621], local_loss=0.006757800001651049, train_loss=0.11310890316963196, time_cost=1.3062562942504883
+
Steps: 2%|▏ | 15621/1000000 [6:40:38<2242:17:32, 8.20s/it, lr=1e-5, step_loss=0.00676]
Steps: 2%|▏ | 15622/1000000 [6:40:51<2596:37:18, 9.50s/it, lr=1e-5, step_loss=0.00676][RANK-0]: Step: [15622], local_loss=0.2994726002216339, train_loss=0.07614775002002716, time_cost=4.078927278518677
+
Steps: 2%|▏ | 15622/1000000 [6:40:51<2596:37:18, 9.50s/it, lr=1e-5, step_loss=0.299]
Steps: 2%|▏ | 15623/1000000 [6:41:05<2985:24:51, 10.92s/it, lr=1e-5, step_loss=0.299][RANK-0]: Step: [15623], local_loss=0.0050330692902207375, train_loss=0.011291499249637127, time_cost=7.4817280769348145
+
Steps: 2%|▏ | 15623/1000000 [6:41:05<2985:24:51, 10.92s/it, lr=1e-5, step_loss=0.00503]
Steps: 2%|▏ | 15624/1000000 [6:41:18<3149:06:59, 11.52s/it, lr=1e-5, step_loss=0.00503][RANK-0]: Step: [15624], local_loss=0.005977982189506292, train_loss=0.018085112795233727, time_cost=3.696650981903076
+
Steps: 2%|▏ | 15624/1000000 [6:41:18<3149:06:59, 11.52s/it, lr=1e-5, step_loss=0.00598]
Steps: 2%|▏ | 15625/1000000 [6:41:31<3272:07:58, 11.97s/it, lr=1e-5, step_loss=0.00598][RANK-0]: Step: [15625], local_loss=0.014326440170407295, train_loss=0.019450737163424492, time_cost=3.680990695953369
+
Steps: 2%|▏ | 15625/1000000 [6:41:31<3272:07:58, 11.97s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 15626/1000000 [6:41:35<2650:09:53, 9.69s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [15626], local_loss=0.07287479937076569, train_loss=0.04113946855068207, time_cost=1.581094741821289
+
Steps: 2%|▏ | 15626/1000000 [6:41:35<2650:09:53, 9.69s/it, lr=1e-5, step_loss=0.0729]
Steps: 2%|▏ | 15627/1000000 [6:41:46<2715:39:04, 9.93s/it, lr=1e-5, step_loss=0.0729][RANK-0]: Step: [15627], local_loss=0.033727340400218964, train_loss=0.02533002756536007, time_cost=2.7586843967437744
+
Steps: 2%|▏ | 15627/1000000 [6:41:46<2715:39:04, 9.93s/it, lr=1e-5, step_loss=0.0337]
Steps: 2%|▏ | 15628/1000000 [6:42:01<3162:41:50, 11.57s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [15628], local_loss=0.007161623332649469, train_loss=0.0402010902762413, time_cost=9.591418027877808
+
Steps: 2%|▏ | 15628/1000000 [6:42:01<3162:41:50, 11.57s/it, lr=1e-5, step_loss=0.00716]
Steps: 2%|▏ | 15629/1000000 [6:42:20<3708:49:43, 13.56s/it, lr=1e-5, step_loss=0.00716][RANK-0]: Step: [15629], local_loss=0.03967589884996414, train_loss=0.014140350744128227, time_cost=9.130221128463745
+
Steps: 2%|▏ | 15629/1000000 [6:42:20<3708:49:43, 13.56s/it, lr=1e-5, step_loss=0.0397]
Steps: 2%|▏ | 15630/1000000 [6:42:30<3485:34:45, 12.75s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [15630], local_loss=0.015824628993868828, train_loss=0.0225767083466053, time_cost=2.085530996322632
+
Steps: 2%|▏ | 15630/1000000 [6:42:30<3485:34:45, 12.75s/it, lr=1e-5, step_loss=0.0158]
Steps: 2%|▏ | 15631/1000000 [6:42:36<2858:50:51, 10.46s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [15631], local_loss=0.009300276637077332, train_loss=0.014972344040870667, time_cost=1.2059857845306396
+
Steps: 2%|▏ | 15631/1000000 [6:42:36<2858:50:51, 10.46s/it, lr=1e-5, step_loss=0.0093]
Steps: 2%|▏ | 15632/1000000 [6:42:41<2457:22:33, 8.99s/it, lr=1e-5, step_loss=0.0093][RANK-0]: Step: [15632], local_loss=0.012594866566359997, train_loss=0.07982127368450165, time_cost=3.225529432296753
+
Steps: 2%|▏ | 15632/1000000 [6:42:41<2457:22:33, 8.99s/it, lr=1e-5, step_loss=0.0126]
Steps: 2%|▏ | 15633/1000000 [6:42:57<3009:03:59, 11.00s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [15633], local_loss=0.004593780729919672, train_loss=0.06480157375335693, time_cost=7.446274757385254
+
Steps: 2%|▏ | 15633/1000000 [6:42:57<3009:03:59, 11.00s/it, lr=1e-5, step_loss=0.00459]
Steps: 2%|▏ | 15634/1000000 [6:43:02<2543:56:47, 9.30s/it, lr=1e-5, step_loss=0.00459][RANK-0]: Step: [15634], local_loss=0.02785247564315796, train_loss=0.029182959347963333, time_cost=1.2156565189361572
+
Steps: 2%|▏ | 15634/1000000 [6:43:02<2543:56:47, 9.30s/it, lr=1e-5, step_loss=0.0279]
Steps: 2%|▏ | 15635/1000000 [6:43:11<2535:06:17, 9.27s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [15635], local_loss=0.2431022971868515, train_loss=0.05278995633125305, time_cost=7.731976747512817
+
Steps: 2%|▏ | 15635/1000000 [6:43:11<2535:06:17, 9.27s/it, lr=1e-5, step_loss=0.243]
Steps: 2%|▏ | 15636/1000000 [6:43:22<2661:18:05, 9.73s/it, lr=1e-5, step_loss=0.243][RANK-0]: Step: [15636], local_loss=0.05294563248753548, train_loss=0.08265966922044754, time_cost=4.089928150177002
+
Steps: 2%|▏ | 15636/1000000 [6:43:22<2661:18:05, 9.73s/it, lr=1e-5, step_loss=0.0529]
Steps: 2%|▏ | 15637/1000000 [6:43:28<2327:03:45, 8.51s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [15637], local_loss=0.041153084486722946, train_loss=0.11015605181455612, time_cost=2.1393566131591797
+
Steps: 2%|▏ | 15637/1000000 [6:43:28<2327:03:45, 8.51s/it, lr=1e-5, step_loss=0.0412]
Steps: 2%|▏ | 15638/1000000 [6:43:33<2050:42:23, 7.50s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [15638], local_loss=0.003391328966245055, train_loss=0.09963500499725342, time_cost=2.206839084625244
+
Steps: 2%|▏ | 15638/1000000 [6:43:33<2050:42:23, 7.50s/it, lr=1e-5, step_loss=0.00339]
Steps: 2%|▏ | 15639/1000000 [6:43:47<2569:37:16, 9.40s/it, lr=1e-5, step_loss=0.00339][RANK-0]: Step: [15639], local_loss=0.0053998492658138275, train_loss=0.08900469541549683, time_cost=4.348053932189941
+
Steps: 2%|▏ | 15639/1000000 [6:43:47<2569:37:16, 9.40s/it, lr=1e-5, step_loss=0.0054]
Steps: 2%|▏ | 15640/1000000 [6:44:00<2921:54:03, 10.69s/it, lr=1e-5, step_loss=0.0054][RANK-0]: Step: [15640], local_loss=0.1452532261610031, train_loss=0.03453781455755234, time_cost=3.499854564666748
+
Steps: 2%|▏ | 15640/1000000 [6:44:00<2921:54:03, 10.69s/it, lr=1e-5, step_loss=0.145]
Steps: 2%|▏ | 15641/1000000 [6:44:11<2889:45:52, 10.57s/it, lr=1e-5, step_loss=0.145][RANK-0]: Step: [15641], local_loss=0.010673725977540016, train_loss=12.246957778930664, time_cost=8.768412828445435
+
Steps: 2%|▏ | 15641/1000000 [6:44:11<2889:45:52, 10.57s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15642/1000000 [6:44:15<2374:27:32, 8.68s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15642], local_loss=0.01109648123383522, train_loss=0.026690615341067314, time_cost=1.4800159931182861
+
Steps: 2%|▏ | 15642/1000000 [6:44:15<2374:27:32, 8.68s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15643/1000000 [6:44:26<2560:16:32, 9.36s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15643], local_loss=0.03695359453558922, train_loss=0.028669143095612526, time_cost=3.3781299591064453
+
Steps: 2%|▏ | 15643/1000000 [6:44:26<2560:16:32, 9.36s/it, lr=1e-5, step_loss=0.037]
Steps: 2%|▏ | 15644/1000000 [6:44:36<2597:50:00, 9.50s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [15644], local_loss=0.02857237681746483, train_loss=0.029308756813406944, time_cost=3.6599793434143066
+
Steps: 2%|▏ | 15644/1000000 [6:44:36<2597:50:00, 9.50s/it, lr=1e-5, step_loss=0.0286]
Steps: 2%|▏ | 15645/1000000 [6:44:40<2190:48:25, 8.01s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [15645], local_loss=0.04233512654900551, train_loss=0.039807409048080444, time_cost=1.6805684566497803
+
Steps: 2%|▏ | 15645/1000000 [6:44:40<2190:48:25, 8.01s/it, lr=1e-5, step_loss=0.0423]
Steps: 2%|▏ | 15646/1000000 [6:44:50<2308:56:51, 8.44s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [15646], local_loss=0.006436832249164581, train_loss=0.0409255251288414, time_cost=4.128899574279785
+
Steps: 2%|▏ | 15646/1000000 [6:44:50<2308:56:51, 8.44s/it, lr=1e-5, step_loss=0.00644]
Steps: 2%|▏ | 15647/1000000 [6:44:56<2095:03:27, 7.66s/it, lr=1e-5, step_loss=0.00644][RANK-0]: Step: [15647], local_loss=0.0051559098064899445, train_loss=0.02619454264640808, time_cost=1.5304973125457764
+
Steps: 2%|▏ | 15647/1000000 [6:44:56<2095:03:27, 7.66s/it, lr=1e-5, step_loss=0.00516]
Steps: 2%|▏ | 15648/1000000 [6:45:01<1918:31:41, 7.02s/it, lr=1e-5, step_loss=0.00516][RANK-0]: Step: [15648], local_loss=0.03153437376022339, train_loss=0.024807538837194443, time_cost=3.095193862915039
+
Steps: 2%|▏ | 15648/1000000 [6:45:01<1918:31:41, 7.02s/it, lr=1e-5, step_loss=0.0315]
Steps: 2%|▏ | 15649/1000000 [6:45:10<2049:43:13, 7.50s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [15649], local_loss=0.004473587032407522, train_loss=0.023616867139935493, time_cost=3.638371706008911
+
Steps: 2%|▏ | 15649/1000000 [6:45:10<2049:43:13, 7.50s/it, lr=1e-5, step_loss=0.00447]
Steps: 2%|▏ | 15650/1000000 [6:45:23<2543:09:31, 9.30s/it, lr=1e-5, step_loss=0.00447][RANK-0]: Step: [15650], local_loss=0.01129573117941618, train_loss=0.046626217663288116, time_cost=4.566904306411743
+
Steps: 2%|▏ | 15650/1000000 [6:45:23<2543:09:31, 9.30s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 15651/1000000 [6:45:28<2141:17:49, 7.83s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [15651], local_loss=0.0388280525803566, train_loss=0.0247315876185894, time_cost=1.4241373538970947
+
Steps: 2%|▏ | 15651/1000000 [6:45:28<2141:17:49, 7.83s/it, lr=1e-5, step_loss=0.0388]
Steps: 2%|▏ | 15652/1000000 [6:45:35<2110:42:19, 7.72s/it, lr=1e-5, step_loss=0.0388][RANK-0]: Step: [15652], local_loss=0.017375390976667404, train_loss=0.01791718602180481, time_cost=2.0077614784240723
+
Steps: 2%|▏ | 15652/1000000 [6:45:35<2110:42:19, 7.72s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 15653/1000000 [6:45:52<2863:51:12, 10.47s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [15653], local_loss=0.007844967767596245, train_loss=0.1528894454240799, time_cost=5.247318983078003
+
Steps: 2%|▏ | 15653/1000000 [6:45:52<2863:51:12, 10.47s/it, lr=1e-5, step_loss=0.00784]
Steps: 2%|▏ | 15654/1000000 [6:46:05<3072:31:07, 11.24s/it, lr=1e-5, step_loss=0.00784][RANK-0]: Step: [15654], local_loss=0.004917331971228123, train_loss=0.011577141471207142, time_cost=1.2446284294128418
+
Steps: 2%|▏ | 15654/1000000 [6:46:05<3072:31:07, 11.24s/it, lr=1e-5, step_loss=0.00492]
Steps: 2%|▏ | 15655/1000000 [6:46:11<2616:30:06, 9.57s/it, lr=1e-5, step_loss=0.00492][RANK-0]: Step: [15655], local_loss=0.04947928711771965, train_loss=0.021230418235063553, time_cost=1.4437148571014404
+
Steps: 2%|▏ | 15655/1000000 [6:46:11<2616:30:06, 9.57s/it, lr=1e-5, step_loss=0.0495]
Steps: 2%|▏ | 15656/1000000 [6:46:15<2216:36:54, 8.11s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [15656], local_loss=0.017131106927990913, train_loss=0.03238750994205475, time_cost=1.698225975036621
+
Steps: 2%|▏ | 15656/1000000 [6:46:15<2216:36:54, 8.11s/it, lr=1e-5, step_loss=0.0171]
Steps: 2%|▏ | 15657/1000000 [6:46:21<1975:00:09, 7.22s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [15657], local_loss=0.010992604307830334, train_loss=0.10995891690254211, time_cost=2.164079427719116
+
Steps: 2%|▏ | 15657/1000000 [6:46:21<1975:00:09, 7.22s/it, lr=1e-5, step_loss=0.011]
Steps: 2%|▏ | 15658/1000000 [6:46:30<2134:49:14, 7.81s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [15658], local_loss=0.10722176730632782, train_loss=28.07648277282715, time_cost=6.981903076171875
+
Steps: 2%|▏ | 15658/1000000 [6:46:30<2134:49:14, 7.81s/it, lr=1e-5, step_loss=0.107]
Steps: 2%|▏ | 15659/1000000 [6:46:36<2037:14:35, 7.45s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [15659], local_loss=0.04421154782176018, train_loss=0.02533571794629097, time_cost=1.2109529972076416
+
Steps: 2%|▏ | 15659/1000000 [6:46:36<2037:14:35, 7.45s/it, lr=1e-5, step_loss=0.0442]
Steps: 2%|▏ | 15660/1000000 [6:46:44<2010:51:42, 7.35s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [15660], local_loss=0.3914719521999359, train_loss=0.06851722300052643, time_cost=2.7020435333251953
+
Steps: 2%|▏ | 15660/1000000 [6:46:44<2010:51:42, 7.35s/it, lr=1e-5, step_loss=0.391]
Steps: 2%|▏ | 15661/1000000 [6:46:54<2269:30:44, 8.30s/it, lr=1e-5, step_loss=0.391][RANK-0]: Step: [15661], local_loss=0.013248682953417301, train_loss=0.09105271100997925, time_cost=1.7059574127197266
+
Steps: 2%|▏ | 15661/1000000 [6:46:54<2269:30:44, 8.30s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 15662/1000000 [6:46:58<1951:51:06, 7.14s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [15662], local_loss=0.029163561761379242, train_loss=0.02089376375079155, time_cost=1.5275278091430664
+
Steps: 2%|▏ | 15662/1000000 [6:46:58<1951:51:06, 7.14s/it, lr=1e-5, step_loss=0.0292]
Steps: 2%|▏ | 15663/1000000 [6:47:03<1730:51:46, 6.33s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [15663], local_loss=0.01113869808614254, train_loss=0.024256939068436623, time_cost=1.9039793014526367
+
Steps: 2%|▏ | 15663/1000000 [6:47:03<1730:51:46, 6.33s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15664/1000000 [6:47:08<1616:01:56, 5.91s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15664], local_loss=0.01210557296872139, train_loss=0.15684343874454498, time_cost=1.8297226428985596
+
Steps: 2%|▏ | 15664/1000000 [6:47:08<1616:01:56, 5.91s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 15665/1000000 [6:47:19<2025:36:27, 7.41s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [15665], local_loss=0.026903629302978516, train_loss=0.02587783895432949, time_cost=2.1807262897491455
+
Steps: 2%|▏ | 15665/1000000 [6:47:19<2025:36:27, 7.41s/it, lr=1e-5, step_loss=0.0269]
Steps: 2%|▏ | 15666/1000000 [6:47:26<1987:40:44, 7.27s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [15666], local_loss=0.012737782672047615, train_loss=0.024553272873163223, time_cost=1.2453081607818604
+
Steps: 2%|▏ | 15666/1000000 [6:47:26<1987:40:44, 7.27s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 15667/1000000 [6:47:35<2167:48:36, 7.93s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [15667], local_loss=0.034604959189891815, train_loss=0.03378631919622421, time_cost=3.2527105808258057
+
Steps: 2%|▏ | 15667/1000000 [6:47:35<2167:48:36, 7.93s/it, lr=1e-5, step_loss=0.0346]
Steps: 2%|▏ | 15668/1000000 [6:47:48<2592:24:44, 9.48s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [15668], local_loss=0.10967100411653519, train_loss=0.037471070885658264, time_cost=4.304527282714844
+
Steps: 2%|▏ | 15668/1000000 [6:47:48<2592:24:44, 9.48s/it, lr=1e-5, step_loss=0.11]
Steps: 2%|▏ | 15669/1000000 [6:47:58<2582:17:45, 9.44s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [15669], local_loss=0.013546235859394073, train_loss=0.042166709899902344, time_cost=3.201023578643799
+
Steps: 2%|▏ | 15669/1000000 [6:47:58<2582:17:45, 9.44s/it, lr=1e-5, step_loss=0.0135]
Steps: 2%|▏ | 15670/1000000 [6:48:12<2977:41:05, 10.89s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [15670], local_loss=82.70709991455078, train_loss=13.234606742858887, time_cost=5.630198955535889
+
Steps: 2%|▏ | 15670/1000000 [6:48:12<2977:41:05, 10.89s/it, lr=1e-5, step_loss=82.7]
Steps: 2%|▏ | 15671/1000000 [6:48:26<3222:19:24, 11.79s/it, lr=1e-5, step_loss=82.7][RANK-0]: Step: [15671], local_loss=0.005358794238418341, train_loss=0.026184197515249252, time_cost=4.621550798416138
+
Steps: 2%|▏ | 15671/1000000 [6:48:26<3222:19:24, 11.79s/it, lr=1e-5, step_loss=0.00536]
Steps: 2%|▏ | 15672/1000000 [6:48:32<2746:49:53, 10.05s/it, lr=1e-5, step_loss=0.00536][RANK-0]: Step: [15672], local_loss=0.007389282342046499, train_loss=0.01839141547679901, time_cost=1.4189982414245605
+
Steps: 2%|▏ | 15672/1000000 [6:48:32<2746:49:53, 10.05s/it, lr=1e-5, step_loss=0.00739]
Steps: 2%|▏ | 15673/1000000 [6:48:37<2325:27:55, 8.50s/it, lr=1e-5, step_loss=0.00739][RANK-0]: Step: [15673], local_loss=0.03565964102745056, train_loss=0.04165733605623245, time_cost=2.444023847579956
+
Steps: 2%|▏ | 15673/1000000 [6:48:37<2325:27:55, 8.50s/it, lr=1e-5, step_loss=0.0357]
Steps: 2%|▏ | 15674/1000000 [6:48:48<2519:21:41, 9.21s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [15674], local_loss=0.10628402233123779, train_loss=0.2908444106578827, time_cost=3.229315757751465
+
Steps: 2%|▏ | 15674/1000000 [6:48:48<2519:21:41, 9.21s/it, lr=1e-5, step_loss=0.106]
Steps: 2%|▏ | 15675/1000000 [6:49:02<2985:24:33, 10.92s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [15675], local_loss=0.01575605571269989, train_loss=0.015340365469455719, time_cost=3.4201407432556152
+
Steps: 2%|▏ | 15675/1000000 [6:49:02<2985:24:33, 10.92s/it, lr=1e-5, step_loss=0.0158]
Steps: 2%|▏ | 15676/1000000 [6:49:08<2544:35:20, 9.31s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [15676], local_loss=0.004734508693218231, train_loss=0.0830637738108635, time_cost=1.2893083095550537
+
Steps: 2%|▏ | 15676/1000000 [6:49:08<2544:35:20, 9.31s/it, lr=1e-5, step_loss=0.00473]
Steps: 2%|▏ | 15677/1000000 [6:49:14<2243:23:28, 8.20s/it, lr=1e-5, step_loss=0.00473][RANK-0]: Step: [15677], local_loss=0.01014783326536417, train_loss=0.039840154349803925, time_cost=2.7911322116851807
+
Steps: 2%|▏ | 15677/1000000 [6:49:14<2243:23:28, 8.20s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 15678/1000000 [6:49:23<2361:51:10, 8.64s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [15678], local_loss=0.014440097846090794, train_loss=0.04972486570477486, time_cost=2.729874849319458
+
Steps: 2%|▏ | 15678/1000000 [6:49:23<2361:51:10, 8.64s/it, lr=1e-5, step_loss=0.0144]
Steps: 2%|▏ | 15679/1000000 [6:49:32<2383:42:24, 8.72s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [15679], local_loss=0.052973419427871704, train_loss=0.18825995922088623, time_cost=1.7050631046295166
+
Steps: 2%|▏ | 15679/1000000 [6:49:32<2383:42:24, 8.72s/it, lr=1e-5, step_loss=0.053]
Steps: 2%|▏ | 15680/1000000 [6:49:45<2712:32:32, 9.92s/it, lr=1e-5, step_loss=0.053][RANK-0]: Step: [15680], local_loss=0.010850232094526291, train_loss=0.04376371577382088, time_cost=9.971324682235718
+
Steps: 2%|▏ | 15680/1000000 [6:49:45<2712:32:32, 9.92s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 15681/1000000 [6:49:58<2955:25:41, 10.81s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [15681], local_loss=0.06130107119679451, train_loss=0.02167489379644394, time_cost=4.899867057800293
+
Steps: 2%|▏ | 15681/1000000 [6:49:58<2955:25:41, 10.81s/it, lr=1e-5, step_loss=0.0613]
Steps: 2%|▏ | 15682/1000000 [6:50:05<2659:45:46, 9.73s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [15682], local_loss=0.02896573767066002, train_loss=0.1510169506072998, time_cost=2.677920341491699
+
Steps: 2%|▏ | 15682/1000000 [6:50:05<2659:45:46, 9.73s/it, lr=1e-5, step_loss=0.029]
Steps: 2%|▏ | 15683/1000000 [6:50:17<2840:22:50, 10.39s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [15683], local_loss=0.007558157201856375, train_loss=0.018543004989624023, time_cost=4.138721942901611
+
Steps: 2%|▏ | 15683/1000000 [6:50:17<2840:22:50, 10.39s/it, lr=1e-5, step_loss=0.00756]
Steps: 2%|▏ | 15684/1000000 [6:50:26<2753:09:48, 10.07s/it, lr=1e-5, step_loss=0.00756][RANK-0]: Step: [15684], local_loss=0.007002933882176876, train_loss=0.01853599213063717, time_cost=4.2924089431762695
+
Steps: 2%|▏ | 15684/1000000 [6:50:26<2753:09:48, 10.07s/it, lr=1e-5, step_loss=0.007]
Steps: 2%|▏ | 15685/1000000 [6:50:36<2769:35:34, 10.13s/it, lr=1e-5, step_loss=0.007][RANK-0]: Step: [15685], local_loss=0.15585005283355713, train_loss=0.07891920953989029, time_cost=2.7643134593963623
+
Steps: 2%|▏ | 15685/1000000 [6:50:36<2769:35:34, 10.13s/it, lr=1e-5, step_loss=0.156]
Steps: 2%|▏ | 15686/1000000 [6:50:48<2851:09:10, 10.43s/it, lr=1e-5, step_loss=0.156][RANK-0]: Step: [15686], local_loss=0.011107044294476509, train_loss=0.02022150531411171, time_cost=1.9204509258270264
+
Steps: 2%|▏ | 15686/1000000 [6:50:48<2851:09:10, 10.43s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15687/1000000 [6:51:01<3067:48:56, 11.22s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15687], local_loss=0.012823470868170261, train_loss=0.06636711210012436, time_cost=5.181997060775757
+
Steps: 2%|▏ | 15687/1000000 [6:51:01<3067:48:56, 11.22s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15688/1000000 [6:51:10<2922:35:41, 10.69s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15688], local_loss=0.0090238768607378, train_loss=0.04545795917510986, time_cost=1.9274466037750244
+
Steps: 2%|▏ | 15688/1000000 [6:51:10<2922:35:41, 10.69s/it, lr=1e-5, step_loss=0.00902]
Steps: 2%|▏ | 15689/1000000 [6:51:15<2462:24:24, 9.01s/it, lr=1e-5, step_loss=0.00902][RANK-0]: Step: [15689], local_loss=0.007860848680138588, train_loss=0.03290325775742531, time_cost=1.3623929023742676
+
Steps: 2%|▏ | 15689/1000000 [6:51:15<2462:24:24, 9.01s/it, lr=1e-5, step_loss=0.00786]
Steps: 2%|▏ | 15690/1000000 [6:51:20<2147:45:44, 7.86s/it, lr=1e-5, step_loss=0.00786][RANK-0]: Step: [15690], local_loss=0.011193841695785522, train_loss=0.06488724052906036, time_cost=2.695618152618408
+
Steps: 2%|▏ | 15690/1000000 [6:51:20<2147:45:44, 7.86s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 15691/1000000 [6:51:32<2420:41:53, 8.85s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [15691], local_loss=0.06556244194507599, train_loss=0.16364720463752747, time_cost=2.328796863555908
+
Steps: 2%|▏ | 15691/1000000 [6:51:32<2420:41:53, 8.85s/it, lr=1e-5, step_loss=0.0656]
Steps: 2%|▏ | 15692/1000000 [6:51:50<3220:14:13, 11.78s/it, lr=1e-5, step_loss=0.0656][RANK-0]: Step: [15692], local_loss=0.047453537583351135, train_loss=0.05410287529230118, time_cost=10.814191579818726
+
Steps: 2%|▏ | 15692/1000000 [6:51:50<3220:14:13, 11.78s/it, lr=1e-5, step_loss=0.0475]
Steps: 2%|▏ | 15693/1000000 [6:52:01<3156:11:04, 11.54s/it, lr=1e-5, step_loss=0.0475][RANK-0]: Step: [15693], local_loss=0.012254164554178715, train_loss=0.09084086865186691, time_cost=4.700179100036621
+
Steps: 2%|▏ | 15693/1000000 [6:52:01<3156:11:04, 11.54s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 15694/1000000 [6:52:11<3055:20:44, 11.17s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [15694], local_loss=0.007474861573427916, train_loss=0.051504384726285934, time_cost=5.45550799369812
+
Steps: 2%|▏ | 15694/1000000 [6:52:11<3055:20:44, 11.17s/it, lr=1e-5, step_loss=0.00747]
Steps: 2%|▏ | 15695/1000000 [6:52:16<2507:57:33, 9.17s/it, lr=1e-5, step_loss=0.00747][RANK-0]: Step: [15695], local_loss=0.02174307033419609, train_loss=0.14429692924022675, time_cost=1.894085168838501
+
Steps: 2%|▏ | 15695/1000000 [6:52:16<2507:57:33, 9.17s/it, lr=1e-5, step_loss=0.0217]
Steps: 2%|▏ | 15696/1000000 [6:52:21<2169:58:54, 7.94s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [15696], local_loss=0.016203714534640312, train_loss=0.03468149155378342, time_cost=2.172419309616089
+
Steps: 2%|▏ | 15696/1000000 [6:52:21<2169:58:54, 7.94s/it, lr=1e-5, step_loss=0.0162]
Steps: 2%|▏ | 15697/1000000 [6:52:27<1976:37:00, 7.23s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [15697], local_loss=0.007435902953147888, train_loss=0.02175062894821167, time_cost=1.2151985168457031
+
Steps: 2%|▏ | 15697/1000000 [6:52:27<1976:37:00, 7.23s/it, lr=1e-5, step_loss=0.00744]
Steps: 2%|▏ | 15698/1000000 [6:52:36<2143:26:32, 7.84s/it, lr=1e-5, step_loss=0.00744][RANK-0]: Step: [15698], local_loss=0.07705652713775635, train_loss=0.06302888691425323, time_cost=3.0954196453094482
+
Steps: 2%|▏ | 15698/1000000 [6:52:36<2143:26:32, 7.84s/it, lr=1e-5, step_loss=0.0771]
Steps: 2%|▏ | 15699/1000000 [6:52:47<2424:26:11, 8.87s/it, lr=1e-5, step_loss=0.0771][RANK-0]: Step: [15699], local_loss=0.06327300518751144, train_loss=0.030798643827438354, time_cost=1.2228295803070068
+
Steps: 2%|▏ | 15699/1000000 [6:52:47<2424:26:11, 8.87s/it, lr=1e-5, step_loss=0.0633]
Steps: 2%|▏ | 15700/1000000 [6:52:52<2100:44:13, 7.68s/it, lr=1e-5, step_loss=0.0633][RANK-0]: Step: [15700], local_loss=1.0258028507232666, train_loss=0.13847346603870392, time_cost=1.9627008438110352
+
Steps: 2%|▏ | 15700/1000000 [6:52:52<2100:44:13, 7.68s/it, lr=1e-5, step_loss=1.03]
Steps: 2%|▏ | 15701/1000000 [6:53:01<2244:06:49, 8.21s/it, lr=1e-5, step_loss=1.03][RANK-0]: Step: [15701], local_loss=0.05450994893908501, train_loss=0.08794549852609634, time_cost=3.2088849544525146
+
Steps: 2%|▏ | 15701/1000000 [6:53:01<2244:06:49, 8.21s/it, lr=1e-5, step_loss=0.0545]
Steps: 2%|▏ | 15702/1000000 [6:53:07<1985:20:23, 7.26s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [15702], local_loss=0.01972414366900921, train_loss=0.07298992574214935, time_cost=2.0556437969207764
+
Steps: 2%|▏ | 15702/1000000 [6:53:07<1985:20:23, 7.26s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 15703/1000000 [6:53:21<2562:36:46, 9.37s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [15703], local_loss=0.010767186060547829, train_loss=0.04856887087225914, time_cost=11.259036302566528
+
Steps: 2%|▏ | 15703/1000000 [6:53:21<2562:36:46, 9.37s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 15704/1000000 [6:53:28<2342:04:21, 8.57s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [15704], local_loss=0.08715023100376129, train_loss=0.06400575488805771, time_cost=2.2142810821533203
+
Steps: 2%|▏ | 15704/1000000 [6:53:28<2342:04:21, 8.57s/it, lr=1e-5, step_loss=0.0872]
Steps: 2%|▏ | 15705/1000000 [6:53:42<2816:18:53, 10.30s/it, lr=1e-5, step_loss=0.0872][RANK-0]: Step: [15705], local_loss=0.030687402933835983, train_loss=0.08241702616214752, time_cost=5.060529947280884
+
Steps: 2%|▏ | 15705/1000000 [6:53:42<2816:18:53, 10.30s/it, lr=1e-5, step_loss=0.0307]
Steps: 2%|▏ | 15706/1000000 [6:53:46<2343:52:53, 8.57s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [15706], local_loss=0.03521338105201721, train_loss=0.03016427904367447, time_cost=1.6086819171905518
+
Steps: 2%|▏ | 15706/1000000 [6:53:46<2343:52:53, 8.57s/it, lr=1e-5, step_loss=0.0352]
Steps: 2%|▏ | 15707/1000000 [6:53:52<2066:38:23, 7.56s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [15707], local_loss=0.014639200642704964, train_loss=0.1024550050497055, time_cost=2.4573185443878174
+
Steps: 2%|▏ | 15707/1000000 [6:53:52<2066:38:23, 7.56s/it, lr=1e-5, step_loss=0.0146]
Steps: 2%|▏ | 15708/1000000 [6:53:57<1868:13:37, 6.83s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [15708], local_loss=0.030680237337946892, train_loss=0.22969168424606323, time_cost=1.3139123916625977
+
Steps: 2%|▏ | 15708/1000000 [6:53:57<1868:13:37, 6.83s/it, lr=1e-5, step_loss=0.0307]
Steps: 2%|▏ | 15709/1000000 [6:54:02<1730:41:45, 6.33s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [15709], local_loss=0.00634386483579874, train_loss=0.06818708777427673, time_cost=1.4086649417877197
+
Steps: 2%|▏ | 15709/1000000 [6:54:02<1730:41:45, 6.33s/it, lr=1e-5, step_loss=0.00634]
Steps: 2%|▏ | 15710/1000000 [6:54:11<1928:15:46, 7.05s/it, lr=1e-5, step_loss=0.00634][RANK-0]: Step: [15710], local_loss=0.00634149182587862, train_loss=0.03545437008142471, time_cost=3.2645466327667236
+
Steps: 2%|▏ | 15710/1000000 [6:54:11<1928:15:46, 7.05s/it, lr=1e-5, step_loss=0.00634]
Steps: 2%|▏ | 15711/1000000 [6:54:18<1916:06:23, 7.01s/it, lr=1e-5, step_loss=0.00634][RANK-0]: Step: [15711], local_loss=0.01387537270784378, train_loss=13.638968467712402, time_cost=1.2278661727905273
+
Steps: 2%|▏ | 15711/1000000 [6:54:18<1916:06:23, 7.01s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 15712/1000000 [6:54:23<1798:50:19, 6.58s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [15712], local_loss=0.008037932217121124, train_loss=0.01539603527635336, time_cost=2.9276223182678223
+
Steps: 2%|▏ | 15712/1000000 [6:54:23<1798:50:19, 6.58s/it, lr=1e-5, step_loss=0.00804]
Steps: 2%|▏ | 15713/1000000 [6:54:36<2341:52:29, 8.57s/it, lr=1e-5, step_loss=0.00804][RANK-0]: Step: [15713], local_loss=0.014324210584163666, train_loss=0.09011167287826538, time_cost=3.638016700744629
+
Steps: 2%|▏ | 15713/1000000 [6:54:36<2341:52:29, 8.57s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 15714/1000000 [6:54:42<2072:01:09, 7.58s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [15714], local_loss=0.025358349084854126, train_loss=0.017063923180103302, time_cost=2.3902029991149902
+
Steps: 2%|▏ | 15714/1000000 [6:54:42<2072:01:09, 7.58s/it, lr=1e-5, step_loss=0.0254]
Steps: 2%|▏ | 15715/1000000 [6:54:54<2432:00:35, 8.90s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [15715], local_loss=0.43924447894096375, train_loss=0.08680737018585205, time_cost=4.849127531051636
+
Steps: 2%|▏ | 15715/1000000 [6:54:54<2432:00:35, 8.90s/it, lr=1e-5, step_loss=0.439]
Steps: 2%|▏ | 15716/1000000 [6:55:01<2295:58:45, 8.40s/it, lr=1e-5, step_loss=0.439][RANK-0]: Step: [15716], local_loss=0.03381789103150368, train_loss=0.018719911575317383, time_cost=2.768688678741455
+
Steps: 2%|▏ | 15716/1000000 [6:55:01<2295:58:45, 8.40s/it, lr=1e-5, step_loss=0.0338]
Steps: 2%|▏ | 15717/1000000 [6:55:09<2269:47:16, 8.30s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [15717], local_loss=0.14007069170475006, train_loss=0.0440981462597847, time_cost=4.080791234970093
+
Steps: 2%|▏ | 15717/1000000 [6:55:09<2269:47:16, 8.30s/it, lr=1e-5, step_loss=0.14]
Steps: 2%|▏ | 15718/1000000 [6:55:22<2667:17:45, 9.76s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [15718], local_loss=0.0471799299120903, train_loss=0.07355082035064697, time_cost=3.776944160461426
+
Steps: 2%|▏ | 15718/1000000 [6:55:22<2667:17:45, 9.76s/it, lr=1e-5, step_loss=0.0472]
Steps: 2%|▏ | 15719/1000000 [6:55:29<2428:59:31, 8.88s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [15719], local_loss=0.011477821506559849, train_loss=0.07100548595190048, time_cost=2.0589616298675537
+
Steps: 2%|▏ | 15719/1000000 [6:55:29<2428:59:31, 8.88s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 15720/1000000 [6:55:38<2445:15:34, 8.94s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [15720], local_loss=0.02107793465256691, train_loss=0.03160654753446579, time_cost=1.6466567516326904
+
Steps: 2%|▏ | 15720/1000000 [6:55:38<2445:15:34, 8.94s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 15721/1000000 [6:55:51<2759:51:19, 10.09s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [15721], local_loss=0.008236533030867577, train_loss=0.026220154017210007, time_cost=10.599499464035034
+
Steps: 2%|▏ | 15721/1000000 [6:55:51<2759:51:19, 10.09s/it, lr=1e-5, step_loss=0.00824]
Steps: 2%|▏ | 15722/1000000 [6:56:07<3267:48:00, 11.95s/it, lr=1e-5, step_loss=0.00824][RANK-0]: Step: [15722], local_loss=0.037177518010139465, train_loss=0.02953370288014412, time_cost=8.876593589782715
+
Steps: 2%|▏ | 15722/1000000 [6:56:07<3267:48:00, 11.95s/it, lr=1e-5, step_loss=0.0372]
Steps: 2%|▏ | 15723/1000000 [6:56:15<2910:23:08, 10.64s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [15723], local_loss=0.0338037870824337, train_loss=0.03525605797767639, time_cost=2.3045029640197754
+
Steps: 2%|▏ | 15723/1000000 [6:56:15<2910:23:08, 10.64s/it, lr=1e-5, step_loss=0.0338]
Steps: 2%|▏ | 15724/1000000 [6:56:24<2797:21:40, 10.23s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [15724], local_loss=0.05584239214658737, train_loss=0.023046167567372322, time_cost=1.6919736862182617
+
Steps: 2%|▏ | 15724/1000000 [6:56:24<2797:21:40, 10.23s/it, lr=1e-5, step_loss=0.0558]
Steps: 2%|▏ | 15725/1000000 [6:56:30<2439:32:22, 8.92s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [15725], local_loss=0.01747480407357216, train_loss=0.04957225173711777, time_cost=3.454629898071289
+
Steps: 2%|▏ | 15725/1000000 [6:56:30<2439:32:22, 8.92s/it, lr=1e-5, step_loss=0.0175]
Steps: 2%|▏ | 15726/1000000 [6:56:46<3026:13:53, 11.07s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [15726], local_loss=0.015762312337756157, train_loss=0.08306144922971725, time_cost=6.157517433166504
+
Steps: 2%|▏ | 15726/1000000 [6:56:46<3026:13:53, 11.07s/it, lr=1e-5, step_loss=0.0158]
Steps: 2%|▏ | 15727/1000000 [6:56:56<2985:56:23, 10.92s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [15727], local_loss=0.011093721725046635, train_loss=0.020455043762922287, time_cost=2.305406093597412
+
Steps: 2%|▏ | 15727/1000000 [6:56:56<2985:56:23, 10.92s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15728/1000000 [6:57:03<2657:30:59, 9.72s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15728], local_loss=0.013761746697127819, train_loss=0.030057109892368317, time_cost=3.1452252864837646
+
Steps: 2%|▏ | 15728/1000000 [6:57:03<2657:30:59, 9.72s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 15729/1000000 [6:57:08<2205:09:19, 8.07s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [15729], local_loss=0.03633928671479225, train_loss=0.02454414963722229, time_cost=1.29557466506958
+
Steps: 2%|▏ | 15729/1000000 [6:57:08<2205:09:19, 8.07s/it, lr=1e-5, step_loss=0.0363]
Steps: 2%|▏ | 15730/1000000 [6:57:13<2014:19:09, 7.37s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [15730], local_loss=0.01289583370089531, train_loss=0.02969961240887642, time_cost=1.2995376586914062
+
Steps: 2%|▏ | 15730/1000000 [6:57:13<2014:19:09, 7.37s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 15731/1000000 [6:57:21<2051:19:05, 7.50s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [15731], local_loss=0.02066813036799431, train_loss=0.0497383214533329, time_cost=3.108592987060547
+
Steps: 2%|▏ | 15731/1000000 [6:57:21<2051:19:05, 7.50s/it, lr=1e-5, step_loss=0.0207]
Steps: 2%|▏ | 15732/1000000 [6:57:33<2453:59:24, 8.98s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [15732], local_loss=0.07960717380046844, train_loss=0.02995670959353447, time_cost=5.187133312225342
+
Steps: 2%|▏ | 15732/1000000 [6:57:33<2453:59:24, 8.98s/it, lr=1e-5, step_loss=0.0796]
Steps: 2%|▏ | 15733/1000000 [6:57:43<2486:15:41, 9.09s/it, lr=1e-5, step_loss=0.0796][RANK-0]: Step: [15733], local_loss=0.019084366038441658, train_loss=0.028750190511345863, time_cost=2.9060826301574707
+
Steps: 2%|▏ | 15733/1000000 [6:57:43<2486:15:41, 9.09s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 15734/1000000 [6:57:50<2345:22:37, 8.58s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [15734], local_loss=0.006724881939589977, train_loss=0.03899195417761803, time_cost=2.882761240005493
+
Steps: 2%|▏ | 15734/1000000 [6:57:50<2345:22:37, 8.58s/it, lr=1e-5, step_loss=0.00672]
Steps: 2%|▏ | 15735/1000000 [6:57:57<2231:50:13, 8.16s/it, lr=1e-5, step_loss=0.00672][RANK-0]: Step: [15735], local_loss=0.3825730085372925, train_loss=0.07764491438865662, time_cost=2.7793025970458984
+
Steps: 2%|▏ | 15735/1000000 [6:57:57<2231:50:13, 8.16s/it, lr=1e-5, step_loss=0.383]
Steps: 2%|▏ | 15736/1000000 [6:58:03<1979:17:17, 7.24s/it, lr=1e-5, step_loss=0.383][RANK-0]: Step: [15736], local_loss=0.014750905334949493, train_loss=0.01880079135298729, time_cost=1.208573341369629
+
Steps: 2%|▏ | 15736/1000000 [6:58:03<1979:17:17, 7.24s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 15737/1000000 [6:58:14<2330:23:19, 8.52s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [15737], local_loss=0.028896262869238853, train_loss=0.06360095739364624, time_cost=4.172831296920776
+
Steps: 2%|▏ | 15737/1000000 [6:58:14<2330:23:19, 8.52s/it, lr=1e-5, step_loss=0.0289]
Steps: 2%|▏ | 15738/1000000 [6:58:23<2344:48:43, 8.58s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [15738], local_loss=0.024439655244350433, train_loss=0.01857253722846508, time_cost=1.271235466003418
+
Steps: 2%|▏ | 15738/1000000 [6:58:23<2344:48:43, 8.58s/it, lr=1e-5, step_loss=0.0244]
Steps: 2%|▏ | 15739/1000000 [6:58:32<2394:22:20, 8.76s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [15739], local_loss=0.021439028903841972, train_loss=0.03049900196492672, time_cost=7.819244384765625
+
Steps: 2%|▏ | 15739/1000000 [6:58:32<2394:22:20, 8.76s/it, lr=1e-5, step_loss=0.0214]
Steps: 2%|▏ | 15740/1000000 [6:58:39<2251:08:11, 8.23s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [15740], local_loss=0.011187387630343437, train_loss=0.05811869353055954, time_cost=2.6332004070281982
+
Steps: 2%|▏ | 15740/1000000 [6:58:39<2251:08:11, 8.23s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 15741/1000000 [6:58:50<2460:10:04, 9.00s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [15741], local_loss=0.9929221868515015, train_loss=0.1600072681903839, time_cost=7.749018669128418
+
Steps: 2%|▏ | 15741/1000000 [6:58:50<2460:10:04, 9.00s/it, lr=1e-5, step_loss=0.993]
Steps: 2%|▏ | 15742/1000000 [6:58:54<2083:28:38, 7.62s/it, lr=1e-5, step_loss=0.993][RANK-0]: Step: [15742], local_loss=0.03407629579305649, train_loss=0.06206098571419716, time_cost=3.3460545539855957
+
Steps: 2%|▏ | 15742/1000000 [6:58:54<2083:28:38, 7.62s/it, lr=1e-5, step_loss=0.0341]
Steps: 2%|▏ | 15743/1000000 [6:59:12<2922:03:53, 10.69s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [15743], local_loss=0.13009586930274963, train_loss=0.08101804554462433, time_cost=7.709146976470947
+
Steps: 2%|▏ | 15743/1000000 [6:59:12<2922:03:53, 10.69s/it, lr=1e-5, step_loss=0.13]
Steps: 2%|▏ | 15744/1000000 [6:59:25<3153:02:18, 11.53s/it, lr=1e-5, step_loss=0.13][RANK-0]: Step: [15744], local_loss=0.13280855119228363, train_loss=0.04500020667910576, time_cost=4.574049472808838
+
Steps: 2%|▏ | 15744/1000000 [6:59:25<3153:02:18, 11.53s/it, lr=1e-5, step_loss=0.133]
Steps: 2%|▏ | 15745/1000000 [6:59:33<2847:07:17, 10.41s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [15745], local_loss=0.013898284174501896, train_loss=0.05725493282079697, time_cost=3.005335807800293
+
Steps: 2%|▏ | 15745/1000000 [6:59:33<2847:07:17, 10.41s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 15746/1000000 [6:59:41<2616:04:14, 9.57s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [15746], local_loss=0.007236042991280556, train_loss=0.06764747947454453, time_cost=5.810682058334351
+
Steps: 2%|▏ | 15746/1000000 [6:59:41<2616:04:14, 9.57s/it, lr=1e-5, step_loss=0.00724]
Steps: 2%|▏ | 15747/1000000 [6:59:46<2278:53:01, 8.34s/it, lr=1e-5, step_loss=0.00724][RANK-0]: Step: [15747], local_loss=0.011150444857776165, train_loss=0.03506769984960556, time_cost=1.4332339763641357
+
Steps: 2%|▏ | 15747/1000000 [6:59:46<2278:53:01, 8.34s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 15748/1000000 [6:59:51<2011:00:57, 7.36s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [15748], local_loss=0.08022190630435944, train_loss=0.0320909321308136, time_cost=2.117055654525757
+
Steps: 2%|▏ | 15748/1000000 [6:59:51<2011:00:57, 7.36s/it, lr=1e-5, step_loss=0.0802]
Steps: 2%|▏ | 15749/1000000 [7:00:04<2460:12:30, 9.00s/it, lr=1e-5, step_loss=0.0802][RANK-0]: Step: [15749], local_loss=0.016459356993436813, train_loss=0.034453265368938446, time_cost=3.89339280128479
+
Steps: 2%|▏ | 15749/1000000 [7:00:04<2460:12:30, 9.00s/it, lr=1e-5, step_loss=0.0165]
Steps: 2%|▏ | 15750/1000000 [7:00:22<3158:07:45, 11.55s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [15750], local_loss=0.11901124566793442, train_loss=0.051780201494693756, time_cost=9.898120403289795
+
Steps: 2%|▏ | 15750/1000000 [7:00:22<3158:07:45, 11.55s/it, lr=1e-5, step_loss=0.119]
Steps: 2%|▏ | 15751/1000000 [7:00:26<2597:31:09, 9.50s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [15751], local_loss=0.07597015798091888, train_loss=0.023210667073726654, time_cost=1.7751762866973877
+
Steps: 2%|▏ | 15751/1000000 [7:00:26<2597:31:09, 9.50s/it, lr=1e-5, step_loss=0.076]
Steps: 2%|▏ | 15752/1000000 [7:00:40<2939:12:01, 10.75s/it, lr=1e-5, step_loss=0.076][RANK-0]: Step: [15752], local_loss=0.032106198370456696, train_loss=0.10018117725849152, time_cost=4.415192365646362
+
Steps: 2%|▏ | 15752/1000000 [7:00:40<2939:12:01, 10.75s/it, lr=1e-5, step_loss=0.0321]
Steps: 2%|▏ | 15753/1000000 [7:00:45<2458:35:38, 8.99s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [15753], local_loss=0.05004417151212692, train_loss=0.016274694353342056, time_cost=2.075550079345703
+
Steps: 2%|▏ | 15753/1000000 [7:00:45<2458:35:38, 8.99s/it, lr=1e-5, step_loss=0.05]
Steps: 2%|▏ | 15754/1000000 [7:00:50<2147:05:43, 7.85s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [15754], local_loss=0.12264537066221237, train_loss=0.04141324758529663, time_cost=2.468519449234009
+
Steps: 2%|▏ | 15754/1000000 [7:00:50<2147:05:43, 7.85s/it, lr=1e-5, step_loss=0.123]
Steps: 2%|▏ | 15755/1000000 [7:00:57<2060:39:09, 7.54s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [15755], local_loss=0.008150126785039902, train_loss=0.07214351743459702, time_cost=2.496633291244507
+
Steps: 2%|▏ | 15755/1000000 [7:00:57<2060:39:09, 7.54s/it, lr=1e-5, step_loss=0.00815]
Steps: 2%|▏ | 15756/1000000 [7:01:07<2290:08:06, 8.38s/it, lr=1e-5, step_loss=0.00815][RANK-0]: Step: [15756], local_loss=0.005143036134541035, train_loss=0.022732317447662354, time_cost=1.2213876247406006
+
Steps: 2%|▏ | 15756/1000000 [7:01:07<2290:08:06, 8.38s/it, lr=1e-5, step_loss=0.00514]
Steps: 2%|▏ | 15757/1000000 [7:01:18<2503:03:40, 9.16s/it, lr=1e-5, step_loss=0.00514][RANK-0]: Step: [15757], local_loss=0.059685494750738144, train_loss=0.02535611018538475, time_cost=1.9975297451019287
+
Steps: 2%|▏ | 15757/1000000 [7:01:18<2503:03:40, 9.16s/it, lr=1e-5, step_loss=0.0597]
Steps: 2%|▏ | 15758/1000000 [7:01:23<2161:06:37, 7.90s/it, lr=1e-5, step_loss=0.0597][RANK-0]: Step: [15758], local_loss=0.027892688289284706, train_loss=0.06552006304264069, time_cost=1.2569031715393066
+
Steps: 2%|▏ | 15758/1000000 [7:01:23<2161:06:37, 7.90s/it, lr=1e-5, step_loss=0.0279]
Steps: 2%|▏ | 15759/1000000 [7:01:34<2350:54:26, 8.60s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [15759], local_loss=0.004259743727743626, train_loss=0.02508559264242649, time_cost=2.0454914569854736
+
Steps: 2%|▏ | 15759/1000000 [7:01:34<2350:54:26, 8.60s/it, lr=1e-5, step_loss=0.00426]
Steps: 2%|▏ | 15760/1000000 [7:01:40<2210:11:10, 8.08s/it, lr=1e-5, step_loss=0.00426][RANK-0]: Step: [15760], local_loss=0.01066141203045845, train_loss=0.013375634327530861, time_cost=3.3144373893737793
+
Steps: 2%|▏ | 15760/1000000 [7:01:40<2210:11:10, 8.08s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15761/1000000 [7:01:47<2094:20:01, 7.66s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15761], local_loss=0.060351766645908356, train_loss=0.023803669959306717, time_cost=4.9056901931762695
+
Steps: 2%|▏ | 15761/1000000 [7:01:47<2094:20:01, 7.66s/it, lr=1e-5, step_loss=0.0604]
Steps: 2%|▏ | 15762/1000000 [7:01:59<2475:18:54, 9.05s/it, lr=1e-5, step_loss=0.0604][RANK-0]: Step: [15762], local_loss=0.06362701952457428, train_loss=0.04907859116792679, time_cost=1.2089190483093262
+
Steps: 2%|▏ | 15762/1000000 [7:01:59<2475:18:54, 9.05s/it, lr=1e-5, step_loss=0.0636]
Steps: 2%|▏ | 15763/1000000 [7:02:11<2697:34:14, 9.87s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [15763], local_loss=0.009509282186627388, train_loss=0.02300887741148472, time_cost=4.647159099578857
+
Steps: 2%|▏ | 15763/1000000 [7:02:11<2697:34:14, 9.87s/it, lr=1e-5, step_loss=0.00951]
Steps: 2%|▏ | 15764/1000000 [7:02:18<2464:13:40, 9.01s/it, lr=1e-5, step_loss=0.00951][RANK-0]: Step: [15764], local_loss=0.0105820894241333, train_loss=0.021420657634735107, time_cost=2.514322280883789
+
Steps: 2%|▏ | 15764/1000000 [7:02:18<2464:13:40, 9.01s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 15765/1000000 [7:02:26<2366:38:17, 8.66s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [15765], local_loss=0.03425880894064903, train_loss=0.03241150081157684, time_cost=5.67466139793396
+
Steps: 2%|▏ | 15765/1000000 [7:02:26<2366:38:17, 8.66s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 15766/1000000 [7:02:32<2122:46:47, 7.76s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [15766], local_loss=0.022537503391504288, train_loss=0.06548026204109192, time_cost=3.385348320007324
+
Steps: 2%|▏ | 15766/1000000 [7:02:32<2122:46:47, 7.76s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 15767/1000000 [7:02:37<1925:31:12, 7.04s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [15767], local_loss=0.005929108709096909, train_loss=0.046670928597450256, time_cost=3.1331241130828857
+
Steps: 2%|▏ | 15767/1000000 [7:02:37<1925:31:12, 7.04s/it, lr=1e-5, step_loss=0.00593]
Steps: 2%|▏ | 15768/1000000 [7:02:43<1829:09:58, 6.69s/it, lr=1e-5, step_loss=0.00593][RANK-0]: Step: [15768], local_loss=0.007797024678438902, train_loss=0.02792961895465851, time_cost=4.847837209701538
+
Steps: 2%|▏ | 15768/1000000 [7:02:43<1829:09:58, 6.69s/it, lr=1e-5, step_loss=0.0078]
Steps: 2%|▏ | 15769/1000000 [7:02:52<2023:56:37, 7.40s/it, lr=1e-5, step_loss=0.0078][RANK-0]: Step: [15769], local_loss=0.01597461849451065, train_loss=0.10587235540151596, time_cost=3.7067344188690186
+
Steps: 2%|▏ | 15769/1000000 [7:02:52<2023:56:37, 7.40s/it, lr=1e-5, step_loss=0.016]
Steps: 2%|▏ | 15770/1000000 [7:03:03<2334:45:46, 8.54s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [15770], local_loss=0.01291509810835123, train_loss=0.1562386006116867, time_cost=1.2213687896728516
+
Steps: 2%|▏ | 15770/1000000 [7:03:03<2334:45:46, 8.54s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 15771/1000000 [7:03:11<2306:42:23, 8.44s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [15771], local_loss=0.041119448840618134, train_loss=0.06882257759571075, time_cost=3.62522029876709
+
Steps: 2%|▏ | 15771/1000000 [7:03:11<2306:42:23, 8.44s/it, lr=1e-5, step_loss=0.0411]
Steps: 2%|▏ | 15772/1000000 [7:03:16<1993:48:17, 7.29s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [15772], local_loss=0.010277077555656433, train_loss=0.06979506462812424, time_cost=1.8597893714904785
+
Steps: 2%|▏ | 15772/1000000 [7:03:16<1993:48:17, 7.29s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 15773/1000000 [7:03:21<1797:17:18, 6.57s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [15773], local_loss=0.03152576461434364, train_loss=0.04530121386051178, time_cost=2.2220571041107178
+
Steps: 2%|▏ | 15773/1000000 [7:03:21<1797:17:18, 6.57s/it, lr=1e-5, step_loss=0.0315]
Steps: 2%|▏ | 15774/1000000 [7:03:28<1824:02:44, 6.67s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [15774], local_loss=0.006201776210218668, train_loss=0.04324693977832794, time_cost=1.2921946048736572
+
Steps: 2%|▏ | 15774/1000000 [7:03:28<1824:02:44, 6.67s/it, lr=1e-5, step_loss=0.0062]
Steps: 2%|▏ | 15775/1000000 [7:03:35<1842:46:01, 6.74s/it, lr=1e-5, step_loss=0.0062][RANK-0]: Step: [15775], local_loss=0.09107362478971481, train_loss=0.038294173777103424, time_cost=2.6224875450134277
+
Steps: 2%|▏ | 15775/1000000 [7:03:35<1842:46:01, 6.74s/it, lr=1e-5, step_loss=0.0911]
Steps: 2%|▏ | 15776/1000000 [7:03:46<2241:39:37, 8.20s/it, lr=1e-5, step_loss=0.0911][RANK-0]: Step: [15776], local_loss=0.011269418522715569, train_loss=0.024624187499284744, time_cost=3.304342746734619
+
Steps: 2%|▏ | 15776/1000000 [7:03:46<2241:39:37, 8.20s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 15777/1000000 [7:03:53<2131:11:01, 7.80s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [15777], local_loss=0.05598404258489609, train_loss=0.020100608468055725, time_cost=2.7970809936523438
+
Steps: 2%|▏ | 15777/1000000 [7:03:53<2131:11:01, 7.80s/it, lr=1e-5, step_loss=0.056]
Steps: 2%|▏ | 15778/1000000 [7:03:58<1898:22:32, 6.94s/it, lr=1e-5, step_loss=0.056][RANK-0]: Step: [15778], local_loss=0.06780195236206055, train_loss=0.15596505999565125, time_cost=2.1284029483795166
+
Steps: 2%|▏ | 15778/1000000 [7:03:58<1898:22:32, 6.94s/it, lr=1e-5, step_loss=0.0678]
Steps: 2%|▏ | 15779/1000000 [7:04:07<2034:51:10, 7.44s/it, lr=1e-5, step_loss=0.0678][RANK-0]: Step: [15779], local_loss=0.06573425233364105, train_loss=0.060274407267570496, time_cost=1.5576860904693604
+
Steps: 2%|▏ | 15779/1000000 [7:04:07<2034:51:10, 7.44s/it, lr=1e-5, step_loss=0.0657]
Steps: 2%|▏ | 15780/1000000 [7:04:14<2059:25:30, 7.53s/it, lr=1e-5, step_loss=0.0657][RANK-0]: Step: [15780], local_loss=0.010566920042037964, train_loss=0.024213220924139023, time_cost=6.043729543685913
+
Steps: 2%|▏ | 15780/1000000 [7:04:14<2059:25:30, 7.53s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 15781/1000000 [7:04:22<2072:24:42, 7.58s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [15781], local_loss=0.013171345926821232, train_loss=0.04367799311876297, time_cost=4.254674196243286
+
Steps: 2%|▏ | 15781/1000000 [7:04:22<2072:24:42, 7.58s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 15782/1000000 [7:04:31<2182:42:13, 7.98s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [15782], local_loss=0.04846709221601486, train_loss=0.031820960342884064, time_cost=2.782135486602783
+
Steps: 2%|▏ | 15782/1000000 [7:04:31<2182:42:13, 7.98s/it, lr=1e-5, step_loss=0.0485]
Steps: 2%|▏ | 15783/1000000 [7:04:40<2267:22:07, 8.29s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [15783], local_loss=0.009441536851227283, train_loss=0.025662817060947418, time_cost=2.5709943771362305
+
Steps: 2%|▏ | 15783/1000000 [7:04:40<2267:22:07, 8.29s/it, lr=1e-5, step_loss=0.00944]
Steps: 2%|▏ | 15784/1000000 [7:04:51<2509:58:41, 9.18s/it, lr=1e-5, step_loss=0.00944][RANK-0]: Step: [15784], local_loss=0.004709979053586721, train_loss=0.03367915749549866, time_cost=4.0348961353302
+
Steps: 2%|▏ | 15784/1000000 [7:04:51<2509:58:41, 9.18s/it, lr=1e-5, step_loss=0.00471]
Steps: 2%|▏ | 15785/1000000 [7:05:07<3011:00:33, 11.01s/it, lr=1e-5, step_loss=0.00471][RANK-0]: Step: [15785], local_loss=0.09548520296812057, train_loss=0.06587672233581543, time_cost=5.877605199813843
+
Steps: 2%|▏ | 15785/1000000 [7:05:07<3011:00:33, 11.01s/it, lr=1e-5, step_loss=0.0955]
Steps: 2%|▏ | 15786/1000000 [7:05:16<2858:45:41, 10.46s/it, lr=1e-5, step_loss=0.0955][RANK-0]: Step: [15786], local_loss=0.012292381376028061, train_loss=0.021505003795027733, time_cost=3.646991491317749
+
Steps: 2%|▏ | 15786/1000000 [7:05:16<2858:45:41, 10.46s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 15787/1000000 [7:05:23<2589:09:47, 9.47s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [15787], local_loss=0.21708944439888, train_loss=0.05236092954874039, time_cost=2.677525043487549
+
Steps: 2%|▏ | 15787/1000000 [7:05:23<2589:09:47, 9.47s/it, lr=1e-5, step_loss=0.217]
Steps: 2%|▏ | 15788/1000000 [7:05:28<2247:10:27, 8.22s/it, lr=1e-5, step_loss=0.217][RANK-0]: Step: [15788], local_loss=0.016978370025753975, train_loss=0.18323612213134766, time_cost=3.3614089488983154
+
Steps: 2%|▏ | 15788/1000000 [7:05:28<2247:10:27, 8.22s/it, lr=1e-5, step_loss=0.017]
Steps: 2%|▏ | 15789/1000000 [7:05:36<2238:38:44, 8.19s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [15789], local_loss=0.00964397843927145, train_loss=0.03020237758755684, time_cost=2.2824044227600098
+
Steps: 2%|▏ | 15789/1000000 [7:05:36<2238:38:44, 8.19s/it, lr=1e-5, step_loss=0.00964]
Steps: 2%|▏ | 15790/1000000 [7:05:43<2153:43:42, 7.88s/it, lr=1e-5, step_loss=0.00964][RANK-0]: Step: [15790], local_loss=0.006145659368485212, train_loss=0.03495991230010986, time_cost=1.5033493041992188
+
Steps: 2%|▏ | 15790/1000000 [7:05:43<2153:43:42, 7.88s/it, lr=1e-5, step_loss=0.00615]
Steps: 2%|▏ | 15791/1000000 [7:05:48<1847:56:52, 6.76s/it, lr=1e-5, step_loss=0.00615][RANK-0]: Step: [15791], local_loss=0.06904301047325134, train_loss=0.057228945195674896, time_cost=1.788323163986206
+
Steps: 2%|▏ | 15791/1000000 [7:05:48<1847:56:52, 6.76s/it, lr=1e-5, step_loss=0.069]
Steps: 2%|▏ | 15792/1000000 [7:05:53<1726:30:58, 6.32s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [15792], local_loss=0.010508928447961807, train_loss=0.04266346991062164, time_cost=2.43349027633667
+
Steps: 2%|▏ | 15792/1000000 [7:05:53<1726:30:58, 6.32s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 15793/1000000 [7:05:58<1624:13:11, 5.94s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [15793], local_loss=0.010477732867002487, train_loss=0.058372706174850464, time_cost=2.6086525917053223
+
Steps: 2%|▏ | 15793/1000000 [7:05:58<1624:13:11, 5.94s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 15794/1000000 [7:06:11<2177:28:22, 7.96s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [15794], local_loss=0.05315996706485748, train_loss=0.04143040254712105, time_cost=4.375021934509277
+
Steps: 2%|▏ | 15794/1000000 [7:06:11<2177:28:22, 7.96s/it, lr=1e-5, step_loss=0.0532]
Steps: 2%|▏ | 15795/1000000 [7:06:16<1942:45:14, 7.11s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [15795], local_loss=0.035506926476955414, train_loss=0.0660284161567688, time_cost=2.6246495246887207
+
Steps: 2%|▏ | 15795/1000000 [7:06:16<1942:45:14, 7.11s/it, lr=1e-5, step_loss=0.0355]
Steps: 2%|▏ | 15796/1000000 [7:06:24<2013:37:17, 7.37s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [15796], local_loss=0.004961133934557438, train_loss=0.08630010485649109, time_cost=2.197106122970581
+
Steps: 2%|▏ | 15796/1000000 [7:06:24<2013:37:17, 7.37s/it, lr=1e-5, step_loss=0.00496]
Steps: 2%|▏ | 15797/1000000 [7:06:29<1832:49:22, 6.70s/it, lr=1e-5, step_loss=0.00496][RANK-0]: Step: [15797], local_loss=0.014997045509517193, train_loss=0.03953355550765991, time_cost=2.5344440937042236
+
Steps: 2%|▏ | 15797/1000000 [7:06:29<1832:49:22, 6.70s/it, lr=1e-5, step_loss=0.015]
Steps: 2%|▏ | 15798/1000000 [7:06:40<2169:44:48, 7.94s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [15798], local_loss=0.02740427479147911, train_loss=0.049597594887018204, time_cost=3.750849962234497
+
Steps: 2%|▏ | 15798/1000000 [7:06:40<2169:44:48, 7.94s/it, lr=1e-5, step_loss=0.0274]
Steps: 2%|▏ | 15799/1000000 [7:06:53<2580:01:12, 9.44s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [15799], local_loss=0.03691492974758148, train_loss=0.14272814989089966, time_cost=3.4951140880584717
+
Steps: 2%|▏ | 15799/1000000 [7:06:53<2580:01:12, 9.44s/it, lr=1e-5, step_loss=0.0369]
Steps: 2%|▏ | 15800/1000000 [7:07:05<2777:51:17, 10.16s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [15800], local_loss=0.030332518741488457, train_loss=0.01819407381117344, time_cost=4.321481227874756
+
Steps: 2%|▏ | 15800/1000000 [7:07:05<2777:51:17, 10.16s/it, lr=1e-5, step_loss=0.0303]
Steps: 2%|▏ | 15801/1000000 [7:07:16<2865:33:01, 10.48s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [15801], local_loss=0.012930883094668388, train_loss=0.147706538438797, time_cost=4.147822618484497
+
Steps: 2%|▏ | 15801/1000000 [7:07:16<2865:33:01, 10.48s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 15802/1000000 [7:07:27<2935:21:16, 10.74s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [15802], local_loss=0.006319642532616854, train_loss=0.024095699191093445, time_cost=1.2230274677276611
+
Steps: 2%|▏ | 15802/1000000 [7:07:27<2935:21:16, 10.74s/it, lr=1e-5, step_loss=0.00632]
Steps: 2%|▏ | 15803/1000000 [7:07:41<3171:03:06, 11.60s/it, lr=1e-5, step_loss=0.00632][RANK-0]: Step: [15803], local_loss=0.026379350572824478, train_loss=0.010427793487906456, time_cost=5.390149831771851
+
Steps: 2%|▏ | 15803/1000000 [7:07:41<3171:03:06, 11.60s/it, lr=1e-5, step_loss=0.0264]
Steps: 2%|▏ | 15804/1000000 [7:07:57<3573:45:10, 13.07s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [15804], local_loss=0.23483695089817047, train_loss=0.0805102214217186, time_cost=3.9425649642944336
+
Steps: 2%|▏ | 15804/1000000 [7:07:57<3573:45:10, 13.07s/it, lr=1e-5, step_loss=0.235]
Steps: 2%|▏ | 15805/1000000 [7:08:02<2925:52:10, 10.70s/it, lr=1e-5, step_loss=0.235][RANK-0]: Step: [15805], local_loss=0.12006267160177231, train_loss=0.18064475059509277, time_cost=2.2492592334747314
+
Steps: 2%|▏ | 15805/1000000 [7:08:02<2925:52:10, 10.70s/it, lr=1e-5, step_loss=0.12]
Steps: 2%|▏ | 15806/1000000 [7:08:12<2822:23:51, 10.32s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [15806], local_loss=0.02505560591816902, train_loss=0.01957351341843605, time_cost=3.368546485900879
+
Steps: 2%|▏ | 15806/1000000 [7:08:12<2822:23:51, 10.32s/it, lr=1e-5, step_loss=0.0251]
Steps: 2%|▏ | 15807/1000000 [7:08:19<2532:22:56, 9.26s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [15807], local_loss=0.08879506587982178, train_loss=0.05159233510494232, time_cost=2.398375988006592
+
Steps: 2%|▏ | 15807/1000000 [7:08:19<2532:22:56, 9.26s/it, lr=1e-5, step_loss=0.0888]
Steps: 2%|▏ | 15808/1000000 [7:08:27<2435:17:22, 8.91s/it, lr=1e-5, step_loss=0.0888][RANK-0]: Step: [15808], local_loss=0.007321319077163935, train_loss=0.370235800743103, time_cost=1.2038099765777588
+
Steps: 2%|▏ | 15808/1000000 [7:08:27<2435:17:22, 8.91s/it, lr=1e-5, step_loss=0.00732]
Steps: 2%|▏ | 15809/1000000 [7:08:31<2059:23:59, 7.53s/it, lr=1e-5, step_loss=0.00732][RANK-0]: Step: [15809], local_loss=0.005611302796751261, train_loss=0.040351592004299164, time_cost=1.7856907844543457
+
Steps: 2%|▏ | 15809/1000000 [7:08:31<2059:23:59, 7.53s/it, lr=1e-5, step_loss=0.00561]
Steps: 2%|▏ | 15810/1000000 [7:08:46<2668:29:19, 9.76s/it, lr=1e-5, step_loss=0.00561][RANK-0]: Step: [15810], local_loss=0.0807327851653099, train_loss=0.17645904421806335, time_cost=5.926008701324463
+
Steps: 2%|▏ | 15810/1000000 [7:08:46<2668:29:19, 9.76s/it, lr=1e-5, step_loss=0.0807]
Steps: 2%|▏ | 15811/1000000 [7:08:55<2587:01:04, 9.46s/it, lr=1e-5, step_loss=0.0807][RANK-0]: Step: [15811], local_loss=0.018739497289061546, train_loss=0.01394472923129797, time_cost=1.901534080505371
+
Steps: 2%|▏ | 15811/1000000 [7:08:55<2587:01:04, 9.46s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 15812/1000000 [7:09:02<2396:26:24, 8.77s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [15812], local_loss=0.012757912278175354, train_loss=0.018655190244317055, time_cost=3.0931410789489746
+
Steps: 2%|▏ | 15812/1000000 [7:09:02<2396:26:24, 8.77s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 15813/1000000 [7:09:11<2407:04:10, 8.80s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [15813], local_loss=0.1431559920310974, train_loss=0.041013412177562714, time_cost=1.598313331604004
+
Steps: 2%|▏ | 15813/1000000 [7:09:11<2407:04:10, 8.80s/it, lr=1e-5, step_loss=0.143]
Steps: 2%|▏ | 15814/1000000 [7:09:27<2979:57:23, 10.90s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [15814], local_loss=0.017860781401395798, train_loss=0.049660708755254745, time_cost=8.02898359298706
+
Steps: 2%|▏ | 15814/1000000 [7:09:27<2979:57:23, 10.90s/it, lr=1e-5, step_loss=0.0179]
Steps: 2%|▏ | 15815/1000000 [7:09:36<2896:32:50, 10.60s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [15815], local_loss=0.034565720707178116, train_loss=0.02436082437634468, time_cost=1.2169404029846191
+
Steps: 2%|▏ | 15815/1000000 [7:09:36<2896:32:50, 10.60s/it, lr=1e-5, step_loss=0.0346]
Steps: 2%|▏ | 15816/1000000 [7:09:44<2662:27:26, 9.74s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [15816], local_loss=0.006570631638169289, train_loss=0.054320331662893295, time_cost=1.2000720500946045
+
Steps: 2%|▏ | 15816/1000000 [7:09:44<2662:27:26, 9.74s/it, lr=1e-5, step_loss=0.00657]
Steps: 2%|▏ | 15817/1000000 [7:09:49<2286:59:30, 8.37s/it, lr=1e-5, step_loss=0.00657][RANK-0]: Step: [15817], local_loss=0.018017519265413284, train_loss=0.05453359708189964, time_cost=2.399202346801758
+
Steps: 2%|▏ | 15817/1000000 [7:09:49<2286:59:30, 8.37s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 15818/1000000 [7:09:56<2162:14:04, 7.91s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [15818], local_loss=0.005041285417973995, train_loss=0.028078768402338028, time_cost=5.450855731964111
+
Steps: 2%|▏ | 15818/1000000 [7:09:56<2162:14:04, 7.91s/it, lr=1e-5, step_loss=0.00504]
Steps: 2%|▏ | 15819/1000000 [7:10:03<2038:52:42, 7.46s/it, lr=1e-5, step_loss=0.00504][RANK-0]: Step: [15819], local_loss=0.051996342837810516, train_loss=0.026531212031841278, time_cost=2.744307518005371
+
Steps: 2%|▏ | 15819/1000000 [7:10:03<2038:52:42, 7.46s/it, lr=1e-5, step_loss=0.052]
Steps: 2%|▏ | 15820/1000000 [7:10:09<1924:59:38, 7.04s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [15820], local_loss=0.012271231971681118, train_loss=0.03731778636574745, time_cost=1.6820735931396484
+
Steps: 2%|▏ | 15820/1000000 [7:10:09<1924:59:38, 7.04s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 15821/1000000 [7:10:21<2370:26:56, 8.67s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [15821], local_loss=0.03927244246006012, train_loss=0.04681892693042755, time_cost=9.750208377838135
+
Steps: 2%|▏ | 15821/1000000 [7:10:21<2370:26:56, 8.67s/it, lr=1e-5, step_loss=0.0393]
Steps: 2%|▏ | 15822/1000000 [7:10:31<2431:10:49, 8.89s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [15822], local_loss=0.01468612439930439, train_loss=0.02055039070546627, time_cost=4.464430570602417
+
Steps: 2%|▏ | 15822/1000000 [7:10:31<2431:10:49, 8.89s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 15823/1000000 [7:10:37<2210:07:04, 8.08s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [15823], local_loss=0.008535419590771198, train_loss=0.1475125402212143, time_cost=2.219878911972046
+
Steps: 2%|▏ | 15823/1000000 [7:10:37<2210:07:04, 8.08s/it, lr=1e-5, step_loss=0.00854]
Steps: 2%|▏ | 15824/1000000 [7:10:44<2172:56:53, 7.95s/it, lr=1e-5, step_loss=0.00854][RANK-0]: Step: [15824], local_loss=0.016650814563035965, train_loss=0.03936932981014252, time_cost=3.3890345096588135
+
Steps: 2%|▏ | 15824/1000000 [7:10:44<2172:56:53, 7.95s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 15825/1000000 [7:10:58<2617:28:53, 9.57s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [15825], local_loss=0.009855941869318485, train_loss=0.024183526635169983, time_cost=4.018184185028076
+
Steps: 2%|▏ | 15825/1000000 [7:10:58<2617:28:53, 9.57s/it, lr=1e-5, step_loss=0.00986]
Steps: 2%|▏ | 15826/1000000 [7:11:11<2885:45:02, 10.56s/it, lr=1e-5, step_loss=0.00986][RANK-0]: Step: [15826], local_loss=0.9866058826446533, train_loss=0.1685408055782318, time_cost=4.730274200439453
+
Steps: 2%|▏ | 15826/1000000 [7:11:11<2885:45:02, 10.56s/it, lr=1e-5, step_loss=0.987]
Steps: 2%|▏ | 15827/1000000 [7:11:19<2672:56:11, 9.78s/it, lr=1e-5, step_loss=0.987][RANK-0]: Step: [15827], local_loss=0.021704038605093956, train_loss=0.13780909776687622, time_cost=2.088503122329712
+
Steps: 2%|▏ | 15827/1000000 [7:11:19<2672:56:11, 9.78s/it, lr=1e-5, step_loss=0.0217]
Steps: 2%|▏ | 15828/1000000 [7:11:28<2616:30:33, 9.57s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [15828], local_loss=0.005238850135356188, train_loss=0.2969629466533661, time_cost=2.949209690093994
+
Steps: 2%|▏ | 15828/1000000 [7:11:28<2616:30:33, 9.57s/it, lr=1e-5, step_loss=0.00524]
Steps: 2%|▏ | 15829/1000000 [7:11:32<2224:03:31, 8.14s/it, lr=1e-5, step_loss=0.00524][RANK-0]: Step: [15829], local_loss=0.018686888739466667, train_loss=0.02475622668862343, time_cost=1.606595516204834
+
Steps: 2%|▏ | 15829/1000000 [7:11:32<2224:03:31, 8.14s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 15830/1000000 [7:11:39<2119:21:20, 7.75s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [15830], local_loss=0.014276907779276371, train_loss=0.08082423359155655, time_cost=2.828489065170288
+
Steps: 2%|▏ | 15830/1000000 [7:11:39<2119:21:20, 7.75s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 15831/1000000 [7:11:53<2570:54:24, 9.40s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [15831], local_loss=0.03921491652727127, train_loss=0.04597380384802818, time_cost=3.515293598175049
+
Steps: 2%|▏ | 15831/1000000 [7:11:53<2570:54:24, 9.40s/it, lr=1e-5, step_loss=0.0392]
Steps: 2%|▏ | 15832/1000000 [7:12:01<2531:16:08, 9.26s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [15832], local_loss=0.022866560146212578, train_loss=0.020350679755210876, time_cost=3.8575215339660645
+
Steps: 2%|▏ | 15832/1000000 [7:12:01<2531:16:08, 9.26s/it, lr=1e-5, step_loss=0.0229]
Steps: 2%|▏ | 15833/1000000 [7:12:10<2505:14:21, 9.16s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [15833], local_loss=0.10355741530656815, train_loss=0.03008255735039711, time_cost=4.020882844924927
+
Steps: 2%|▏ | 15833/1000000 [7:12:10<2505:14:21, 9.16s/it, lr=1e-5, step_loss=0.104]
Steps: 2%|▏ | 15834/1000000 [7:12:16<2176:44:00, 7.96s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [15834], local_loss=0.008136272430419922, train_loss=0.058433905243873596, time_cost=2.2198257446289062
+
Steps: 2%|▏ | 15834/1000000 [7:12:16<2176:44:00, 7.96s/it, lr=1e-5, step_loss=0.00814]
Steps: 2%|▏ | 15835/1000000 [7:12:29<2664:11:39, 9.75s/it, lr=1e-5, step_loss=0.00814][RANK-0]: Step: [15835], local_loss=0.05131911113858223, train_loss=0.03989911079406738, time_cost=6.187565326690674
+
Steps: 2%|▏ | 15835/1000000 [7:12:29<2664:11:39, 9.75s/it, lr=1e-5, step_loss=0.0513]
Steps: 2%|▏ | 15836/1000000 [7:12:39<2641:49:41, 9.66s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [15836], local_loss=0.015428372658789158, train_loss=0.02300112321972847, time_cost=4.588841199874878
+
Steps: 2%|▏ | 15836/1000000 [7:12:39<2641:49:41, 9.66s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 15837/1000000 [7:12:52<2935:08:20, 10.74s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [15837], local_loss=0.01775353215634823, train_loss=0.05435473844408989, time_cost=3.7620201110839844
+
Steps: 2%|▏ | 15837/1000000 [7:12:52<2935:08:20, 10.74s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 15838/1000000 [7:13:11<3628:44:09, 13.27s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [15838], local_loss=0.30017927289009094, train_loss=0.06160765513777733, time_cost=11.489189624786377
+
Steps: 2%|▏ | 15838/1000000 [7:13:11<3628:44:09, 13.27s/it, lr=1e-5, step_loss=0.3]
Steps: 2%|▏ | 15839/1000000 [7:13:24<3617:11:40, 13.23s/it, lr=1e-5, step_loss=0.3][RANK-0]: Step: [15839], local_loss=0.02196141704916954, train_loss=0.01910526491701603, time_cost=5.436241149902344
+
Steps: 2%|▏ | 15839/1000000 [7:13:25<3617:11:40, 13.23s/it, lr=1e-5, step_loss=0.022]
Steps: 2%|▏ | 15840/1000000 [7:13:31<3031:07:57, 11.09s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [15840], local_loss=0.0093853659927845, train_loss=0.01779850199818611, time_cost=1.7147228717803955
+
Steps: 2%|▏ | 15840/1000000 [7:13:31<3031:07:57, 11.09s/it, lr=1e-5, step_loss=0.00939]
Steps: 2%|▏ | 15841/1000000 [7:13:42<3079:29:09, 11.26s/it, lr=1e-5, step_loss=0.00939][RANK-0]: Step: [15841], local_loss=0.005490442737936974, train_loss=0.03006073273718357, time_cost=4.3439929485321045
+
Steps: 2%|▏ | 15841/1000000 [7:13:42<3079:29:09, 11.26s/it, lr=1e-5, step_loss=0.00549]
Steps: 2%|▏ | 15842/1000000 [7:13:46<2491:40:53, 9.11s/it, lr=1e-5, step_loss=0.00549][RANK-0]: Step: [15842], local_loss=0.00808640941977501, train_loss=2.3263490200042725, time_cost=1.304781198501587
+
Steps: 2%|▏ | 15842/1000000 [7:13:46<2491:40:53, 9.11s/it, lr=1e-5, step_loss=0.00809]
Steps: 2%|▏ | 15843/1000000 [7:13:53<2313:24:44, 8.46s/it, lr=1e-5, step_loss=0.00809][RANK-0]: Step: [15843], local_loss=0.00936136394739151, train_loss=0.04709547758102417, time_cost=2.818394899368286
+
Steps: 2%|▏ | 15843/1000000 [7:13:53<2313:24:44, 8.46s/it, lr=1e-5, step_loss=0.00936]
Steps: 2%|▏ | 15844/1000000 [7:14:06<2666:47:32, 9.76s/it, lr=1e-5, step_loss=0.00936][RANK-0]: Step: [15844], local_loss=0.026196269318461418, train_loss=0.04172617197036743, time_cost=3.9446940422058105
+
Steps: 2%|▏ | 15844/1000000 [7:14:06<2666:47:32, 9.76s/it, lr=1e-5, step_loss=0.0262]
Steps: 2%|▏ | 15845/1000000 [7:14:12<2357:42:57, 8.62s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [15845], local_loss=0.006430803798139095, train_loss=0.02656291052699089, time_cost=1.638371229171753
+
Steps: 2%|▏ | 15845/1000000 [7:14:12<2357:42:57, 8.62s/it, lr=1e-5, step_loss=0.00643]
Steps: 2%|▏ | 15846/1000000 [7:14:17<2069:15:41, 7.57s/it, lr=1e-5, step_loss=0.00643][RANK-0]: Step: [15846], local_loss=0.07766292244195938, train_loss=0.06143302470445633, time_cost=2.3519937992095947
+
Steps: 2%|▏ | 15846/1000000 [7:14:17<2069:15:41, 7.57s/it, lr=1e-5, step_loss=0.0777]
Steps: 2%|▏ | 15847/1000000 [7:14:22<1810:39:22, 6.62s/it, lr=1e-5, step_loss=0.0777][RANK-0]: Step: [15847], local_loss=0.010709822177886963, train_loss=0.041762351989746094, time_cost=2.4979586601257324
+
Steps: 2%|▏ | 15847/1000000 [7:14:22<1810:39:22, 6.62s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 15848/1000000 [7:14:27<1681:40:51, 6.15s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [15848], local_loss=0.0626886785030365, train_loss=0.09039010107517242, time_cost=2.0676217079162598
+
Steps: 2%|▏ | 15848/1000000 [7:14:27<1681:40:51, 6.15s/it, lr=1e-5, step_loss=0.0627]
Steps: 2%|▏ | 15849/1000000 [7:14:32<1611:14:40, 5.89s/it, lr=1e-5, step_loss=0.0627][RANK-0]: Step: [15849], local_loss=0.013151310384273529, train_loss=0.028148524463176727, time_cost=2.6778128147125244
+
Steps: 2%|▏ | 15849/1000000 [7:14:32<1611:14:40, 5.89s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 15850/1000000 [7:14:37<1506:16:27, 5.51s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [15850], local_loss=0.010085965506732464, train_loss=15.253700256347656, time_cost=1.7635533809661865
+
Steps: 2%|▏ | 15850/1000000 [7:14:37<1506:16:27, 5.51s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 15851/1000000 [7:14:48<2027:03:21, 7.41s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [15851], local_loss=0.030110865831375122, train_loss=0.05142369866371155, time_cost=3.142441749572754
+
Steps: 2%|▏ | 15851/1000000 [7:14:48<2027:03:21, 7.41s/it, lr=1e-5, step_loss=0.0301]
Steps: 2%|▏ | 15852/1000000 [7:15:00<2369:57:42, 8.67s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [15852], local_loss=0.002944616600871086, train_loss=0.028878718614578247, time_cost=5.579157829284668
+
Steps: 2%|▏ | 15852/1000000 [7:15:00<2369:57:42, 8.67s/it, lr=1e-5, step_loss=0.00294]
Steps: 2%|▏ | 15853/1000000 [7:15:13<2757:59:56, 10.09s/it, lr=1e-5, step_loss=0.00294][RANK-0]: Step: [15853], local_loss=0.017180142924189568, train_loss=0.048701219260692596, time_cost=4.856853723526001
+
Steps: 2%|▏ | 15853/1000000 [7:15:13<2757:59:56, 10.09s/it, lr=1e-5, step_loss=0.0172]
Steps: 2%|▏ | 15854/1000000 [7:15:31<3359:41:14, 12.29s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [15854], local_loss=0.011581959202885628, train_loss=0.059140197932720184, time_cost=9.590225219726562
+
Steps: 2%|▏ | 15854/1000000 [7:15:31<3359:41:14, 12.29s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 15855/1000000 [7:15:38<2915:56:54, 10.67s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [15855], local_loss=0.5339992046356201, train_loss=0.10101793706417084, time_cost=1.2513396739959717
+
Steps: 2%|▏ | 15855/1000000 [7:15:38<2915:56:54, 10.67s/it, lr=1e-5, step_loss=0.534]
Steps: 2%|▏ | 15856/1000000 [7:15:46<2761:32:53, 10.10s/it, lr=1e-5, step_loss=0.534][RANK-0]: Step: [15856], local_loss=0.029629886150360107, train_loss=0.04731852933764458, time_cost=3.8001549243927
+
Steps: 2%|▏ | 15856/1000000 [7:15:46<2761:32:53, 10.10s/it, lr=1e-5, step_loss=0.0296]
Steps: 2%|▏ | 15857/1000000 [7:15:51<2293:36:57, 8.39s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [15857], local_loss=0.011756474152207375, train_loss=0.03614422306418419, time_cost=1.258237600326538
+
Steps: 2%|▏ | 15857/1000000 [7:15:51<2293:36:57, 8.39s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 15858/1000000 [7:16:01<2473:39:18, 9.05s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [15858], local_loss=0.05937562882900238, train_loss=0.024183904752135277, time_cost=7.892149925231934
+
Steps: 2%|▏ | 15858/1000000 [7:16:01<2473:39:18, 9.05s/it, lr=1e-5, step_loss=0.0594]
Steps: 2%|▏ | 15859/1000000 [7:16:06<2142:43:56, 7.84s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [15859], local_loss=0.04135172814130783, train_loss=0.0716198980808258, time_cost=1.388263463973999
+
Steps: 2%|▏ | 15859/1000000 [7:16:06<2142:43:56, 7.84s/it, lr=1e-5, step_loss=0.0414]
Steps: 2%|▏ | 15860/1000000 [7:16:15<2224:53:16, 8.14s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [15860], local_loss=0.005571606568992138, train_loss=6.568836688995361, time_cost=3.4422779083251953
+
Steps: 2%|▏ | 15860/1000000 [7:16:15<2224:53:16, 8.14s/it, lr=1e-5, step_loss=0.00557]
Steps: 2%|▏ | 15861/1000000 [7:16:24<2240:44:19, 8.20s/it, lr=1e-5, step_loss=0.00557][RANK-0]: Step: [15861], local_loss=0.02088829316198826, train_loss=0.03766125813126564, time_cost=1.9060180187225342
+
Steps: 2%|▏ | 15861/1000000 [7:16:24<2240:44:19, 8.20s/it, lr=1e-5, step_loss=0.0209]
Steps: 2%|▏ | 15862/1000000 [7:16:29<2042:24:54, 7.47s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [15862], local_loss=0.09996034950017929, train_loss=0.053009793162345886, time_cost=3.0564017295837402
+
Steps: 2%|▏ | 15862/1000000 [7:16:29<2042:24:54, 7.47s/it, lr=1e-5, step_loss=0.1]
Steps: 2%|▏ | 15863/1000000 [7:16:38<2171:42:40, 7.94s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [15863], local_loss=0.029849113896489143, train_loss=0.08371643722057343, time_cost=4.092092275619507
+
Steps: 2%|▏ | 15863/1000000 [7:16:38<2171:42:40, 7.94s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 15864/1000000 [7:16:43<1925:50:29, 7.04s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [15864], local_loss=0.026365628466010094, train_loss=0.07446345686912537, time_cost=1.9284982681274414
+
Steps: 2%|▏ | 15864/1000000 [7:16:43<1925:50:29, 7.04s/it, lr=1e-5, step_loss=0.0264]
Steps: 2%|▏ | 15865/1000000 [7:16:53<2096:48:02, 7.67s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [15865], local_loss=0.3455418348312378, train_loss=0.08199647068977356, time_cost=6.901127815246582
+
Steps: 2%|▏ | 15865/1000000 [7:16:53<2096:48:02, 7.67s/it, lr=1e-5, step_loss=0.346]
Steps: 2%|▏ | 15866/1000000 [7:16:58<1940:13:27, 7.10s/it, lr=1e-5, step_loss=0.346][RANK-0]: Step: [15866], local_loss=0.02677621692419052, train_loss=0.03265274688601494, time_cost=1.4479994773864746
+
Steps: 2%|▏ | 15866/1000000 [7:16:58<1940:13:27, 7.10s/it, lr=1e-5, step_loss=0.0268]
Steps: 2%|▏ | 15867/1000000 [7:17:04<1801:27:59, 6.59s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [15867], local_loss=0.006646594498306513, train_loss=0.022153440862894058, time_cost=2.445481300354004
+
Steps: 2%|▏ | 15867/1000000 [7:17:04<1801:27:59, 6.59s/it, lr=1e-5, step_loss=0.00665]
Steps: 2%|▏ | 15868/1000000 [7:17:18<2412:33:00, 8.83s/it, lr=1e-5, step_loss=0.00665][RANK-0]: Step: [15868], local_loss=0.010460607707500458, train_loss=0.0829227939248085, time_cost=3.8549234867095947
+
Steps: 2%|▏ | 15868/1000000 [7:17:18<2412:33:00, 8.83s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 15869/1000000 [7:17:27<2479:09:40, 9.07s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [15869], local_loss=0.00400576600804925, train_loss=0.019289979711174965, time_cost=3.8419172763824463
+
Steps: 2%|▏ | 15869/1000000 [7:17:27<2479:09:40, 9.07s/it, lr=1e-5, step_loss=0.00401]
Steps: 2%|▏ | 15870/1000000 [7:17:33<2184:46:08, 7.99s/it, lr=1e-5, step_loss=0.00401][RANK-0]: Step: [15870], local_loss=0.150493323802948, train_loss=0.07970689982175827, time_cost=3.0304720401763916
+
Steps: 2%|▏ | 15870/1000000 [7:17:33<2184:46:08, 7.99s/it, lr=1e-5, step_loss=0.15]
Steps: 2%|▏ | 15871/1000000 [7:17:41<2162:56:25, 7.91s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [15871], local_loss=0.004342464730143547, train_loss=0.02291666902601719, time_cost=1.9518327713012695
+
Steps: 2%|▏ | 15871/1000000 [7:17:41<2162:56:25, 7.91s/it, lr=1e-5, step_loss=0.00434]
Steps: 2%|▏ | 15872/1000000 [7:17:49<2175:44:55, 7.96s/it, lr=1e-5, step_loss=0.00434][RANK-0]: Step: [15872], local_loss=0.015740159898996353, train_loss=0.04284774884581566, time_cost=6.9934375286102295
+
Steps: 2%|▏ | 15872/1000000 [7:17:49<2175:44:55, 7.96s/it, lr=1e-5, step_loss=0.0157]
Steps: 2%|▏ | 15873/1000000 [7:18:01<2541:40:42, 9.30s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [15873], local_loss=0.02732919529080391, train_loss=0.08305799961090088, time_cost=5.431213617324829
+
Steps: 2%|▏ | 15873/1000000 [7:18:01<2541:40:42, 9.30s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 15874/1000000 [7:18:14<2826:32:53, 10.34s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [15874], local_loss=0.006416783668100834, train_loss=0.028213173151016235, time_cost=1.279404878616333
+
Steps: 2%|▏ | 15874/1000000 [7:18:14<2826:32:53, 10.34s/it, lr=1e-5, step_loss=0.00642]
Steps: 2%|▏ | 15875/1000000 [7:18:25<2901:37:43, 10.61s/it, lr=1e-5, step_loss=0.00642][RANK-0]: Step: [15875], local_loss=0.05036509037017822, train_loss=0.024755921214818954, time_cost=6.390347003936768
+
Steps: 2%|▏ | 15875/1000000 [7:18:25<2901:37:43, 10.61s/it, lr=1e-5, step_loss=0.0504]
Steps: 2%|▏ | 15876/1000000 [7:18:35<2819:32:52, 10.31s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [15876], local_loss=0.023787949234247208, train_loss=0.03230518475174904, time_cost=3.785306453704834
+
Steps: 2%|▏ | 15876/1000000 [7:18:35<2819:32:52, 10.31s/it, lr=1e-5, step_loss=0.0238]
Steps: 2%|▏ | 15877/1000000 [7:18:39<2338:36:59, 8.55s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [15877], local_loss=0.05762121081352234, train_loss=0.03858920931816101, time_cost=1.5283682346343994
+
Steps: 2%|▏ | 15877/1000000 [7:18:39<2338:36:59, 8.55s/it, lr=1e-5, step_loss=0.0576]
Steps: 2%|▏ | 15878/1000000 [7:18:46<2218:25:17, 8.12s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [15878], local_loss=0.24738310277462006, train_loss=0.0651167556643486, time_cost=2.789628028869629
+
Steps: 2%|▏ | 15878/1000000 [7:18:46<2218:25:17, 8.12s/it, lr=1e-5, step_loss=0.247]
Steps: 2%|▏ | 15879/1000000 [7:18:52<1982:24:02, 7.25s/it, lr=1e-5, step_loss=0.247][RANK-0]: Step: [15879], local_loss=0.9903803467750549, train_loss=5.5336127281188965, time_cost=2.433084726333618
+
Steps: 2%|▏ | 15879/1000000 [7:18:52<1982:24:02, 7.25s/it, lr=1e-5, step_loss=0.99]
Steps: 2%|▏ | 15880/1000000 [7:19:02<2245:43:44, 8.22s/it, lr=1e-5, step_loss=0.99][RANK-0]: Step: [15880], local_loss=0.031230861321091652, train_loss=22.725830078125, time_cost=3.945265531539917
+
Steps: 2%|▏ | 15880/1000000 [7:19:02<2245:43:44, 8.22s/it, lr=1e-5, step_loss=0.0312]
Steps: 2%|▏ | 15881/1000000 [7:19:13<2438:31:05, 8.92s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [15881], local_loss=0.004649188369512558, train_loss=0.025721769779920578, time_cost=2.149538040161133
+
Steps: 2%|▏ | 15881/1000000 [7:19:13<2438:31:05, 8.92s/it, lr=1e-5, step_loss=0.00465]
Steps: 2%|▏ | 15882/1000000 [7:19:20<2296:54:52, 8.40s/it, lr=1e-5, step_loss=0.00465][RANK-0]: Step: [15882], local_loss=0.020779002457857132, train_loss=0.019766798242926598, time_cost=2.5577874183654785
+
Steps: 2%|▏ | 15882/1000000 [7:19:20<2296:54:52, 8.40s/it, lr=1e-5, step_loss=0.0208]
Steps: 2%|▏ | 15883/1000000 [7:19:26<2102:00:21, 7.69s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [15883], local_loss=0.11998428404331207, train_loss=0.05261821299791336, time_cost=1.4856846332550049
+
Steps: 2%|▏ | 15883/1000000 [7:19:26<2102:00:21, 7.69s/it, lr=1e-5, step_loss=0.12]
Steps: 2%|▏ | 15884/1000000 [7:19:38<2445:23:15, 8.95s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [15884], local_loss=0.1321660280227661, train_loss=0.08412449806928635, time_cost=1.9869849681854248
+
Steps: 2%|▏ | 15884/1000000 [7:19:38<2445:23:15, 8.95s/it, lr=1e-5, step_loss=0.132]
Steps: 2%|▏ | 15885/1000000 [7:19:49<2603:20:26, 9.52s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [15885], local_loss=0.05515943840146065, train_loss=0.034808121621608734, time_cost=1.639575719833374
+
Steps: 2%|▏ | 15885/1000000 [7:19:49<2603:20:26, 9.52s/it, lr=1e-5, step_loss=0.0552]
Steps: 2%|▏ | 15886/1000000 [7:20:03<2994:08:36, 10.95s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [15886], local_loss=0.018405944108963013, train_loss=0.02068670466542244, time_cost=5.536935329437256
+
Steps: 2%|▏ | 15886/1000000 [7:20:03<2994:08:36, 10.95s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 15887/1000000 [7:20:10<2657:35:00, 9.72s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [15887], local_loss=0.032154496759176254, train_loss=0.04592323303222656, time_cost=2.2650163173675537
+
Steps: 2%|▏ | 15887/1000000 [7:20:10<2657:35:00, 9.72s/it, lr=1e-5, step_loss=0.0322]
Steps: 2%|▏ | 15888/1000000 [7:20:26<3166:49:14, 11.58s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [15888], local_loss=0.020136933773756027, train_loss=0.01945589855313301, time_cost=1.2169897556304932
+
Steps: 2%|▏ | 15888/1000000 [7:20:26<3166:49:14, 11.58s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 15889/1000000 [7:20:37<3167:13:37, 11.59s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [15889], local_loss=0.007922022603452206, train_loss=0.048144400119781494, time_cost=4.0944178104400635
+
Steps: 2%|▏ | 15889/1000000 [7:20:37<3167:13:37, 11.59s/it, lr=1e-5, step_loss=0.00792]
Steps: 2%|▏ | 15890/1000000 [7:20:50<3254:15:25, 11.90s/it, lr=1e-5, step_loss=0.00792][RANK-0]: Step: [15890], local_loss=0.00384441833011806, train_loss=0.04100130498409271, time_cost=4.459993839263916
+
Steps: 2%|▏ | 15890/1000000 [7:20:50<3254:15:25, 11.90s/it, lr=1e-5, step_loss=0.00384]
Steps: 2%|▏ | 15891/1000000 [7:21:05<3556:40:24, 13.01s/it, lr=1e-5, step_loss=0.00384][RANK-0]: Step: [15891], local_loss=0.0048432182520627975, train_loss=0.01894208788871765, time_cost=7.697935581207275
+
Steps: 2%|▏ | 15891/1000000 [7:21:05<3556:40:24, 13.01s/it, lr=1e-5, step_loss=0.00484]
Steps: 2%|▏ | 15892/1000000 [7:21:21<3791:22:49, 13.87s/it, lr=1e-5, step_loss=0.00484][RANK-0]: Step: [15892], local_loss=0.014268001541495323, train_loss=0.025588318705558777, time_cost=7.934008836746216
+
Steps: 2%|▏ | 15892/1000000 [7:21:21<3791:22:49, 13.87s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 15893/1000000 [7:21:34<3711:29:22, 13.58s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [15893], local_loss=0.06875233352184296, train_loss=0.045803003013134, time_cost=1.194695234298706
+
Steps: 2%|▏ | 15893/1000000 [7:21:34<3711:29:22, 13.58s/it, lr=1e-5, step_loss=0.0688]
Steps: 2%|▏ | 15894/1000000 [7:21:50<3876:13:08, 14.18s/it, lr=1e-5, step_loss=0.0688][RANK-0]: Step: [15894], local_loss=0.018710549920797348, train_loss=0.03204407915472984, time_cost=6.548158645629883
+
Steps: 2%|▏ | 15894/1000000 [7:21:50<3876:13:08, 14.18s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 15895/1000000 [7:21:57<3331:20:27, 12.19s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [15895], local_loss=0.008311169221997261, train_loss=0.08189088106155396, time_cost=5.669911623001099
+
Steps: 2%|▏ | 15895/1000000 [7:21:57<3331:20:27, 12.19s/it, lr=1e-5, step_loss=0.00831]
Steps: 2%|▏ | 15896/1000000 [7:22:07<3101:19:15, 11.35s/it, lr=1e-5, step_loss=0.00831][RANK-0]: Step: [15896], local_loss=0.10263021290302277, train_loss=0.0690465122461319, time_cost=2.696242094039917
+
Steps: 2%|▏ | 15896/1000000 [7:22:07<3101:19:15, 11.35s/it, lr=1e-5, step_loss=0.103]
Steps: 2%|▏ | 15897/1000000 [7:22:17<3045:48:01, 11.14s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [15897], local_loss=0.0037642705719918013, train_loss=0.04403303563594818, time_cost=1.3104469776153564
+
Steps: 2%|▏ | 15897/1000000 [7:22:17<3045:48:01, 11.14s/it, lr=1e-5, step_loss=0.00376]
Steps: 2%|▏ | 15898/1000000 [7:22:29<3058:22:15, 11.19s/it, lr=1e-5, step_loss=0.00376][RANK-0]: Step: [15898], local_loss=0.025471702218055725, train_loss=0.031433649361133575, time_cost=2.293361186981201
+
Steps: 2%|▏ | 15898/1000000 [7:22:29<3058:22:15, 11.19s/it, lr=1e-5, step_loss=0.0255]
Steps: 2%|▏ | 15899/1000000 [7:22:38<2894:09:27, 10.59s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [15899], local_loss=0.042420752346515656, train_loss=0.05465768277645111, time_cost=3.6178901195526123
+
Steps: 2%|▏ | 15899/1000000 [7:22:38<2894:09:27, 10.59s/it, lr=1e-5, step_loss=0.0424]
Steps: 2%|▏ | 15900/1000000 [7:22:54<3382:04:00, 12.37s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [15900], local_loss=0.003601754317060113, train_loss=0.038615502417087555, time_cost=3.162317991256714
+
Steps: 2%|▏ | 15900/1000000 [7:22:54<3382:04:00, 12.37s/it, lr=1e-5, step_loss=0.0036]
Steps: 2%|▏ | 15901/1000000 [7:23:03<3106:49:14, 11.37s/it, lr=1e-5, step_loss=0.0036][RANK-0]: Step: [15901], local_loss=0.00972465705126524, train_loss=0.03256125748157501, time_cost=1.19773530960083
+
Steps: 2%|▏ | 15901/1000000 [7:23:03<3106:49:14, 11.37s/it, lr=1e-5, step_loss=0.00972]
Steps: 2%|▏ | 15902/1000000 [7:23:14<3061:36:51, 11.20s/it, lr=1e-5, step_loss=0.00972][RANK-0]: Step: [15902], local_loss=0.01428443007171154, train_loss=0.04603435471653938, time_cost=1.9335863590240479
+
Steps: 2%|▏ | 15902/1000000 [7:23:14<3061:36:51, 11.20s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 15903/1000000 [7:23:28<3253:24:15, 11.90s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [15903], local_loss=0.040543440729379654, train_loss=0.02573184296488762, time_cost=5.192200183868408
+
Steps: 2%|▏ | 15903/1000000 [7:23:28<3253:24:15, 11.90s/it, lr=1e-5, step_loss=0.0405]
Steps: 2%|▏ | 15904/1000000 [7:23:42<3472:34:30, 12.70s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [15904], local_loss=0.006291577126830816, train_loss=0.022230859845876694, time_cost=4.729061841964722
+
Steps: 2%|▏ | 15904/1000000 [7:23:42<3472:34:30, 12.70s/it, lr=1e-5, step_loss=0.00629]
Steps: 2%|▏ | 15905/1000000 [7:23:48<2904:17:23, 10.62s/it, lr=1e-5, step_loss=0.00629][RANK-0]: Step: [15905], local_loss=0.013854634016752243, train_loss=0.030532274395227432, time_cost=1.6108317375183105
+
Steps: 2%|▏ | 15905/1000000 [7:23:48<2904:17:23, 10.62s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 15906/1000000 [7:23:59<2899:02:55, 10.61s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [15906], local_loss=0.043832629919052124, train_loss=0.036599863320589066, time_cost=1.4168477058410645
+
Steps: 2%|▏ | 15906/1000000 [7:23:59<2899:02:55, 10.61s/it, lr=1e-5, step_loss=0.0438]
Steps: 2%|▏ | 15907/1000000 [7:24:11<3027:22:04, 11.07s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [15907], local_loss=0.035864658653736115, train_loss=0.15049995481967926, time_cost=10.293687343597412
+
Steps: 2%|▏ | 15907/1000000 [7:24:11<3027:22:04, 11.07s/it, lr=1e-5, step_loss=0.0359]
Steps: 2%|▏ | 15908/1000000 [7:24:22<3031:33:37, 11.09s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [15908], local_loss=0.014805831015110016, train_loss=0.027817795053124428, time_cost=1.5671319961547852
+
Steps: 2%|▏ | 15908/1000000 [7:24:22<3031:33:37, 11.09s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 15909/1000000 [7:24:34<3107:10:23, 11.37s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [15909], local_loss=0.01003951020538807, train_loss=0.01926681213080883, time_cost=4.988138198852539
+
Steps: 2%|▏ | 15909/1000000 [7:24:34<3107:10:23, 11.37s/it, lr=1e-5, step_loss=0.01]
Steps: 2%|▏ | 15910/1000000 [7:24:48<3347:13:48, 12.24s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [15910], local_loss=0.019601216539740562, train_loss=0.017190415412187576, time_cost=5.1779561042785645
+
Steps: 2%|▏ | 15910/1000000 [7:24:48<3347:13:48, 12.24s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 15911/1000000 [7:25:02<3439:46:01, 12.58s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [15911], local_loss=0.033267270773649216, train_loss=0.017471497878432274, time_cost=4.162729024887085
+
Steps: 2%|▏ | 15911/1000000 [7:25:02<3439:46:01, 12.58s/it, lr=1e-5, step_loss=0.0333]
Steps: 2%|▏ | 15912/1000000 [7:25:16<3571:50:49, 13.07s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [15912], local_loss=0.0039413440972566605, train_loss=0.19115041196346283, time_cost=4.416933298110962
+
Steps: 2%|▏ | 15912/1000000 [7:25:16<3571:50:49, 13.07s/it, lr=1e-5, step_loss=0.00394]
Steps: 2%|▏ | 15913/1000000 [7:25:21<2965:43:04, 10.85s/it, lr=1e-5, step_loss=0.00394][RANK-0]: Step: [15913], local_loss=0.007094584871083498, train_loss=0.07598944008350372, time_cost=3.2862088680267334
+
Steps: 2%|▏ | 15913/1000000 [7:25:21<2965:43:04, 10.85s/it, lr=1e-5, step_loss=0.00709]
Steps: 2%|▏ | 15914/1000000 [7:25:31<2885:27:51, 10.56s/it, lr=1e-5, step_loss=0.00709][RANK-0]: Step: [15914], local_loss=0.025991685688495636, train_loss=0.034265659749507904, time_cost=8.255929231643677
+
Steps: 2%|▏ | 15914/1000000 [7:25:31<2885:27:51, 10.56s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 15915/1000000 [7:25:37<2443:44:52, 8.94s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [15915], local_loss=0.028232436627149582, train_loss=7.531770706176758, time_cost=1.4809350967407227
+
Steps: 2%|▏ | 15915/1000000 [7:25:37<2443:44:52, 8.94s/it, lr=1e-5, step_loss=0.0282]
Steps: 2%|▏ | 15916/1000000 [7:25:42<2130:49:25, 7.80s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [15916], local_loss=0.09683000296354294, train_loss=0.07316228002309799, time_cost=1.217301607131958
+
Steps: 2%|▏ | 15916/1000000 [7:25:42<2130:49:25, 7.80s/it, lr=1e-5, step_loss=0.0968]
Steps: 2%|▏ | 15917/1000000 [7:25:47<1910:43:20, 6.99s/it, lr=1e-5, step_loss=0.0968][RANK-0]: Step: [15917], local_loss=1.008563756942749, train_loss=0.21328353881835938, time_cost=2.3450701236724854
+
Steps: 2%|▏ | 15917/1000000 [7:25:47<1910:43:20, 6.99s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 15918/1000000 [7:25:56<2096:05:00, 7.67s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [15918], local_loss=0.032269030809402466, train_loss=0.1444023847579956, time_cost=2.937246561050415
+
Steps: 2%|▏ | 15918/1000000 [7:25:56<2096:05:00, 7.67s/it, lr=1e-5, step_loss=0.0323]
Steps: 2%|▏ | 15919/1000000 [7:26:05<2206:53:04, 8.07s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [15919], local_loss=0.19270867109298706, train_loss=0.18449822068214417, time_cost=2.5522730350494385
+
Steps: 2%|▏ | 15919/1000000 [7:26:05<2206:53:04, 8.07s/it, lr=1e-5, step_loss=0.193]
Steps: 2%|▏ | 15920/1000000 [7:26:14<2283:42:03, 8.35s/it, lr=1e-5, step_loss=0.193][RANK-0]: Step: [15920], local_loss=0.05572609230875969, train_loss=0.09702664613723755, time_cost=3.387089967727661
+
Steps: 2%|▏ | 15920/1000000 [7:26:14<2283:42:03, 8.35s/it, lr=1e-5, step_loss=0.0557]
Steps: 2%|▏ | 15921/1000000 [7:26:19<2033:11:35, 7.44s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [15921], local_loss=0.008529057726264, train_loss=12.725628852844238, time_cost=1.3758633136749268
+
Steps: 2%|▏ | 15921/1000000 [7:26:19<2033:11:35, 7.44s/it, lr=1e-5, step_loss=0.00853]
Steps: 2%|▏ | 15922/1000000 [7:26:26<2004:54:01, 7.33s/it, lr=1e-5, step_loss=0.00853][RANK-0]: Step: [15922], local_loss=0.007727963849902153, train_loss=0.029730230569839478, time_cost=2.9654250144958496
+
Steps: 2%|▏ | 15922/1000000 [7:26:26<2004:54:01, 7.33s/it, lr=1e-5, step_loss=0.00773]
Steps: 2%|▏ | 15923/1000000 [7:26:31<1766:32:51, 6.46s/it, lr=1e-5, step_loss=0.00773][RANK-0]: Step: [15923], local_loss=0.01335521787405014, train_loss=0.021777864545583725, time_cost=1.461961030960083
+
Steps: 2%|▏ | 15923/1000000 [7:26:31<1766:32:51, 6.46s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 15924/1000000 [7:26:40<1976:02:25, 7.23s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [15924], local_loss=0.018600720912218094, train_loss=0.03219493851065636, time_cost=1.3335449695587158
+
Steps: 2%|▏ | 15924/1000000 [7:26:40<1976:02:25, 7.23s/it, lr=1e-5, step_loss=0.0186]
Steps: 2%|▏ | 15925/1000000 [7:26:54<2527:44:50, 9.25s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [15925], local_loss=0.08028245717287064, train_loss=0.04349616914987564, time_cost=5.176512241363525
+
Steps: 2%|▏ | 15925/1000000 [7:26:54<2527:44:50, 9.25s/it, lr=1e-5, step_loss=0.0803]
Steps: 2%|▏ | 15926/1000000 [7:27:04<2629:27:47, 9.62s/it, lr=1e-5, step_loss=0.0803][RANK-0]: Step: [15926], local_loss=0.007487480528652668, train_loss=0.037947364151477814, time_cost=3.4430649280548096
+
Steps: 2%|▏ | 15926/1000000 [7:27:04<2629:27:47, 9.62s/it, lr=1e-5, step_loss=0.00749]
Steps: 2%|▏ | 15927/1000000 [7:27:18<2941:11:33, 10.76s/it, lr=1e-5, step_loss=0.00749][RANK-0]: Step: [15927], local_loss=0.0282516460865736, train_loss=0.028140457347035408, time_cost=4.434481143951416
+
Steps: 2%|▏ | 15927/1000000 [7:27:18<2941:11:33, 10.76s/it, lr=1e-5, step_loss=0.0283]
Steps: 2%|▏ | 15928/1000000 [7:27:29<2984:58:53, 10.92s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [15928], local_loss=0.012700676918029785, train_loss=0.14133119583129883, time_cost=3.8868212699890137
+
Steps: 2%|▏ | 15928/1000000 [7:27:29<2984:58:53, 10.92s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 15929/1000000 [7:27:42<3127:05:55, 11.44s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [15929], local_loss=0.09932199120521545, train_loss=0.03128720074892044, time_cost=6.212043285369873
+
Steps: 2%|▏ | 15929/1000000 [7:27:42<3127:05:55, 11.44s/it, lr=1e-5, step_loss=0.0993]
Steps: 2%|▏ | 15930/1000000 [7:27:55<3306:33:42, 12.10s/it, lr=1e-5, step_loss=0.0993][RANK-0]: Step: [15930], local_loss=0.09687598794698715, train_loss=0.05751561373472214, time_cost=4.659416198730469
+
Steps: 2%|▏ | 15930/1000000 [7:27:55<3306:33:42, 12.10s/it, lr=1e-5, step_loss=0.0969]
Steps: 2%|▏ | 15931/1000000 [7:28:04<3039:40:43, 11.12s/it, lr=1e-5, step_loss=0.0969][RANK-0]: Step: [15931], local_loss=0.01293946336954832, train_loss=0.049862951040267944, time_cost=1.578566551208496
+
Steps: 2%|▏ | 15931/1000000 [7:28:04<3039:40:43, 11.12s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 15932/1000000 [7:28:10<2634:06:39, 9.64s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [15932], local_loss=0.00779643002897501, train_loss=0.07094767689704895, time_cost=1.8141181468963623
+
Steps: 2%|▏ | 15932/1000000 [7:28:10<2634:06:39, 9.64s/it, lr=1e-5, step_loss=0.0078]
Steps: 2%|▏ | 15933/1000000 [7:28:16<2283:08:18, 8.35s/it, lr=1e-5, step_loss=0.0078][RANK-0]: Step: [15933], local_loss=0.025887690484523773, train_loss=0.07872529327869415, time_cost=1.2126588821411133
+
Steps: 2%|▏ | 15933/1000000 [7:28:16<2283:08:18, 8.35s/it, lr=1e-5, step_loss=0.0259]
Steps: 2%|▏ | 15934/1000000 [7:28:24<2316:41:36, 8.48s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [15934], local_loss=0.021456124261021614, train_loss=0.059451498091220856, time_cost=1.249288558959961
+
Steps: 2%|▏ | 15934/1000000 [7:28:24<2316:41:36, 8.48s/it, lr=1e-5, step_loss=0.0215]
Steps: 2%|▏ | 15935/1000000 [7:28:38<2746:33:28, 10.05s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [15935], local_loss=0.06076572835445404, train_loss=0.03996002674102783, time_cost=5.101514101028442
+
Steps: 2%|▏ | 15935/1000000 [7:28:38<2746:33:28, 10.05s/it, lr=1e-5, step_loss=0.0608]
Steps: 2%|▏ | 15936/1000000 [7:28:46<2605:10:15, 9.53s/it, lr=1e-5, step_loss=0.0608][RANK-0]: Step: [15936], local_loss=0.015549896284937859, train_loss=0.015102999284863472, time_cost=4.74169921875
+
Steps: 2%|▏ | 15936/1000000 [7:28:46<2605:10:15, 9.53s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 15937/1000000 [7:28:55<2562:17:33, 9.37s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [15937], local_loss=0.0283522829413414, train_loss=0.15385867655277252, time_cost=3.081568479537964
+
Steps: 2%|▏ | 15937/1000000 [7:28:55<2562:17:33, 9.37s/it, lr=1e-5, step_loss=0.0284]
Steps: 2%|▏ | 15938/1000000 [7:29:06<2670:57:34, 9.77s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [15938], local_loss=0.008904663845896721, train_loss=0.08223260194063187, time_cost=8.243173599243164
+
Steps: 2%|▏ | 15938/1000000 [7:29:06<2670:57:34, 9.77s/it, lr=1e-5, step_loss=0.0089]
Steps: 2%|▏ | 15939/1000000 [7:29:16<2668:10:11, 9.76s/it, lr=1e-5, step_loss=0.0089][RANK-0]: Step: [15939], local_loss=0.07851295173168182, train_loss=0.0432593896985054, time_cost=3.277038335800171
+
Steps: 2%|▏ | 15939/1000000 [7:29:16<2668:10:11, 9.76s/it, lr=1e-5, step_loss=0.0785]
Steps: 2%|▏ | 15940/1000000 [7:29:24<2503:42:53, 9.16s/it, lr=1e-5, step_loss=0.0785][RANK-0]: Step: [15940], local_loss=0.03455708175897598, train_loss=0.03369949758052826, time_cost=2.8942790031433105
+
Steps: 2%|▏ | 15940/1000000 [7:29:24<2503:42:53, 9.16s/it, lr=1e-5, step_loss=0.0346]
Steps: 2%|▏ | 15941/1000000 [7:29:30<2275:54:57, 8.33s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [15941], local_loss=0.007284435443580151, train_loss=0.03218381479382515, time_cost=1.9491560459136963
+
Steps: 2%|▏ | 15941/1000000 [7:29:30<2275:54:57, 8.33s/it, lr=1e-5, step_loss=0.00728]
Steps: 2%|▏ | 15942/1000000 [7:29:37<2186:57:17, 8.00s/it, lr=1e-5, step_loss=0.00728][RANK-0]: Step: [15942], local_loss=0.01796986162662506, train_loss=14.34103012084961, time_cost=1.203810453414917
+
Steps: 2%|▏ | 15942/1000000 [7:29:37<2186:57:17, 8.00s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 15943/1000000 [7:29:43<1956:45:42, 7.16s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [15943], local_loss=0.3179283142089844, train_loss=0.06364460289478302, time_cost=1.4834342002868652
+
Steps: 2%|▏ | 15943/1000000 [7:29:43<1956:45:42, 7.16s/it, lr=1e-5, step_loss=0.318]
Steps: 2%|▏ | 15944/1000000 [7:29:49<1923:31:27, 7.04s/it, lr=1e-5, step_loss=0.318][RANK-0]: Step: [15944], local_loss=0.14102958142757416, train_loss=0.05740142613649368, time_cost=2.4113309383392334
+
Steps: 2%|▏ | 15944/1000000 [7:29:49<1923:31:27, 7.04s/it, lr=1e-5, step_loss=0.141]
Steps: 2%|▏ | 15945/1000000 [7:29:59<2113:47:58, 7.73s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [15945], local_loss=0.007879181765019894, train_loss=0.055260658264160156, time_cost=4.390357255935669
+
Steps: 2%|▏ | 15945/1000000 [7:29:59<2113:47:58, 7.73s/it, lr=1e-5, step_loss=0.00788]
Steps: 2%|▏ | 15946/1000000 [7:30:13<2644:06:55, 9.67s/it, lr=1e-5, step_loss=0.00788][RANK-0]: Step: [15946], local_loss=0.006153943948447704, train_loss=0.051367081701755524, time_cost=5.0326738357543945
+
Steps: 2%|▏ | 15946/1000000 [7:30:13<2644:06:55, 9.67s/it, lr=1e-5, step_loss=0.00615]
Steps: 2%|▏ | 15947/1000000 [7:30:17<2214:36:17, 8.10s/it, lr=1e-5, step_loss=0.00615][RANK-0]: Step: [15947], local_loss=0.018748754635453224, train_loss=0.02698858268558979, time_cost=1.711165428161621
+
Steps: 2%|▏ | 15947/1000000 [7:30:17<2214:36:17, 8.10s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 15948/1000000 [7:30:28<2439:07:54, 8.92s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [15948], local_loss=0.1417464017868042, train_loss=0.04281872510910034, time_cost=1.6602070331573486
+
Steps: 2%|▏ | 15948/1000000 [7:30:28<2439:07:54, 8.92s/it, lr=1e-5, step_loss=0.142]
Steps: 2%|▏ | 15949/1000000 [7:30:33<2147:10:33, 7.86s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [15949], local_loss=0.4007973372936249, train_loss=0.06123904138803482, time_cost=2.438530683517456
+
Steps: 2%|▏ | 15949/1000000 [7:30:33<2147:10:33, 7.86s/it, lr=1e-5, step_loss=0.401]
Steps: 2%|▏ | 15950/1000000 [7:30:41<2103:35:40, 7.70s/it, lr=1e-5, step_loss=0.401][RANK-0]: Step: [15950], local_loss=0.028154103085398674, train_loss=0.031612981110811234, time_cost=3.6090667247772217
+
Steps: 2%|▏ | 15950/1000000 [7:30:41<2103:35:40, 7.70s/it, lr=1e-5, step_loss=0.0282]
Steps: 2%|▏ | 15951/1000000 [7:30:56<2703:49:31, 9.89s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [15951], local_loss=0.03333502262830734, train_loss=0.056352078914642334, time_cost=2.6057989597320557
+
Steps: 2%|▏ | 15951/1000000 [7:30:56<2703:49:31, 9.89s/it, lr=1e-5, step_loss=0.0333]
Steps: 2%|▏ | 15952/1000000 [7:31:05<2617:19:22, 9.58s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [15952], local_loss=0.007325534708797932, train_loss=0.03433097153902054, time_cost=3.0089566707611084
+
Steps: 2%|▏ | 15952/1000000 [7:31:05<2617:19:22, 9.58s/it, lr=1e-5, step_loss=0.00733]
Steps: 2%|▏ | 15953/1000000 [7:31:15<2662:47:17, 9.74s/it, lr=1e-5, step_loss=0.00733][RANK-0]: Step: [15953], local_loss=0.05471314489841461, train_loss=0.22733944654464722, time_cost=1.7599906921386719
+
Steps: 2%|▏ | 15953/1000000 [7:31:15<2662:47:17, 9.74s/it, lr=1e-5, step_loss=0.0547]
Steps: 2%|▏ | 15954/1000000 [7:31:22<2431:40:56, 8.90s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [15954], local_loss=0.008889641612768173, train_loss=0.14843881130218506, time_cost=3.172452449798584
+
Steps: 2%|▏ | 15954/1000000 [7:31:22<2431:40:56, 8.90s/it, lr=1e-5, step_loss=0.00889]
Steps: 2%|▏ | 15955/1000000 [7:31:38<3019:35:09, 11.05s/it, lr=1e-5, step_loss=0.00889][RANK-0]: Step: [15955], local_loss=0.06681939214468002, train_loss=0.2193869948387146, time_cost=8.487017631530762
+
Steps: 2%|▏ | 15955/1000000 [7:31:38<3019:35:09, 11.05s/it, lr=1e-5, step_loss=0.0668]
Steps: 2%|▏ | 15956/1000000 [7:31:43<2521:00:22, 9.22s/it, lr=1e-5, step_loss=0.0668][RANK-0]: Step: [15956], local_loss=0.016233740374445915, train_loss=0.031008176505565643, time_cost=1.9701273441314697
+
Steps: 2%|▏ | 15956/1000000 [7:31:43<2521:00:22, 9.22s/it, lr=1e-5, step_loss=0.0162]
Steps: 2%|▏ | 15957/1000000 [7:31:54<2668:54:09, 9.76s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [15957], local_loss=0.01058553159236908, train_loss=0.09280204772949219, time_cost=3.730886697769165
+
Steps: 2%|▏ | 15957/1000000 [7:31:54<2668:54:09, 9.76s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 15958/1000000 [7:31:59<2300:55:58, 8.42s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [15958], local_loss=0.033832259476184845, train_loss=0.10890820622444153, time_cost=4.0377631187438965
+
Steps: 2%|▏ | 15958/1000000 [7:31:59<2300:55:58, 8.42s/it, lr=1e-5, step_loss=0.0338]
Steps: 2%|▏ | 15959/1000000 [7:32:04<2028:19:31, 7.42s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [15959], local_loss=0.06044720113277435, train_loss=0.035204362124204636, time_cost=2.1315925121307373
+
Steps: 2%|▏ | 15959/1000000 [7:32:04<2028:19:31, 7.42s/it, lr=1e-5, step_loss=0.0604]
Steps: 2%|▏ | 15960/1000000 [7:32:18<2557:09:34, 9.36s/it, lr=1e-5, step_loss=0.0604][RANK-0]: Step: [15960], local_loss=0.01928463578224182, train_loss=0.018895065411925316, time_cost=5.986846208572388
+
Steps: 2%|▏ | 15960/1000000 [7:32:18<2557:09:34, 9.36s/it, lr=1e-5, step_loss=0.0193]
Steps: 2%|▏ | 15961/1000000 [7:32:30<2777:13:22, 10.16s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [15961], local_loss=0.05710093677043915, train_loss=0.019078535959124565, time_cost=2.7723710536956787
+
Steps: 2%|▏ | 15961/1000000 [7:32:30<2777:13:22, 10.16s/it, lr=1e-5, step_loss=0.0571]
Steps: 2%|▏ | 15962/1000000 [7:32:39<2682:36:53, 9.81s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [15962], local_loss=0.018721168860793114, train_loss=0.024140525609254837, time_cost=1.4675469398498535
+
Steps: 2%|▏ | 15962/1000000 [7:32:39<2682:36:53, 9.81s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 15963/1000000 [7:32:44<2283:20:45, 8.35s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [15963], local_loss=0.004720789380371571, train_loss=0.020367372781038284, time_cost=3.085689067840576
+
Steps: 2%|▏ | 15963/1000000 [7:32:44<2283:20:45, 8.35s/it, lr=1e-5, step_loss=0.00472]
Steps: 2%|▏ | 15964/1000000 [7:32:53<2371:19:35, 8.68s/it, lr=1e-5, step_loss=0.00472][RANK-0]: Step: [15964], local_loss=0.05400031805038452, train_loss=0.053169477730989456, time_cost=1.294508695602417
+
Steps: 2%|▏ | 15964/1000000 [7:32:53<2371:19:35, 8.68s/it, lr=1e-5, step_loss=0.054]
Steps: 2%|▏ | 15965/1000000 [7:32:59<2087:04:03, 7.64s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [15965], local_loss=0.07745195925235748, train_loss=0.07281625270843506, time_cost=2.288083791732788
+
Steps: 2%|▏ | 15965/1000000 [7:32:59<2087:04:03, 7.64s/it, lr=1e-5, step_loss=0.0775]
Steps: 2%|▏ | 15966/1000000 [7:33:08<2195:11:50, 8.03s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [15966], local_loss=0.44411981105804443, train_loss=0.07967762649059296, time_cost=7.240760326385498
+
Steps: 2%|▏ | 15966/1000000 [7:33:08<2195:11:50, 8.03s/it, lr=1e-5, step_loss=0.444]
Steps: 2%|▏ | 15967/1000000 [7:33:16<2267:22:13, 8.29s/it, lr=1e-5, step_loss=0.444][RANK-0]: Step: [15967], local_loss=0.01684529334306717, train_loss=0.02240128628909588, time_cost=2.657632827758789
+
Steps: 2%|▏ | 15967/1000000 [7:33:16<2267:22:13, 8.29s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 15968/1000000 [7:33:29<2576:54:21, 9.43s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [15968], local_loss=0.020131759345531464, train_loss=14.045129776000977, time_cost=1.208590030670166
+
Steps: 2%|▏ | 15968/1000000 [7:33:29<2576:54:21, 9.43s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 15969/1000000 [7:33:34<2227:31:54, 8.15s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [15969], local_loss=0.006845889147371054, train_loss=0.048553187400102615, time_cost=2.1378228664398193
+
Steps: 2%|▏ | 15969/1000000 [7:33:34<2227:31:54, 8.15s/it, lr=1e-5, step_loss=0.00685]
Steps: 2%|▏ | 15970/1000000 [7:33:41<2130:30:29, 7.79s/it, lr=1e-5, step_loss=0.00685][RANK-0]: Step: [15970], local_loss=0.021600253880023956, train_loss=0.009700220078229904, time_cost=1.3217012882232666
+
Steps: 2%|▏ | 15970/1000000 [7:33:41<2130:30:29, 7.79s/it, lr=1e-5, step_loss=0.0216]
Steps: 2%|▏ | 15971/1000000 [7:33:52<2405:05:14, 8.80s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [15971], local_loss=0.005140781402587891, train_loss=0.012061449699103832, time_cost=1.7445895671844482
+
Steps: 2%|▏ | 15971/1000000 [7:33:52<2405:05:14, 8.80s/it, lr=1e-5, step_loss=0.00514]
Steps: 2%|▏ | 15972/1000000 [7:33:57<2096:29:36, 7.67s/it, lr=1e-5, step_loss=0.00514][RANK-0]: Step: [15972], local_loss=0.019670212641358376, train_loss=0.032095279544591904, time_cost=2.154160261154175
+
Steps: 2%|▏ | 15972/1000000 [7:33:57<2096:29:36, 7.67s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 15973/1000000 [7:34:07<2320:04:45, 8.49s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [15973], local_loss=0.0761069506406784, train_loss=0.03269793093204498, time_cost=2.2833001613616943
+
Steps: 2%|▏ | 15973/1000000 [7:34:07<2320:04:45, 8.49s/it, lr=1e-5, step_loss=0.0761]
Steps: 2%|▏ | 15974/1000000 [7:34:13<2127:53:08, 7.78s/it, lr=1e-5, step_loss=0.0761][RANK-0]: Step: [15974], local_loss=0.04398324713110924, train_loss=0.14526091516017914, time_cost=1.2078568935394287
+
Steps: 2%|▏ | 15974/1000000 [7:34:13<2127:53:08, 7.78s/it, lr=1e-5, step_loss=0.044]
Steps: 2%|▏ | 15975/1000000 [7:34:19<1913:34:25, 7.00s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [15975], local_loss=0.04600491747260094, train_loss=0.12285781651735306, time_cost=1.2284531593322754
+
Steps: 2%|▏ | 15975/1000000 [7:34:19<1913:34:25, 7.00s/it, lr=1e-5, step_loss=0.046]
Steps: 2%|▏ | 15976/1000000 [7:34:26<1917:37:52, 7.02s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [15976], local_loss=91.432373046875, train_loss=11.53646183013916, time_cost=5.249598979949951
+
Steps: 2%|▏ | 15976/1000000 [7:34:26<1917:37:52, 7.02s/it, lr=1e-5, step_loss=91.4]
Steps: 2%|▏ | 15977/1000000 [7:34:38<2387:10:38, 8.73s/it, lr=1e-5, step_loss=91.4][RANK-0]: Step: [15977], local_loss=0.029013417661190033, train_loss=0.06451108306646347, time_cost=4.378885746002197
+
Steps: 2%|▏ | 15977/1000000 [7:34:38<2387:10:38, 8.73s/it, lr=1e-5, step_loss=0.029]
Steps: 2%|▏ | 15978/1000000 [7:34:49<2512:29:11, 9.19s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [15978], local_loss=0.006938954349607229, train_loss=0.04815636947751045, time_cost=1.2235972881317139
+
Steps: 2%|▏ | 15978/1000000 [7:34:49<2512:29:11, 9.19s/it, lr=1e-5, step_loss=0.00694]
Steps: 2%|▏ | 15979/1000000 [7:34:55<2247:28:24, 8.22s/it, lr=1e-5, step_loss=0.00694][RANK-0]: Step: [15979], local_loss=0.021919449791312218, train_loss=0.13176022469997406, time_cost=1.784376859664917
+
Steps: 2%|▏ | 15979/1000000 [7:34:55<2247:28:24, 8.22s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 15980/1000000 [7:35:08<2642:27:18, 9.67s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [15980], local_loss=0.0830514207482338, train_loss=0.04827733710408211, time_cost=1.7273151874542236
+
Steps: 2%|▏ | 15980/1000000 [7:35:08<2642:27:18, 9.67s/it, lr=1e-5, step_loss=0.0831]
Steps: 2%|▏ | 15981/1000000 [7:35:22<3024:43:13, 11.07s/it, lr=1e-5, step_loss=0.0831][RANK-0]: Step: [15981], local_loss=0.18716055154800415, train_loss=0.04109743982553482, time_cost=6.45024561882019
+
Steps: 2%|▏ | 15981/1000000 [7:35:22<3024:43:13, 11.07s/it, lr=1e-5, step_loss=0.187]
Steps: 2%|▏ | 15982/1000000 [7:35:27<2495:30:40, 9.13s/it, lr=1e-5, step_loss=0.187][RANK-0]: Step: [15982], local_loss=0.01775985024869442, train_loss=0.06845583766698837, time_cost=1.8817734718322754
+
Steps: 2%|▏ | 15982/1000000 [7:35:27<2495:30:40, 9.13s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 15983/1000000 [7:35:38<2676:20:24, 9.79s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [15983], local_loss=0.013028999790549278, train_loss=0.03714140132069588, time_cost=8.983371257781982
+
Steps: 2%|▏ | 15983/1000000 [7:35:38<2676:20:24, 9.79s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 15984/1000000 [7:35:52<3057:44:32, 11.19s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [15984], local_loss=0.023176243528723717, train_loss=0.04696343094110489, time_cost=5.727756023406982
+
Steps: 2%|▏ | 15984/1000000 [7:35:52<3057:44:32, 11.19s/it, lr=1e-5, step_loss=0.0232]
Steps: 2%|▏ | 15985/1000000 [7:36:08<3410:40:56, 12.48s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [15985], local_loss=0.006216452457010746, train_loss=0.16202771663665771, time_cost=5.025741815567017
+
Steps: 2%|▏ | 15985/1000000 [7:36:08<3410:40:56, 12.48s/it, lr=1e-5, step_loss=0.00622]
Steps: 2%|▏ | 15986/1000000 [7:36:13<2797:46:29, 10.24s/it, lr=1e-5, step_loss=0.00622][RANK-0]: Step: [15986], local_loss=0.057696446776390076, train_loss=0.03046361729502678, time_cost=1.3413569927215576
+
Steps: 2%|▏ | 15986/1000000 [7:36:13<2797:46:29, 10.24s/it, lr=1e-5, step_loss=0.0577]
Steps: 2%|▏ | 15987/1000000 [7:36:23<2804:23:12, 10.26s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [15987], local_loss=0.00725588807836175, train_loss=0.03400075435638428, time_cost=2.5694010257720947
+
Steps: 2%|▏ | 15987/1000000 [7:36:23<2804:23:12, 10.26s/it, lr=1e-5, step_loss=0.00726]
Steps: 2%|▏ | 15988/1000000 [7:36:35<2966:49:08, 10.85s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [15988], local_loss=0.010354053229093552, train_loss=0.024030402302742004, time_cost=4.866771697998047
+
Steps: 2%|▏ | 15988/1000000 [7:36:35<2966:49:08, 10.85s/it, lr=1e-5, step_loss=0.0104]
Steps: 2%|▏ | 15989/1000000 [7:36:40<2486:28:35, 9.10s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [15989], local_loss=0.008881867863237858, train_loss=0.02385244332253933, time_cost=2.1142451763153076
+
Steps: 2%|▏ | 15989/1000000 [7:36:40<2486:28:35, 9.10s/it, lr=1e-5, step_loss=0.00888]
Steps: 2%|▏ | 15990/1000000 [7:36:53<2812:46:50, 10.29s/it, lr=1e-5, step_loss=0.00888][RANK-0]: Step: [15990], local_loss=0.03239106759428978, train_loss=0.0329732820391655, time_cost=1.9118685722351074
+
Steps: 2%|▏ | 15990/1000000 [7:36:53<2812:46:50, 10.29s/it, lr=1e-5, step_loss=0.0324]
Steps: 2%|▏ | 15991/1000000 [7:37:01<2583:28:45, 9.45s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [15991], local_loss=0.01764950528740883, train_loss=0.015245451591908932, time_cost=3.0141422748565674
+
Steps: 2%|▏ | 15991/1000000 [7:37:01<2583:28:45, 9.45s/it, lr=1e-5, step_loss=0.0176]
Steps: 2%|▏ | 15992/1000000 [7:37:13<2797:32:12, 10.23s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [15992], local_loss=0.01105912309139967, train_loss=0.020427044481039047, time_cost=3.972733736038208
+
Steps: 2%|▏ | 15992/1000000 [7:37:13<2797:32:12, 10.23s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 15993/1000000 [7:37:26<3056:51:08, 11.18s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [15993], local_loss=0.003689012723043561, train_loss=0.022295944392681122, time_cost=5.088056564331055
+
Steps: 2%|▏ | 15993/1000000 [7:37:26<3056:51:08, 11.18s/it, lr=1e-5, step_loss=0.00369]
Steps: 2%|▏ | 15994/1000000 [7:37:34<2735:52:57, 10.01s/it, lr=1e-5, step_loss=0.00369][RANK-0]: Step: [15994], local_loss=0.005512488540261984, train_loss=0.025096524506807327, time_cost=1.7600514888763428
+
Steps: 2%|▏ | 15994/1000000 [7:37:34<2735:52:57, 10.01s/it, lr=1e-5, step_loss=0.00551]
Steps: 2%|▏ | 15995/1000000 [7:37:45<2842:53:10, 10.40s/it, lr=1e-5, step_loss=0.00551][RANK-0]: Step: [15995], local_loss=0.06205008924007416, train_loss=0.03532508388161659, time_cost=1.6664073467254639
+
Steps: 2%|▏ | 15995/1000000 [7:37:45<2842:53:10, 10.40s/it, lr=1e-5, step_loss=0.0621]
Steps: 2%|▏ | 15996/1000000 [7:37:50<2397:42:21, 8.77s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [15996], local_loss=0.014710979536175728, train_loss=0.024275686591863632, time_cost=1.449434518814087
+
Steps: 2%|▏ | 15996/1000000 [7:37:50<2397:42:21, 8.77s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 15997/1000000 [7:37:58<2322:27:01, 8.50s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [15997], local_loss=0.030392814427614212, train_loss=0.161087304353714, time_cost=1.4183998107910156
+
Steps: 2%|▏ | 15997/1000000 [7:37:58<2322:27:01, 8.50s/it, lr=1e-5, step_loss=0.0304]
Steps: 2%|▏ | 15998/1000000 [7:38:05<2189:36:28, 8.01s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [15998], local_loss=0.011949358507990837, train_loss=0.01747017540037632, time_cost=3.061297655105591
+
Steps: 2%|▏ | 15998/1000000 [7:38:05<2189:36:28, 8.01s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 15999/1000000 [7:38:10<1960:47:19, 7.17s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [15999], local_loss=0.024489017203450203, train_loss=0.01948522962629795, time_cost=1.700974702835083
+
Steps: 2%|▏ | 15999/1000000 [7:38:10<1960:47:19, 7.17s/it, lr=1e-5, step_loss=0.0245]
Steps: 2%|▏ | 16000/1000000 [7:38:16<1857:50:50, 6.80s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [16000], local_loss=0.004341190680861473, train_loss=0.014632808975875378, time_cost=2.2837159633636475
+09/18/2024 17:02:19 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1/checkpoint-16000
+09/18/2024 17:02:19 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-18 17:02:19,222] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-18 17:02:19,252] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-18 17:02:19,253] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 17:02:36,086] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-18 17:02:36,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-18 17:03:10,867] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:10,868] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:10,868] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:11,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:11,734] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:11,735] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:12,235] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:12,266] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:12,267] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:12,267] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:12,297] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:12,298] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:12,356] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:12,356] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:12,356] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:12,393] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:12,393] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:12,393] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:12,427] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:12,427] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:12,427] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 17:03:12,528] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-18 17:03:12,528] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-18 17:03:12,528] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/18/2024 17:03:12 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/pytorch_model
+{'norm_num_groups', 'dropout', 'use_additional_conditions'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/model/diffusion_pytorch_model.safetensors
+09/18/2024 17:04:15 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/scheduler.bin
+09/18/2024 17:04:15 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/sampler.bin
+09/18/2024 17:04:15 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-16000/random_states_0.pkl
+09/18/2024 17:04:15 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1/checkpoint-16000
+
Steps: 2%|▏ | 16000/1000000 [7:40:12<1857:50:50, 6.80s/it, lr=1e-5, step_loss=0.00434]
Steps: 2%|▏ | 16001/1000000 [7:40:14<11025:01:24, 40.34s/it, lr=1e-5, step_loss=0.00434][RANK-0]: Step: [16001], local_loss=0.056716132909059525, train_loss=0.03961295634508133, time_cost=1.2076010704040527
+
Steps: 2%|▏ | 16001/1000000 [7:40:14<11025:01:24, 40.34s/it, lr=1e-5, step_loss=0.0567]
Steps: 2%|▏ | 16002/1000000 [7:40:23<8437:49:24, 30.87s/it, lr=1e-5, step_loss=0.0567] [RANK-0]: Step: [16002], local_loss=0.20238715410232544, train_loss=0.10225433856248856, time_cost=2.0220768451690674
+
Steps: 2%|▏ | 16002/1000000 [7:40:23<8437:49:24, 30.87s/it, lr=1e-5, step_loss=0.202]
Steps: 2%|▏ | 16003/1000000 [7:40:27<6253:48:41, 22.88s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [16003], local_loss=0.04049326106905937, train_loss=0.12532088160514832, time_cost=1.2751352787017822
+
Steps: 2%|▏ | 16003/1000000 [7:40:27<6253:48:41, 22.88s/it, lr=1e-5, step_loss=0.0405]
Steps: 2%|▏ | 16004/1000000 [7:40:32<4722:58:51, 17.28s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [16004], local_loss=0.06681529432535172, train_loss=0.03322764113545418, time_cost=1.4981358051300049
+
Steps: 2%|▏ | 16004/1000000 [7:40:32<4722:58:51, 17.28s/it, lr=1e-5, step_loss=0.0668]
Steps: 2%|▏ | 16005/1000000 [7:40:39<3906:43:19, 14.29s/it, lr=1e-5, step_loss=0.0668][RANK-0]: Step: [16005], local_loss=0.08840807527303696, train_loss=0.030012022703886032, time_cost=1.8222236633300781
+
Steps: 2%|▏ | 16005/1000000 [7:40:39<3906:43:19, 14.29s/it, lr=1e-5, step_loss=0.0884]
Steps: 2%|▏ | 16006/1000000 [7:40:49<3584:26:08, 13.11s/it, lr=1e-5, step_loss=0.0884][RANK-0]: Step: [16006], local_loss=0.0081322705373168, train_loss=0.04648589715361595, time_cost=8.994548797607422
+
Steps: 2%|▏ | 16006/1000000 [7:40:49<3584:26:08, 13.11s/it, lr=1e-5, step_loss=0.00813]
Steps: 2%|▏ | 16007/1000000 [7:41:00<3401:16:36, 12.44s/it, lr=1e-5, step_loss=0.00813][RANK-0]: Step: [16007], local_loss=0.05800461396574974, train_loss=0.08178015053272247, time_cost=3.332970380783081
+
Steps: 2%|▏ | 16007/1000000 [7:41:00<3401:16:36, 12.44s/it, lr=1e-5, step_loss=0.058]
Steps: 2%|▏ | 16008/1000000 [7:41:07<2962:04:20, 10.84s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [16008], local_loss=0.2964164614677429, train_loss=0.06057421863079071, time_cost=1.673133134841919
+
Steps: 2%|▏ | 16008/1000000 [7:41:07<2962:04:20, 10.84s/it, lr=1e-5, step_loss=0.296]
Steps: 2%|▏ | 16009/1000000 [7:41:12<2457:16:20, 8.99s/it, lr=1e-5, step_loss=0.296][RANK-0]: Step: [16009], local_loss=0.06346867978572845, train_loss=0.14947186410427094, time_cost=1.307901382446289
+
Steps: 2%|▏ | 16009/1000000 [7:41:12<2457:16:20, 8.99s/it, lr=1e-5, step_loss=0.0635]
Steps: 2%|▏ | 16010/1000000 [7:41:22<2560:49:51, 9.37s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [16010], local_loss=0.039503369480371475, train_loss=0.05553111806511879, time_cost=1.2475690841674805
+
Steps: 2%|▏ | 16010/1000000 [7:41:22<2560:49:51, 9.37s/it, lr=1e-5, step_loss=0.0395]
Steps: 2%|▏ | 16011/1000000 [7:41:35<2854:30:17, 10.44s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [16011], local_loss=0.032563354820013046, train_loss=0.03892230987548828, time_cost=1.281893253326416
+
Steps: 2%|▏ | 16011/1000000 [7:41:35<2854:30:17, 10.44s/it, lr=1e-5, step_loss=0.0326]
Steps: 2%|▏ | 16012/1000000 [7:41:43<2666:55:25, 9.76s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [16012], local_loss=0.012573814950883389, train_loss=0.017407763749361038, time_cost=6.859519958496094
+
Steps: 2%|▏ | 16012/1000000 [7:41:43<2666:55:25, 9.76s/it, lr=1e-5, step_loss=0.0126]
Steps: 2%|▏ | 16013/1000000 [7:41:59<3132:03:37, 11.46s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [16013], local_loss=0.022449512034654617, train_loss=0.02734488993883133, time_cost=7.291358709335327
+
Steps: 2%|▏ | 16013/1000000 [7:41:59<3132:03:37, 11.46s/it, lr=1e-5, step_loss=0.0224]
Steps: 2%|▏ | 16014/1000000 [7:42:04<2585:59:38, 9.46s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [16014], local_loss=0.013845332898199558, train_loss=0.06395210325717926, time_cost=1.2272603511810303
+
Steps: 2%|▏ | 16014/1000000 [7:42:04<2585:59:38, 9.46s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 16015/1000000 [7:42:12<2506:34:32, 9.17s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [16015], local_loss=0.007868805900216103, train_loss=0.037120696157217026, time_cost=2.8463056087493896
+
Steps: 2%|▏ | 16015/1000000 [7:42:12<2506:34:32, 9.17s/it, lr=1e-5, step_loss=0.00787]
Steps: 2%|▏ | 16016/1000000 [7:42:20<2424:10:50, 8.87s/it, lr=1e-5, step_loss=0.00787][RANK-0]: Step: [16016], local_loss=0.007968034595251083, train_loss=0.02208971045911312, time_cost=3.109853982925415
+
Steps: 2%|▏ | 16016/1000000 [7:42:20<2424:10:50, 8.87s/it, lr=1e-5, step_loss=0.00797]
Steps: 2%|▏ | 16017/1000000 [7:42:26<2178:55:18, 7.97s/it, lr=1e-5, step_loss=0.00797][RANK-0]: Step: [16017], local_loss=0.05147896334528923, train_loss=0.06560872495174408, time_cost=1.946092128753662
+
Steps: 2%|▏ | 16017/1000000 [7:42:26<2178:55:18, 7.97s/it, lr=1e-5, step_loss=0.0515]
Steps: 2%|▏ | 16018/1000000 [7:42:33<2082:45:55, 7.62s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [16018], local_loss=0.006098433863371611, train_loss=0.02357354201376438, time_cost=2.3872323036193848
+
Steps: 2%|▏ | 16018/1000000 [7:42:33<2082:45:55, 7.62s/it, lr=1e-5, step_loss=0.0061]
Steps: 2%|▏ | 16019/1000000 [7:42:38<1876:18:20, 6.86s/it, lr=1e-5, step_loss=0.0061][RANK-0]: Step: [16019], local_loss=0.337930291891098, train_loss=0.10509274899959564, time_cost=1.2177140712738037
+
Steps: 2%|▏ | 16019/1000000 [7:42:38<1876:18:20, 6.86s/it, lr=1e-5, step_loss=0.338]
Steps: 2%|▏ | 16020/1000000 [7:42:43<1680:45:40, 6.15s/it, lr=1e-5, step_loss=0.338][RANK-0]: Step: [16020], local_loss=0.0114946523681283, train_loss=0.02984759956598282, time_cost=1.8427751064300537
+
Steps: 2%|▏ | 16020/1000000 [7:42:43<1680:45:40, 6.15s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 16021/1000000 [7:42:47<1579:14:56, 5.78s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [16021], local_loss=0.0034864344634115696, train_loss=0.04981270432472229, time_cost=4.095900535583496
+
Steps: 2%|▏ | 16021/1000000 [7:42:47<1579:14:56, 5.78s/it, lr=1e-5, step_loss=0.00349]
Steps: 2%|▏ | 16022/1000000 [7:42:56<1813:42:07, 6.64s/it, lr=1e-5, step_loss=0.00349][RANK-0]: Step: [16022], local_loss=0.1793043613433838, train_loss=0.058227669447660446, time_cost=2.812764883041382
+
Steps: 2%|▏ | 16022/1000000 [7:42:56<1813:42:07, 6.64s/it, lr=1e-5, step_loss=0.179]
Steps: 2%|▏ | 16023/1000000 [7:43:08<2234:18:36, 8.17s/it, lr=1e-5, step_loss=0.179][RANK-0]: Step: [16023], local_loss=0.051603276282548904, train_loss=0.09916707128286362, time_cost=3.9482383728027344
+
Steps: 2%|▏ | 16023/1000000 [7:43:08<2234:18:36, 8.17s/it, lr=1e-5, step_loss=0.0516]
Steps: 2%|▏ | 16024/1000000 [7:43:13<1963:17:23, 7.18s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [16024], local_loss=0.07896707952022552, train_loss=0.07362540811300278, time_cost=2.5027759075164795
+
Steps: 2%|▏ | 16024/1000000 [7:43:13<1963:17:23, 7.18s/it, lr=1e-5, step_loss=0.079]
Steps: 2%|▏ | 16025/1000000 [7:43:23<2181:31:32, 7.98s/it, lr=1e-5, step_loss=0.079][RANK-0]: Step: [16025], local_loss=0.007707878947257996, train_loss=0.020280933007597923, time_cost=1.262385368347168
+
Steps: 2%|▏ | 16025/1000000 [7:43:23<2181:31:32, 7.98s/it, lr=1e-5, step_loss=0.00771]
Steps: 2%|▏ | 16026/1000000 [7:43:35<2526:23:33, 9.24s/it, lr=1e-5, step_loss=0.00771][RANK-0]: Step: [16026], local_loss=0.012141555547714233, train_loss=0.14054493606090546, time_cost=5.215296745300293
+
Steps: 2%|▏ | 16026/1000000 [7:43:35<2526:23:33, 9.24s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 16027/1000000 [7:43:46<2698:03:28, 9.87s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [16027], local_loss=0.04675725847482681, train_loss=0.025236912071704865, time_cost=8.916215896606445
+
Steps: 2%|▏ | 16027/1000000 [7:43:46<2698:03:28, 9.87s/it, lr=1e-5, step_loss=0.0468]
Steps: 2%|▏ | 16028/1000000 [7:44:00<3063:03:22, 11.21s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [16028], local_loss=0.020568734034895897, train_loss=0.0280308797955513, time_cost=10.823863506317139
+
Steps: 2%|▏ | 16028/1000000 [7:44:00<3063:03:22, 11.21s/it, lr=1e-5, step_loss=0.0206]
Steps: 2%|▏ | 16029/1000000 [7:44:05<2553:59:23, 9.34s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [16029], local_loss=0.04060915485024452, train_loss=0.022525427863001823, time_cost=2.136202812194824
+
Steps: 2%|▏ | 16029/1000000 [7:44:05<2553:59:23, 9.34s/it, lr=1e-5, step_loss=0.0406]
Steps: 2%|▏ | 16030/1000000 [7:44:16<2642:47:29, 9.67s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [16030], local_loss=0.006758855190128088, train_loss=0.21743860840797424, time_cost=5.970995903015137
+
Steps: 2%|▏ | 16030/1000000 [7:44:16<2642:47:29, 9.67s/it, lr=1e-5, step_loss=0.00676]
Steps: 2%|▏ | 16031/1000000 [7:44:30<3028:45:07, 11.08s/it, lr=1e-5, step_loss=0.00676][RANK-0]: Step: [16031], local_loss=0.010364992544054985, train_loss=0.0327732227742672, time_cost=10.538373231887817
+
Steps: 2%|▏ | 16031/1000000 [7:44:30<3028:45:07, 11.08s/it, lr=1e-5, step_loss=0.0104]
Steps: 2%|▏ | 16032/1000000 [7:44:46<3403:02:11, 12.45s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [16032], local_loss=0.05750226229429245, train_loss=0.026399502530694008, time_cost=7.227556228637695
+
Steps: 2%|▏ | 16032/1000000 [7:44:46<3403:02:11, 12.45s/it, lr=1e-5, step_loss=0.0575]
Steps: 2%|▏ | 16033/1000000 [7:44:56<3217:43:44, 11.77s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [16033], local_loss=0.025051429867744446, train_loss=0.025901511311531067, time_cost=4.858588218688965
+
Steps: 2%|▏ | 16033/1000000 [7:44:56<3217:43:44, 11.77s/it, lr=1e-5, step_loss=0.0251]
Steps: 2%|▏ | 16034/1000000 [7:45:12<3597:58:26, 13.16s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [16034], local_loss=0.00877587404102087, train_loss=0.02611074596643448, time_cost=5.68504524230957
+
Steps: 2%|▏ | 16034/1000000 [7:45:12<3597:58:26, 13.16s/it, lr=1e-5, step_loss=0.00878]
Steps: 2%|▏ | 16035/1000000 [7:45:26<3670:45:50, 13.43s/it, lr=1e-5, step_loss=0.00878][RANK-0]: Step: [16035], local_loss=0.04210750758647919, train_loss=0.1663426011800766, time_cost=4.652627229690552
+
Steps: 2%|▏ | 16035/1000000 [7:45:26<3670:45:50, 13.43s/it, lr=1e-5, step_loss=0.0421]
Steps: 2%|▏ | 16036/1000000 [7:45:40<3670:29:34, 13.43s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [16036], local_loss=0.010486791841685772, train_loss=19.83074188232422, time_cost=3.437929630279541
+
Steps: 2%|▏ | 16036/1000000 [7:45:40<3670:29:34, 13.43s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 16037/1000000 [7:45:44<2937:17:27, 10.75s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [16037], local_loss=0.023702211678028107, train_loss=0.024544304236769676, time_cost=2.072037935256958
+
Steps: 2%|▏ | 16037/1000000 [7:45:44<2937:17:27, 10.75s/it, lr=1e-5, step_loss=0.0237]
Steps: 2%|▏ | 16038/1000000 [7:45:55<2950:46:43, 10.80s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [16038], local_loss=0.01887836866080761, train_loss=0.03311661630868912, time_cost=2.0233609676361084
+
Steps: 2%|▏ | 16038/1000000 [7:45:55<2950:46:43, 10.80s/it, lr=1e-5, step_loss=0.0189]
Steps: 2%|▏ | 16039/1000000 [7:46:04<2818:04:17, 10.31s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [16039], local_loss=0.03484943509101868, train_loss=0.021571921184659004, time_cost=3.621340036392212
+
Steps: 2%|▏ | 16039/1000000 [7:46:04<2818:04:17, 10.31s/it, lr=1e-5, step_loss=0.0348]
Steps: 2%|▏ | 16040/1000000 [7:46:10<2451:36:15, 8.97s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [16040], local_loss=0.005766310263425112, train_loss=0.053069885820150375, time_cost=2.0244224071502686
+
Steps: 2%|▏ | 16040/1000000 [7:46:10<2451:36:15, 8.97s/it, lr=1e-5, step_loss=0.00577]
Steps: 2%|▏ | 16041/1000000 [7:46:24<2799:40:04, 10.24s/it, lr=1e-5, step_loss=0.00577][RANK-0]: Step: [16041], local_loss=0.016147809103131294, train_loss=0.06204299256205559, time_cost=3.9122889041900635
+
Steps: 2%|▏ | 16041/1000000 [7:46:24<2799:40:04, 10.24s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 16042/1000000 [7:46:37<3031:59:52, 11.09s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [16042], local_loss=0.007781828287988901, train_loss=0.010789879597723484, time_cost=3.2003679275512695
+
Steps: 2%|▏ | 16042/1000000 [7:46:37<3031:59:52, 11.09s/it, lr=1e-5, step_loss=0.00778]
Steps: 2%|▏ | 16043/1000000 [7:46:46<2917:09:10, 10.67s/it, lr=1e-5, step_loss=0.00778][RANK-0]: Step: [16043], local_loss=0.02283146046102047, train_loss=0.025130584836006165, time_cost=7.836609125137329
+
Steps: 2%|▏ | 16043/1000000 [7:46:46<2917:09:10, 10.67s/it, lr=1e-5, step_loss=0.0228]
Steps: 2%|▏ | 16044/1000000 [7:46:58<3018:47:39, 11.04s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [16044], local_loss=0.040193893015384674, train_loss=0.030013833194971085, time_cost=5.717623949050903
+
Steps: 2%|▏ | 16044/1000000 [7:46:58<3018:47:39, 11.04s/it, lr=1e-5, step_loss=0.0402]
Steps: 2%|▏ | 16045/1000000 [7:47:04<2599:15:03, 9.51s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [16045], local_loss=0.04984866455197334, train_loss=0.022583728656172752, time_cost=1.5798745155334473
+
Steps: 2%|▏ | 16045/1000000 [7:47:04<2599:15:03, 9.51s/it, lr=1e-5, step_loss=0.0498]
Steps: 2%|▏ | 16046/1000000 [7:47:10<2263:42:05, 8.28s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [16046], local_loss=0.010400301776826382, train_loss=0.028372956439852715, time_cost=4.612506151199341
+
Steps: 2%|▏ | 16046/1000000 [7:47:10<2263:42:05, 8.28s/it, lr=1e-5, step_loss=0.0104]
Steps: 2%|▏ | 16047/1000000 [7:47:15<2015:33:08, 7.37s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [16047], local_loss=0.017320729792118073, train_loss=0.01828925684094429, time_cost=2.3299343585968018
+
Steps: 2%|▏ | 16047/1000000 [7:47:15<2015:33:08, 7.37s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 16048/1000000 [7:47:28<2497:17:22, 9.14s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [16048], local_loss=0.06811755895614624, train_loss=0.12060388922691345, time_cost=3.9301469326019287
+
Steps: 2%|▏ | 16048/1000000 [7:47:28<2497:17:22, 9.14s/it, lr=1e-5, step_loss=0.0681]
Steps: 2%|▏ | 16049/1000000 [7:47:39<2667:36:01, 9.76s/it, lr=1e-5, step_loss=0.0681][RANK-0]: Step: [16049], local_loss=0.025089848786592484, train_loss=0.014381783083081245, time_cost=1.2451941967010498
+
Steps: 2%|▏ | 16049/1000000 [7:47:39<2667:36:01, 9.76s/it, lr=1e-5, step_loss=0.0251]
Steps: 2%|▏ | 16050/1000000 [7:47:51<2802:37:39, 10.25s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [16050], local_loss=0.006720368750393391, train_loss=0.03604256361722946, time_cost=1.2395572662353516
+
Steps: 2%|▏ | 16050/1000000 [7:47:51<2802:37:39, 10.25s/it, lr=1e-5, step_loss=0.00672]
Steps: 2%|▏ | 16051/1000000 [7:48:01<2772:41:30, 10.14s/it, lr=1e-5, step_loss=0.00672][RANK-0]: Step: [16051], local_loss=0.02425043284893036, train_loss=0.017025059089064598, time_cost=1.2227697372436523
+
Steps: 2%|▏ | 16051/1000000 [7:48:01<2772:41:30, 10.14s/it, lr=1e-5, step_loss=0.0243]
Steps: 2%|▏ | 16052/1000000 [7:48:13<2956:05:30, 10.82s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [16052], local_loss=0.042421605437994, train_loss=0.027804836630821228, time_cost=2.97817325592041
+
Steps: 2%|▏ | 16052/1000000 [7:48:13<2956:05:30, 10.82s/it, lr=1e-5, step_loss=0.0424]
Steps: 2%|▏ | 16053/1000000 [7:48:22<2803:48:59, 10.26s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [16053], local_loss=0.006661401595920324, train_loss=0.013537304475903511, time_cost=3.364736795425415
+
Steps: 2%|▏ | 16053/1000000 [7:48:22<2803:48:59, 10.26s/it, lr=1e-5, step_loss=0.00666]
Steps: 2%|▏ | 16054/1000000 [7:48:34<2915:34:44, 10.67s/it, lr=1e-5, step_loss=0.00666][RANK-0]: Step: [16054], local_loss=0.013576757162809372, train_loss=0.011457622982561588, time_cost=4.128932952880859
+
Steps: 2%|▏ | 16054/1000000 [7:48:34<2915:34:44, 10.67s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 16055/1000000 [7:48:41<2675:02:21, 9.79s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [16055], local_loss=0.008434392511844635, train_loss=0.020312700420618057, time_cost=2.0053975582122803
+
Steps: 2%|▏ | 16055/1000000 [7:48:41<2675:02:21, 9.79s/it, lr=1e-5, step_loss=0.00843]
Steps: 2%|▏ | 16056/1000000 [7:48:52<2764:41:55, 10.12s/it, lr=1e-5, step_loss=0.00843][RANK-0]: Step: [16056], local_loss=0.03635876625776291, train_loss=0.04348958283662796, time_cost=2.6856143474578857
+
Steps: 2%|▏ | 16056/1000000 [7:48:52<2764:41:55, 10.12s/it, lr=1e-5, step_loss=0.0364]
Steps: 2%|▏ | 16057/1000000 [7:48:57<2340:08:45, 8.56s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [16057], local_loss=0.01538623683154583, train_loss=0.05305178835988045, time_cost=1.4323348999023438
+
Steps: 2%|▏ | 16057/1000000 [7:48:57<2340:08:45, 8.56s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 16058/1000000 [7:49:08<2547:11:25, 9.32s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [16058], local_loss=0.06532925367355347, train_loss=0.024612732231616974, time_cost=8.044960021972656
+
Steps: 2%|▏ | 16058/1000000 [7:49:08<2547:11:25, 9.32s/it, lr=1e-5, step_loss=0.0653]
Steps: 2%|▏ | 16059/1000000 [7:49:17<2466:27:03, 9.02s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [16059], local_loss=0.02535911649465561, train_loss=0.03847527503967285, time_cost=1.2132692337036133
+
Steps: 2%|▏ | 16059/1000000 [7:49:17<2466:27:03, 9.02s/it, lr=1e-5, step_loss=0.0254]
Steps: 2%|▏ | 16060/1000000 [7:49:27<2607:53:12, 9.54s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [16060], local_loss=0.00750365387648344, train_loss=0.01168489083647728, time_cost=1.8616907596588135
+
Steps: 2%|▏ | 16060/1000000 [7:49:27<2607:53:12, 9.54s/it, lr=1e-5, step_loss=0.0075]
Steps: 2%|▏ | 16061/1000000 [7:49:32<2228:44:53, 8.15s/it, lr=1e-5, step_loss=0.0075][RANK-0]: Step: [16061], local_loss=0.004649647511541843, train_loss=0.012670764699578285, time_cost=2.3829522132873535
+
Steps: 2%|▏ | 16061/1000000 [7:49:32<2228:44:53, 8.15s/it, lr=1e-5, step_loss=0.00465]
Steps: 2%|▏ | 16062/1000000 [7:49:38<2054:45:29, 7.52s/it, lr=1e-5, step_loss=0.00465][RANK-0]: Step: [16062], local_loss=0.00598642323166132, train_loss=0.043035976588726044, time_cost=1.8140208721160889
+
Steps: 2%|▏ | 16062/1000000 [7:49:38<2054:45:29, 7.52s/it, lr=1e-5, step_loss=0.00599]
Steps: 2%|▏ | 16063/1000000 [7:49:49<2359:37:53, 8.63s/it, lr=1e-5, step_loss=0.00599][RANK-0]: Step: [16063], local_loss=0.00594038050621748, train_loss=0.058442529290914536, time_cost=4.214822053909302
+
Steps: 2%|▏ | 16063/1000000 [7:49:49<2359:37:53, 8.63s/it, lr=1e-5, step_loss=0.00594]
Steps: 2%|▏ | 16064/1000000 [7:50:00<2526:44:35, 9.24s/it, lr=1e-5, step_loss=0.00594][RANK-0]: Step: [16064], local_loss=0.008263901807367802, train_loss=0.022001447156071663, time_cost=1.251314401626587
+
Steps: 2%|▏ | 16064/1000000 [7:50:00<2526:44:35, 9.24s/it, lr=1e-5, step_loss=0.00826]
Steps: 2%|▏ | 16065/1000000 [7:50:05<2167:28:56, 7.93s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [16065], local_loss=0.009425191208720207, train_loss=0.03792860358953476, time_cost=1.2183358669281006
+
Steps: 2%|▏ | 16065/1000000 [7:50:05<2167:28:56, 7.93s/it, lr=1e-5, step_loss=0.00943]
Steps: 2%|▏ | 16066/1000000 [7:50:12<2091:02:55, 7.65s/it, lr=1e-5, step_loss=0.00943][RANK-0]: Step: [16066], local_loss=0.004741509445011616, train_loss=0.15317994356155396, time_cost=5.142306566238403
+
Steps: 2%|▏ | 16066/1000000 [7:50:12<2091:02:55, 7.65s/it, lr=1e-5, step_loss=0.00474]
Steps: 2%|▏ | 16067/1000000 [7:50:18<1981:07:15, 7.25s/it, lr=1e-5, step_loss=0.00474][RANK-0]: Step: [16067], local_loss=0.09122682362794876, train_loss=0.041751764714717865, time_cost=1.6352815628051758
+
Steps: 2%|▏ | 16067/1000000 [7:50:18<1981:07:15, 7.25s/it, lr=1e-5, step_loss=0.0912]
Steps: 2%|▏ | 16068/1000000 [7:50:28<2197:59:18, 8.04s/it, lr=1e-5, step_loss=0.0912][RANK-0]: Step: [16068], local_loss=0.049270857125520706, train_loss=0.17583222687244415, time_cost=4.426974534988403
+
Steps: 2%|▏ | 16068/1000000 [7:50:28<2197:59:18, 8.04s/it, lr=1e-5, step_loss=0.0493]
Steps: 2%|▏ | 16069/1000000 [7:50:40<2483:48:31, 9.09s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [16069], local_loss=0.01740412600338459, train_loss=0.08561296761035919, time_cost=6.536393880844116
+
Steps: 2%|▏ | 16069/1000000 [7:50:40<2483:48:31, 9.09s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 16070/1000000 [7:50:44<2098:07:49, 7.68s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [16070], local_loss=0.06729090958833694, train_loss=0.018792232498526573, time_cost=3.1480062007904053
+
Steps: 2%|▏ | 16070/1000000 [7:50:44<2098:07:49, 7.68s/it, lr=1e-5, step_loss=0.0673]
Steps: 2%|▏ | 16071/1000000 [7:51:00<2763:26:15, 10.11s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [16071], local_loss=0.044996876269578934, train_loss=0.0585399866104126, time_cost=1.219296932220459
+
Steps: 2%|▏ | 16071/1000000 [7:51:00<2763:26:15, 10.11s/it, lr=1e-5, step_loss=0.045]
Steps: 2%|▏ | 16072/1000000 [7:51:06<2423:40:36, 8.87s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [16072], local_loss=0.008478382602334023, train_loss=0.06520131975412369, time_cost=1.8851280212402344
+
Steps: 2%|▏ | 16072/1000000 [7:51:06<2423:40:36, 8.87s/it, lr=1e-5, step_loss=0.00848]
Steps: 2%|▏ | 16073/1000000 [7:51:19<2772:57:53, 10.15s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [16073], local_loss=0.04071451723575592, train_loss=0.02571377158164978, time_cost=2.82446551322937
+
Steps: 2%|▏ | 16073/1000000 [7:51:19<2772:57:53, 10.15s/it, lr=1e-5, step_loss=0.0407]
Steps: 2%|▏ | 16074/1000000 [7:51:30<2840:32:13, 10.39s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [16074], local_loss=0.024847377091646194, train_loss=0.044643666595220566, time_cost=4.411395788192749
+
Steps: 2%|▏ | 16074/1000000 [7:51:30<2840:32:13, 10.39s/it, lr=1e-5, step_loss=0.0248]
Steps: 2%|▏ | 16075/1000000 [7:51:35<2404:03:32, 8.80s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [16075], local_loss=0.007987985387444496, train_loss=0.033504415303468704, time_cost=2.178901195526123
+
Steps: 2%|▏ | 16075/1000000 [7:51:35<2404:03:32, 8.80s/it, lr=1e-5, step_loss=0.00799]
Steps: 2%|▏ | 16076/1000000 [7:51:47<2687:44:31, 9.83s/it, lr=1e-5, step_loss=0.00799][RANK-0]: Step: [16076], local_loss=0.021374672651290894, train_loss=0.08278918266296387, time_cost=3.681901693344116
+
Steps: 2%|▏ | 16076/1000000 [7:51:47<2687:44:31, 9.83s/it, lr=1e-5, step_loss=0.0214]
Steps: 2%|▏ | 16077/1000000 [7:51:58<2763:18:12, 10.11s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [16077], local_loss=0.004757490940392017, train_loss=0.04265003278851509, time_cost=3.95428729057312
+
Steps: 2%|▏ | 16077/1000000 [7:51:58<2763:18:12, 10.11s/it, lr=1e-5, step_loss=0.00476]
Steps: 2%|▏ | 16078/1000000 [7:52:12<3071:20:50, 11.24s/it, lr=1e-5, step_loss=0.00476][RANK-0]: Step: [16078], local_loss=0.05932461470365524, train_loss=0.029615404084324837, time_cost=5.660379886627197
+
Steps: 2%|▏ | 16078/1000000 [7:52:12<3071:20:50, 11.24s/it, lr=1e-5, step_loss=0.0593]
Steps: 2%|▏ | 16079/1000000 [7:52:23<3028:11:18, 11.08s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [16079], local_loss=0.1211383044719696, train_loss=0.034005023539066315, time_cost=1.22597074508667
+
Steps: 2%|▏ | 16079/1000000 [7:52:23<3028:11:18, 11.08s/it, lr=1e-5, step_loss=0.121]
Steps: 2%|▏ | 16080/1000000 [7:52:30<2721:07:44, 9.96s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [16080], local_loss=0.04595005139708519, train_loss=0.018499117344617844, time_cost=1.668699026107788
+
Steps: 2%|▏ | 16080/1000000 [7:52:30<2721:07:44, 9.96s/it, lr=1e-5, step_loss=0.046]
Steps: 2%|▏ | 16081/1000000 [7:52:40<2761:24:06, 10.10s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [16081], local_loss=0.014751509763300419, train_loss=0.03268459439277649, time_cost=1.6687791347503662
+
Steps: 2%|▏ | 16081/1000000 [7:52:40<2761:24:06, 10.10s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 16082/1000000 [7:52:54<3039:31:21, 11.12s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [16082], local_loss=0.036434952169656754, train_loss=0.03974007070064545, time_cost=7.1364829540252686
+
Steps: 2%|▏ | 16082/1000000 [7:52:54<3039:31:21, 11.12s/it, lr=1e-5, step_loss=0.0364]
Steps: 2%|▏ | 16083/1000000 [7:53:07<3187:46:40, 11.66s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [16083], local_loss=0.004961485508829355, train_loss=0.024321798235177994, time_cost=4.857407331466675
+
Steps: 2%|▏ | 16083/1000000 [7:53:07<3187:46:40, 11.66s/it, lr=1e-5, step_loss=0.00496]
Steps: 2%|▏ | 16084/1000000 [7:53:15<2886:34:43, 10.56s/it, lr=1e-5, step_loss=0.00496][RANK-0]: Step: [16084], local_loss=0.08497515320777893, train_loss=0.02684747613966465, time_cost=3.1617329120635986
+
Steps: 2%|▏ | 16084/1000000 [7:53:15<2886:34:43, 10.56s/it, lr=1e-5, step_loss=0.085]
Steps: 2%|▏ | 16085/1000000 [7:53:27<2998:59:32, 10.97s/it, lr=1e-5, step_loss=0.085][RANK-0]: Step: [16085], local_loss=0.011160147376358509, train_loss=0.03436891734600067, time_cost=3.617450475692749
+
Steps: 2%|▏ | 16085/1000000 [7:53:27<2998:59:32, 10.97s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 16086/1000000 [7:53:31<2442:46:06, 8.94s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [16086], local_loss=0.00711701437830925, train_loss=0.025584880262613297, time_cost=1.5130341053009033
+
Steps: 2%|▏ | 16086/1000000 [7:53:31<2442:46:06, 8.94s/it, lr=1e-5, step_loss=0.00712]
Steps: 2%|▏ | 16087/1000000 [7:53:37<2229:28:52, 8.16s/it, lr=1e-5, step_loss=0.00712][RANK-0]: Step: [16087], local_loss=0.04703831300139427, train_loss=0.018549803644418716, time_cost=2.920301675796509
+
Steps: 2%|▏ | 16087/1000000 [7:53:37<2229:28:52, 8.16s/it, lr=1e-5, step_loss=0.047]
Steps: 2%|▏ | 16088/1000000 [7:53:44<2122:01:14, 7.76s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [16088], local_loss=0.007780540268868208, train_loss=0.03022484853863716, time_cost=3.027256965637207
+
Steps: 2%|▏ | 16088/1000000 [7:53:44<2122:01:14, 7.76s/it, lr=1e-5, step_loss=0.00778]
Steps: 2%|▏ | 16089/1000000 [7:53:51<2050:09:49, 7.50s/it, lr=1e-5, step_loss=0.00778][RANK-0]: Step: [16089], local_loss=0.23690930008888245, train_loss=0.0638778880238533, time_cost=1.2720246315002441
+
Steps: 2%|▏ | 16089/1000000 [7:53:51<2050:09:49, 7.50s/it, lr=1e-5, step_loss=0.237]
Steps: 2%|▏ | 16090/1000000 [7:53:58<2002:34:30, 7.33s/it, lr=1e-5, step_loss=0.237][RANK-0]: Step: [16090], local_loss=0.05797012150287628, train_loss=0.1576395034790039, time_cost=3.345133066177368
+
Steps: 2%|▏ | 16090/1000000 [7:53:58<2002:34:30, 7.33s/it, lr=1e-5, step_loss=0.058]
Steps: 2%|▏ | 16091/1000000 [7:54:05<1955:59:00, 7.16s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [16091], local_loss=0.08398818969726562, train_loss=0.15876945853233337, time_cost=2.904641628265381
+
Steps: 2%|▏ | 16091/1000000 [7:54:05<1955:59:00, 7.16s/it, lr=1e-5, step_loss=0.084]
Steps: 2%|▏ | 16092/1000000 [7:54:14<2124:54:04, 7.77s/it, lr=1e-5, step_loss=0.084][RANK-0]: Step: [16092], local_loss=0.12029726803302765, train_loss=0.034247927367687225, time_cost=3.094914674758911
+
Steps: 2%|▏ | 16092/1000000 [7:54:14<2124:54:04, 7.77s/it, lr=1e-5, step_loss=0.12]
Steps: 2%|▏ | 16093/1000000 [7:54:19<1908:00:44, 6.98s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [16093], local_loss=0.019107632339000702, train_loss=0.04022691398859024, time_cost=2.082773447036743
+
Steps: 2%|▏ | 16093/1000000 [7:54:19<1908:00:44, 6.98s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 16094/1000000 [7:54:28<2094:54:57, 7.67s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [16094], local_loss=0.010854627937078476, train_loss=0.02680042013525963, time_cost=2.4254634380340576
+
Steps: 2%|▏ | 16094/1000000 [7:54:28<2094:54:57, 7.67s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 16095/1000000 [7:54:42<2593:27:50, 9.49s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [16095], local_loss=0.009643870405852795, train_loss=0.030660437420010567, time_cost=1.5818076133728027
+
Steps: 2%|▏ | 16095/1000000 [7:54:42<2593:27:50, 9.49s/it, lr=1e-5, step_loss=0.00964]
Steps: 2%|▏ | 16096/1000000 [7:54:51<2570:40:26, 9.41s/it, lr=1e-5, step_loss=0.00964][RANK-0]: Step: [16096], local_loss=0.010573041625320911, train_loss=0.02066691219806671, time_cost=1.7148613929748535
+
Steps: 2%|▏ | 16096/1000000 [7:54:51<2570:40:26, 9.41s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 16097/1000000 [7:54:57<2275:00:37, 8.32s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [16097], local_loss=0.01817730814218521, train_loss=0.03216533362865448, time_cost=1.4070076942443848
+
Steps: 2%|▏ | 16097/1000000 [7:54:57<2275:00:37, 8.32s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 16098/1000000 [7:55:06<2350:24:48, 8.60s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [16098], local_loss=0.007851839996874332, train_loss=0.01918196678161621, time_cost=2.103079319000244
+
Steps: 2%|▏ | 16098/1000000 [7:55:06<2350:24:48, 8.60s/it, lr=1e-5, step_loss=0.00785]
Steps: 2%|▏ | 16099/1000000 [7:55:16<2443:54:23, 8.94s/it, lr=1e-5, step_loss=0.00785][RANK-0]: Step: [16099], local_loss=0.033513832837343216, train_loss=0.03221113979816437, time_cost=4.681732892990112
+
Steps: 2%|▏ | 16099/1000000 [7:55:16<2443:54:23, 8.94s/it, lr=1e-5, step_loss=0.0335]
Steps: 2%|▏ | 16100/1000000 [7:55:22<2216:41:08, 8.11s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [16100], local_loss=0.005767871625721455, train_loss=16.393714904785156, time_cost=1.9275565147399902
+
Steps: 2%|▏ | 16100/1000000 [7:55:22<2216:41:08, 8.11s/it, lr=1e-5, step_loss=0.00577]
Steps: 2%|▏ | 16101/1000000 [7:55:29<2081:58:55, 7.62s/it, lr=1e-5, step_loss=0.00577][RANK-0]: Step: [16101], local_loss=0.039369359612464905, train_loss=0.18822452425956726, time_cost=2.5463593006134033
+
Steps: 2%|▏ | 16101/1000000 [7:55:29<2081:58:55, 7.62s/it, lr=1e-5, step_loss=0.0394]
Steps: 2%|▏ | 16102/1000000 [7:55:40<2424:27:39, 8.87s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [16102], local_loss=0.0597873255610466, train_loss=0.06075398996472359, time_cost=1.3485913276672363
+
Steps: 2%|▏ | 16102/1000000 [7:55:40<2424:27:39, 8.87s/it, lr=1e-5, step_loss=0.0598]
Steps: 2%|▏ | 16103/1000000 [7:55:56<2940:26:36, 10.76s/it, lr=1e-5, step_loss=0.0598][RANK-0]: Step: [16103], local_loss=0.0352427177131176, train_loss=3.1631851196289062, time_cost=3.2999603748321533
+
Steps: 2%|▏ | 16103/1000000 [7:55:56<2940:26:36, 10.76s/it, lr=1e-5, step_loss=0.0352]
Steps: 2%|▏ | 16104/1000000 [7:56:07<2962:56:12, 10.84s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [16104], local_loss=0.005076582543551922, train_loss=0.14453192055225372, time_cost=4.288997650146484
+
Steps: 2%|▏ | 16104/1000000 [7:56:07<2962:56:12, 10.84s/it, lr=1e-5, step_loss=0.00508]
Steps: 2%|▏ | 16105/1000000 [7:56:19<3118:40:10, 11.41s/it, lr=1e-5, step_loss=0.00508][RANK-0]: Step: [16105], local_loss=0.0370350256562233, train_loss=0.0195632204413414, time_cost=5.758819580078125
+
Steps: 2%|▏ | 16105/1000000 [7:56:19<3118:40:10, 11.41s/it, lr=1e-5, step_loss=0.037]
Steps: 2%|▏ | 16106/1000000 [7:56:30<3020:44:59, 11.05s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [16106], local_loss=0.016770262271165848, train_loss=0.07466886937618256, time_cost=4.341456651687622
+
Steps: 2%|▏ | 16106/1000000 [7:56:30<3020:44:59, 11.05s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 16107/1000000 [7:56:35<2530:31:08, 9.26s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [16107], local_loss=0.027745872735977173, train_loss=0.15291520953178406, time_cost=2.254023313522339
+
Steps: 2%|▏ | 16107/1000000 [7:56:35<2530:31:08, 9.26s/it, lr=1e-5, step_loss=0.0277]
Steps: 2%|▏ | 16108/1000000 [7:56:46<2712:53:39, 9.93s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [16108], local_loss=0.00468215299770236, train_loss=0.019932210445404053, time_cost=7.873823165893555
+
Steps: 2%|▏ | 16108/1000000 [7:56:46<2712:53:39, 9.93s/it, lr=1e-5, step_loss=0.00468]
Steps: 2%|▏ | 16109/1000000 [7:56:54<2505:08:53, 9.17s/it, lr=1e-5, step_loss=0.00468][RANK-0]: Step: [16109], local_loss=0.028255287557840347, train_loss=0.09745129942893982, time_cost=1.7596065998077393
+
Steps: 2%|▏ | 16109/1000000 [7:56:54<2505:08:53, 9.17s/it, lr=1e-5, step_loss=0.0283]
Steps: 2%|▏ | 16110/1000000 [7:57:03<2523:40:01, 9.23s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [16110], local_loss=0.03980425372719765, train_loss=0.04277893900871277, time_cost=3.5768706798553467
+
Steps: 2%|▏ | 16110/1000000 [7:57:03<2523:40:01, 9.23s/it, lr=1e-5, step_loss=0.0398]
Steps: 2%|▏ | 16111/1000000 [7:57:12<2480:56:38, 9.08s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [16111], local_loss=0.011327391490340233, train_loss=0.05217084288597107, time_cost=7.34018087387085
+
Steps: 2%|▏ | 16111/1000000 [7:57:12<2480:56:38, 9.08s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 16112/1000000 [7:57:19<2321:17:27, 8.49s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [16112], local_loss=0.2662636935710907, train_loss=0.0646999329328537, time_cost=3.5125958919525146
+
Steps: 2%|▏ | 16112/1000000 [7:57:19<2321:17:27, 8.49s/it, lr=1e-5, step_loss=0.266]
Steps: 2%|▏ | 16113/1000000 [7:57:33<2793:33:40, 10.22s/it, lr=1e-5, step_loss=0.266][RANK-0]: Step: [16113], local_loss=0.014677566476166248, train_loss=0.02911987341940403, time_cost=9.897892951965332
+
Steps: 2%|▏ | 16113/1000000 [7:57:33<2793:33:40, 10.22s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 16114/1000000 [7:57:43<2790:49:08, 10.21s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [16114], local_loss=0.02817394584417343, train_loss=0.02558295801281929, time_cost=4.3970818519592285
+
Steps: 2%|▏ | 16114/1000000 [7:57:43<2790:49:08, 10.21s/it, lr=1e-5, step_loss=0.0282]
Steps: 2%|▏ | 16115/1000000 [7:57:51<2605:26:00, 9.53s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [16115], local_loss=0.1764083206653595, train_loss=0.05452604964375496, time_cost=3.945472478866577
+
Steps: 2%|▏ | 16115/1000000 [7:57:51<2605:26:00, 9.53s/it, lr=1e-5, step_loss=0.176]
Steps: 2%|▏ | 16116/1000000 [7:57:57<2286:02:26, 8.36s/it, lr=1e-5, step_loss=0.176][RANK-0]: Step: [16116], local_loss=0.03805108368396759, train_loss=0.05318029224872589, time_cost=1.5614814758300781
+
Steps: 2%|▏ | 16116/1000000 [7:57:57<2286:02:26, 8.36s/it, lr=1e-5, step_loss=0.0381]
Steps: 2%|▏ | 16117/1000000 [7:58:13<2921:58:28, 10.69s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [16117], local_loss=0.027352582663297653, train_loss=0.048100024461746216, time_cost=7.200422525405884
+
Steps: 2%|▏ | 16117/1000000 [7:58:13<2921:58:28, 10.69s/it, lr=1e-5, step_loss=0.0274]
Steps: 2%|▏ | 16118/1000000 [7:58:24<2942:51:00, 10.77s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [16118], local_loss=0.013126465491950512, train_loss=0.05421833693981171, time_cost=2.005507230758667
+
Steps: 2%|▏ | 16118/1000000 [7:58:24<2942:51:00, 10.77s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 16119/1000000 [7:58:29<2507:15:23, 9.17s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [16119], local_loss=0.017567817121744156, train_loss=0.074811190366745, time_cost=2.943265438079834
+
Steps: 2%|▏ | 16119/1000000 [7:58:29<2507:15:23, 9.17s/it, lr=1e-5, step_loss=0.0176]
Steps: 2%|▏ | 16120/1000000 [7:58:38<2487:06:07, 9.10s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [16120], local_loss=0.029918566346168518, train_loss=0.020308736711740494, time_cost=3.2563302516937256
+
Steps: 2%|▏ | 16120/1000000 [7:58:38<2487:06:07, 9.10s/it, lr=1e-5, step_loss=0.0299]
Steps: 2%|▏ | 16121/1000000 [7:58:54<3040:37:56, 11.13s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [16121], local_loss=0.10841518640518188, train_loss=0.03532605990767479, time_cost=6.3217620849609375
+
Steps: 2%|▏ | 16121/1000000 [7:58:54<3040:37:56, 11.13s/it, lr=1e-5, step_loss=0.108]
Steps: 2%|▏ | 16122/1000000 [7:59:09<3347:31:58, 12.25s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [16122], local_loss=0.006840684916824102, train_loss=0.01684163138270378, time_cost=2.6794137954711914
+
Steps: 2%|▏ | 16122/1000000 [7:59:09<3347:31:58, 12.25s/it, lr=1e-5, step_loss=0.00684]
Steps: 2%|▏ | 16123/1000000 [7:59:20<3251:04:22, 11.90s/it, lr=1e-5, step_loss=0.00684][RANK-0]: Step: [16123], local_loss=0.13131915032863617, train_loss=0.03170172870159149, time_cost=7.917720079421997
+
Steps: 2%|▏ | 16123/1000000 [7:59:20<3251:04:22, 11.90s/it, lr=1e-5, step_loss=0.131]
Steps: 2%|▏ | 16124/1000000 [7:59:32<3258:46:25, 11.92s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [16124], local_loss=0.17333120107650757, train_loss=0.0775827094912529, time_cost=3.529278039932251
+
Steps: 2%|▏ | 16124/1000000 [7:59:32<3258:46:25, 11.92s/it, lr=1e-5, step_loss=0.173]
Steps: 2%|▏ | 16125/1000000 [7:59:39<2862:10:52, 10.47s/it, lr=1e-5, step_loss=0.173][RANK-0]: Step: [16125], local_loss=0.017904730513691902, train_loss=0.03750491142272949, time_cost=4.3155677318573
+
Steps: 2%|▏ | 16125/1000000 [7:59:39<2862:10:52, 10.47s/it, lr=1e-5, step_loss=0.0179]
Steps: 2%|▏ | 16126/1000000 [7:59:52<3086:55:38, 11.30s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [16126], local_loss=0.059749651700258255, train_loss=0.11028029024600983, time_cost=4.886650085449219
+
Steps: 2%|▏ | 16126/1000000 [7:59:52<3086:55:38, 11.30s/it, lr=1e-5, step_loss=0.0597]
Steps: 2%|▏ | 16127/1000000 [8:00:04<3076:54:41, 11.26s/it, lr=1e-5, step_loss=0.0597][RANK-0]: Step: [16127], local_loss=0.0038655083626508713, train_loss=0.027164939790964127, time_cost=2.727017879486084
+
Steps: 2%|▏ | 16127/1000000 [8:00:04<3076:54:41, 11.26s/it, lr=1e-5, step_loss=0.00387]
Steps: 2%|▏ | 16128/1000000 [8:00:09<2586:32:44, 9.46s/it, lr=1e-5, step_loss=0.00387][RANK-0]: Step: [16128], local_loss=0.4650763273239136, train_loss=0.09320100396871567, time_cost=2.3435564041137695
+
Steps: 2%|▏ | 16128/1000000 [8:00:09<2586:32:44, 9.46s/it, lr=1e-5, step_loss=0.465]
Steps: 2%|▏ | 16129/1000000 [8:00:20<2749:11:25, 10.06s/it, lr=1e-5, step_loss=0.465][RANK-0]: Step: [16129], local_loss=0.054836682975292206, train_loss=0.04206588864326477, time_cost=4.922401666641235
+
Steps: 2%|▏ | 16129/1000000 [8:00:20<2749:11:25, 10.06s/it, lr=1e-5, step_loss=0.0548]
Steps: 2%|▏ | 16130/1000000 [8:00:29<2656:55:24, 9.72s/it, lr=1e-5, step_loss=0.0548][RANK-0]: Step: [16130], local_loss=0.012919096276164055, train_loss=0.1403844803571701, time_cost=2.0642812252044678
+
Steps: 2%|▏ | 16130/1000000 [8:00:29<2656:55:24, 9.72s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 16131/1000000 [8:00:45<3175:01:05, 11.62s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [16131], local_loss=0.04091417044401169, train_loss=0.16841571033000946, time_cost=7.936307191848755
+
Steps: 2%|▏ | 16131/1000000 [8:00:45<3175:01:05, 11.62s/it, lr=1e-5, step_loss=0.0409]
Steps: 2%|▏ | 16132/1000000 [8:00:50<2578:13:36, 9.43s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [16132], local_loss=0.048042964190244675, train_loss=0.03797776997089386, time_cost=3.5689914226531982
+
Steps: 2%|▏ | 16132/1000000 [8:00:50<2578:13:36, 9.43s/it, lr=1e-5, step_loss=0.048]
Steps: 2%|▏ | 16133/1000000 [8:00:54<2205:13:38, 8.07s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [16133], local_loss=0.06488977372646332, train_loss=0.0696878731250763, time_cost=1.7712440490722656
+
Steps: 2%|▏ | 16133/1000000 [8:00:54<2205:13:38, 8.07s/it, lr=1e-5, step_loss=0.0649]
Steps: 2%|▏ | 16134/1000000 [8:01:04<2322:58:45, 8.50s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [16134], local_loss=0.03971846401691437, train_loss=0.035653479397296906, time_cost=6.174379110336304
+
Steps: 2%|▏ | 16134/1000000 [8:01:04<2322:58:45, 8.50s/it, lr=1e-5, step_loss=0.0397]
Steps: 2%|▏ | 16135/1000000 [8:01:13<2360:06:14, 8.64s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [16135], local_loss=0.022516850382089615, train_loss=0.0331304594874382, time_cost=1.220226526260376
+
Steps: 2%|▏ | 16135/1000000 [8:01:13<2360:06:14, 8.64s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 16136/1000000 [8:01:22<2403:35:30, 8.79s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [16136], local_loss=0.027682466432452202, train_loss=0.051090385764837265, time_cost=1.7365357875823975
+
Steps: 2%|▏ | 16136/1000000 [8:01:22<2403:35:30, 8.79s/it, lr=1e-5, step_loss=0.0277]
Steps: 2%|▏ | 16137/1000000 [8:01:27<2120:08:54, 7.76s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [16137], local_loss=0.009467929601669312, train_loss=0.04681873321533203, time_cost=2.682204484939575
+
Steps: 2%|▏ | 16137/1000000 [8:01:27<2120:08:54, 7.76s/it, lr=1e-5, step_loss=0.00947]
Steps: 2%|▏ | 16138/1000000 [8:01:34<2035:10:08, 7.45s/it, lr=1e-5, step_loss=0.00947][RANK-0]: Step: [16138], local_loss=0.008264326490461826, train_loss=0.028985051438212395, time_cost=2.5263404846191406
+
Steps: 2%|▏ | 16138/1000000 [8:01:34<2035:10:08, 7.45s/it, lr=1e-5, step_loss=0.00826]
Steps: 2%|▏ | 16139/1000000 [8:01:41<1976:09:42, 7.23s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [16139], local_loss=0.025704652070999146, train_loss=0.050506796687841415, time_cost=2.8619019985198975
+
Steps: 2%|▏ | 16139/1000000 [8:01:41<1976:09:42, 7.23s/it, lr=1e-5, step_loss=0.0257]
Steps: 2%|▏ | 16140/1000000 [8:01:50<2118:21:19, 7.75s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [16140], local_loss=0.10417213290929794, train_loss=0.032487716525793076, time_cost=1.3655736446380615
+
Steps: 2%|▏ | 16140/1000000 [8:01:50<2118:21:19, 7.75s/it, lr=1e-5, step_loss=0.104]
Steps: 2%|▏ | 16141/1000000 [8:01:54<1842:21:21, 6.74s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [16141], local_loss=0.06797460466623306, train_loss=0.03355567902326584, time_cost=1.4134533405303955
+
Steps: 2%|▏ | 16141/1000000 [8:01:54<1842:21:21, 6.74s/it, lr=1e-5, step_loss=0.068]
Steps: 2%|▏ | 16142/1000000 [8:02:05<2194:43:26, 8.03s/it, lr=1e-5, step_loss=0.068][RANK-0]: Step: [16142], local_loss=0.023680277168750763, train_loss=0.044835321605205536, time_cost=3.096691608428955
+
Steps: 2%|▏ | 16142/1000000 [8:02:05<2194:43:26, 8.03s/it, lr=1e-5, step_loss=0.0237]
Steps: 2%|▏ | 16143/1000000 [8:02:10<1907:41:12, 6.98s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [16143], local_loss=0.012505041435360909, train_loss=0.08692027628421783, time_cost=1.6206996440887451
+
Steps: 2%|▏ | 16143/1000000 [8:02:10<1907:41:12, 6.98s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 16144/1000000 [8:02:16<1818:21:06, 6.65s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [16144], local_loss=1.0020426511764526, train_loss=0.2722172737121582, time_cost=3.357083320617676
+
Steps: 2%|▏ | 16144/1000000 [8:02:16<1818:21:06, 6.65s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 16145/1000000 [8:02:24<1995:32:14, 7.30s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [16145], local_loss=0.021074730902910233, train_loss=0.027992159128189087, time_cost=2.1304478645324707
+
Steps: 2%|▏ | 16145/1000000 [8:02:24<1995:32:14, 7.30s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 16146/1000000 [8:02:34<2211:09:16, 8.09s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [16146], local_loss=0.010591478087008, train_loss=0.022168850526213646, time_cost=1.2157230377197266
+
Steps: 2%|▏ | 16146/1000000 [8:02:34<2211:09:16, 8.09s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 16147/1000000 [8:02:41<2105:18:47, 7.70s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [16147], local_loss=0.00532566849142313, train_loss=0.012960616499185562, time_cost=2.3478994369506836
+
Steps: 2%|▏ | 16147/1000000 [8:02:41<2105:18:47, 7.70s/it, lr=1e-5, step_loss=0.00533]
Steps: 2%|▏ | 16148/1000000 [8:02:55<2564:43:20, 9.38s/it, lr=1e-5, step_loss=0.00533][RANK-0]: Step: [16148], local_loss=0.006601652130484581, train_loss=0.09852030873298645, time_cost=4.009793758392334
+
Steps: 2%|▏ | 16148/1000000 [8:02:55<2564:43:20, 9.38s/it, lr=1e-5, step_loss=0.0066]
Steps: 2%|▏ | 16149/1000000 [8:03:05<2687:05:53, 9.83s/it, lr=1e-5, step_loss=0.0066][RANK-0]: Step: [16149], local_loss=0.03181883320212364, train_loss=0.023226357996463776, time_cost=1.220656156539917
+
Steps: 2%|▏ | 16149/1000000 [8:03:05<2687:05:53, 9.83s/it, lr=1e-5, step_loss=0.0318]
Steps: 2%|▏ | 16150/1000000 [8:03:11<2344:10:25, 8.58s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [16150], local_loss=0.0839640200138092, train_loss=0.02536782994866371, time_cost=2.9278197288513184
+
Steps: 2%|▏ | 16150/1000000 [8:03:11<2344:10:25, 8.58s/it, lr=1e-5, step_loss=0.084]
Steps: 2%|▏ | 16151/1000000 [8:03:19<2281:56:27, 8.35s/it, lr=1e-5, step_loss=0.084][RANK-0]: Step: [16151], local_loss=0.006492302753031254, train_loss=0.027838805690407753, time_cost=2.3027937412261963
+
Steps: 2%|▏ | 16151/1000000 [8:03:19<2281:56:27, 8.35s/it, lr=1e-5, step_loss=0.00649]
Steps: 2%|▏ | 16152/1000000 [8:03:26<2187:01:43, 8.00s/it, lr=1e-5, step_loss=0.00649][RANK-0]: Step: [16152], local_loss=0.010738967917859554, train_loss=0.01886829361319542, time_cost=3.3877060413360596
+
Steps: 2%|▏ | 16152/1000000 [8:03:26<2187:01:43, 8.00s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 16153/1000000 [8:03:31<1934:06:58, 7.08s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [16153], local_loss=0.33066287636756897, train_loss=0.22727467119693756, time_cost=2.6215720176696777
+
Steps: 2%|▏ | 16153/1000000 [8:03:31<1934:06:58, 7.08s/it, lr=1e-5, step_loss=0.331]
Steps: 2%|▏ | 16154/1000000 [8:03:40<2105:26:43, 7.70s/it, lr=1e-5, step_loss=0.331][RANK-0]: Step: [16154], local_loss=0.020699355751276016, train_loss=0.043256163597106934, time_cost=1.2322063446044922
+
Steps: 2%|▏ | 16154/1000000 [8:03:40<2105:26:43, 7.70s/it, lr=1e-5, step_loss=0.0207]
Steps: 2%|▏ | 16155/1000000 [8:03:50<2266:42:33, 8.29s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [16155], local_loss=0.014568579383194447, train_loss=19.91119384765625, time_cost=4.55719780921936
+
Steps: 2%|▏ | 16155/1000000 [8:03:50<2266:42:33, 8.29s/it, lr=1e-5, step_loss=0.0146]
Steps: 2%|▏ | 16156/1000000 [8:03:56<2117:09:13, 7.75s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [16156], local_loss=0.08264721184968948, train_loss=0.02390730381011963, time_cost=2.4014084339141846
+
Steps: 2%|▏ | 16156/1000000 [8:03:56<2117:09:13, 7.75s/it, lr=1e-5, step_loss=0.0826]
Steps: 2%|▏ | 16157/1000000 [8:04:10<2602:49:14, 9.52s/it, lr=1e-5, step_loss=0.0826][RANK-0]: Step: [16157], local_loss=0.020412607118487358, train_loss=0.03410359099507332, time_cost=4.737490892410278
+
Steps: 2%|▏ | 16157/1000000 [8:04:10<2602:49:14, 9.52s/it, lr=1e-5, step_loss=0.0204]
Steps: 2%|▏ | 16158/1000000 [8:04:15<2271:34:15, 8.31s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [16158], local_loss=0.01292004156857729, train_loss=0.021792951971292496, time_cost=2.8540027141571045
+
Steps: 2%|▏ | 16158/1000000 [8:04:15<2271:34:15, 8.31s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 16159/1000000 [8:04:20<1934:05:34, 7.08s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [16159], local_loss=0.0442286916077137, train_loss=0.08873596042394638, time_cost=1.1955866813659668
+
Steps: 2%|▏ | 16159/1000000 [8:04:20<1934:05:34, 7.08s/it, lr=1e-5, step_loss=0.0442]
Steps: 2%|▏ | 16160/1000000 [8:04:27<1972:02:56, 7.22s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [16160], local_loss=0.03664473816752434, train_loss=0.02171538583934307, time_cost=1.6417603492736816
+
Steps: 2%|▏ | 16160/1000000 [8:04:27<1972:02:56, 7.22s/it, lr=1e-5, step_loss=0.0366]
Steps: 2%|▏ | 16161/1000000 [8:04:36<2143:47:16, 7.84s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [16161], local_loss=0.0376090332865715, train_loss=0.018349217250943184, time_cost=3.071535110473633
+
Steps: 2%|▏ | 16161/1000000 [8:04:37<2143:47:16, 7.84s/it, lr=1e-5, step_loss=0.0376]
Steps: 2%|▏ | 16162/1000000 [8:04:43<2066:28:07, 7.56s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [16162], local_loss=0.018212350085377693, train_loss=0.023510383442044258, time_cost=3.100858449935913
+
Steps: 2%|▏ | 16162/1000000 [8:04:43<2066:28:07, 7.56s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 16163/1000000 [8:04:57<2529:48:10, 9.26s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [16163], local_loss=0.024356799200177193, train_loss=0.05616593733429909, time_cost=4.968812704086304
+
Steps: 2%|▏ | 16163/1000000 [8:04:57<2529:48:10, 9.26s/it, lr=1e-5, step_loss=0.0244]
Steps: 2%|▏ | 16164/1000000 [8:05:10<2891:32:02, 10.58s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [16164], local_loss=0.0074572996236383915, train_loss=0.02237841486930847, time_cost=4.490358829498291
+
Steps: 2%|▏ | 16164/1000000 [8:05:10<2891:32:02, 10.58s/it, lr=1e-5, step_loss=0.00746]
Steps: 2%|▏ | 16165/1000000 [8:05:18<2633:19:26, 9.64s/it, lr=1e-5, step_loss=0.00746][RANK-0]: Step: [16165], local_loss=0.00950108002871275, train_loss=0.025335092097520828, time_cost=1.2563154697418213
+
Steps: 2%|▏ | 16165/1000000 [8:05:18<2633:19:26, 9.64s/it, lr=1e-5, step_loss=0.0095]
Steps: 2%|▏ | 16166/1000000 [8:05:22<2229:20:00, 8.16s/it, lr=1e-5, step_loss=0.0095][RANK-0]: Step: [16166], local_loss=0.0389542356133461, train_loss=0.07283530384302139, time_cost=1.8254008293151855
+
Steps: 2%|▏ | 16166/1000000 [8:05:22<2229:20:00, 8.16s/it, lr=1e-5, step_loss=0.039]
Steps: 2%|▏ | 16167/1000000 [8:05:34<2517:33:40, 9.21s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [16167], local_loss=0.037577494978904724, train_loss=0.042412057518959045, time_cost=7.8499085903167725
+
Steps: 2%|▏ | 16167/1000000 [8:05:34<2517:33:40, 9.21s/it, lr=1e-5, step_loss=0.0376]
Steps: 2%|▏ | 16168/1000000 [8:05:43<2485:13:06, 9.09s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [16168], local_loss=0.06532846391201019, train_loss=0.03958591818809509, time_cost=2.2841591835021973
+
Steps: 2%|▏ | 16168/1000000 [8:05:43<2485:13:06, 9.09s/it, lr=1e-5, step_loss=0.0653]
Steps: 2%|▏ | 16169/1000000 [8:05:48<2168:24:58, 7.93s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [16169], local_loss=0.07299275696277618, train_loss=0.08400388062000275, time_cost=2.5069477558135986
+
Steps: 2%|▏ | 16169/1000000 [8:05:48<2168:24:58, 7.93s/it, lr=1e-5, step_loss=0.073]
Steps: 2%|▏ | 16170/1000000 [8:06:00<2456:08:45, 8.99s/it, lr=1e-5, step_loss=0.073][RANK-0]: Step: [16170], local_loss=0.00800103135406971, train_loss=0.05420343577861786, time_cost=2.838046073913574
+
Steps: 2%|▏ | 16170/1000000 [8:06:00<2456:08:45, 8.99s/it, lr=1e-5, step_loss=0.008]
Steps: 2%|▏ | 16171/1000000 [8:06:07<2296:48:56, 8.40s/it, lr=1e-5, step_loss=0.008][RANK-0]: Step: [16171], local_loss=0.9995906352996826, train_loss=0.29327821731567383, time_cost=2.920093536376953
+
Steps: 2%|▏ | 16171/1000000 [8:06:07<2296:48:56, 8.40s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 16172/1000000 [8:06:12<2027:01:23, 7.42s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [16172], local_loss=0.006018318235874176, train_loss=0.02442755177617073, time_cost=2.5234792232513428
+
Steps: 2%|▏ | 16172/1000000 [8:06:12<2027:01:23, 7.42s/it, lr=1e-5, step_loss=0.00602]
Steps: 2%|▏ | 16173/1000000 [8:06:18<1905:47:36, 6.97s/it, lr=1e-5, step_loss=0.00602][RANK-0]: Step: [16173], local_loss=0.03824108839035034, train_loss=0.038793355226516724, time_cost=3.102525472640991
+
Steps: 2%|▏ | 16173/1000000 [8:06:18<1905:47:36, 6.97s/it, lr=1e-5, step_loss=0.0382]
Steps: 2%|▏ | 16174/1000000 [8:06:29<2230:47:30, 8.16s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [16174], local_loss=0.9877355098724365, train_loss=0.21860072016716003, time_cost=4.1748998165130615
+
Steps: 2%|▏ | 16174/1000000 [8:06:29<2230:47:30, 8.16s/it, lr=1e-5, step_loss=0.988]
Steps: 2%|▏ | 16175/1000000 [8:06:39<2380:48:42, 8.71s/it, lr=1e-5, step_loss=0.988][RANK-0]: Step: [16175], local_loss=0.005217067897319794, train_loss=0.015639042481780052, time_cost=4.664712190628052
+
Steps: 2%|▏ | 16175/1000000 [8:06:39<2380:48:42, 8.71s/it, lr=1e-5, step_loss=0.00522]
Steps: 2%|▏ | 16176/1000000 [8:06:49<2529:20:23, 9.26s/it, lr=1e-5, step_loss=0.00522][RANK-0]: Step: [16176], local_loss=0.06356887519359589, train_loss=0.08420273661613464, time_cost=2.0498087406158447
+
Steps: 2%|▏ | 16176/1000000 [8:06:49<2529:20:23, 9.26s/it, lr=1e-5, step_loss=0.0636]
Steps: 2%|▏ | 16177/1000000 [8:06:57<2406:02:07, 8.80s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [16177], local_loss=0.010694652795791626, train_loss=0.027863822877407074, time_cost=1.528693437576294
+
Steps: 2%|▏ | 16177/1000000 [8:06:57<2406:02:07, 8.80s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 16178/1000000 [8:07:05<2381:52:41, 8.72s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [16178], local_loss=0.11647722870111465, train_loss=0.04564956948161125, time_cost=1.397934913635254
+
Steps: 2%|▏ | 16178/1000000 [8:07:05<2381:52:41, 8.72s/it, lr=1e-5, step_loss=0.116]
Steps: 2%|▏ | 16179/1000000 [8:07:18<2722:13:47, 9.96s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [16179], local_loss=0.006787192542105913, train_loss=0.13804717361927032, time_cost=1.2081079483032227
+
Steps: 2%|▏ | 16179/1000000 [8:07:18<2722:13:47, 9.96s/it, lr=1e-5, step_loss=0.00679]
Steps: 2%|▏ | 16180/1000000 [8:07:34<3190:41:22, 11.68s/it, lr=1e-5, step_loss=0.00679][RANK-0]: Step: [16180], local_loss=0.032381318509578705, train_loss=0.025004060938954353, time_cost=3.9508211612701416
+
Steps: 2%|▏ | 16180/1000000 [8:07:34<3190:41:22, 11.68s/it, lr=1e-5, step_loss=0.0324]
Steps: 2%|▏ | 16181/1000000 [8:07:41<2838:22:43, 10.39s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [16181], local_loss=0.009454491548240185, train_loss=0.02891572006046772, time_cost=1.558833122253418
+
Steps: 2%|▏ | 16181/1000000 [8:07:41<2838:22:43, 10.39s/it, lr=1e-5, step_loss=0.00945]
Steps: 2%|▏ | 16182/1000000 [8:07:48<2557:06:26, 9.36s/it, lr=1e-5, step_loss=0.00945][RANK-0]: Step: [16182], local_loss=44.845252990722656, train_loss=5.646780967712402, time_cost=5.875410318374634
+
Steps: 2%|▏ | 16182/1000000 [8:07:48<2557:06:26, 9.36s/it, lr=1e-5, step_loss=44.8]
Steps: 2%|▏ | 16183/1000000 [8:07:53<2142:38:00, 7.84s/it, lr=1e-5, step_loss=44.8][RANK-0]: Step: [16183], local_loss=0.047625310719013214, train_loss=0.06058667600154877, time_cost=1.3514344692230225
+
Steps: 2%|▏ | 16183/1000000 [8:07:53<2142:38:00, 7.84s/it, lr=1e-5, step_loss=0.0476]
Steps: 2%|▏ | 16184/1000000 [8:07:58<1920:34:48, 7.03s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [16184], local_loss=0.024395931512117386, train_loss=0.036835432052612305, time_cost=1.9387891292572021
+
Steps: 2%|▏ | 16184/1000000 [8:07:58<1920:34:48, 7.03s/it, lr=1e-5, step_loss=0.0244]
Steps: 2%|▏ | 16185/1000000 [8:08:10<2390:19:51, 8.75s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [16185], local_loss=0.006470009684562683, train_loss=15.660236358642578, time_cost=3.7654097080230713
+
Steps: 2%|▏ | 16185/1000000 [8:08:10<2390:19:51, 8.75s/it, lr=1e-5, step_loss=0.00647]
Steps: 2%|▏ | 16186/1000000 [8:08:15<2021:48:01, 7.40s/it, lr=1e-5, step_loss=0.00647][RANK-0]: Step: [16186], local_loss=0.050459813326597214, train_loss=0.03546687588095665, time_cost=1.6137843132019043
+
Steps: 2%|▏ | 16186/1000000 [8:08:15<2021:48:01, 7.40s/it, lr=1e-5, step_loss=0.0505]
Steps: 2%|▏ | 16187/1000000 [8:08:23<2106:57:50, 7.71s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [16187], local_loss=0.020106058567762375, train_loss=0.12288352102041245, time_cost=1.2288968563079834
+
Steps: 2%|▏ | 16187/1000000 [8:08:23<2106:57:50, 7.71s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 16188/1000000 [8:08:37<2586:23:54, 9.46s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [16188], local_loss=0.02155289612710476, train_loss=0.054297201335430145, time_cost=1.5404739379882812
+
Steps: 2%|▏ | 16188/1000000 [8:08:37<2586:23:54, 9.46s/it, lr=1e-5, step_loss=0.0216]
Steps: 2%|▏ | 16189/1000000 [8:08:42<2229:03:18, 8.16s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [16189], local_loss=0.02425410971045494, train_loss=0.0194247979670763, time_cost=2.244945764541626
+
Steps: 2%|▏ | 16189/1000000 [8:08:42<2229:03:18, 8.16s/it, lr=1e-5, step_loss=0.0243]
Steps: 2%|▏ | 16190/1000000 [8:08:46<1939:35:58, 7.10s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [16190], local_loss=0.04800815507769585, train_loss=0.044017642736434937, time_cost=3.5531163215637207
+
Steps: 2%|▏ | 16190/1000000 [8:08:46<1939:35:58, 7.10s/it, lr=1e-5, step_loss=0.048]
Steps: 2%|▏ | 16191/1000000 [8:08:56<2118:14:32, 7.75s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [16191], local_loss=0.055254556238651276, train_loss=0.02236148715019226, time_cost=2.4230594635009766
+
Steps: 2%|▏ | 16191/1000000 [8:08:56<2118:14:32, 7.75s/it, lr=1e-5, step_loss=0.0553]
Steps: 2%|▏ | 16192/1000000 [8:09:06<2286:48:05, 8.37s/it, lr=1e-5, step_loss=0.0553][RANK-0]: Step: [16192], local_loss=0.02326870709657669, train_loss=0.2816663384437561, time_cost=3.0253307819366455
+
Steps: 2%|▏ | 16192/1000000 [8:09:06<2286:48:05, 8.37s/it, lr=1e-5, step_loss=0.0233]
Steps: 2%|▏ | 16193/1000000 [8:09:18<2653:24:14, 9.71s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [16193], local_loss=0.007811015471816063, train_loss=58.194976806640625, time_cost=1.3203797340393066
+
Steps: 2%|▏ | 16193/1000000 [8:09:18<2653:24:14, 9.71s/it, lr=1e-5, step_loss=0.00781]
Steps: 2%|▏ | 16194/1000000 [8:09:24<2304:02:52, 8.43s/it, lr=1e-5, step_loss=0.00781][RANK-0]: Step: [16194], local_loss=0.02026369422674179, train_loss=0.03537721559405327, time_cost=2.8080127239227295
+
Steps: 2%|▏ | 16194/1000000 [8:09:24<2304:02:52, 8.43s/it, lr=1e-5, step_loss=0.0203]
Steps: 2%|▏ | 16195/1000000 [8:09:37<2694:18:43, 9.86s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [16195], local_loss=0.027309678494930267, train_loss=0.12085659056901932, time_cost=3.130784273147583
+
Steps: 2%|▏ | 16195/1000000 [8:09:37<2694:18:43, 9.86s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 16196/1000000 [8:09:48<2768:47:12, 10.13s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [16196], local_loss=0.27546536922454834, train_loss=0.053799472749233246, time_cost=3.325195074081421
+
Steps: 2%|▏ | 16196/1000000 [8:09:48<2768:47:12, 10.13s/it, lr=1e-5, step_loss=0.275]
Steps: 2%|▏ | 16197/1000000 [8:09:53<2361:30:13, 8.64s/it, lr=1e-5, step_loss=0.275][RANK-0]: Step: [16197], local_loss=0.008021418005228043, train_loss=0.039235472679138184, time_cost=1.9002113342285156
+
Steps: 2%|▏ | 16197/1000000 [8:09:53<2361:30:13, 8.64s/it, lr=1e-5, step_loss=0.00802]
Steps: 2%|▏ | 16198/1000000 [8:10:03<2441:37:44, 8.93s/it, lr=1e-5, step_loss=0.00802][RANK-0]: Step: [16198], local_loss=0.08647438883781433, train_loss=0.02828322909772396, time_cost=2.627873420715332
+
Steps: 2%|▏ | 16198/1000000 [8:10:03<2441:37:44, 8.93s/it, lr=1e-5, step_loss=0.0865]
Steps: 2%|▏ | 16199/1000000 [8:10:15<2741:05:54, 10.03s/it, lr=1e-5, step_loss=0.0865][RANK-0]: Step: [16199], local_loss=0.010058499872684479, train_loss=0.16849297285079956, time_cost=3.296168804168701
+
Steps: 2%|▏ | 16199/1000000 [8:10:15<2741:05:54, 10.03s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 16200/1000000 [8:10:22<2483:20:27, 9.09s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [16200], local_loss=0.016105985268950462, train_loss=0.01652546040713787, time_cost=1.2506871223449707
+
Steps: 2%|▏ | 16200/1000000 [8:10:22<2483:20:27, 9.09s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 16201/1000000 [8:10:30<2394:39:23, 8.76s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [16201], local_loss=0.3715416193008423, train_loss=0.07560931146144867, time_cost=1.5339066982269287
+
Steps: 2%|▏ | 16201/1000000 [8:10:30<2394:39:23, 8.76s/it, lr=1e-5, step_loss=0.372]
Steps: 2%|▏ | 16202/1000000 [8:10:37<2256:14:34, 8.26s/it, lr=1e-5, step_loss=0.372][RANK-0]: Step: [16202], local_loss=0.008515531197190285, train_loss=0.13251696527004242, time_cost=1.4458091259002686
+
Steps: 2%|▏ | 16202/1000000 [8:10:37<2256:14:34, 8.26s/it, lr=1e-5, step_loss=0.00852]
Steps: 2%|▏ | 16203/1000000 [8:10:50<2647:34:45, 9.69s/it, lr=1e-5, step_loss=0.00852][RANK-0]: Step: [16203], local_loss=0.029307575896382332, train_loss=0.03278888389468193, time_cost=4.952514410018921
+
Steps: 2%|▏ | 16203/1000000 [8:10:50<2647:34:45, 9.69s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 16204/1000000 [8:11:01<2759:59:34, 10.10s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [16204], local_loss=0.031140603125095367, train_loss=0.015577418729662895, time_cost=9.924379825592041
+
Steps: 2%|▏ | 16204/1000000 [8:11:01<2759:59:34, 10.10s/it, lr=1e-5, step_loss=0.0311]
Steps: 2%|▏ | 16205/1000000 [8:11:12<2824:44:50, 10.34s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [16205], local_loss=0.12221424281597137, train_loss=0.03297997638583183, time_cost=7.957953929901123
+
Steps: 2%|▏ | 16205/1000000 [8:11:12<2824:44:50, 10.34s/it, lr=1e-5, step_loss=0.122]
Steps: 2%|▏ | 16206/1000000 [8:11:19<2569:03:10, 9.40s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [16206], local_loss=0.01775151491165161, train_loss=0.01707647368311882, time_cost=2.2133898735046387
+
Steps: 2%|▏ | 16206/1000000 [8:11:19<2569:03:10, 9.40s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 16207/1000000 [8:11:24<2181:14:15, 7.98s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [16207], local_loss=0.012121441774070263, train_loss=0.10810650885105133, time_cost=2.2507119178771973
+
Steps: 2%|▏ | 16207/1000000 [8:11:24<2181:14:15, 7.98s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 16208/1000000 [8:11:31<2127:39:59, 7.79s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [16208], local_loss=0.05770193412899971, train_loss=49.76343536376953, time_cost=1.793123722076416
+
Steps: 2%|▏ | 16208/1000000 [8:11:31<2127:39:59, 7.79s/it, lr=1e-5, step_loss=0.0577]
Steps: 2%|▏ | 16209/1000000 [8:11:43<2471:25:47, 9.04s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [16209], local_loss=0.011065035127103329, train_loss=0.040241725742816925, time_cost=5.027510166168213
+
Steps: 2%|▏ | 16209/1000000 [8:11:43<2471:25:47, 9.04s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 16210/1000000 [8:11:58<2964:37:16, 10.85s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [16210], local_loss=0.10654100775718689, train_loss=0.028462346643209457, time_cost=6.750795125961304
+
Steps: 2%|▏ | 16210/1000000 [8:11:58<2964:37:16, 10.85s/it, lr=1e-5, step_loss=0.107]
Steps: 2%|▏ | 16211/1000000 [8:12:12<3166:32:34, 11.59s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [16211], local_loss=0.00953198503702879, train_loss=0.029348550364375114, time_cost=6.100938081741333
+
Steps: 2%|▏ | 16211/1000000 [8:12:12<3166:32:34, 11.59s/it, lr=1e-5, step_loss=0.00953]
Steps: 2%|▏ | 16212/1000000 [8:12:26<3425:35:18, 12.54s/it, lr=1e-5, step_loss=0.00953][RANK-0]: Step: [16212], local_loss=0.020511524751782417, train_loss=0.1450163871049881, time_cost=1.2263848781585693
+
Steps: 2%|▏ | 16212/1000000 [8:12:26<3425:35:18, 12.54s/it, lr=1e-5, step_loss=0.0205]
Steps: 2%|▏ | 16213/1000000 [8:12:31<2745:26:11, 10.05s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [16213], local_loss=0.006448841188102961, train_loss=0.04058688133955002, time_cost=3.062504529953003
+
Steps: 2%|▏ | 16213/1000000 [8:12:31<2745:26:11, 10.05s/it, lr=1e-5, step_loss=0.00645]
Steps: 2%|▏ | 16214/1000000 [8:12:44<2984:18:09, 10.92s/it, lr=1e-5, step_loss=0.00645][RANK-0]: Step: [16214], local_loss=0.03845268860459328, train_loss=0.0317382737994194, time_cost=5.464059591293335
+
Steps: 2%|▏ | 16214/1000000 [8:12:44<2984:18:09, 10.92s/it, lr=1e-5, step_loss=0.0385]
Steps: 2%|▏ | 16215/1000000 [8:12:54<2911:16:19, 10.65s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [16215], local_loss=0.006495364475995302, train_loss=0.034024275839328766, time_cost=5.009405851364136
+
Steps: 2%|▏ | 16215/1000000 [8:12:54<2911:16:19, 10.65s/it, lr=1e-5, step_loss=0.0065]
Steps: 2%|▏ | 16216/1000000 [8:13:05<2931:59:05, 10.73s/it, lr=1e-5, step_loss=0.0065][RANK-0]: Step: [16216], local_loss=0.02009432204067707, train_loss=0.036338817328214645, time_cost=2.1217153072357178
+
Steps: 2%|▏ | 16216/1000000 [8:13:05<2931:59:05, 10.73s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 16217/1000000 [8:13:16<3009:41:35, 11.01s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [16217], local_loss=0.16642440855503082, train_loss=7.956238269805908, time_cost=2.2310476303100586
+
Steps: 2%|▏ | 16217/1000000 [8:13:16<3009:41:35, 11.01s/it, lr=1e-5, step_loss=0.166]
Steps: 2%|▏ | 16218/1000000 [8:13:21<2513:10:08, 9.20s/it, lr=1e-5, step_loss=0.166][RANK-0]: Step: [16218], local_loss=0.009799142368137836, train_loss=0.024258363991975784, time_cost=2.0779154300689697
+
Steps: 2%|▏ | 16218/1000000 [8:13:21<2513:10:08, 9.20s/it, lr=1e-5, step_loss=0.0098]
Steps: 2%|▏ | 16219/1000000 [8:13:34<2773:38:27, 10.15s/it, lr=1e-5, step_loss=0.0098][RANK-0]: Step: [16219], local_loss=0.040308479219675064, train_loss=0.2029668092727661, time_cost=10.445910453796387
+
Steps: 2%|▏ | 16219/1000000 [8:13:34<2773:38:27, 10.15s/it, lr=1e-5, step_loss=0.0403]
Steps: 2%|▏ | 16220/1000000 [8:13:45<2919:52:26, 10.68s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [16220], local_loss=0.04048338532447815, train_loss=0.025581955909729004, time_cost=1.2003588676452637
+
Steps: 2%|▏ | 16220/1000000 [8:13:45<2919:52:26, 10.68s/it, lr=1e-5, step_loss=0.0405]
Steps: 2%|▏ | 16221/1000000 [8:14:00<3206:06:38, 11.73s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [16221], local_loss=0.01161352638155222, train_loss=0.04301324486732483, time_cost=1.2285428047180176
+
Steps: 2%|▏ | 16221/1000000 [8:14:00<3206:06:38, 11.73s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 16222/1000000 [8:14:06<2799:22:47, 10.24s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [16222], local_loss=0.008980568498373032, train_loss=0.010467829182744026, time_cost=1.2581150531768799
+
Steps: 2%|▏ | 16222/1000000 [8:14:06<2799:22:47, 10.24s/it, lr=1e-5, step_loss=0.00898]
Steps: 2%|▏ | 16223/1000000 [8:14:18<2921:03:58, 10.69s/it, lr=1e-5, step_loss=0.00898][RANK-0]: Step: [16223], local_loss=0.16211125254631042, train_loss=0.0809418335556984, time_cost=5.433661222457886
+
Steps: 2%|▏ | 16223/1000000 [8:14:18<2921:03:58, 10.69s/it, lr=1e-5, step_loss=0.162]
Steps: 2%|▏ | 16224/1000000 [8:14:30<3016:19:00, 11.04s/it, lr=1e-5, step_loss=0.162][RANK-0]: Step: [16224], local_loss=0.007149845361709595, train_loss=0.019405227154493332, time_cost=2.686945676803589
+
Steps: 2%|▏ | 16224/1000000 [8:14:30<3016:19:00, 11.04s/it, lr=1e-5, step_loss=0.00715]
Steps: 2%|▏ | 16225/1000000 [8:14:39<2839:51:28, 10.39s/it, lr=1e-5, step_loss=0.00715][RANK-0]: Step: [16225], local_loss=0.02251666970551014, train_loss=0.029093459248542786, time_cost=2.937347888946533
+
Steps: 2%|▏ | 16225/1000000 [8:14:39<2839:51:28, 10.39s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 16226/1000000 [8:14:45<2453:26:17, 8.98s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [16226], local_loss=0.03797995671629906, train_loss=0.16895590722560883, time_cost=1.2851006984710693
+
Steps: 2%|▏ | 16226/1000000 [8:14:45<2453:26:17, 8.98s/it, lr=1e-5, step_loss=0.038]
Steps: 2%|▏ | 16227/1000000 [8:14:50<2139:54:25, 7.83s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [16227], local_loss=0.0930706113576889, train_loss=0.0831342414021492, time_cost=2.4418303966522217
+
Steps: 2%|▏ | 16227/1000000 [8:14:50<2139:54:25, 7.83s/it, lr=1e-5, step_loss=0.0931]
Steps: 2%|▏ | 16228/1000000 [8:14:59<2248:20:50, 8.23s/it, lr=1e-5, step_loss=0.0931][RANK-0]: Step: [16228], local_loss=0.007623282261192799, train_loss=0.04063991457223892, time_cost=2.3360440731048584
+
Steps: 2%|▏ | 16228/1000000 [8:14:59<2248:20:50, 8.23s/it, lr=1e-5, step_loss=0.00762]
Steps: 2%|▏ | 16229/1000000 [8:15:07<2232:59:09, 8.17s/it, lr=1e-5, step_loss=0.00762][RANK-0]: Step: [16229], local_loss=0.14765900373458862, train_loss=0.0484677255153656, time_cost=6.780579328536987
+
Steps: 2%|▏ | 16229/1000000 [8:15:07<2232:59:09, 8.17s/it, lr=1e-5, step_loss=0.148]
Steps: 2%|▏ | 16230/1000000 [8:15:11<1909:00:29, 6.99s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [16230], local_loss=0.02146613970398903, train_loss=0.05414089560508728, time_cost=3.3033034801483154
+
Steps: 2%|▏ | 16230/1000000 [8:15:11<1909:00:29, 6.99s/it, lr=1e-5, step_loss=0.0215]
Steps: 2%|▏ | 16231/1000000 [8:15:20<2040:45:32, 7.47s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [16231], local_loss=0.04866751655936241, train_loss=0.023180903866887093, time_cost=4.6070473194122314
+
Steps: 2%|▏ | 16231/1000000 [8:15:20<2040:45:32, 7.47s/it, lr=1e-5, step_loss=0.0487]
Steps: 2%|▏ | 16232/1000000 [8:15:25<1874:09:52, 6.86s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [16232], local_loss=0.09022568166255951, train_loss=0.19540388882160187, time_cost=2.982661724090576
+
Steps: 2%|▏ | 16232/1000000 [8:15:25<1874:09:52, 6.86s/it, lr=1e-5, step_loss=0.0902]
Steps: 2%|▏ | 16233/1000000 [8:15:34<2048:21:16, 7.50s/it, lr=1e-5, step_loss=0.0902][RANK-0]: Step: [16233], local_loss=0.054306760430336, train_loss=0.026559438556432724, time_cost=2.710963726043701
+
Steps: 2%|▏ | 16233/1000000 [8:15:34<2048:21:16, 7.50s/it, lr=1e-5, step_loss=0.0543]
Steps: 2%|▏ | 16234/1000000 [8:15:41<1986:41:31, 7.27s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [16234], local_loss=0.027211982756853104, train_loss=0.0284663625061512, time_cost=1.2435176372528076
+
Steps: 2%|▏ | 16234/1000000 [8:15:41<1986:41:31, 7.27s/it, lr=1e-5, step_loss=0.0272]
Steps: 2%|▏ | 16235/1000000 [8:15:51<2224:29:04, 8.14s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [16235], local_loss=0.046840593218803406, train_loss=0.04220768064260483, time_cost=7.914327383041382
+
Steps: 2%|▏ | 16235/1000000 [8:15:51<2224:29:04, 8.14s/it, lr=1e-5, step_loss=0.0468]
Steps: 2%|▏ | 16236/1000000 [8:16:02<2422:21:06, 8.86s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [16236], local_loss=0.014866754412651062, train_loss=0.028396304696798325, time_cost=5.027791738510132
+
Steps: 2%|▏ | 16236/1000000 [8:16:02<2422:21:06, 8.86s/it, lr=1e-5, step_loss=0.0149]
Steps: 2%|▏ | 16237/1000000 [8:16:06<2031:36:50, 7.43s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [16237], local_loss=0.006172048393636942, train_loss=0.011951969936490059, time_cost=1.3663063049316406
+
Steps: 2%|▏ | 16237/1000000 [8:16:06<2031:36:50, 7.43s/it, lr=1e-5, step_loss=0.00617]
Steps: 2%|▏ | 16238/1000000 [8:16:13<2018:03:49, 7.38s/it, lr=1e-5, step_loss=0.00617][RANK-0]: Step: [16238], local_loss=0.05557180941104889, train_loss=0.04526548460125923, time_cost=3.132955312728882
+
Steps: 2%|▏ | 16238/1000000 [8:16:13<2018:03:49, 7.38s/it, lr=1e-5, step_loss=0.0556]
Steps: 2%|▏ | 16239/1000000 [8:16:20<2005:14:43, 7.34s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [16239], local_loss=0.07823928445577621, train_loss=0.030377957969903946, time_cost=1.69901442527771
+
Steps: 2%|▏ | 16239/1000000 [8:16:20<2005:14:43, 7.34s/it, lr=1e-5, step_loss=0.0782]
Steps: 2%|▏ | 16240/1000000 [8:16:25<1806:49:34, 6.61s/it, lr=1e-5, step_loss=0.0782][RANK-0]: Step: [16240], local_loss=0.01332998275756836, train_loss=0.05389130860567093, time_cost=1.9352812767028809
+
Steps: 2%|▏ | 16240/1000000 [8:16:25<1806:49:34, 6.61s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 16241/1000000 [8:16:33<1905:55:58, 6.97s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [16241], local_loss=0.05734735354781151, train_loss=0.022821184247732162, time_cost=3.8171374797821045
+
Steps: 2%|▏ | 16241/1000000 [8:16:33<1905:55:58, 6.97s/it, lr=1e-5, step_loss=0.0573]
Steps: 2%|▏ | 16242/1000000 [8:16:47<2448:40:05, 8.96s/it, lr=1e-5, step_loss=0.0573][RANK-0]: Step: [16242], local_loss=0.007490672171115875, train_loss=0.04366576299071312, time_cost=5.48535680770874
+
Steps: 2%|▏ | 16242/1000000 [8:16:47<2448:40:05, 8.96s/it, lr=1e-5, step_loss=0.00749]
Steps: 2%|▏ | 16243/1000000 [8:16:57<2583:06:51, 9.45s/it, lr=1e-5, step_loss=0.00749][RANK-0]: Step: [16243], local_loss=0.008398458361625671, train_loss=0.011572534218430519, time_cost=1.9798383712768555
+
Steps: 2%|▏ | 16243/1000000 [8:16:57<2583:06:51, 9.45s/it, lr=1e-5, step_loss=0.0084]
Steps: 2%|▏ | 16244/1000000 [8:17:08<2695:42:52, 9.86s/it, lr=1e-5, step_loss=0.0084][RANK-0]: Step: [16244], local_loss=0.009695872664451599, train_loss=0.01208257395774126, time_cost=4.121385097503662
+
Steps: 2%|▏ | 16244/1000000 [8:17:08<2695:42:52, 9.86s/it, lr=1e-5, step_loss=0.0097]
Steps: 2%|▏ | 16245/1000000 [8:17:13<2324:09:21, 8.51s/it, lr=1e-5, step_loss=0.0097][RANK-0]: Step: [16245], local_loss=0.029991041868925095, train_loss=0.05296558514237404, time_cost=1.6141431331634521
+
Steps: 2%|▏ | 16245/1000000 [8:17:13<2324:09:21, 8.51s/it, lr=1e-5, step_loss=0.03]
Steps: 2%|▏ | 16246/1000000 [8:17:21<2221:09:06, 8.13s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [16246], local_loss=0.04918090999126434, train_loss=0.0662992000579834, time_cost=3.343940258026123
+
Steps: 2%|▏ | 16246/1000000 [8:17:21<2221:09:06, 8.13s/it, lr=1e-5, step_loss=0.0492]
Steps: 2%|▏ | 16247/1000000 [8:17:34<2630:18:35, 9.63s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [16247], local_loss=0.038103312253952026, train_loss=0.052656177431344986, time_cost=1.2380664348602295
+
Steps: 2%|▏ | 16247/1000000 [8:17:34<2630:18:35, 9.63s/it, lr=1e-5, step_loss=0.0381]
Steps: 2%|▏ | 16248/1000000 [8:17:45<2782:01:51, 10.18s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [16248], local_loss=0.09510349482297897, train_loss=0.03349257633090019, time_cost=1.6046013832092285
+
Steps: 2%|▏ | 16248/1000000 [8:17:45<2782:01:51, 10.18s/it, lr=1e-5, step_loss=0.0951]
Steps: 2%|▏ | 16249/1000000 [8:17:58<3010:32:51, 11.02s/it, lr=1e-5, step_loss=0.0951][RANK-0]: Step: [16249], local_loss=0.028552962467074394, train_loss=0.02230142056941986, time_cost=4.5630552768707275
+
Steps: 2%|▏ | 16249/1000000 [8:17:58<3010:32:51, 11.02s/it, lr=1e-5, step_loss=0.0286]
Steps: 2%|▏ | 16250/1000000 [8:18:06<2776:29:51, 10.16s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [16250], local_loss=0.028299633413553238, train_loss=0.0293828584253788, time_cost=1.211294174194336
+
Steps: 2%|▏ | 16250/1000000 [8:18:06<2776:29:51, 10.16s/it, lr=1e-5, step_loss=0.0283]
Steps: 2%|▏ | 16251/1000000 [8:18:11<2311:45:04, 8.46s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [16251], local_loss=0.059631478041410446, train_loss=0.08618520200252533, time_cost=3.7416751384735107
+
Steps: 2%|▏ | 16251/1000000 [8:18:11<2311:45:04, 8.46s/it, lr=1e-5, step_loss=0.0596]
Steps: 2%|▏ | 16252/1000000 [8:18:16<2051:46:33, 7.51s/it, lr=1e-5, step_loss=0.0596][RANK-0]: Step: [16252], local_loss=0.05101566016674042, train_loss=0.08151736110448837, time_cost=1.1924998760223389
+
Steps: 2%|▏ | 16252/1000000 [8:18:16<2051:46:33, 7.51s/it, lr=1e-5, step_loss=0.051]
Steps: 2%|▏ | 16253/1000000 [8:18:31<2670:27:44, 9.77s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [16253], local_loss=0.059121206402778625, train_loss=0.1003289520740509, time_cost=12.131463766098022
+
Steps: 2%|▏ | 16253/1000000 [8:18:31<2670:27:44, 9.77s/it, lr=1e-5, step_loss=0.0591]
Steps: 2%|▏ | 16254/1000000 [8:18:39<2499:46:14, 9.15s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [16254], local_loss=0.057974085211753845, train_loss=0.0238355603069067, time_cost=1.4930918216705322
+
Steps: 2%|▏ | 16254/1000000 [8:18:39<2499:46:14, 9.15s/it, lr=1e-5, step_loss=0.058]
Steps: 2%|▏ | 16255/1000000 [8:18:50<2664:13:44, 9.75s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [16255], local_loss=0.008524262346327305, train_loss=0.05924597010016441, time_cost=1.9863901138305664
+
Steps: 2%|▏ | 16255/1000000 [8:18:50<2664:13:44, 9.75s/it, lr=1e-5, step_loss=0.00852]
Steps: 2%|▏ | 16256/1000000 [8:18:54<2220:28:27, 8.13s/it, lr=1e-5, step_loss=0.00852][RANK-0]: Step: [16256], local_loss=0.0656329095363617, train_loss=0.018398266285657883, time_cost=2.5511085987091064
+
Steps: 2%|▏ | 16256/1000000 [8:18:54<2220:28:27, 8.13s/it, lr=1e-5, step_loss=0.0656]
Steps: 2%|▏ | 16257/1000000 [8:19:00<2029:04:41, 7.43s/it, lr=1e-5, step_loss=0.0656][RANK-0]: Step: [16257], local_loss=0.021821029484272003, train_loss=0.0647263154387474, time_cost=2.9116439819335938
+
Steps: 2%|▏ | 16257/1000000 [8:19:00<2029:04:41, 7.43s/it, lr=1e-5, step_loss=0.0218]
Steps: 2%|▏ | 16258/1000000 [8:19:11<2309:49:44, 8.45s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [16258], local_loss=0.6375991106033325, train_loss=0.15105710923671722, time_cost=2.6182775497436523
+
Steps: 2%|▏ | 16258/1000000 [8:19:11<2309:49:44, 8.45s/it, lr=1e-5, step_loss=0.638]
Steps: 2%|▏ | 16259/1000000 [8:19:27<2915:51:19, 10.67s/it, lr=1e-5, step_loss=0.638][RANK-0]: Step: [16259], local_loss=0.007630772888660431, train_loss=0.0349951907992363, time_cost=7.652756929397583
+
Steps: 2%|▏ | 16259/1000000 [8:19:27<2915:51:19, 10.67s/it, lr=1e-5, step_loss=0.00763]
Steps: 2%|▏ | 16260/1000000 [8:19:36<2832:04:11, 10.36s/it, lr=1e-5, step_loss=0.00763][RANK-0]: Step: [16260], local_loss=0.013547858223319054, train_loss=0.022161277011036873, time_cost=1.2000422477722168
+
Steps: 2%|▏ | 16260/1000000 [8:19:36<2832:04:11, 10.36s/it, lr=1e-5, step_loss=0.0135]
Steps: 2%|▏ | 16261/1000000 [8:19:50<3073:35:16, 11.25s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [16261], local_loss=0.012668976560235023, train_loss=0.10310151427984238, time_cost=8.50753927230835
+
Steps: 2%|▏ | 16261/1000000 [8:19:50<3073:35:16, 11.25s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 16262/1000000 [8:20:03<3265:22:29, 11.95s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [16262], local_loss=0.08082126826047897, train_loss=0.030399808660149574, time_cost=2.16497802734375
+
Steps: 2%|▏ | 16262/1000000 [8:20:03<3265:22:29, 11.95s/it, lr=1e-5, step_loss=0.0808]
Steps: 2%|▏ | 16263/1000000 [8:20:12<3035:39:41, 11.11s/it, lr=1e-5, step_loss=0.0808][RANK-0]: Step: [16263], local_loss=0.015858974307775497, train_loss=0.08453671634197235, time_cost=1.2191410064697266
+
Steps: 2%|▏ | 16263/1000000 [8:20:12<3035:39:41, 11.11s/it, lr=1e-5, step_loss=0.0159]
Steps: 2%|▏ | 16264/1000000 [8:20:18<2558:12:20, 9.36s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [16264], local_loss=0.006694698706269264, train_loss=0.013507019728422165, time_cost=2.588317632675171
+
Steps: 2%|▏ | 16264/1000000 [8:20:18<2558:12:20, 9.36s/it, lr=1e-5, step_loss=0.00669]
Steps: 2%|▏ | 16265/1000000 [8:20:25<2357:09:07, 8.63s/it, lr=1e-5, step_loss=0.00669][RANK-0]: Step: [16265], local_loss=0.0056586842983961105, train_loss=0.019988249987363815, time_cost=2.5250487327575684
+
Steps: 2%|▏ | 16265/1000000 [8:20:25<2357:09:07, 8.63s/it, lr=1e-5, step_loss=0.00566]
Steps: 2%|▏ | 16266/1000000 [8:20:30<2081:54:54, 7.62s/it, lr=1e-5, step_loss=0.00566][RANK-0]: Step: [16266], local_loss=0.04453195258975029, train_loss=0.02569453790783882, time_cost=1.2597661018371582
+
Steps: 2%|▏ | 16266/1000000 [8:20:30<2081:54:54, 7.62s/it, lr=1e-5, step_loss=0.0445]
Steps: 2%|▏ | 16267/1000000 [8:20:43<2561:02:34, 9.37s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [16267], local_loss=0.005891142413020134, train_loss=0.017012381926178932, time_cost=11.241079330444336
+
Steps: 2%|▏ | 16267/1000000 [8:20:43<2561:02:34, 9.37s/it, lr=1e-5, step_loss=0.00589]
Steps: 2%|▏ | 16268/1000000 [8:20:49<2262:23:25, 8.28s/it, lr=1e-5, step_loss=0.00589][RANK-0]: Step: [16268], local_loss=0.007783017121255398, train_loss=0.07857038080692291, time_cost=1.3659305572509766
+
Steps: 2%|▏ | 16268/1000000 [8:20:49<2262:23:25, 8.28s/it, lr=1e-5, step_loss=0.00778]
Steps: 2%|▏ | 16269/1000000 [8:21:04<2801:09:16, 10.25s/it, lr=1e-5, step_loss=0.00778][RANK-0]: Step: [16269], local_loss=0.005264237988740206, train_loss=0.03235989809036255, time_cost=6.763069152832031
+
Steps: 2%|▏ | 16269/1000000 [8:21:04<2801:09:16, 10.25s/it, lr=1e-5, step_loss=0.00526]
Steps: 2%|▏ | 16270/1000000 [8:21:11<2562:29:17, 9.38s/it, lr=1e-5, step_loss=0.00526][RANK-0]: Step: [16270], local_loss=0.010237494483590126, train_loss=0.013171886093914509, time_cost=1.5777411460876465
+
Steps: 2%|▏ | 16270/1000000 [8:21:11<2562:29:17, 9.38s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 16271/1000000 [8:21:16<2206:20:33, 8.07s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [16271], local_loss=0.005711267702281475, train_loss=0.019376087933778763, time_cost=2.2562096118927
+
Steps: 2%|▏ | 16271/1000000 [8:21:16<2206:20:33, 8.07s/it, lr=1e-5, step_loss=0.00571]
Steps: 2%|▏ | 16272/1000000 [8:21:21<1956:58:04, 7.16s/it, lr=1e-5, step_loss=0.00571][RANK-0]: Step: [16272], local_loss=0.009247425943613052, train_loss=0.07088156789541245, time_cost=2.171379327774048
+
Steps: 2%|▏ | 16272/1000000 [8:21:21<1956:58:04, 7.16s/it, lr=1e-5, step_loss=0.00925]
Steps: 2%|▏ | 16273/1000000 [8:21:35<2465:44:47, 9.02s/it, lr=1e-5, step_loss=0.00925][RANK-0]: Step: [16273], local_loss=0.018233468756079674, train_loss=0.0657186210155487, time_cost=5.180114030838013
+
Steps: 2%|▏ | 16273/1000000 [8:21:35<2465:44:47, 9.02s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 16274/1000000 [8:21:40<2154:53:11, 7.89s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [16274], local_loss=0.018614575266838074, train_loss=0.021804653108119965, time_cost=1.5330467224121094
+
Steps: 2%|▏ | 16274/1000000 [8:21:40<2154:53:11, 7.89s/it, lr=1e-5, step_loss=0.0186]
Steps: 2%|▏ | 16275/1000000 [8:21:45<1887:08:25, 6.91s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [16275], local_loss=0.020724110305309296, train_loss=0.029914505779743195, time_cost=2.136601448059082
+
Steps: 2%|▏ | 16275/1000000 [8:21:45<1887:08:25, 6.91s/it, lr=1e-5, step_loss=0.0207]
Steps: 2%|▏ | 16276/1000000 [8:21:53<2048:37:34, 7.50s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [16276], local_loss=0.061850666999816895, train_loss=0.09525786340236664, time_cost=6.537730932235718
+
Steps: 2%|▏ | 16276/1000000 [8:21:53<2048:37:34, 7.50s/it, lr=1e-5, step_loss=0.0619]
Steps: 2%|▏ | 16277/1000000 [8:22:01<2031:30:45, 7.43s/it, lr=1e-5, step_loss=0.0619][RANK-0]: Step: [16277], local_loss=0.0700884684920311, train_loss=0.06110945716500282, time_cost=3.608281135559082
+
Steps: 2%|▏ | 16277/1000000 [8:22:01<2031:30:45, 7.43s/it, lr=1e-5, step_loss=0.0701]
Steps: 2%|▏ | 16278/1000000 [8:22:07<1903:29:57, 6.97s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [16278], local_loss=0.12101110070943832, train_loss=0.10379613935947418, time_cost=2.369347333908081
+
Steps: 2%|▏ | 16278/1000000 [8:22:07<1903:29:57, 6.97s/it, lr=1e-5, step_loss=0.121]
Steps: 2%|▏ | 16279/1000000 [8:22:13<1830:25:10, 6.70s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [16279], local_loss=1.0086370706558228, train_loss=0.18271978199481964, time_cost=1.8949320316314697
+
Steps: 2%|▏ | 16279/1000000 [8:22:13<1830:25:10, 6.70s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 16280/1000000 [8:22:20<1913:29:35, 7.00s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [16280], local_loss=0.00917469896376133, train_loss=0.5234686732292175, time_cost=3.1776304244995117
+
Steps: 2%|▏ | 16280/1000000 [8:22:20<1913:29:35, 7.00s/it, lr=1e-5, step_loss=0.00917]
Steps: 2%|▏ | 16281/1000000 [8:22:28<1927:10:11, 7.05s/it, lr=1e-5, step_loss=0.00917][RANK-0]: Step: [16281], local_loss=0.006368874106556177, train_loss=0.031555529683828354, time_cost=2.713548183441162
+
Steps: 2%|▏ | 16281/1000000 [8:22:28<1927:10:11, 7.05s/it, lr=1e-5, step_loss=0.00637]
Steps: 2%|▏ | 16282/1000000 [8:22:39<2316:59:55, 8.48s/it, lr=1e-5, step_loss=0.00637][RANK-0]: Step: [16282], local_loss=0.013697408139705658, train_loss=0.03256148472428322, time_cost=3.0437889099121094
+
Steps: 2%|▏ | 16282/1000000 [8:22:39<2316:59:55, 8.48s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 16283/1000000 [8:22:45<2079:37:56, 7.61s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [16283], local_loss=0.009597459807991982, train_loss=0.03856891393661499, time_cost=1.5129477977752686
+
Steps: 2%|▏ | 16283/1000000 [8:22:45<2079:37:56, 7.61s/it, lr=1e-5, step_loss=0.0096]
Steps: 2%|▏ | 16284/1000000 [8:22:53<2079:32:36, 7.61s/it, lr=1e-5, step_loss=0.0096][RANK-0]: Step: [16284], local_loss=0.008783344179391861, train_loss=0.046107422560453415, time_cost=3.932722330093384
+
Steps: 2%|▏ | 16284/1000000 [8:22:53<2079:32:36, 7.61s/it, lr=1e-5, step_loss=0.00878]
Steps: 2%|▏ | 16285/1000000 [8:22:57<1845:25:41, 6.75s/it, lr=1e-5, step_loss=0.00878][RANK-0]: Step: [16285], local_loss=0.02005656436085701, train_loss=0.024728424847126007, time_cost=2.426783323287964
+
Steps: 2%|▏ | 16285/1000000 [8:22:57<1845:25:41, 6.75s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 16286/1000000 [8:23:06<2028:45:33, 7.42s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [16286], local_loss=0.03536229953169823, train_loss=0.03903356194496155, time_cost=2.539245128631592
+
Steps: 2%|▏ | 16286/1000000 [8:23:06<2028:45:33, 7.42s/it, lr=1e-5, step_loss=0.0354]
Steps: 2%|▏ | 16287/1000000 [8:23:16<2225:00:39, 8.14s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [16287], local_loss=173.03712463378906, train_loss=21.71043586730957, time_cost=1.5119273662567139
+
Steps: 2%|▏ | 16287/1000000 [8:23:16<2225:00:39, 8.14s/it, lr=1e-5, step_loss=173]
Steps: 2%|▏ | 16288/1000000 [8:23:32<2832:34:52, 10.37s/it, lr=1e-5, step_loss=173][RANK-0]: Step: [16288], local_loss=0.008396080695092678, train_loss=0.021953241899609566, time_cost=6.690174341201782
+
Steps: 2%|▏ | 16288/1000000 [8:23:32<2832:34:52, 10.37s/it, lr=1e-5, step_loss=0.0084]
Steps: 2%|▏ | 16289/1000000 [8:23:39<2551:25:19, 9.34s/it, lr=1e-5, step_loss=0.0084][RANK-0]: Step: [16289], local_loss=0.021395741030573845, train_loss=0.0825282633304596, time_cost=2.671022891998291
+
Steps: 2%|▏ | 16289/1000000 [8:23:39<2551:25:19, 9.34s/it, lr=1e-5, step_loss=0.0214]
Steps: 2%|▏ | 16290/1000000 [8:23:46<2402:57:52, 8.79s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [16290], local_loss=0.19810238480567932, train_loss=0.048876430839300156, time_cost=1.254192590713501
+
Steps: 2%|▏ | 16290/1000000 [8:23:46<2402:57:52, 8.79s/it, lr=1e-5, step_loss=0.198]
Steps: 2%|▏ | 16291/1000000 [8:23:56<2527:49:14, 9.25s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [16291], local_loss=0.006151349283754826, train_loss=0.03830733522772789, time_cost=5.460903882980347
+
Steps: 2%|▏ | 16291/1000000 [8:23:57<2527:49:14, 9.25s/it, lr=1e-5, step_loss=0.00615]
Steps: 2%|▏ | 16292/1000000 [8:24:09<2832:57:08, 10.37s/it, lr=1e-5, step_loss=0.00615][RANK-0]: Step: [16292], local_loss=0.2110530138015747, train_loss=0.04524511098861694, time_cost=4.520288944244385
+
Steps: 2%|▏ | 16292/1000000 [8:24:09<2832:57:08, 10.37s/it, lr=1e-5, step_loss=0.211]
Steps: 2%|▏ | 16293/1000000 [8:24:14<2387:00:32, 8.74s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [16293], local_loss=0.22672419250011444, train_loss=0.05536175146698952, time_cost=1.2234351634979248
+
Steps: 2%|▏ | 16293/1000000 [8:24:14<2387:00:32, 8.74s/it, lr=1e-5, step_loss=0.227]
Steps: 2%|▏ | 16294/1000000 [8:24:20<2148:28:32, 7.86s/it, lr=1e-5, step_loss=0.227][RANK-0]: Step: [16294], local_loss=0.2536846697330475, train_loss=0.12829305231571198, time_cost=1.4076597690582275
+
Steps: 2%|▏ | 16294/1000000 [8:24:20<2148:28:32, 7.86s/it, lr=1e-5, step_loss=0.254]
Steps: 2%|▏ | 16295/1000000 [8:24:26<1981:21:51, 7.25s/it, lr=1e-5, step_loss=0.254][RANK-0]: Step: [16295], local_loss=0.02800188586115837, train_loss=0.05888974666595459, time_cost=1.3546226024627686
+
Steps: 2%|▏ | 16295/1000000 [8:24:26<1981:21:51, 7.25s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 16296/1000000 [8:24:35<2084:05:28, 7.63s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [16296], local_loss=0.03337905555963516, train_loss=0.06548244506120682, time_cost=2.0619280338287354
+
Steps: 2%|▏ | 16296/1000000 [8:24:35<2084:05:28, 7.63s/it, lr=1e-5, step_loss=0.0334]
Steps: 2%|▏ | 16297/1000000 [8:24:46<2375:20:12, 8.69s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [16297], local_loss=0.08784934878349304, train_loss=0.05071385204792023, time_cost=3.825437307357788
+
Steps: 2%|▏ | 16297/1000000 [8:24:46<2375:20:12, 8.69s/it, lr=1e-5, step_loss=0.0878]
Steps: 2%|▏ | 16298/1000000 [8:24:56<2486:48:46, 9.10s/it, lr=1e-5, step_loss=0.0878][RANK-0]: Step: [16298], local_loss=0.02590484730899334, train_loss=0.04034776985645294, time_cost=2.028461456298828
+
Steps: 2%|▏ | 16298/1000000 [8:24:56<2486:48:46, 9.10s/it, lr=1e-5, step_loss=0.0259]
Steps: 2%|▏ | 16299/1000000 [8:25:05<2535:10:53, 9.28s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [16299], local_loss=0.022210758179426193, train_loss=0.015197718515992165, time_cost=3.630764961242676
+
Steps: 2%|▏ | 16299/1000000 [8:25:05<2535:10:53, 9.28s/it, lr=1e-5, step_loss=0.0222]
Steps: 2%|▏ | 16300/1000000 [8:25:15<2585:59:48, 9.46s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [16300], local_loss=0.03825997933745384, train_loss=0.042403168976306915, time_cost=1.2520573139190674
+
Steps: 2%|▏ | 16300/1000000 [8:25:15<2585:59:48, 9.46s/it, lr=1e-5, step_loss=0.0383]
Steps: 2%|▏ | 16301/1000000 [8:25:27<2747:40:35, 10.06s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [16301], local_loss=0.006004200782626867, train_loss=0.02167215384542942, time_cost=2.1277763843536377
+
Steps: 2%|▏ | 16301/1000000 [8:25:27<2747:40:35, 10.06s/it, lr=1e-5, step_loss=0.006]
Steps: 2%|▏ | 16302/1000000 [8:25:45<3419:15:56, 12.51s/it, lr=1e-5, step_loss=0.006][RANK-0]: Step: [16302], local_loss=0.02555377036333084, train_loss=0.013178940862417221, time_cost=14.120150804519653
+
Steps: 2%|▏ | 16302/1000000 [8:25:45<3419:15:56, 12.51s/it, lr=1e-5, step_loss=0.0256]
Steps: 2%|▏ | 16303/1000000 [8:25:51<2875:13:44, 10.52s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [16303], local_loss=0.004684941843152046, train_loss=0.1046864464879036, time_cost=2.6593542098999023
+
Steps: 2%|▏ | 16303/1000000 [8:25:51<2875:13:44, 10.52s/it, lr=1e-5, step_loss=0.00468]
Steps: 2%|▏ | 16304/1000000 [8:26:09<3508:55:46, 12.84s/it, lr=1e-5, step_loss=0.00468][RANK-0]: Step: [16304], local_loss=0.013022739440202713, train_loss=0.049461618065834045, time_cost=9.364108800888062
+
Steps: 2%|▏ | 16304/1000000 [8:26:09<3508:55:46, 12.84s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 16305/1000000 [8:26:15<2907:24:00, 10.64s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [16305], local_loss=0.01870802603662014, train_loss=0.0175691619515419, time_cost=1.4825618267059326
+
Steps: 2%|▏ | 16305/1000000 [8:26:15<2907:24:00, 10.64s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 16306/1000000 [8:26:20<2476:37:22, 9.06s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [16306], local_loss=0.030271029099822044, train_loss=0.021751388907432556, time_cost=2.3655669689178467
+
Steps: 2%|▏ | 16306/1000000 [8:26:20<2476:37:22, 9.06s/it, lr=1e-5, step_loss=0.0303]
Steps: 2%|▏ | 16307/1000000 [8:26:32<2698:10:09, 9.87s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [16307], local_loss=0.010165074840188026, train_loss=0.06591024994850159, time_cost=4.72066330909729
+
Steps: 2%|▏ | 16307/1000000 [8:26:32<2698:10:09, 9.87s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 16308/1000000 [8:26:43<2789:27:45, 10.21s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [16308], local_loss=0.19833125174045563, train_loss=0.03961179405450821, time_cost=2.077789783477783
+
Steps: 2%|▏ | 16308/1000000 [8:26:43<2789:27:45, 10.21s/it, lr=1e-5, step_loss=0.198]
Steps: 2%|▏ | 16309/1000000 [8:26:56<2997:32:42, 10.97s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [16309], local_loss=0.026715552434325218, train_loss=0.023047056049108505, time_cost=6.072768449783325
+
Steps: 2%|▏ | 16309/1000000 [8:26:56<2997:32:42, 10.97s/it, lr=1e-5, step_loss=0.0267]
Steps: 2%|▏ | 16310/1000000 [8:27:10<3311:09:07, 12.12s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [16310], local_loss=0.06672067195177078, train_loss=0.12476707249879837, time_cost=4.388011455535889
+
Steps: 2%|▏ | 16310/1000000 [8:27:10<3311:09:07, 12.12s/it, lr=1e-5, step_loss=0.0667]
Steps: 2%|▏ | 16311/1000000 [8:27:16<2757:06:35, 10.09s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [16311], local_loss=0.012853960506618023, train_loss=0.05767136439681053, time_cost=3.5075201988220215
+
Steps: 2%|▏ | 16311/1000000 [8:27:16<2757:06:35, 10.09s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 16312/1000000 [8:27:28<2943:18:48, 10.77s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [16312], local_loss=0.021730955690145493, train_loss=0.036956705152988434, time_cost=5.36488676071167
+
Steps: 2%|▏ | 16312/1000000 [8:27:28<2943:18:48, 10.77s/it, lr=1e-5, step_loss=0.0217]
Steps: 2%|▏ | 16313/1000000 [8:27:38<2850:39:06, 10.43s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [16313], local_loss=0.10140281915664673, train_loss=0.06012038141489029, time_cost=4.092281103134155
+
Steps: 2%|▏ | 16313/1000000 [8:27:38<2850:39:06, 10.43s/it, lr=1e-5, step_loss=0.101]
Steps: 2%|▏ | 16314/1000000 [8:27:42<2366:41:12, 8.66s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [16314], local_loss=0.04995046183466911, train_loss=0.09695897996425629, time_cost=2.188425064086914
+
Steps: 2%|▏ | 16314/1000000 [8:27:42<2366:41:12, 8.66s/it, lr=1e-5, step_loss=0.05]
Steps: 2%|▏ | 16315/1000000 [8:27:48<2126:59:47, 7.78s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [16315], local_loss=0.010860607028007507, train_loss=0.03794602304697037, time_cost=4.756969690322876
+
Steps: 2%|▏ | 16315/1000000 [8:27:48<2126:59:47, 7.78s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 16316/1000000 [8:28:04<2762:39:54, 10.11s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [16316], local_loss=0.03769785165786743, train_loss=0.04806821793317795, time_cost=6.273519277572632
+
Steps: 2%|▏ | 16316/1000000 [8:28:04<2762:39:54, 10.11s/it, lr=1e-5, step_loss=0.0377]
Steps: 2%|▏ | 16317/1000000 [8:28:09<2361:48:06, 8.64s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [16317], local_loss=0.00440060393884778, train_loss=30.3089542388916, time_cost=3.381425619125366
+
Steps: 2%|▏ | 16317/1000000 [8:28:09<2361:48:06, 8.64s/it, lr=1e-5, step_loss=0.0044]
Steps: 2%|▏ | 16318/1000000 [8:28:14<2080:03:42, 7.61s/it, lr=1e-5, step_loss=0.0044][RANK-0]: Step: [16318], local_loss=0.019424021244049072, train_loss=0.026608049869537354, time_cost=2.321887254714966
+
Steps: 2%|▏ | 16318/1000000 [8:28:14<2080:03:42, 7.61s/it, lr=1e-5, step_loss=0.0194]
Steps: 2%|▏ | 16319/1000000 [8:28:28<2644:50:04, 9.68s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [16319], local_loss=0.09311150759458542, train_loss=0.03508780896663666, time_cost=6.41839337348938
+
Steps: 2%|▏ | 16319/1000000 [8:28:28<2644:50:04, 9.68s/it, lr=1e-5, step_loss=0.0931]
Steps: 2%|▏ | 16320/1000000 [8:28:41<2901:35:59, 10.62s/it, lr=1e-5, step_loss=0.0931][RANK-0]: Step: [16320], local_loss=0.00962396152317524, train_loss=0.017382699996232986, time_cost=1.210050106048584
+
Steps: 2%|▏ | 16320/1000000 [8:28:41<2901:35:59, 10.62s/it, lr=1e-5, step_loss=0.00962]
Steps: 2%|▏ | 16321/1000000 [8:28:52<2923:04:30, 10.70s/it, lr=1e-5, step_loss=0.00962][RANK-0]: Step: [16321], local_loss=0.015916580334305763, train_loss=0.06599441915750504, time_cost=2.5817084312438965
+
Steps: 2%|▏ | 16321/1000000 [8:28:52<2923:04:30, 10.70s/it, lr=1e-5, step_loss=0.0159]
Steps: 2%|▏ | 16322/1000000 [8:29:04<3018:34:26, 11.05s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [16322], local_loss=0.004011302255094051, train_loss=0.014671145007014275, time_cost=4.3017497062683105
+
Steps: 2%|▏ | 16322/1000000 [8:29:04<3018:34:26, 11.05s/it, lr=1e-5, step_loss=0.00401]
Steps: 2%|▏ | 16323/1000000 [8:29:10<2571:52:03, 9.41s/it, lr=1e-5, step_loss=0.00401][RANK-0]: Step: [16323], local_loss=0.04607035964727402, train_loss=0.08169276267290115, time_cost=1.4296250343322754
+
Steps: 2%|▏ | 16323/1000000 [8:29:10<2571:52:03, 9.41s/it, lr=1e-5, step_loss=0.0461]
Steps: 2%|▏ | 16324/1000000 [8:29:23<2884:14:00, 10.56s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [16324], local_loss=0.046433765441179276, train_loss=0.0807323008775711, time_cost=9.838215351104736
+
Steps: 2%|▏ | 16324/1000000 [8:29:23<2884:14:00, 10.56s/it, lr=1e-5, step_loss=0.0464]
Steps: 2%|▏ | 16325/1000000 [8:29:28<2428:00:03, 8.89s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [16325], local_loss=0.039987556636333466, train_loss=0.05093516409397125, time_cost=2.2465462684631348
+
Steps: 2%|▏ | 16325/1000000 [8:29:28<2428:00:03, 8.89s/it, lr=1e-5, step_loss=0.04]
Steps: 2%|▏ | 16326/1000000 [8:29:39<2616:00:50, 9.57s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [16326], local_loss=0.00913882628083229, train_loss=0.0232500322163105, time_cost=3.32580828666687
+
Steps: 2%|▏ | 16326/1000000 [8:29:39<2616:00:50, 9.57s/it, lr=1e-5, step_loss=0.00914]
Steps: 2%|▏ | 16327/1000000 [8:29:46<2424:09:20, 8.87s/it, lr=1e-5, step_loss=0.00914][RANK-0]: Step: [16327], local_loss=0.275328129529953, train_loss=0.05477086082100868, time_cost=3.207131862640381
+
Steps: 2%|▏ | 16327/1000000 [8:29:46<2424:09:20, 8.87s/it, lr=1e-5, step_loss=0.275]
Steps: 2%|▏ | 16328/1000000 [8:30:01<2905:22:06, 10.63s/it, lr=1e-5, step_loss=0.275][RANK-0]: Step: [16328], local_loss=0.02386535331606865, train_loss=0.09110917150974274, time_cost=5.764255523681641
+
Steps: 2%|▏ | 16328/1000000 [8:30:01<2905:22:06, 10.63s/it, lr=1e-5, step_loss=0.0239]
Steps: 2%|▏ | 16329/1000000 [8:30:07<2537:51:49, 9.29s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [16329], local_loss=0.03300445154309273, train_loss=0.04367019236087799, time_cost=2.5538406372070312
+
Steps: 2%|▏ | 16329/1000000 [8:30:07<2537:51:49, 9.29s/it, lr=1e-5, step_loss=0.033]
Steps: 2%|▏ | 16330/1000000 [8:30:20<2842:44:24, 10.40s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [16330], local_loss=0.009068777784705162, train_loss=0.029125692322850227, time_cost=5.3387131690979
+
Steps: 2%|▏ | 16330/1000000 [8:30:20<2842:44:24, 10.40s/it, lr=1e-5, step_loss=0.00907]
Steps: 2%|▏ | 16331/1000000 [8:30:33<3056:56:17, 11.19s/it, lr=1e-5, step_loss=0.00907][RANK-0]: Step: [16331], local_loss=0.00461122952401638, train_loss=0.05456282198429108, time_cost=11.099251747131348
+
Steps: 2%|▏ | 16331/1000000 [8:30:33<3056:56:17, 11.19s/it, lr=1e-5, step_loss=0.00461]
Steps: 2%|▏ | 16332/1000000 [8:30:39<2582:30:49, 9.45s/it, lr=1e-5, step_loss=0.00461][RANK-0]: Step: [16332], local_loss=0.03399011865258217, train_loss=0.02184608019888401, time_cost=1.5770950317382812
+
Steps: 2%|▏ | 16332/1000000 [8:30:39<2582:30:49, 9.45s/it, lr=1e-5, step_loss=0.034]
Steps: 2%|▏ | 16333/1000000 [8:30:46<2418:37:03, 8.85s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [16333], local_loss=0.11922744661569595, train_loss=0.06906694918870926, time_cost=1.210371971130371
+
Steps: 2%|▏ | 16333/1000000 [8:30:46<2418:37:03, 8.85s/it, lr=1e-5, step_loss=0.119]
Steps: 2%|▏ | 16334/1000000 [8:30:53<2258:29:43, 8.27s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [16334], local_loss=0.04945388063788414, train_loss=0.030323540791869164, time_cost=5.641228199005127
+
Steps: 2%|▏ | 16334/1000000 [8:30:53<2258:29:43, 8.27s/it, lr=1e-5, step_loss=0.0495]
Steps: 2%|▏ | 16335/1000000 [8:30:58<1993:08:48, 7.29s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [16335], local_loss=0.16343745589256287, train_loss=0.041261956095695496, time_cost=2.0450901985168457
+
Steps: 2%|▏ | 16335/1000000 [8:30:58<1993:08:48, 7.29s/it, lr=1e-5, step_loss=0.163]
Steps: 2%|▏ | 16336/1000000 [8:31:03<1817:36:33, 6.65s/it, lr=1e-5, step_loss=0.163][RANK-0]: Step: [16336], local_loss=0.03993499279022217, train_loss=0.0266430601477623, time_cost=2.068335771560669
+
Steps: 2%|▏ | 16336/1000000 [8:31:03<1817:36:33, 6.65s/it, lr=1e-5, step_loss=0.0399]
Steps: 2%|▏ | 16337/1000000 [8:31:10<1863:55:38, 6.82s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [16337], local_loss=0.0289632398635149, train_loss=0.04715466499328613, time_cost=1.8829505443572998
+
Steps: 2%|▏ | 16337/1000000 [8:31:10<1863:55:38, 6.82s/it, lr=1e-5, step_loss=0.029]
Steps: 2%|▏ | 16338/1000000 [8:31:20<2132:40:50, 7.81s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [16338], local_loss=0.3196236789226532, train_loss=23.897668838500977, time_cost=7.348984718322754
+
Steps: 2%|▏ | 16338/1000000 [8:31:20<2132:40:50, 7.81s/it, lr=1e-5, step_loss=0.32]
Steps: 2%|▏ | 16339/1000000 [8:31:32<2421:07:41, 8.86s/it, lr=1e-5, step_loss=0.32][RANK-0]: Step: [16339], local_loss=0.12228230386972427, train_loss=0.11708398163318634, time_cost=2.2119061946868896
+
Steps: 2%|▏ | 16339/1000000 [8:31:32<2421:07:41, 8.86s/it, lr=1e-5, step_loss=0.122]
Steps: 2%|▏ | 16340/1000000 [8:31:47<2974:03:38, 10.88s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [16340], local_loss=0.009873231872916222, train_loss=0.01807982660830021, time_cost=4.958331108093262
+
Steps: 2%|▏ | 16340/1000000 [8:31:47<2974:03:38, 10.88s/it, lr=1e-5, step_loss=0.00987]
Steps: 2%|▏ | 16341/1000000 [8:32:00<3154:14:05, 11.54s/it, lr=1e-5, step_loss=0.00987][RANK-0]: Step: [16341], local_loss=0.04003988206386566, train_loss=0.017088163644075394, time_cost=3.5594594478607178
+
Steps: 2%|▏ | 16341/1000000 [8:32:00<3154:14:05, 11.54s/it, lr=1e-5, step_loss=0.04]
Steps: 2%|▏ | 16342/1000000 [8:32:07<2772:12:33, 10.15s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [16342], local_loss=0.00617974903434515, train_loss=0.017525775358080864, time_cost=2.3657217025756836
+
Steps: 2%|▏ | 16342/1000000 [8:32:07<2772:12:33, 10.15s/it, lr=1e-5, step_loss=0.00618]
Steps: 2%|▏ | 16343/1000000 [8:32:18<2801:28:55, 10.25s/it, lr=1e-5, step_loss=0.00618][RANK-0]: Step: [16343], local_loss=0.030453696846961975, train_loss=0.028081055730581284, time_cost=2.279592275619507
+
Steps: 2%|▏ | 16343/1000000 [8:32:18<2801:28:55, 10.25s/it, lr=1e-5, step_loss=0.0305]
Steps: 2%|▏ | 16344/1000000 [8:32:31<3033:19:41, 11.10s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [16344], local_loss=0.0068704113364219666, train_loss=0.04911604896187782, time_cost=5.324316740036011
+
Steps: 2%|▏ | 16344/1000000 [8:32:31<3033:19:41, 11.10s/it, lr=1e-5, step_loss=0.00687]
Steps: 2%|▏ | 16345/1000000 [8:32:40<2897:21:39, 10.60s/it, lr=1e-5, step_loss=0.00687][RANK-0]: Step: [16345], local_loss=0.027690904214978218, train_loss=0.03752637654542923, time_cost=3.25758957862854
+
Steps: 2%|▏ | 16345/1000000 [8:32:40<2897:21:39, 10.60s/it, lr=1e-5, step_loss=0.0277]
Steps: 2%|▏ | 16346/1000000 [8:32:54<3115:37:23, 11.40s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [16346], local_loss=0.006439856253564358, train_loss=0.03456380218267441, time_cost=5.18225622177124
+
Steps: 2%|▏ | 16346/1000000 [8:32:54<3115:37:23, 11.40s/it, lr=1e-5, step_loss=0.00644]
Steps: 2%|▏ | 16347/1000000 [8:32:58<2541:37:04, 9.30s/it, lr=1e-5, step_loss=0.00644][RANK-0]: Step: [16347], local_loss=0.005998147651553154, train_loss=0.0177755244076252, time_cost=1.5554523468017578
+
Steps: 2%|▏ | 16347/1000000 [8:32:58<2541:37:04, 9.30s/it, lr=1e-5, step_loss=0.006]
Steps: 2%|▏ | 16348/1000000 [8:33:09<2678:28:18, 9.80s/it, lr=1e-5, step_loss=0.006][RANK-0]: Step: [16348], local_loss=29.61203384399414, train_loss=3.729414224624634, time_cost=4.7807536125183105
+
Steps: 2%|▏ | 16348/1000000 [8:33:09<2678:28:18, 9.80s/it, lr=1e-5, step_loss=29.6]
Steps: 2%|▏ | 16349/1000000 [8:33:15<2365:04:16, 8.66s/it, lr=1e-5, step_loss=29.6][RANK-0]: Step: [16349], local_loss=0.013554524630308151, train_loss=0.025349948555231094, time_cost=1.6897027492523193
+
Steps: 2%|▏ | 16349/1000000 [8:33:15<2365:04:16, 8.66s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 16350/1000000 [8:33:19<2015:40:37, 7.38s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [16350], local_loss=0.028057841584086418, train_loss=0.06916572898626328, time_cost=1.4948184490203857
+
Steps: 2%|▏ | 16350/1000000 [8:33:19<2015:40:37, 7.38s/it, lr=1e-5, step_loss=0.0281]
Steps: 2%|▏ | 16351/1000000 [8:33:30<2263:06:48, 8.28s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [16351], local_loss=0.004305724054574966, train_loss=0.030343255028128624, time_cost=5.280041694641113
+
Steps: 2%|▏ | 16351/1000000 [8:33:30<2263:06:48, 8.28s/it, lr=1e-5, step_loss=0.00431]
Steps: 2%|▏ | 16352/1000000 [8:33:37<2142:55:25, 7.84s/it, lr=1e-5, step_loss=0.00431][RANK-0]: Step: [16352], local_loss=0.010627402924001217, train_loss=0.028649600222706795, time_cost=2.583425760269165
+
Steps: 2%|▏ | 16352/1000000 [8:33:37<2142:55:25, 7.84s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 16353/1000000 [8:33:43<2016:28:43, 7.38s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [16353], local_loss=0.01213052123785019, train_loss=0.021763764321804047, time_cost=5.289847373962402
+
Steps: 2%|▏ | 16353/1000000 [8:33:43<2016:28:43, 7.38s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 16354/1000000 [8:33:48<1832:28:51, 6.71s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [16354], local_loss=0.11814525723457336, train_loss=0.09283547103404999, time_cost=1.5670952796936035
+
Steps: 2%|▏ | 16354/1000000 [8:33:48<1832:28:51, 6.71s/it, lr=1e-5, step_loss=0.118]
Steps: 2%|▏ | 16355/1000000 [8:34:01<2378:56:21, 8.71s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [16355], local_loss=0.0048865447752177715, train_loss=0.04494182765483856, time_cost=1.2138354778289795
+
Steps: 2%|▏ | 16355/1000000 [8:34:01<2378:56:21, 8.71s/it, lr=1e-5, step_loss=0.00489]
Steps: 2%|▏ | 16356/1000000 [8:34:15<2804:09:21, 10.26s/it, lr=1e-5, step_loss=0.00489][RANK-0]: Step: [16356], local_loss=0.005888908635824919, train_loss=0.02304537408053875, time_cost=4.743710041046143
+
Steps: 2%|▏ | 16356/1000000 [8:34:15<2804:09:21, 10.26s/it, lr=1e-5, step_loss=0.00589]
Steps: 2%|▏ | 16357/1000000 [8:34:20<2374:33:14, 8.69s/it, lr=1e-5, step_loss=0.00589][RANK-0]: Step: [16357], local_loss=0.01745946891605854, train_loss=0.04795273393392563, time_cost=1.983949899673462
+
Steps: 2%|▏ | 16357/1000000 [8:34:20<2374:33:14, 8.69s/it, lr=1e-5, step_loss=0.0175]
Steps: 2%|▏ | 16358/1000000 [8:34:28<2281:21:54, 8.35s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [16358], local_loss=0.049197595566511154, train_loss=0.0686878114938736, time_cost=3.8448293209075928
+
Steps: 2%|▏ | 16358/1000000 [8:34:28<2281:21:54, 8.35s/it, lr=1e-5, step_loss=0.0492]
Steps: 2%|▏ | 16359/1000000 [8:34:32<1938:57:52, 7.10s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [16359], local_loss=0.060842663049697876, train_loss=0.1779198944568634, time_cost=1.344482183456421
+
Steps: 2%|▏ | 16359/1000000 [8:34:32<1938:57:52, 7.10s/it, lr=1e-5, step_loss=0.0608]
Steps: 2%|▏ | 16360/1000000 [8:34:41<2114:21:04, 7.74s/it, lr=1e-5, step_loss=0.0608][RANK-0]: Step: [16360], local_loss=1.0070371627807617, train_loss=0.17327597737312317, time_cost=1.677609920501709
+
Steps: 2%|▏ | 16360/1000000 [8:34:41<2114:21:04, 7.74s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 16361/1000000 [8:34:54<2530:36:33, 9.26s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [16361], local_loss=0.0067772818729281425, train_loss=0.025894267484545708, time_cost=3.111534357070923
+
Steps: 2%|▏ | 16361/1000000 [8:34:54<2530:36:33, 9.26s/it, lr=1e-5, step_loss=0.00678]
Steps: 2%|▏ | 16362/1000000 [8:35:01<2331:28:45, 8.53s/it, lr=1e-5, step_loss=0.00678][RANK-0]: Step: [16362], local_loss=0.016748569905757904, train_loss=0.027199571952223778, time_cost=2.583500385284424
+
Steps: 2%|▏ | 16362/1000000 [8:35:01<2331:28:45, 8.53s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 16363/1000000 [8:35:08<2200:55:26, 8.06s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [16363], local_loss=0.029181256890296936, train_loss=0.01895926520228386, time_cost=1.364109754562378
+
Steps: 2%|▏ | 16363/1000000 [8:35:08<2200:55:26, 8.06s/it, lr=1e-5, step_loss=0.0292]
Steps: 2%|▏ | 16364/1000000 [8:35:15<2109:00:00, 7.72s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [16364], local_loss=0.014010285027325153, train_loss=0.05249227583408356, time_cost=2.335283041000366
+
Steps: 2%|▏ | 16364/1000000 [8:35:15<2109:00:00, 7.72s/it, lr=1e-5, step_loss=0.014]
Steps: 2%|▏ | 16365/1000000 [8:35:19<1840:19:19, 6.74s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [16365], local_loss=0.09734698385000229, train_loss=0.02114293724298477, time_cost=1.4638807773590088
+
Steps: 2%|▏ | 16365/1000000 [8:35:19<1840:19:19, 6.74s/it, lr=1e-5, step_loss=0.0973]
Steps: 2%|▏ | 16366/1000000 [8:35:25<1728:31:00, 6.33s/it, lr=1e-5, step_loss=0.0973][RANK-0]: Step: [16366], local_loss=0.01665947027504444, train_loss=0.27735239267349243, time_cost=4.391643524169922
+
Steps: 2%|▏ | 16366/1000000 [8:35:25<1728:31:00, 6.33s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 16367/1000000 [8:35:32<1835:13:50, 6.72s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [16367], local_loss=0.07484520226716995, train_loss=0.06517777591943741, time_cost=3.4160711765289307
+
Steps: 2%|▏ | 16367/1000000 [8:35:32<1835:13:50, 6.72s/it, lr=1e-5, step_loss=0.0748]
Steps: 2%|▏ | 16368/1000000 [8:35:39<1839:55:03, 6.73s/it, lr=1e-5, step_loss=0.0748][RANK-0]: Step: [16368], local_loss=0.039588749408721924, train_loss=0.285091757774353, time_cost=1.2095415592193604
+
Steps: 2%|▏ | 16368/1000000 [8:35:39<1839:55:03, 6.73s/it, lr=1e-5, step_loss=0.0396]
Steps: 2%|▏ | 16369/1000000 [8:35:46<1847:03:49, 6.76s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [16369], local_loss=0.008143825456500053, train_loss=0.10590243339538574, time_cost=4.160364866256714
+
Steps: 2%|▏ | 16369/1000000 [8:35:46<1847:03:49, 6.76s/it, lr=1e-5, step_loss=0.00814]
Steps: 2%|▏ | 16370/1000000 [8:35:51<1720:37:59, 6.30s/it, lr=1e-5, step_loss=0.00814][RANK-0]: Step: [16370], local_loss=0.07059650868177414, train_loss=0.02561268024146557, time_cost=1.576077938079834
+
Steps: 2%|▏ | 16370/1000000 [8:35:51<1720:37:59, 6.30s/it, lr=1e-5, step_loss=0.0706]
Steps: 2%|▏ | 16371/1000000 [8:36:07<2523:15:06, 9.23s/it, lr=1e-5, step_loss=0.0706][RANK-0]: Step: [16371], local_loss=0.015476329252123833, train_loss=0.028361298143863678, time_cost=7.09208607673645
+
Steps: 2%|▏ | 16371/1000000 [8:36:07<2523:15:06, 9.23s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 16372/1000000 [8:36:17<2600:42:26, 9.52s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [16372], local_loss=0.0069021424278616905, train_loss=0.023631086573004723, time_cost=2.905406951904297
+
Steps: 2%|▏ | 16372/1000000 [8:36:17<2600:42:26, 9.52s/it, lr=1e-5, step_loss=0.0069]
Steps: 2%|▏ | 16373/1000000 [8:36:22<2166:59:26, 7.93s/it, lr=1e-5, step_loss=0.0069][RANK-0]: Step: [16373], local_loss=0.037283893674612045, train_loss=0.04826545715332031, time_cost=1.5649058818817139
+
Steps: 2%|▏ | 16373/1000000 [8:36:22<2166:59:26, 7.93s/it, lr=1e-5, step_loss=0.0373]
Steps: 2%|▏ | 16374/1000000 [8:36:29<2167:12:12, 7.93s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [16374], local_loss=0.009817751124501228, train_loss=0.02531728893518448, time_cost=4.514141321182251
+
Steps: 2%|▏ | 16374/1000000 [8:36:29<2167:12:12, 7.93s/it, lr=1e-5, step_loss=0.00982]
Steps: 2%|▏ | 16375/1000000 [8:36:34<1863:52:31, 6.82s/it, lr=1e-5, step_loss=0.00982][RANK-0]: Step: [16375], local_loss=0.20857451856136322, train_loss=0.05453271418809891, time_cost=1.3030705451965332
+
Steps: 2%|▏ | 16375/1000000 [8:36:34<1863:52:31, 6.82s/it, lr=1e-5, step_loss=0.209]
Steps: 2%|▏ | 16376/1000000 [8:36:39<1750:47:42, 6.41s/it, lr=1e-5, step_loss=0.209][RANK-0]: Step: [16376], local_loss=0.04464828595519066, train_loss=0.05791211500763893, time_cost=1.3350653648376465
+
Steps: 2%|▏ | 16376/1000000 [8:36:39<1750:47:42, 6.41s/it, lr=1e-5, step_loss=0.0446]
Steps: 2%|▏ | 16377/1000000 [8:36:53<2328:16:27, 8.52s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [16377], local_loss=0.039651717990636826, train_loss=0.01463436521589756, time_cost=5.871084213256836
+
Steps: 2%|▏ | 16377/1000000 [8:36:53<2328:16:27, 8.52s/it, lr=1e-5, step_loss=0.0397]
Steps: 2%|▏ | 16378/1000000 [8:36:58<2048:19:25, 7.50s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [16378], local_loss=0.0037984554655849934, train_loss=0.05998998135328293, time_cost=2.5194528102874756
+
Steps: 2%|▏ | 16378/1000000 [8:36:58<2048:19:25, 7.50s/it, lr=1e-5, step_loss=0.0038]
Steps: 2%|▏ | 16379/1000000 [8:37:06<2093:57:08, 7.66s/it, lr=1e-5, step_loss=0.0038][RANK-0]: Step: [16379], local_loss=0.026149731129407883, train_loss=0.053126707673072815, time_cost=1.2192647457122803
+
Steps: 2%|▏ | 16379/1000000 [8:37:06<2093:57:08, 7.66s/it, lr=1e-5, step_loss=0.0261]
Steps: 2%|▏ | 16380/1000000 [8:37:13<2085:34:10, 7.63s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [16380], local_loss=1.0020395517349243, train_loss=0.17849206924438477, time_cost=3.9368271827697754
+
Steps: 2%|▏ | 16380/1000000 [8:37:13<2085:34:10, 7.63s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 16381/1000000 [8:37:28<2679:58:01, 9.81s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [16381], local_loss=0.11096197366714478, train_loss=0.03185627609491348, time_cost=6.824242830276489
+
Steps: 2%|▏ | 16381/1000000 [8:37:28<2679:58:01, 9.81s/it, lr=1e-5, step_loss=0.111]
Steps: 2%|▏ | 16382/1000000 [8:37:34<2387:25:53, 8.74s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [16382], local_loss=0.03192462772130966, train_loss=0.030988149344921112, time_cost=1.8379602432250977
+
Steps: 2%|▏ | 16382/1000000 [8:37:34<2387:25:53, 8.74s/it, lr=1e-5, step_loss=0.0319]
Steps: 2%|▏ | 16383/1000000 [8:37:40<2167:34:51, 7.93s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [16383], local_loss=0.11845453083515167, train_loss=0.07015682011842728, time_cost=4.329977512359619
+
Steps: 2%|▏ | 16383/1000000 [8:37:40<2167:34:51, 7.93s/it, lr=1e-5, step_loss=0.118]
Steps: 2%|▏ | 16384/1000000 [8:37:47<2076:52:49, 7.60s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [16384], local_loss=0.020916156470775604, train_loss=0.044925473630428314, time_cost=1.2318360805511475
+
Steps: 2%|▏ | 16384/1000000 [8:37:47<2076:52:49, 7.60s/it, lr=1e-5, step_loss=0.0209]
Steps: 2%|▏ | 16385/1000000 [8:37:52<1856:29:25, 6.79s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [16385], local_loss=0.006181108299642801, train_loss=0.047018829733133316, time_cost=2.326634645462036
+
Steps: 2%|▏ | 16385/1000000 [8:37:52<1856:29:25, 6.79s/it, lr=1e-5, step_loss=0.00618]
Steps: 2%|▏ | 16386/1000000 [8:38:01<2036:58:32, 7.46s/it, lr=1e-5, step_loss=0.00618][RANK-0]: Step: [16386], local_loss=0.0063878740184009075, train_loss=0.015473978593945503, time_cost=2.9601528644561768
+
Steps: 2%|▏ | 16386/1000000 [8:38:01<2036:58:32, 7.46s/it, lr=1e-5, step_loss=0.00639]
Steps: 2%|▏ | 16387/1000000 [8:38:10<2175:24:56, 7.96s/it, lr=1e-5, step_loss=0.00639][RANK-0]: Step: [16387], local_loss=0.005491185933351517, train_loss=0.05046466365456581, time_cost=1.537785530090332
+
Steps: 2%|▏ | 16387/1000000 [8:38:10<2175:24:56, 7.96s/it, lr=1e-5, step_loss=0.00549]
Steps: 2%|▏ | 16388/1000000 [8:38:15<1887:33:36, 6.91s/it, lr=1e-5, step_loss=0.00549][RANK-0]: Step: [16388], local_loss=0.0312679186463356, train_loss=0.035103365778923035, time_cost=1.4066684246063232
+
Steps: 2%|▏ | 16388/1000000 [8:38:15<1887:33:36, 6.91s/it, lr=1e-5, step_loss=0.0313]
Steps: 2%|▏ | 16389/1000000 [8:38:20<1734:54:26, 6.35s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [16389], local_loss=0.0077681574039161205, train_loss=0.058111343532800674, time_cost=2.02396559715271
+
Steps: 2%|▏ | 16389/1000000 [8:38:20<1734:54:26, 6.35s/it, lr=1e-5, step_loss=0.00777]
Steps: 2%|▏ | 16390/1000000 [8:38:31<2112:13:10, 7.73s/it, lr=1e-5, step_loss=0.00777][RANK-0]: Step: [16390], local_loss=0.08494903892278671, train_loss=0.05518120154738426, time_cost=3.2245280742645264
+
Steps: 2%|▏ | 16390/1000000 [8:38:31<2112:13:10, 7.73s/it, lr=1e-5, step_loss=0.0849]
Steps: 2%|▏ | 16391/1000000 [8:38:40<2202:27:51, 8.06s/it, lr=1e-5, step_loss=0.0849][RANK-0]: Step: [16391], local_loss=0.02194025181233883, train_loss=0.04600609093904495, time_cost=2.6352574825286865
+
Steps: 2%|▏ | 16391/1000000 [8:38:40<2202:27:51, 8.06s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 16392/1000000 [8:38:53<2636:49:26, 9.65s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [16392], local_loss=0.07720896601676941, train_loss=0.03870313614606857, time_cost=5.88349175453186
+
Steps: 2%|▏ | 16392/1000000 [8:38:53<2636:49:26, 9.65s/it, lr=1e-5, step_loss=0.0772]
Steps: 2%|▏ | 16393/1000000 [8:38:57<2205:16:44, 8.07s/it, lr=1e-5, step_loss=0.0772][RANK-0]: Step: [16393], local_loss=0.011637119576334953, train_loss=0.014257045462727547, time_cost=1.5052247047424316
+
Steps: 2%|▏ | 16393/1000000 [8:38:57<2205:16:44, 8.07s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 16394/1000000 [8:39:02<1947:35:30, 7.13s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [16394], local_loss=0.06191696971654892, train_loss=0.029617590829730034, time_cost=2.4843122959136963
+
Steps: 2%|▏ | 16394/1000000 [8:39:02<1947:35:30, 7.13s/it, lr=1e-5, step_loss=0.0619]
Steps: 2%|▏ | 16395/1000000 [8:39:10<1958:46:56, 7.17s/it, lr=1e-5, step_loss=0.0619][RANK-0]: Step: [16395], local_loss=0.04359591752290726, train_loss=0.03616292402148247, time_cost=2.795464515686035
+
Steps: 2%|▏ | 16395/1000000 [8:39:10<1958:46:56, 7.17s/it, lr=1e-5, step_loss=0.0436]
Steps: 2%|▏ | 16396/1000000 [8:39:23<2500:00:22, 9.15s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [16396], local_loss=0.0065915584564208984, train_loss=0.2455604374408722, time_cost=4.500139474868774
+
Steps: 2%|▏ | 16396/1000000 [8:39:23<2500:00:22, 9.15s/it, lr=1e-5, step_loss=0.00659]
Steps: 2%|▏ | 16397/1000000 [8:39:28<2139:30:08, 7.83s/it, lr=1e-5, step_loss=0.00659][RANK-0]: Step: [16397], local_loss=0.010746839456260204, train_loss=0.02583804354071617, time_cost=1.5271801948547363
+
Steps: 2%|▏ | 16397/1000000 [8:39:28<2139:30:08, 7.83s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 16398/1000000 [8:39:34<1982:07:25, 7.25s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [16398], local_loss=0.02214871719479561, train_loss=0.04694046825170517, time_cost=1.6226181983947754
+
Steps: 2%|▏ | 16398/1000000 [8:39:34<1982:07:25, 7.25s/it, lr=1e-5, step_loss=0.0221]
Steps: 2%|▏ | 16399/1000000 [8:39:40<1851:57:07, 6.78s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [16399], local_loss=0.11271638423204422, train_loss=0.02579948678612709, time_cost=3.3160383701324463
+
Steps: 2%|▏ | 16399/1000000 [8:39:40<1851:57:07, 6.78s/it, lr=1e-5, step_loss=0.113]
Steps: 2%|▏ | 16400/1000000 [8:39:51<2194:43:32, 8.03s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [16400], local_loss=0.008507559075951576, train_loss=0.059597790241241455, time_cost=1.173999309539795
+
Steps: 2%|▏ | 16400/1000000 [8:39:51<2194:43:32, 8.03s/it, lr=1e-5, step_loss=0.00851]
Steps: 2%|▏ | 16401/1000000 [8:40:02<2506:55:38, 9.18s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [16401], local_loss=0.04525471106171608, train_loss=0.023388441652059555, time_cost=2.455299139022827
+
Steps: 2%|▏ | 16401/1000000 [8:40:02<2506:55:38, 9.18s/it, lr=1e-5, step_loss=0.0453]
Steps: 2%|▏ | 16402/1000000 [8:40:11<2486:39:15, 9.10s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [16402], local_loss=0.010507702827453613, train_loss=0.020257452502846718, time_cost=2.888948440551758
+
Steps: 2%|▏ | 16402/1000000 [8:40:11<2486:39:15, 9.10s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 16403/1000000 [8:40:24<2801:26:05, 10.25s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [16403], local_loss=126.05445861816406, train_loss=15.80408763885498, time_cost=1.2303228378295898
+
Steps: 2%|▏ | 16403/1000000 [8:40:24<2801:26:05, 10.25s/it, lr=1e-5, step_loss=126]
Steps: 2%|▏ | 16404/1000000 [8:40:29<2302:34:01, 8.43s/it, lr=1e-5, step_loss=126][RANK-0]: Step: [16404], local_loss=0.03303682431578636, train_loss=0.01578812673687935, time_cost=1.461057186126709
+
Steps: 2%|▏ | 16404/1000000 [8:40:29<2302:34:01, 8.43s/it, lr=1e-5, step_loss=0.033]
Steps: 2%|▏ | 16405/1000000 [8:40:39<2472:57:57, 9.05s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [16405], local_loss=0.042410582304000854, train_loss=0.03563518077135086, time_cost=1.2941064834594727
+
Steps: 2%|▏ | 16405/1000000 [8:40:39<2472:57:57, 9.05s/it, lr=1e-5, step_loss=0.0424]
Steps: 2%|▏ | 16406/1000000 [8:40:44<2132:58:50, 7.81s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [16406], local_loss=0.08343870937824249, train_loss=0.12515348196029663, time_cost=2.1746022701263428
+
Steps: 2%|▏ | 16406/1000000 [8:40:44<2132:58:50, 7.81s/it, lr=1e-5, step_loss=0.0834]
Steps: 2%|▏ | 16407/1000000 [8:40:52<2187:16:11, 8.01s/it, lr=1e-5, step_loss=0.0834][RANK-0]: Step: [16407], local_loss=0.04591819643974304, train_loss=0.2330981343984604, time_cost=4.781754016876221
+
Steps: 2%|▏ | 16407/1000000 [8:40:52<2187:16:11, 8.01s/it, lr=1e-5, step_loss=0.0459]
Steps: 2%|▏ | 16408/1000000 [8:40:58<2025:45:01, 7.41s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [16408], local_loss=0.0074107106775045395, train_loss=0.025057464838027954, time_cost=2.0116117000579834
+
Steps: 2%|▏ | 16408/1000000 [8:40:58<2025:45:01, 7.41s/it, lr=1e-5, step_loss=0.00741]
Steps: 2%|▏ | 16409/1000000 [8:41:04<1896:39:25, 6.94s/it, lr=1e-5, step_loss=0.00741][RANK-0]: Step: [16409], local_loss=0.04852869734168053, train_loss=0.022011809051036835, time_cost=5.2571046352386475
+
Steps: 2%|▏ | 16409/1000000 [8:41:04<1896:39:25, 6.94s/it, lr=1e-5, step_loss=0.0485]
Steps: 2%|▏ | 16410/1000000 [8:41:18<2424:09:23, 8.87s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [16410], local_loss=0.03734413906931877, train_loss=0.04194580763578415, time_cost=1.2108275890350342
+
Steps: 2%|▏ | 16410/1000000 [8:41:18<2424:09:23, 8.87s/it, lr=1e-5, step_loss=0.0373]
Steps: 2%|▏ | 16411/1000000 [8:41:23<2121:50:11, 7.77s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [16411], local_loss=0.015598329715430737, train_loss=0.016904398798942566, time_cost=1.608508825302124
+
Steps: 2%|▏ | 16411/1000000 [8:41:23<2121:50:11, 7.77s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 16412/1000000 [8:41:28<1923:16:37, 7.04s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [16412], local_loss=0.057685595005750656, train_loss=0.14245684444904327, time_cost=2.3980424404144287
+
Steps: 2%|▏ | 16412/1000000 [8:41:28<1923:16:37, 7.04s/it, lr=1e-5, step_loss=0.0577]
Steps: 2%|▏ | 16413/1000000 [8:41:41<2400:47:35, 8.79s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [16413], local_loss=0.206753671169281, train_loss=0.06437190622091293, time_cost=9.43206787109375
+
Steps: 2%|▏ | 16413/1000000 [8:41:41<2400:47:35, 8.79s/it, lr=1e-5, step_loss=0.207]
Steps: 2%|▏ | 16414/1000000 [8:41:53<2630:19:39, 9.63s/it, lr=1e-5, step_loss=0.207][RANK-0]: Step: [16414], local_loss=0.03513018786907196, train_loss=0.03337781876325607, time_cost=1.977719783782959
+
Steps: 2%|▏ | 16414/1000000 [8:41:53<2630:19:39, 9.63s/it, lr=1e-5, step_loss=0.0351]
Steps: 2%|▏ | 16415/1000000 [8:42:02<2638:46:32, 9.66s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [16415], local_loss=0.004831267055124044, train_loss=0.026401814073324203, time_cost=1.8127896785736084
+
Steps: 2%|▏ | 16415/1000000 [8:42:02<2638:46:32, 9.66s/it, lr=1e-5, step_loss=0.00483]
Steps: 2%|▏ | 16416/1000000 [8:42:13<2756:13:47, 10.09s/it, lr=1e-5, step_loss=0.00483][RANK-0]: Step: [16416], local_loss=0.010638508945703506, train_loss=0.018905259668827057, time_cost=3.6907498836517334
+
Steps: 2%|▏ | 16416/1000000 [8:42:13<2756:13:47, 10.09s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 16417/1000000 [8:42:19<2375:54:13, 8.70s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [16417], local_loss=0.011451389640569687, train_loss=0.027359100058674812, time_cost=1.5307519435882568
+
Steps: 2%|▏ | 16417/1000000 [8:42:19<2375:54:13, 8.70s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 16418/1000000 [8:42:25<2183:46:36, 7.99s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [16418], local_loss=0.012516610324382782, train_loss=0.0255911685526371, time_cost=2.0715091228485107
+
Steps: 2%|▏ | 16418/1000000 [8:42:25<2183:46:36, 7.99s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 16419/1000000 [8:42:31<2022:52:27, 7.40s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [16419], local_loss=0.009064428508281708, train_loss=0.02741263434290886, time_cost=2.45129132270813
+
Steps: 2%|▏ | 16419/1000000 [8:42:31<2022:52:27, 7.40s/it, lr=1e-5, step_loss=0.00906]
Steps: 2%|▏ | 16420/1000000 [8:42:42<2332:02:59, 8.54s/it, lr=1e-5, step_loss=0.00906][RANK-0]: Step: [16420], local_loss=0.03682079166173935, train_loss=0.05639314651489258, time_cost=5.687446355819702
+
Steps: 2%|▏ | 16420/1000000 [8:42:42<2332:02:59, 8.54s/it, lr=1e-5, step_loss=0.0368]
Steps: 2%|▏ | 16421/1000000 [8:42:53<2492:14:56, 9.12s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [16421], local_loss=0.012324173003435135, train_loss=0.06550011783838272, time_cost=5.769769191741943
+
Steps: 2%|▏ | 16421/1000000 [8:42:53<2492:14:56, 9.12s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 16422/1000000 [8:43:02<2490:50:05, 9.12s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [16422], local_loss=0.07609745115041733, train_loss=0.045523546636104584, time_cost=3.895036458969116
+
Steps: 2%|▏ | 16422/1000000 [8:43:02<2490:50:05, 9.12s/it, lr=1e-5, step_loss=0.0761]
Steps: 2%|▏ | 16423/1000000 [8:43:11<2512:57:38, 9.20s/it, lr=1e-5, step_loss=0.0761][RANK-0]: Step: [16423], local_loss=0.007722490467131138, train_loss=0.041609495878219604, time_cost=1.8027193546295166
+
Steps: 2%|▏ | 16423/1000000 [8:43:11<2512:57:38, 9.20s/it, lr=1e-5, step_loss=0.00772]
Steps: 2%|▏ | 16424/1000000 [8:43:23<2696:31:13, 9.87s/it, lr=1e-5, step_loss=0.00772][RANK-0]: Step: [16424], local_loss=0.051639582961797714, train_loss=0.07109029591083527, time_cost=3.9209723472595215
+
Steps: 2%|▏ | 16424/1000000 [8:43:23<2696:31:13, 9.87s/it, lr=1e-5, step_loss=0.0516]
Steps: 2%|▏ | 16425/1000000 [8:43:33<2748:28:54, 10.06s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [16425], local_loss=0.07425298541784286, train_loss=0.06518286466598511, time_cost=1.2332375049591064
+
Steps: 2%|▏ | 16425/1000000 [8:43:33<2748:28:54, 10.06s/it, lr=1e-5, step_loss=0.0743]
Steps: 2%|▏ | 16426/1000000 [8:43:39<2353:21:35, 8.61s/it, lr=1e-5, step_loss=0.0743][RANK-0]: Step: [16426], local_loss=0.003649058286100626, train_loss=0.14359541237354279, time_cost=2.542595624923706
+
Steps: 2%|▏ | 16426/1000000 [8:43:39<2353:21:35, 8.61s/it, lr=1e-5, step_loss=0.00365]
Steps: 2%|▏ | 16427/1000000 [8:43:51<2692:41:19, 9.86s/it, lr=1e-5, step_loss=0.00365][RANK-0]: Step: [16427], local_loss=0.07192084193229675, train_loss=0.03289146348834038, time_cost=3.843654155731201
+
Steps: 2%|▏ | 16427/1000000 [8:43:51<2692:41:19, 9.86s/it, lr=1e-5, step_loss=0.0719]
Steps: 2%|▏ | 16428/1000000 [8:43:57<2305:29:10, 8.44s/it, lr=1e-5, step_loss=0.0719][RANK-0]: Step: [16428], local_loss=0.012446480803191662, train_loss=0.02355467714369297, time_cost=1.808424472808838
+
Steps: 2%|▏ | 16428/1000000 [8:43:57<2305:29:10, 8.44s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 16429/1000000 [8:44:07<2448:51:36, 8.96s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [16429], local_loss=0.14797565340995789, train_loss=0.05567726492881775, time_cost=1.8545382022857666
+
Steps: 2%|▏ | 16429/1000000 [8:44:07<2448:51:36, 8.96s/it, lr=1e-5, step_loss=0.148]
Steps: 2%|▏ | 16430/1000000 [8:44:14<2283:22:36, 8.36s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [16430], local_loss=0.00802553165704012, train_loss=0.020126380026340485, time_cost=2.458406448364258
+
Steps: 2%|▏ | 16430/1000000 [8:44:14<2283:22:36, 8.36s/it, lr=1e-5, step_loss=0.00803]
Steps: 2%|▏ | 16431/1000000 [8:44:19<2000:11:32, 7.32s/it, lr=1e-5, step_loss=0.00803][RANK-0]: Step: [16431], local_loss=0.040666963905096054, train_loss=0.0634295791387558, time_cost=3.621220350265503
+
Steps: 2%|▏ | 16431/1000000 [8:44:19<2000:11:32, 7.32s/it, lr=1e-5, step_loss=0.0407]
Steps: 2%|▏ | 16432/1000000 [8:44:30<2306:31:58, 8.44s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [16432], local_loss=0.022407570853829384, train_loss=0.05829373002052307, time_cost=2.588003635406494
+
Steps: 2%|▏ | 16432/1000000 [8:44:30<2306:31:58, 8.44s/it, lr=1e-5, step_loss=0.0224]
Steps: 2%|▏ | 16433/1000000 [8:44:34<1986:25:58, 7.27s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [16433], local_loss=0.009134144522249699, train_loss=0.1396908015012741, time_cost=1.7355799674987793
+
Steps: 2%|▏ | 16433/1000000 [8:44:34<1986:25:58, 7.27s/it, lr=1e-5, step_loss=0.00913]
Steps: 2%|▏ | 16434/1000000 [8:44:50<2701:03:08, 9.89s/it, lr=1e-5, step_loss=0.00913][RANK-0]: Step: [16434], local_loss=0.022950273007154465, train_loss=0.023635607212781906, time_cost=7.771599054336548
+
Steps: 2%|▏ | 16434/1000000 [8:44:50<2701:03:08, 9.89s/it, lr=1e-5, step_loss=0.023]
Steps: 2%|▏ | 16435/1000000 [8:45:01<2746:25:47, 10.05s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [16435], local_loss=0.011903420090675354, train_loss=0.0888902023434639, time_cost=1.9287757873535156
+
Steps: 2%|▏ | 16435/1000000 [8:45:01<2746:25:47, 10.05s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 16436/1000000 [8:45:05<2308:46:22, 8.45s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [16436], local_loss=0.011723039671778679, train_loss=0.035359255969524384, time_cost=2.0420005321502686
+
Steps: 2%|▏ | 16436/1000000 [8:45:05<2308:46:22, 8.45s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 16437/1000000 [8:45:15<2412:42:32, 8.83s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [16437], local_loss=0.08065425604581833, train_loss=0.026441963389515877, time_cost=6.743354797363281
+
Steps: 2%|▏ | 16437/1000000 [8:45:15<2412:42:32, 8.83s/it, lr=1e-5, step_loss=0.0807]
Steps: 2%|▏ | 16438/1000000 [8:45:20<2122:47:28, 7.77s/it, lr=1e-5, step_loss=0.0807][RANK-0]: Step: [16438], local_loss=0.013731550425291061, train_loss=0.04524971544742584, time_cost=1.6798930168151855
+
Steps: 2%|▏ | 16438/1000000 [8:45:20<2122:47:28, 7.77s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 16439/1000000 [8:45:25<1844:19:25, 6.75s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [16439], local_loss=0.49437904357910156, train_loss=0.12909884750843048, time_cost=1.4940950870513916
+
Steps: 2%|▏ | 16439/1000000 [8:45:25<1844:19:25, 6.75s/it, lr=1e-5, step_loss=0.494]
Steps: 2%|▏ | 16440/1000000 [8:45:30<1713:57:56, 6.27s/it, lr=1e-5, step_loss=0.494][RANK-0]: Step: [16440], local_loss=0.007410291116684675, train_loss=0.10246174037456512, time_cost=1.3485627174377441
+
Steps: 2%|▏ | 16440/1000000 [8:45:30<1713:57:56, 6.27s/it, lr=1e-5, step_loss=0.00741]
Steps: 2%|▏ | 16441/1000000 [8:45:37<1806:47:58, 6.61s/it, lr=1e-5, step_loss=0.00741][RANK-0]: Step: [16441], local_loss=0.07364463806152344, train_loss=0.052813366055488586, time_cost=3.501187324523926
+
Steps: 2%|▏ | 16441/1000000 [8:45:37<1806:47:58, 6.61s/it, lr=1e-5, step_loss=0.0736]
Steps: 2%|▏ | 16442/1000000 [8:45:42<1646:08:09, 6.03s/it, lr=1e-5, step_loss=0.0736][RANK-0]: Step: [16442], local_loss=0.10714557766914368, train_loss=0.040247298777103424, time_cost=1.9612352848052979
+
Steps: 2%|▏ | 16442/1000000 [8:45:42<1646:08:09, 6.03s/it, lr=1e-5, step_loss=0.107]
Steps: 2%|▏ | 16443/1000000 [8:45:46<1505:58:01, 5.51s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [16443], local_loss=0.0378849022090435, train_loss=0.022876109927892685, time_cost=1.5107898712158203
+
Steps: 2%|▏ | 16443/1000000 [8:45:46<1505:58:01, 5.51s/it, lr=1e-5, step_loss=0.0379]
Steps: 2%|▏ | 16444/1000000 [8:45:53<1615:22:52, 5.91s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [16444], local_loss=0.0105580510571599, train_loss=0.04221529886126518, time_cost=1.273447036743164
+
Steps: 2%|▏ | 16444/1000000 [8:45:53<1615:22:52, 5.91s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 16445/1000000 [8:46:00<1689:35:43, 6.18s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [16445], local_loss=0.005184152163565159, train_loss=0.06358249485492706, time_cost=2.354124069213867
+
Steps: 2%|▏ | 16445/1000000 [8:46:00<1689:35:43, 6.18s/it, lr=1e-5, step_loss=0.00518]
Steps: 2%|▏ | 16446/1000000 [8:46:09<1948:54:09, 7.13s/it, lr=1e-5, step_loss=0.00518][RANK-0]: Step: [16446], local_loss=0.017773520201444626, train_loss=0.04854521155357361, time_cost=3.7616817951202393
+
Steps: 2%|▏ | 16446/1000000 [8:46:09<1948:54:09, 7.13s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 16447/1000000 [8:46:19<2137:17:42, 7.82s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [16447], local_loss=0.016082432121038437, train_loss=0.06278969347476959, time_cost=3.459272861480713
+
Steps: 2%|▏ | 16447/1000000 [8:46:19<2137:17:42, 7.82s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 16448/1000000 [8:46:29<2316:04:10, 8.48s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [16448], local_loss=0.12560586631298065, train_loss=0.02748054638504982, time_cost=2.1601645946502686
+
Steps: 2%|▏ | 16448/1000000 [8:46:29<2316:04:10, 8.48s/it, lr=1e-5, step_loss=0.126]
Steps: 2%|▏ | 16449/1000000 [8:46:34<2049:47:54, 7.50s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [16449], local_loss=0.01275643426924944, train_loss=0.032684992998838425, time_cost=2.6874608993530273
+
Steps: 2%|▏ | 16449/1000000 [8:46:34<2049:47:54, 7.50s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 16450/1000000 [8:46:40<1919:14:01, 7.02s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [16450], local_loss=0.0998590812087059, train_loss=0.04237961024045944, time_cost=3.4075660705566406
+
Steps: 2%|▏ | 16450/1000000 [8:46:40<1919:14:01, 7.02s/it, lr=1e-5, step_loss=0.0999]
Steps: 2%|▏ | 16451/1000000 [8:46:45<1783:37:13, 6.53s/it, lr=1e-5, step_loss=0.0999][RANK-0]: Step: [16451], local_loss=0.04980120807886124, train_loss=0.15407080948352814, time_cost=1.1997897624969482
+
Steps: 2%|▏ | 16451/1000000 [8:46:45<1783:37:13, 6.53s/it, lr=1e-5, step_loss=0.0498]
Steps: 2%|▏ | 16452/1000000 [8:46:59<2352:15:15, 8.61s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [16452], local_loss=0.4614541232585907, train_loss=0.11943823099136353, time_cost=5.1355602741241455
+
Steps: 2%|▏ | 16452/1000000 [8:46:59<2352:15:15, 8.61s/it, lr=1e-5, step_loss=0.461]
Steps: 2%|▏ | 16453/1000000 [8:47:06<2254:02:33, 8.25s/it, lr=1e-5, step_loss=0.461][RANK-0]: Step: [16453], local_loss=0.02238028310239315, train_loss=0.06503133475780487, time_cost=1.5951895713806152
+
Steps: 2%|▏ | 16453/1000000 [8:47:06<2254:02:33, 8.25s/it, lr=1e-5, step_loss=0.0224]
Steps: 2%|▏ | 16454/1000000 [8:47:17<2459:56:44, 9.00s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [16454], local_loss=0.0062059639021754265, train_loss=0.050483208149671555, time_cost=4.21833610534668
+
Steps: 2%|▏ | 16454/1000000 [8:47:17<2459:56:44, 9.00s/it, lr=1e-5, step_loss=0.00621]
Steps: 2%|▏ | 16455/1000000 [8:47:26<2453:57:52, 8.98s/it, lr=1e-5, step_loss=0.00621][RANK-0]: Step: [16455], local_loss=0.07025201618671417, train_loss=0.03481164202094078, time_cost=3.316084146499634
+
Steps: 2%|▏ | 16455/1000000 [8:47:26<2453:57:52, 8.98s/it, lr=1e-5, step_loss=0.0703]
Steps: 2%|▏ | 16456/1000000 [8:47:30<2089:04:42, 7.65s/it, lr=1e-5, step_loss=0.0703][RANK-0]: Step: [16456], local_loss=0.04096288979053497, train_loss=0.030570438131690025, time_cost=1.8055310249328613
+
Steps: 2%|▏ | 16456/1000000 [8:47:30<2089:04:42, 7.65s/it, lr=1e-5, step_loss=0.041]
Steps: 2%|▏ | 16457/1000000 [8:47:35<1872:39:13, 6.85s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [16457], local_loss=0.07210349291563034, train_loss=0.05559033900499344, time_cost=2.1594080924987793
+
Steps: 2%|▏ | 16457/1000000 [8:47:35<1872:39:13, 6.85s/it, lr=1e-5, step_loss=0.0721]
Steps: 2%|▏ | 16458/1000000 [8:47:43<1904:22:48, 6.97s/it, lr=1e-5, step_loss=0.0721][RANK-0]: Step: [16458], local_loss=0.07609312981367111, train_loss=0.047139883041381836, time_cost=3.1795949935913086
+
Steps: 2%|▏ | 16458/1000000 [8:47:43<1904:22:48, 6.97s/it, lr=1e-5, step_loss=0.0761]
Steps: 2%|▏ | 16459/1000000 [8:47:56<2445:00:47, 8.95s/it, lr=1e-5, step_loss=0.0761][RANK-0]: Step: [16459], local_loss=0.011335051618516445, train_loss=0.022369693964719772, time_cost=5.7685558795928955
+
Steps: 2%|▏ | 16459/1000000 [8:47:56<2445:00:47, 8.95s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 16460/1000000 [8:48:01<2155:39:44, 7.89s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [16460], local_loss=0.016094978898763657, train_loss=0.02024109475314617, time_cost=1.6465380191802979
+
Steps: 2%|▏ | 16460/1000000 [8:48:01<2155:39:44, 7.89s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 16461/1000000 [8:48:06<1861:36:03, 6.81s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [16461], local_loss=0.008756643161177635, train_loss=0.03671681880950928, time_cost=1.2143406867980957
+
Steps: 2%|▏ | 16461/1000000 [8:48:06<1861:36:03, 6.81s/it, lr=1e-5, step_loss=0.00876]
Steps: 2%|▏ | 16462/1000000 [8:48:15<2043:47:30, 7.48s/it, lr=1e-5, step_loss=0.00876][RANK-0]: Step: [16462], local_loss=0.03792843222618103, train_loss=0.025640912353992462, time_cost=3.908811330795288
+
Steps: 2%|▏ | 16462/1000000 [8:48:15<2043:47:30, 7.48s/it, lr=1e-5, step_loss=0.0379]
Steps: 2%|▏ | 16463/1000000 [8:48:24<2191:00:12, 8.02s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [16463], local_loss=0.04331459477543831, train_loss=0.08524483442306519, time_cost=1.7292187213897705
+
Steps: 2%|▏ | 16463/1000000 [8:48:24<2191:00:12, 8.02s/it, lr=1e-5, step_loss=0.0433]
Steps: 2%|▏ | 16464/1000000 [8:48:36<2543:21:35, 9.31s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [16464], local_loss=0.12325860559940338, train_loss=0.04600208252668381, time_cost=6.393011808395386
+
Steps: 2%|▏ | 16464/1000000 [8:48:36<2543:21:35, 9.31s/it, lr=1e-5, step_loss=0.123]
Steps: 2%|▏ | 16465/1000000 [8:48:45<2521:48:38, 9.23s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [16465], local_loss=0.03972385823726654, train_loss=0.06760230660438538, time_cost=1.7348041534423828
+
Steps: 2%|▏ | 16465/1000000 [8:48:45<2521:48:38, 9.23s/it, lr=1e-5, step_loss=0.0397]
Steps: 2%|▏ | 16466/1000000 [8:48:53<2368:38:28, 8.67s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [16466], local_loss=0.005488620605319738, train_loss=0.03469393402338028, time_cost=5.634446859359741
+
Steps: 2%|▏ | 16466/1000000 [8:48:53<2368:38:28, 8.67s/it, lr=1e-5, step_loss=0.00549]
Steps: 2%|▏ | 16467/1000000 [8:49:00<2226:19:06, 8.15s/it, lr=1e-5, step_loss=0.00549][RANK-0]: Step: [16467], local_loss=0.04758286103606224, train_loss=0.04872536659240723, time_cost=4.013003349304199
+
Steps: 2%|▏ | 16467/1000000 [8:49:00<2226:19:06, 8.15s/it, lr=1e-5, step_loss=0.0476]
Steps: 2%|▏ | 16468/1000000 [8:49:09<2276:40:08, 8.33s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [16468], local_loss=0.031553275883197784, train_loss=0.07092258334159851, time_cost=1.4540965557098389
+
Steps: 2%|▏ | 16468/1000000 [8:49:09<2276:40:08, 8.33s/it, lr=1e-5, step_loss=0.0316]
Steps: 2%|▏ | 16469/1000000 [8:49:13<1996:52:42, 7.31s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [16469], local_loss=0.01342517789453268, train_loss=0.016184933483600616, time_cost=1.986187219619751
+
Steps: 2%|▏ | 16469/1000000 [8:49:13<1996:52:42, 7.31s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 16470/1000000 [8:49:21<2027:40:24, 7.42s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [16470], local_loss=0.016032328829169273, train_loss=0.017274394631385803, time_cost=4.14586067199707
+
Steps: 2%|▏ | 16470/1000000 [8:49:21<2027:40:24, 7.42s/it, lr=1e-5, step_loss=0.016]
Steps: 2%|▏ | 16471/1000000 [8:49:30<2128:26:06, 7.79s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [16471], local_loss=0.014827472157776356, train_loss=0.03366469964385033, time_cost=2.789182186126709
+
Steps: 2%|▏ | 16471/1000000 [8:49:30<2128:26:06, 7.79s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 16472/1000000 [8:49:42<2509:55:48, 9.19s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [16472], local_loss=0.07709097862243652, train_loss=0.0325363390147686, time_cost=5.596766233444214
+
Steps: 2%|▏ | 16472/1000000 [8:49:42<2509:55:48, 9.19s/it, lr=1e-5, step_loss=0.0771]
Steps: 2%|▏ | 16473/1000000 [8:49:57<2949:23:07, 10.80s/it, lr=1e-5, step_loss=0.0771][RANK-0]: Step: [16473], local_loss=0.005970663391053677, train_loss=0.026548437774181366, time_cost=5.503390550613403
+
Steps: 2%|▏ | 16473/1000000 [8:49:57<2949:23:07, 10.80s/it, lr=1e-5, step_loss=0.00597]
Steps: 2%|▏ | 16474/1000000 [8:50:04<2632:11:23, 9.63s/it, lr=1e-5, step_loss=0.00597][RANK-0]: Step: [16474], local_loss=0.013750873506069183, train_loss=0.019950341433286667, time_cost=3.0221571922302246
+
Steps: 2%|▏ | 16474/1000000 [8:50:04<2632:11:23, 9.63s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 16475/1000000 [8:50:12<2487:00:28, 9.10s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [16475], local_loss=0.006767189595848322, train_loss=0.044257164001464844, time_cost=2.910015344619751
+
Steps: 2%|▏ | 16475/1000000 [8:50:12<2487:00:28, 9.10s/it, lr=1e-5, step_loss=0.00677]
Steps: 2%|▏ | 16476/1000000 [8:50:20<2431:12:28, 8.90s/it, lr=1e-5, step_loss=0.00677][RANK-0]: Step: [16476], local_loss=0.015274266712367535, train_loss=0.10201971977949142, time_cost=2.3887181282043457
+
Steps: 2%|▏ | 16476/1000000 [8:50:20<2431:12:28, 8.90s/it, lr=1e-5, step_loss=0.0153]
Steps: 2%|▏ | 16477/1000000 [8:50:26<2214:21:20, 8.11s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [16477], local_loss=0.026774555444717407, train_loss=0.1729608178138733, time_cost=1.8980329036712646
+
Steps: 2%|▏ | 16477/1000000 [8:50:26<2214:21:20, 8.11s/it, lr=1e-5, step_loss=0.0268]
Steps: 2%|▏ | 16478/1000000 [8:50:33<2131:56:43, 7.80s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [16478], local_loss=0.07173316180706024, train_loss=0.05823889374732971, time_cost=2.7735705375671387
+
Steps: 2%|▏ | 16478/1000000 [8:50:33<2131:56:43, 7.80s/it, lr=1e-5, step_loss=0.0717]
Steps: 2%|▏ | 16479/1000000 [8:50:44<2340:21:02, 8.57s/it, lr=1e-5, step_loss=0.0717][RANK-0]: Step: [16479], local_loss=0.005572345107793808, train_loss=0.02953483909368515, time_cost=2.178823232650757
+
Steps: 2%|▏ | 16479/1000000 [8:50:44<2340:21:02, 8.57s/it, lr=1e-5, step_loss=0.00557]
Steps: 2%|▏ | 16480/1000000 [8:50:56<2686:27:03, 9.83s/it, lr=1e-5, step_loss=0.00557][RANK-0]: Step: [16480], local_loss=0.10220286250114441, train_loss=0.0615852028131485, time_cost=4.226288318634033
+
Steps: 2%|▏ | 16480/1000000 [8:50:56<2686:27:03, 9.83s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 16481/1000000 [8:51:02<2342:41:01, 8.57s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [16481], local_loss=0.0588127076625824, train_loss=0.059310801327228546, time_cost=1.7481493949890137
+
Steps: 2%|▏ | 16481/1000000 [8:51:02<2342:41:01, 8.57s/it, lr=1e-5, step_loss=0.0588]
Steps: 2%|▏ | 16482/1000000 [8:51:06<1990:05:34, 7.28s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [16482], local_loss=0.049041006714105606, train_loss=0.06454910337924957, time_cost=1.5754752159118652
+
Steps: 2%|▏ | 16482/1000000 [8:51:06<1990:05:34, 7.28s/it, lr=1e-5, step_loss=0.049]
Steps: 2%|▏ | 16483/1000000 [8:51:11<1749:36:33, 6.40s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [16483], local_loss=0.030591268092393875, train_loss=0.047212131321430206, time_cost=1.5513651371002197
+
Steps: 2%|▏ | 16483/1000000 [8:51:11<1749:36:33, 6.40s/it, lr=1e-5, step_loss=0.0306]
Steps: 2%|▏ | 16484/1000000 [8:51:26<2451:39:39, 8.97s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [16484], local_loss=0.019978903234004974, train_loss=6.617186546325684, time_cost=6.47187352180481
+
Steps: 2%|▏ | 16484/1000000 [8:51:26<2451:39:39, 8.97s/it, lr=1e-5, step_loss=0.02]
Steps: 2%|▏ | 16485/1000000 [8:51:37<2619:36:13, 9.59s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [16485], local_loss=0.021483369171619415, train_loss=0.06269483268260956, time_cost=8.08194637298584
+
Steps: 2%|▏ | 16485/1000000 [8:51:37<2619:36:13, 9.59s/it, lr=1e-5, step_loss=0.0215]
Steps: 2%|▏ | 16486/1000000 [8:51:46<2576:05:13, 9.43s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [16486], local_loss=0.03587881848216057, train_loss=0.014533396810293198, time_cost=6.145974159240723
+
Steps: 2%|▏ | 16486/1000000 [8:51:46<2576:05:13, 9.43s/it, lr=1e-5, step_loss=0.0359]
Steps: 2%|▏ | 16487/1000000 [8:51:52<2274:34:46, 8.33s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [16487], local_loss=0.016464103013277054, train_loss=0.02715310826897621, time_cost=1.3466076850891113
+
Steps: 2%|▏ | 16487/1000000 [8:51:52<2274:34:46, 8.33s/it, lr=1e-5, step_loss=0.0165]
Steps: 2%|▏ | 16488/1000000 [8:51:57<2004:21:36, 7.34s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [16488], local_loss=0.038985174149274826, train_loss=0.05842018127441406, time_cost=1.8683855533599854
+
Steps: 2%|▏ | 16488/1000000 [8:51:57<2004:21:36, 7.34s/it, lr=1e-5, step_loss=0.039]
Steps: 2%|▏ | 16489/1000000 [8:52:02<1876:55:46, 6.87s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [16489], local_loss=0.03529739752411842, train_loss=0.09066706895828247, time_cost=3.3245270252227783
+
Steps: 2%|▏ | 16489/1000000 [8:52:02<1876:55:46, 6.87s/it, lr=1e-5, step_loss=0.0353]
Steps: 2%|▏ | 16490/1000000 [8:52:15<2344:12:05, 8.58s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [16490], local_loss=0.04665595293045044, train_loss=0.04098084196448326, time_cost=5.343759775161743
+
Steps: 2%|▏ | 16490/1000000 [8:52:15<2344:12:05, 8.58s/it, lr=1e-5, step_loss=0.0467]
Steps: 2%|▏ | 16491/1000000 [8:52:24<2422:58:23, 8.87s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [16491], local_loss=0.03975914046168327, train_loss=0.025774598121643066, time_cost=2.0601911544799805
+
Steps: 2%|▏ | 16491/1000000 [8:52:24<2422:58:23, 8.87s/it, lr=1e-5, step_loss=0.0398]
Steps: 2%|▏ | 16492/1000000 [8:52:33<2427:21:02, 8.88s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [16492], local_loss=0.017164306715130806, train_loss=40.33332061767578, time_cost=3.2345199584960938
+
Steps: 2%|▏ | 16492/1000000 [8:52:33<2427:21:02, 8.88s/it, lr=1e-5, step_loss=0.0172]
Steps: 2%|▏ | 16493/1000000 [8:52:38<2062:56:35, 7.55s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [16493], local_loss=0.1068861335515976, train_loss=0.045828502625226974, time_cost=1.556694507598877
+
Steps: 2%|▏ | 16493/1000000 [8:52:38<2062:56:35, 7.55s/it, lr=1e-5, step_loss=0.107]
Steps: 2%|▏ | 16494/1000000 [8:52:44<1922:55:47, 7.04s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [16494], local_loss=0.018165942281484604, train_loss=0.03226889669895172, time_cost=2.2814862728118896
+
Steps: 2%|▏ | 16494/1000000 [8:52:44<1922:55:47, 7.04s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 16495/1000000 [8:52:49<1816:11:38, 6.65s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [16495], local_loss=0.019303523004055023, train_loss=1.0745820999145508, time_cost=2.823395252227783
+
Steps: 2%|▏ | 16495/1000000 [8:52:49<1816:11:38, 6.65s/it, lr=1e-5, step_loss=0.0193]
Steps: 2%|▏ | 16496/1000000 [8:53:01<2222:33:33, 8.14s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [16496], local_loss=0.1575782746076584, train_loss=0.09329397976398468, time_cost=5.612212181091309
+
Steps: 2%|▏ | 16496/1000000 [8:53:01<2222:33:33, 8.14s/it, lr=1e-5, step_loss=0.158]
Steps: 2%|▏ | 16497/1000000 [8:53:12<2455:17:10, 8.99s/it, lr=1e-5, step_loss=0.158][RANK-0]: Step: [16497], local_loss=0.03673650324344635, train_loss=0.1684049367904663, time_cost=2.299375295639038
+
Steps: 2%|▏ | 16497/1000000 [8:53:12<2455:17:10, 8.99s/it, lr=1e-5, step_loss=0.0367]
Steps: 2%|▏ | 16498/1000000 [8:53:17<2150:42:08, 7.87s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [16498], local_loss=0.020414892584085464, train_loss=0.016745675355196, time_cost=1.2301676273345947
+
Steps: 2%|▏ | 16498/1000000 [8:53:17<2150:42:08, 7.87s/it, lr=1e-5, step_loss=0.0204]
Steps: 2%|▏ | 16499/1000000 [8:53:23<1936:48:18, 7.09s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [16499], local_loss=0.017401818186044693, train_loss=0.05047616735100746, time_cost=2.32193922996521
+
Steps: 2%|▏ | 16499/1000000 [8:53:23<1936:48:18, 7.09s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 16500/1000000 [8:53:28<1773:17:46, 6.49s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [16500], local_loss=0.005430304445326328, train_loss=0.02513866499066353, time_cost=1.2279584407806396
+
Steps: 2%|▏ | 16500/1000000 [8:53:28<1773:17:46, 6.49s/it, lr=1e-5, step_loss=0.00543]
Steps: 2%|▏ | 16501/1000000 [8:53:32<1614:16:25, 5.91s/it, lr=1e-5, step_loss=0.00543][RANK-0]: Step: [16501], local_loss=0.07704933732748032, train_loss=0.1524391770362854, time_cost=1.4093420505523682
+
Steps: 2%|▏ | 16501/1000000 [8:53:32<1614:16:25, 5.91s/it, lr=1e-5, step_loss=0.077]
Steps: 2%|▏ | 16502/1000000 [8:53:42<1941:42:56, 7.11s/it, lr=1e-5, step_loss=0.077][RANK-0]: Step: [16502], local_loss=0.04780537635087967, train_loss=0.06971370428800583, time_cost=2.0773744583129883
+
Steps: 2%|▏ | 16502/1000000 [8:53:42<1941:42:56, 7.11s/it, lr=1e-5, step_loss=0.0478]
Steps: 2%|▏ | 16503/1000000 [8:53:57<2618:07:52, 9.58s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [16503], local_loss=0.007706757169216871, train_loss=0.024552494287490845, time_cost=7.261859655380249
+
Steps: 2%|▏ | 16503/1000000 [8:53:57<2618:07:52, 9.58s/it, lr=1e-5, step_loss=0.00771]
Steps: 2%|▏ | 16504/1000000 [8:54:06<2567:07:35, 9.40s/it, lr=1e-5, step_loss=0.00771][RANK-0]: Step: [16504], local_loss=0.012243427336215973, train_loss=0.07799142599105835, time_cost=1.8255138397216797
+
Steps: 2%|▏ | 16504/1000000 [8:54:06<2567:07:35, 9.40s/it, lr=1e-5, step_loss=0.0122]
Steps: 2%|▏ | 16505/1000000 [8:54:12<2286:58:48, 8.37s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [16505], local_loss=0.013575498946011066, train_loss=0.04206930100917816, time_cost=1.2450270652770996
+
Steps: 2%|▏ | 16505/1000000 [8:54:12<2286:58:48, 8.37s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 16506/1000000 [8:54:21<2294:10:13, 8.40s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [16506], local_loss=0.05850626528263092, train_loss=0.026202553883194923, time_cost=6.868038892745972
+
Steps: 2%|▏ | 16506/1000000 [8:54:21<2294:10:13, 8.40s/it, lr=1e-5, step_loss=0.0585]
Steps: 2%|▏ | 16507/1000000 [8:54:30<2389:15:36, 8.75s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [16507], local_loss=0.07220397144556046, train_loss=0.04422498866915703, time_cost=1.1955673694610596
+
Steps: 2%|▏ | 16507/1000000 [8:54:30<2389:15:36, 8.75s/it, lr=1e-5, step_loss=0.0722]
Steps: 2%|▏ | 16508/1000000 [8:54:41<2501:27:34, 9.16s/it, lr=1e-5, step_loss=0.0722][RANK-0]: Step: [16508], local_loss=0.031026169657707214, train_loss=0.06230216845870018, time_cost=1.2119929790496826
+
Steps: 2%|▏ | 16508/1000000 [8:54:41<2501:27:34, 9.16s/it, lr=1e-5, step_loss=0.031]
Steps: 2%|▏ | 16509/1000000 [8:54:48<2355:46:24, 8.62s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [16509], local_loss=0.02064131200313568, train_loss=0.04528876394033432, time_cost=3.1547200679779053
+
Steps: 2%|▏ | 16509/1000000 [8:54:48<2355:46:24, 8.62s/it, lr=1e-5, step_loss=0.0206]
Steps: 2%|▏ | 16510/1000000 [8:54:54<2140:23:23, 7.83s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [16510], local_loss=0.022623470053076744, train_loss=2.66137433052063, time_cost=1.6257331371307373
+
Steps: 2%|▏ | 16510/1000000 [8:54:54<2140:23:23, 7.83s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 16511/1000000 [8:55:08<2657:32:56, 9.73s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [16511], local_loss=0.025956999510526657, train_loss=0.06632622331380844, time_cost=4.722455978393555
+
Steps: 2%|▏ | 16511/1000000 [8:55:08<2657:32:56, 9.73s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 16512/1000000 [8:55:23<3087:44:50, 11.30s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [16512], local_loss=0.011420536786317825, train_loss=0.029861390590667725, time_cost=1.4454715251922607
+
Steps: 2%|▏ | 16512/1000000 [8:55:23<3087:44:50, 11.30s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 16513/1000000 [8:55:27<2519:39:59, 9.22s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [16513], local_loss=0.025963420048356056, train_loss=0.042038969695568085, time_cost=2.362035036087036
+
Steps: 2%|▏ | 16513/1000000 [8:55:27<2519:39:59, 9.22s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 16514/1000000 [8:55:34<2321:19:44, 8.50s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [16514], local_loss=0.014544999226927757, train_loss=0.04162786900997162, time_cost=2.4445574283599854
+
Steps: 2%|▏ | 16514/1000000 [8:55:34<2321:19:44, 8.50s/it, lr=1e-5, step_loss=0.0145]
Steps: 2%|▏ | 16515/1000000 [8:55:45<2508:01:35, 9.18s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [16515], local_loss=0.008629035204648972, train_loss=0.02596455067396164, time_cost=1.627058506011963
+
Steps: 2%|▏ | 16515/1000000 [8:55:45<2508:01:35, 9.18s/it, lr=1e-5, step_loss=0.00863]
Steps: 2%|▏ | 16516/1000000 [8:55:52<2310:00:57, 8.46s/it, lr=1e-5, step_loss=0.00863][RANK-0]: Step: [16516], local_loss=0.1046292781829834, train_loss=0.02795344963669777, time_cost=1.2396550178527832
+
Steps: 2%|▏ | 16516/1000000 [8:55:52<2310:00:57, 8.46s/it, lr=1e-5, step_loss=0.105]
Steps: 2%|▏ | 16517/1000000 [8:56:01<2366:09:04, 8.66s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [16517], local_loss=0.02925632894039154, train_loss=0.023528052493929863, time_cost=3.746506452560425
+
Steps: 2%|▏ | 16517/1000000 [8:56:01<2366:09:04, 8.66s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 16518/1000000 [8:56:11<2462:46:40, 9.01s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [16518], local_loss=0.016032012179493904, train_loss=0.15373419225215912, time_cost=1.5418879985809326
+
Steps: 2%|▏ | 16518/1000000 [8:56:11<2462:46:40, 9.01s/it, lr=1e-5, step_loss=0.016]
Steps: 2%|▏ | 16519/1000000 [8:56:22<2688:14:15, 9.84s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [16519], local_loss=0.018261101096868515, train_loss=0.032795198261737823, time_cost=8.456051111221313
+
Steps: 2%|▏ | 16519/1000000 [8:56:22<2688:14:15, 9.84s/it, lr=1e-5, step_loss=0.0183]
Steps: 2%|▏ | 16520/1000000 [8:56:34<2803:24:07, 10.26s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [16520], local_loss=0.03307316452264786, train_loss=0.10961587727069855, time_cost=4.285762786865234
+
Steps: 2%|▏ | 16520/1000000 [8:56:34<2803:24:07, 10.26s/it, lr=1e-5, step_loss=0.0331]
Steps: 2%|▏ | 16521/1000000 [8:56:47<3049:42:35, 11.16s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [16521], local_loss=0.017408058047294617, train_loss=0.04787615314126015, time_cost=4.328585147857666
+
Steps: 2%|▏ | 16521/1000000 [8:56:47<3049:42:35, 11.16s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 16522/1000000 [8:57:01<3320:32:55, 12.15s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [16522], local_loss=0.006463341414928436, train_loss=0.029224971309304237, time_cost=5.420140027999878
+
Steps: 2%|▏ | 16522/1000000 [8:57:01<3320:32:55, 12.15s/it, lr=1e-5, step_loss=0.00646]
Steps: 2%|▏ | 16523/1000000 [8:57:06<2719:00:43, 9.95s/it, lr=1e-5, step_loss=0.00646][RANK-0]: Step: [16523], local_loss=0.004456819035112858, train_loss=0.02211599238216877, time_cost=1.2212433815002441
+
Steps: 2%|▏ | 16523/1000000 [8:57:06<2719:00:43, 9.95s/it, lr=1e-5, step_loss=0.00446]
Steps: 2%|▏ | 16524/1000000 [8:57:15<2597:06:34, 9.51s/it, lr=1e-5, step_loss=0.00446][RANK-0]: Step: [16524], local_loss=0.07014691829681396, train_loss=0.05097925662994385, time_cost=2.908005714416504
+
Steps: 2%|▏ | 16524/1000000 [8:57:15<2597:06:34, 9.51s/it, lr=1e-5, step_loss=0.0701]
Steps: 2%|▏ | 16525/1000000 [8:57:24<2557:27:16, 9.36s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [16525], local_loss=0.02765352837741375, train_loss=0.16314075887203217, time_cost=3.447986364364624
+
Steps: 2%|▏ | 16525/1000000 [8:57:24<2557:27:16, 9.36s/it, lr=1e-5, step_loss=0.0277]
Steps: 2%|▏ | 16526/1000000 [8:57:30<2267:55:08, 8.30s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [16526], local_loss=0.00864943116903305, train_loss=0.06787630915641785, time_cost=1.2559187412261963
+
Steps: 2%|▏ | 16526/1000000 [8:57:30<2267:55:08, 8.30s/it, lr=1e-5, step_loss=0.00865]
Steps: 2%|▏ | 16527/1000000 [8:57:41<2520:20:01, 9.23s/it, lr=1e-5, step_loss=0.00865][RANK-0]: Step: [16527], local_loss=0.013282369822263718, train_loss=0.023801488801836967, time_cost=2.7350974082946777
+
Steps: 2%|▏ | 16527/1000000 [8:57:41<2520:20:01, 9.23s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 16528/1000000 [8:57:51<2592:02:31, 9.49s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [16528], local_loss=0.011933885514736176, train_loss=0.050781697034835815, time_cost=4.869368314743042
+
Steps: 2%|▏ | 16528/1000000 [8:57:51<2592:02:31, 9.49s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 16529/1000000 [8:58:01<2608:42:46, 9.55s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [16529], local_loss=0.06960220634937286, train_loss=0.06014690920710564, time_cost=3.497119188308716
+
Steps: 2%|▏ | 16529/1000000 [8:58:01<2608:42:46, 9.55s/it, lr=1e-5, step_loss=0.0696]
Steps: 2%|▏ | 16530/1000000 [8:58:06<2260:24:19, 8.27s/it, lr=1e-5, step_loss=0.0696][RANK-0]: Step: [16530], local_loss=0.005725309252738953, train_loss=0.023978669196367264, time_cost=1.4207043647766113
+
Steps: 2%|▏ | 16530/1000000 [8:58:06<2260:24:19, 8.27s/it, lr=1e-5, step_loss=0.00573]
Steps: 2%|▏ | 16531/1000000 [8:58:19<2677:39:43, 9.80s/it, lr=1e-5, step_loss=0.00573][RANK-0]: Step: [16531], local_loss=0.0072213816456496716, train_loss=0.036650728434324265, time_cost=2.213832378387451
+
Steps: 2%|▏ | 16531/1000000 [8:58:19<2677:39:43, 9.80s/it, lr=1e-5, step_loss=0.00722]
Steps: 2%|▏ | 16532/1000000 [8:58:30<2765:50:20, 10.12s/it, lr=1e-5, step_loss=0.00722][RANK-0]: Step: [16532], local_loss=0.058625396341085434, train_loss=0.02591060847043991, time_cost=4.736733675003052
+
Steps: 2%|▏ | 16532/1000000 [8:58:30<2765:50:20, 10.12s/it, lr=1e-5, step_loss=0.0586]
Steps: 2%|▏ | 16533/1000000 [8:58:36<2382:56:24, 8.72s/it, lr=1e-5, step_loss=0.0586][RANK-0]: Step: [16533], local_loss=0.06426967680454254, train_loss=0.024083111435174942, time_cost=2.7193353176116943
+
Steps: 2%|▏ | 16533/1000000 [8:58:36<2382:56:24, 8.72s/it, lr=1e-5, step_loss=0.0643]
Steps: 2%|▏ | 16534/1000000 [8:58:40<2021:55:51, 7.40s/it, lr=1e-5, step_loss=0.0643][RANK-0]: Step: [16534], local_loss=0.009639411233365536, train_loss=0.07358259707689285, time_cost=1.2093322277069092
+
Steps: 2%|▏ | 16534/1000000 [8:58:40<2021:55:51, 7.40s/it, lr=1e-5, step_loss=0.00964]
Steps: 2%|▏ | 16535/1000000 [8:58:51<2320:53:52, 8.50s/it, lr=1e-5, step_loss=0.00964][RANK-0]: Step: [16535], local_loss=0.023878732696175575, train_loss=0.0235246904194355, time_cost=3.4479072093963623
+
Steps: 2%|▏ | 16535/1000000 [8:58:51<2320:53:52, 8.50s/it, lr=1e-5, step_loss=0.0239]
Steps: 2%|▏ | 16536/1000000 [8:58:57<2069:58:26, 7.58s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [16536], local_loss=0.022280994802713394, train_loss=0.026570376008749008, time_cost=2.9922993183135986
+
Steps: 2%|▏ | 16536/1000000 [8:58:57<2069:58:26, 7.58s/it, lr=1e-5, step_loss=0.0223]
Steps: 2%|▏ | 16537/1000000 [8:59:05<2177:52:16, 7.97s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [16537], local_loss=0.029539624229073524, train_loss=0.16090530157089233, time_cost=1.285654067993164
+
Steps: 2%|▏ | 16537/1000000 [8:59:05<2177:52:16, 7.97s/it, lr=1e-5, step_loss=0.0295]
Steps: 2%|▏ | 16538/1000000 [8:59:15<2342:27:14, 8.57s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [16538], local_loss=0.020757591351866722, train_loss=0.029680486768484116, time_cost=1.3862216472625732
+
Steps: 2%|▏ | 16538/1000000 [8:59:15<2342:27:14, 8.57s/it, lr=1e-5, step_loss=0.0208]
Steps: 2%|▏ | 16539/1000000 [8:59:25<2399:24:25, 8.78s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [16539], local_loss=0.02862659841775894, train_loss=0.05949784815311432, time_cost=1.8604135513305664
+
Steps: 2%|▏ | 16539/1000000 [8:59:25<2399:24:25, 8.78s/it, lr=1e-5, step_loss=0.0286]
Steps: 2%|▏ | 16540/1000000 [8:59:38<2780:15:55, 10.18s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [16540], local_loss=0.029807167127728462, train_loss=0.03198721632361412, time_cost=7.5236616134643555
+
Steps: 2%|▏ | 16540/1000000 [8:59:38<2780:15:55, 10.18s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 16541/1000000 [8:59:50<2931:47:06, 10.73s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [16541], local_loss=0.011953810229897499, train_loss=0.012831094674766064, time_cost=4.637241840362549
+
Steps: 2%|▏ | 16541/1000000 [8:59:50<2931:47:06, 10.73s/it, lr=1e-5, step_loss=0.012]
Steps: 2%|▏ | 16542/1000000 [9:00:00<2866:37:41, 10.49s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [16542], local_loss=0.018142012879252434, train_loss=0.0489528588950634, time_cost=2.2119550704956055
+
Steps: 2%|▏ | 16542/1000000 [9:00:00<2866:37:41, 10.49s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 16543/1000000 [9:00:10<2792:47:13, 10.22s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [16543], local_loss=0.005766153801232576, train_loss=0.027597762644290924, time_cost=2.1034181118011475
+
Steps: 2%|▏ | 16543/1000000 [9:00:10<2792:47:13, 10.22s/it, lr=1e-5, step_loss=0.00577]
Steps: 2%|▏ | 16544/1000000 [9:00:20<2805:32:16, 10.27s/it, lr=1e-5, step_loss=0.00577][RANK-0]: Step: [16544], local_loss=0.007146378979086876, train_loss=0.029526546597480774, time_cost=1.6826138496398926
+
Steps: 2%|▏ | 16544/1000000 [9:00:20<2805:32:16, 10.27s/it, lr=1e-5, step_loss=0.00715]
Steps: 2%|▏ | 16545/1000000 [9:00:31<2861:42:20, 10.48s/it, lr=1e-5, step_loss=0.00715][RANK-0]: Step: [16545], local_loss=0.020864451304078102, train_loss=0.016620784997940063, time_cost=1.234689712524414
+
Steps: 2%|▏ | 16545/1000000 [9:00:31<2861:42:20, 10.48s/it, lr=1e-5, step_loss=0.0209]
Steps: 2%|▏ | 16546/1000000 [9:00:37<2503:15:25, 9.16s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [16546], local_loss=0.10683639347553253, train_loss=0.049053192138671875, time_cost=1.828026533126831
+
Steps: 2%|▏ | 16546/1000000 [9:00:37<2503:15:25, 9.16s/it, lr=1e-5, step_loss=0.107]
Steps: 2%|▏ | 16547/1000000 [9:00:48<2649:33:17, 9.70s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [16547], local_loss=0.034382414072752, train_loss=0.026055753231048584, time_cost=3.5869858264923096
+
Steps: 2%|▏ | 16547/1000000 [9:00:48<2649:33:17, 9.70s/it, lr=1e-5, step_loss=0.0344]
Steps: 2%|▏ | 16548/1000000 [9:00:52<2212:28:32, 8.10s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [16548], local_loss=0.2980528175830841, train_loss=0.08251064270734787, time_cost=1.6131713390350342
+
Steps: 2%|▏ | 16548/1000000 [9:00:52<2212:28:32, 8.10s/it, lr=1e-5, step_loss=0.298]
Steps: 2%|▏ | 16549/1000000 [9:00:57<1959:49:21, 7.17s/it, lr=1e-5, step_loss=0.298][RANK-0]: Step: [16549], local_loss=0.05636567249894142, train_loss=0.040541451424360275, time_cost=1.953810453414917
+
Steps: 2%|▏ | 16549/1000000 [9:00:57<1959:49:21, 7.17s/it, lr=1e-5, step_loss=0.0564]
Steps: 2%|▏ | 16550/1000000 [9:01:09<2288:05:50, 8.38s/it, lr=1e-5, step_loss=0.0564][RANK-0]: Step: [16550], local_loss=0.16472847759723663, train_loss=0.06168035417795181, time_cost=2.8597025871276855
+
Steps: 2%|▏ | 16550/1000000 [9:01:09<2288:05:50, 8.38s/it, lr=1e-5, step_loss=0.165]
Steps: 2%|▏ | 16551/1000000 [9:01:20<2507:26:09, 9.18s/it, lr=1e-5, step_loss=0.165][RANK-0]: Step: [16551], local_loss=0.016262978315353394, train_loss=0.09330245107412338, time_cost=3.0314440727233887
+
Steps: 2%|▏ | 16551/1000000 [9:01:20<2507:26:09, 9.18s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 16552/1000000 [9:01:31<2668:09:10, 9.77s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [16552], local_loss=0.027280349284410477, train_loss=11.285633087158203, time_cost=2.8796026706695557
+
Steps: 2%|▏ | 16552/1000000 [9:01:31<2668:09:10, 9.77s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 16553/1000000 [9:01:42<2810:10:03, 10.29s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [16553], local_loss=0.007810753770172596, train_loss=0.03386532515287399, time_cost=4.0428173542022705
+
Steps: 2%|▏ | 16553/1000000 [9:01:42<2810:10:03, 10.29s/it, lr=1e-5, step_loss=0.00781]
Steps: 2%|▏ | 16554/1000000 [9:01:56<3058:13:45, 11.19s/it, lr=1e-5, step_loss=0.00781][RANK-0]: Step: [16554], local_loss=0.024820946156978607, train_loss=0.09633408486843109, time_cost=4.206796646118164
+
Steps: 2%|▏ | 16554/1000000 [9:01:56<3058:13:45, 11.19s/it, lr=1e-5, step_loss=0.0248]
Steps: 2%|▏ | 16555/1000000 [9:02:06<3015:15:42, 11.04s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [16555], local_loss=0.01592695154249668, train_loss=0.03221855312585831, time_cost=3.440464735031128
+
Steps: 2%|▏ | 16555/1000000 [9:02:06<3015:15:42, 11.04s/it, lr=1e-5, step_loss=0.0159]
Steps: 2%|▏ | 16556/1000000 [9:02:13<2661:49:29, 9.74s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [16556], local_loss=0.00890253484249115, train_loss=0.028194498270750046, time_cost=2.1847968101501465
+
Steps: 2%|▏ | 16556/1000000 [9:02:13<2661:49:29, 9.74s/it, lr=1e-5, step_loss=0.0089]
Steps: 2%|▏ | 16557/1000000 [9:02:19<2366:05:24, 8.66s/it, lr=1e-5, step_loss=0.0089][RANK-0]: Step: [16557], local_loss=0.03506670147180557, train_loss=0.04386023432016373, time_cost=2.1978602409362793
+
Steps: 2%|▏ | 16557/1000000 [9:02:19<2366:05:24, 8.66s/it, lr=1e-5, step_loss=0.0351]
Steps: 2%|▏ | 16558/1000000 [9:02:30<2583:08:20, 9.46s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [16558], local_loss=0.02716142125427723, train_loss=0.15003079175949097, time_cost=2.000690221786499
+
Steps: 2%|▏ | 16558/1000000 [9:02:30<2583:08:20, 9.46s/it, lr=1e-5, step_loss=0.0272]
Steps: 2%|▏ | 16559/1000000 [9:02:35<2173:51:36, 7.96s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [16559], local_loss=0.010104052722454071, train_loss=0.152497336268425, time_cost=1.2472434043884277
+
Steps: 2%|▏ | 16559/1000000 [9:02:35<2173:51:36, 7.96s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 16560/1000000 [9:02:44<2255:02:24, 8.25s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [16560], local_loss=0.15011471509933472, train_loss=0.051215216517448425, time_cost=2.8954765796661377
+
Steps: 2%|▏ | 16560/1000000 [9:02:44<2255:02:24, 8.25s/it, lr=1e-5, step_loss=0.15]
Steps: 2%|▏ | 16561/1000000 [9:02:55<2464:58:42, 9.02s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [16561], local_loss=0.010096585378050804, train_loss=0.03330305963754654, time_cost=1.3800530433654785
+
Steps: 2%|▏ | 16561/1000000 [9:02:55<2464:58:42, 9.02s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 16562/1000000 [9:03:01<2227:55:56, 8.16s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [16562], local_loss=0.030331352725625038, train_loss=0.021074680611491203, time_cost=1.9820926189422607
+
Steps: 2%|▏ | 16562/1000000 [9:03:01<2227:55:56, 8.16s/it, lr=1e-5, step_loss=0.0303]
Steps: 2%|▏ | 16563/1000000 [9:03:08<2125:27:31, 7.78s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [16563], local_loss=0.09584031999111176, train_loss=0.03810799494385719, time_cost=1.4542624950408936
+
Steps: 2%|▏ | 16563/1000000 [9:03:08<2125:27:31, 7.78s/it, lr=1e-5, step_loss=0.0958]
Steps: 2%|▏ | 16564/1000000 [9:03:19<2402:26:12, 8.79s/it, lr=1e-5, step_loss=0.0958][RANK-0]: Step: [16564], local_loss=0.01731560006737709, train_loss=0.05784224718809128, time_cost=3.2131330966949463
+
Steps: 2%|▏ | 16564/1000000 [9:03:19<2402:26:12, 8.79s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 16565/1000000 [9:03:25<2179:59:29, 7.98s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [16565], local_loss=0.05288261920213699, train_loss=0.051271431148052216, time_cost=1.3449904918670654
+
Steps: 2%|▏ | 16565/1000000 [9:03:25<2179:59:29, 7.98s/it, lr=1e-5, step_loss=0.0529]
Steps: 2%|▏ | 16566/1000000 [9:03:36<2470:31:16, 9.04s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [16566], local_loss=0.05910228192806244, train_loss=0.025825202465057373, time_cost=2.929262638092041
+
Steps: 2%|▏ | 16566/1000000 [9:03:36<2470:31:16, 9.04s/it, lr=1e-5, step_loss=0.0591]
Steps: 2%|▏ | 16567/1000000 [9:03:47<2609:14:54, 9.55s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [16567], local_loss=0.007132761646062136, train_loss=0.009929566644132137, time_cost=1.520211935043335
+
Steps: 2%|▏ | 16567/1000000 [9:03:47<2609:14:54, 9.55s/it, lr=1e-5, step_loss=0.00713]
Steps: 2%|▏ | 16568/1000000 [9:03:57<2608:23:43, 9.55s/it, lr=1e-5, step_loss=0.00713][RANK-0]: Step: [16568], local_loss=0.048416003584861755, train_loss=0.038419853895902634, time_cost=3.9953863620758057
+
Steps: 2%|▏ | 16568/1000000 [9:03:57<2608:23:43, 9.55s/it, lr=1e-5, step_loss=0.0484]
Steps: 2%|▏ | 16569/1000000 [9:04:07<2633:37:30, 9.64s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [16569], local_loss=0.033714670687913895, train_loss=0.031983185559511185, time_cost=3.5345873832702637
+
Steps: 2%|▏ | 16569/1000000 [9:04:07<2633:37:30, 9.64s/it, lr=1e-5, step_loss=0.0337]
Steps: 2%|▏ | 16570/1000000 [9:04:17<2695:18:26, 9.87s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [16570], local_loss=0.009059589356184006, train_loss=0.018073944374918938, time_cost=5.032722473144531
+
Steps: 2%|▏ | 16570/1000000 [9:04:17<2695:18:26, 9.87s/it, lr=1e-5, step_loss=0.00906]
Steps: 2%|▏ | 16571/1000000 [9:04:22<2295:26:57, 8.40s/it, lr=1e-5, step_loss=0.00906][RANK-0]: Step: [16571], local_loss=0.014067311771214008, train_loss=0.041636236011981964, time_cost=1.9521350860595703
+
Steps: 2%|▏ | 16571/1000000 [9:04:22<2295:26:57, 8.40s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 16572/1000000 [9:04:32<2401:11:08, 8.79s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [16572], local_loss=0.2085777223110199, train_loss=0.059039875864982605, time_cost=2.14787220954895
+
Steps: 2%|▏ | 16572/1000000 [9:04:32<2401:11:08, 8.79s/it, lr=1e-5, step_loss=0.209]
Steps: 2%|▏ | 16573/1000000 [9:04:44<2668:04:51, 9.77s/it, lr=1e-5, step_loss=0.209][RANK-0]: Step: [16573], local_loss=0.0066462317481637, train_loss=0.07010763138532639, time_cost=4.5936667919158936
+
Steps: 2%|▏ | 16573/1000000 [9:04:44<2668:04:51, 9.77s/it, lr=1e-5, step_loss=0.00665]
Steps: 2%|▏ | 16574/1000000 [9:04:55<2779:47:45, 10.18s/it, lr=1e-5, step_loss=0.00665][RANK-0]: Step: [16574], local_loss=0.05511738732457161, train_loss=0.06478306651115417, time_cost=4.236186265945435
+
Steps: 2%|▏ | 16574/1000000 [9:04:55<2779:47:45, 10.18s/it, lr=1e-5, step_loss=0.0551]
Steps: 2%|▏ | 16575/1000000 [9:05:03<2635:15:18, 9.65s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [16575], local_loss=0.13052824139595032, train_loss=0.045800670981407166, time_cost=2.666381359100342
+
Steps: 2%|▏ | 16575/1000000 [9:05:03<2635:15:18, 9.65s/it, lr=1e-5, step_loss=0.131]
Steps: 2%|▏ | 16576/1000000 [9:05:18<3079:02:56, 11.27s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [16576], local_loss=0.010658731684088707, train_loss=0.03682821989059448, time_cost=5.384739398956299
+
Steps: 2%|▏ | 16576/1000000 [9:05:18<3079:02:56, 11.27s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 16577/1000000 [9:05:32<3261:49:10, 11.94s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [16577], local_loss=0.1330166608095169, train_loss=0.03987891972064972, time_cost=5.2435462474823
+
Steps: 2%|▏ | 16577/1000000 [9:05:32<3261:49:10, 11.94s/it, lr=1e-5, step_loss=0.133]
Steps: 2%|▏ | 16578/1000000 [9:05:44<3287:31:12, 12.03s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [16578], local_loss=0.013139823451638222, train_loss=0.15892378985881805, time_cost=5.539158344268799
+
Steps: 2%|▏ | 16578/1000000 [9:05:44<3287:31:12, 12.03s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 16579/1000000 [9:05:49<2718:01:19, 9.95s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [16579], local_loss=0.006938554346561432, train_loss=0.012277822941541672, time_cost=2.6047985553741455
+
Steps: 2%|▏ | 16579/1000000 [9:05:49<2718:01:19, 9.95s/it, lr=1e-5, step_loss=0.00694]
Steps: 2%|▏ | 16580/1000000 [9:05:54<2333:37:08, 8.54s/it, lr=1e-5, step_loss=0.00694][RANK-0]: Step: [16580], local_loss=0.023409688845276833, train_loss=0.15581536293029785, time_cost=1.4547407627105713
+
Steps: 2%|▏ | 16580/1000000 [9:05:54<2333:37:08, 8.54s/it, lr=1e-5, step_loss=0.0234]
Steps: 2%|▏ | 16581/1000000 [9:06:00<2053:06:57, 7.52s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [16581], local_loss=0.06666603684425354, train_loss=0.0688629299402237, time_cost=2.165989637374878
+
Steps: 2%|▏ | 16581/1000000 [9:06:00<2053:06:57, 7.52s/it, lr=1e-5, step_loss=0.0667]
Steps: 2%|▏ | 16582/1000000 [9:06:05<1869:13:16, 6.84s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [16582], local_loss=0.006667892914265394, train_loss=0.09920036792755127, time_cost=2.289513111114502
+
Steps: 2%|▏ | 16582/1000000 [9:06:05<1869:13:16, 6.84s/it, lr=1e-5, step_loss=0.00667]
Steps: 2%|▏ | 16583/1000000 [9:06:18<2387:38:35, 8.74s/it, lr=1e-5, step_loss=0.00667][RANK-0]: Step: [16583], local_loss=0.007264938671141863, train_loss=0.08146354556083679, time_cost=6.767140626907349
+
Steps: 2%|▏ | 16583/1000000 [9:06:18<2387:38:35, 8.74s/it, lr=1e-5, step_loss=0.00726]
Steps: 2%|▏ | 16584/1000000 [9:06:23<2044:41:30, 7.49s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [16584], local_loss=0.1504921317100525, train_loss=0.03217952698469162, time_cost=1.4722371101379395
+
Steps: 2%|▏ | 16584/1000000 [9:06:23<2044:41:30, 7.49s/it, lr=1e-5, step_loss=0.15]
Steps: 2%|▏ | 16585/1000000 [9:06:32<2201:25:19, 8.06s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [16585], local_loss=0.38254591822624207, train_loss=0.09887725114822388, time_cost=2.289123296737671
+
Steps: 2%|▏ | 16585/1000000 [9:06:32<2201:25:19, 8.06s/it, lr=1e-5, step_loss=0.383]
Steps: 2%|▏ | 16586/1000000 [9:06:45<2587:01:21, 9.47s/it, lr=1e-5, step_loss=0.383][RANK-0]: Step: [16586], local_loss=0.0025932725984603167, train_loss=0.7296340465545654, time_cost=3.5906665325164795
+
Steps: 2%|▏ | 16586/1000000 [9:06:45<2587:01:21, 9.47s/it, lr=1e-5, step_loss=0.00259]
Steps: 2%|▏ | 16587/1000000 [9:06:49<2169:52:01, 7.94s/it, lr=1e-5, step_loss=0.00259][RANK-0]: Step: [16587], local_loss=0.017503630369901657, train_loss=0.014440642669796944, time_cost=1.410989761352539
+
Steps: 2%|▏ | 16587/1000000 [9:06:49<2169:52:01, 7.94s/it, lr=1e-5, step_loss=0.0175]
Steps: 2%|▏ | 16588/1000000 [9:07:00<2408:49:32, 8.82s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [16588], local_loss=0.03708484396338463, train_loss=0.03950635343790054, time_cost=7.2039079666137695
+
Steps: 2%|▏ | 16588/1000000 [9:07:00<2408:49:32, 8.82s/it, lr=1e-5, step_loss=0.0371]
Steps: 2%|▏ | 16589/1000000 [9:07:13<2744:15:59, 10.05s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [16589], local_loss=0.006414806004613638, train_loss=0.011643560603260994, time_cost=5.734370470046997
+
Steps: 2%|▏ | 16589/1000000 [9:07:13<2744:15:59, 10.05s/it, lr=1e-5, step_loss=0.00641]
Steps: 2%|▏ | 16590/1000000 [9:07:20<2508:40:19, 9.18s/it, lr=1e-5, step_loss=0.00641][RANK-0]: Step: [16590], local_loss=0.007499582599848509, train_loss=0.018173428252339363, time_cost=1.2241246700286865
+
Steps: 2%|▏ | 16590/1000000 [9:07:20<2508:40:19, 9.18s/it, lr=1e-5, step_loss=0.0075]
Steps: 2%|▏ | 16591/1000000 [9:07:27<2363:13:44, 8.65s/it, lr=1e-5, step_loss=0.0075][RANK-0]: Step: [16591], local_loss=0.07371185719966888, train_loss=0.037418365478515625, time_cost=1.9501557350158691
+
Steps: 2%|▏ | 16591/1000000 [9:07:27<2363:13:44, 8.65s/it, lr=1e-5, step_loss=0.0737]
Steps: 2%|▏ | 16592/1000000 [9:07:36<2335:11:31, 8.55s/it, lr=1e-5, step_loss=0.0737][RANK-0]: Step: [16592], local_loss=0.04337455704808235, train_loss=0.14249807596206665, time_cost=4.303277254104614
+
Steps: 2%|▏ | 16592/1000000 [9:07:36<2335:11:31, 8.55s/it, lr=1e-5, step_loss=0.0434]
Steps: 2%|▏ | 16593/1000000 [9:07:41<2040:18:02, 7.47s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [16593], local_loss=0.03773622587323189, train_loss=0.055390097200870514, time_cost=1.8481316566467285
+
Steps: 2%|▏ | 16593/1000000 [9:07:41<2040:18:02, 7.47s/it, lr=1e-5, step_loss=0.0377]
Steps: 2%|▏ | 16594/1000000 [9:07:48<2000:27:23, 7.32s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [16594], local_loss=0.005405053496360779, train_loss=0.017990652471780777, time_cost=2.6173365116119385
+
Steps: 2%|▏ | 16594/1000000 [9:07:48<2000:27:23, 7.32s/it, lr=1e-5, step_loss=0.00541]
Steps: 2%|▏ | 16595/1000000 [9:07:55<1990:17:43, 7.29s/it, lr=1e-5, step_loss=0.00541][RANK-0]: Step: [16595], local_loss=0.024012580513954163, train_loss=0.1060660108923912, time_cost=1.6897785663604736
+
Steps: 2%|▏ | 16595/1000000 [9:07:55<1990:17:43, 7.29s/it, lr=1e-5, step_loss=0.024]
Steps: 2%|▏ | 16596/1000000 [9:08:08<2469:43:38, 9.04s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [16596], local_loss=0.021727493032813072, train_loss=0.1780611276626587, time_cost=4.4066267013549805
+
Steps: 2%|▏ | 16596/1000000 [9:08:08<2469:43:38, 9.04s/it, lr=1e-5, step_loss=0.0217]
Steps: 2%|▏ | 16597/1000000 [9:08:19<2596:04:43, 9.50s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [16597], local_loss=0.017690829932689667, train_loss=0.012943247333168983, time_cost=1.5800511837005615
+
Steps: 2%|▏ | 16597/1000000 [9:08:19<2596:04:43, 9.50s/it, lr=1e-5, step_loss=0.0177]
Steps: 2%|▏ | 16598/1000000 [9:08:24<2239:28:12, 8.20s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [16598], local_loss=0.030843645334243774, train_loss=0.03838493302464485, time_cost=2.1072323322296143
+
Steps: 2%|▏ | 16598/1000000 [9:08:24<2239:28:12, 8.20s/it, lr=1e-5, step_loss=0.0308]
Steps: 2%|▏ | 16599/1000000 [9:08:31<2121:28:23, 7.77s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [16599], local_loss=0.05692361667752266, train_loss=0.043133124709129333, time_cost=2.9639475345611572
+
Steps: 2%|▏ | 16599/1000000 [9:08:31<2121:28:23, 7.77s/it, lr=1e-5, step_loss=0.0569]
Steps: 2%|▏ | 16600/1000000 [9:08:36<1964:50:29, 7.19s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [16600], local_loss=0.004125652369111776, train_loss=2.596796751022339, time_cost=1.79073166847229
+
Steps: 2%|▏ | 16600/1000000 [9:08:36<1964:50:29, 7.19s/it, lr=1e-5, step_loss=0.00413]
Steps: 2%|▏ | 16601/1000000 [9:08:49<2438:05:33, 8.93s/it, lr=1e-5, step_loss=0.00413][RANK-0]: Step: [16601], local_loss=0.004189382307231426, train_loss=0.09160975366830826, time_cost=6.6856982707977295
+
Steps: 2%|▏ | 16601/1000000 [9:08:49<2438:05:33, 8.93s/it, lr=1e-5, step_loss=0.00419]
Steps: 2%|▏ | 16602/1000000 [9:08:57<2327:04:17, 8.52s/it, lr=1e-5, step_loss=0.00419][RANK-0]: Step: [16602], local_loss=0.0102003775537014, train_loss=0.019350629299879074, time_cost=3.0089573860168457
+
Steps: 2%|▏ | 16602/1000000 [9:08:57<2327:04:17, 8.52s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 16603/1000000 [9:09:09<2588:00:47, 9.47s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [16603], local_loss=0.0058608525432646275, train_loss=29.500852584838867, time_cost=1.2130327224731445
+
Steps: 2%|▏ | 16603/1000000 [9:09:09<2588:00:47, 9.47s/it, lr=1e-5, step_loss=0.00586]
Steps: 2%|▏ | 16604/1000000 [9:09:21<2817:21:14, 10.31s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [16604], local_loss=0.009992925450205803, train_loss=0.043974678963422775, time_cost=6.236603021621704
+
Steps: 2%|▏ | 16604/1000000 [9:09:21<2817:21:14, 10.31s/it, lr=1e-5, step_loss=0.00999]
Steps: 2%|▏ | 16605/1000000 [9:09:30<2741:13:43, 10.04s/it, lr=1e-5, step_loss=0.00999][RANK-0]: Step: [16605], local_loss=0.011517305858433247, train_loss=0.02985193207859993, time_cost=3.2151553630828857
+
Steps: 2%|▏ | 16605/1000000 [9:09:30<2741:13:43, 10.04s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 16606/1000000 [9:09:35<2279:47:04, 8.35s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [16606], local_loss=0.06662202626466751, train_loss=0.07039643079042435, time_cost=1.2535033226013184
+
Steps: 2%|▏ | 16606/1000000 [9:09:35<2279:47:04, 8.35s/it, lr=1e-5, step_loss=0.0666]
Steps: 2%|▏ | 16607/1000000 [9:09:42<2221:23:55, 8.13s/it, lr=1e-5, step_loss=0.0666][RANK-0]: Step: [16607], local_loss=0.30198246240615845, train_loss=0.06182170286774635, time_cost=3.421272039413452
+
Steps: 2%|▏ | 16607/1000000 [9:09:42<2221:23:55, 8.13s/it, lr=1e-5, step_loss=0.302]
Steps: 2%|▏ | 16608/1000000 [9:09:53<2464:19:29, 9.02s/it, lr=1e-5, step_loss=0.302][RANK-0]: Step: [16608], local_loss=0.008782129734754562, train_loss=0.024249214679002762, time_cost=2.3010857105255127
+
Steps: 2%|▏ | 16608/1000000 [9:09:53<2464:19:29, 9.02s/it, lr=1e-5, step_loss=0.00878]
Steps: 2%|▏ | 16609/1000000 [9:10:03<2492:49:21, 9.13s/it, lr=1e-5, step_loss=0.00878][RANK-0]: Step: [16609], local_loss=0.009203793480992317, train_loss=0.1345265805721283, time_cost=8.000870704650879
+
Steps: 2%|▏ | 16609/1000000 [9:10:03<2492:49:21, 9.13s/it, lr=1e-5, step_loss=0.0092]
Steps: 2%|▏ | 16610/1000000 [9:10:07<2106:59:31, 7.71s/it, lr=1e-5, step_loss=0.0092][RANK-0]: Step: [16610], local_loss=0.056258972734212875, train_loss=0.08370032906532288, time_cost=1.4058396816253662
+
Steps: 2%|▏ | 16610/1000000 [9:10:07<2106:59:31, 7.71s/it, lr=1e-5, step_loss=0.0563]
Steps: 2%|▏ | 16611/1000000 [9:10:18<2378:28:55, 8.71s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [16611], local_loss=0.04233166202902794, train_loss=0.02504931017756462, time_cost=8.561306715011597
+
Steps: 2%|▏ | 16611/1000000 [9:10:18<2378:28:55, 8.71s/it, lr=1e-5, step_loss=0.0423]
Steps: 2%|▏ | 16612/1000000 [9:10:27<2384:56:15, 8.73s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [16612], local_loss=0.025263654068112373, train_loss=0.01818104088306427, time_cost=5.296993017196655
+
Steps: 2%|▏ | 16612/1000000 [9:10:27<2384:56:15, 8.73s/it, lr=1e-5, step_loss=0.0253]
Steps: 2%|▏ | 16613/1000000 [9:10:32<2075:37:15, 7.60s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [16613], local_loss=0.02501353807747364, train_loss=0.059804871678352356, time_cost=2.2140800952911377
+
Steps: 2%|▏ | 16613/1000000 [9:10:32<2075:37:15, 7.60s/it, lr=1e-5, step_loss=0.025]
Steps: 2%|▏ | 16614/1000000 [9:10:43<2343:05:56, 8.58s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [16614], local_loss=0.017290713265538216, train_loss=0.09941120445728302, time_cost=1.9396913051605225
+
Steps: 2%|▏ | 16614/1000000 [9:10:43<2343:05:56, 8.58s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 16615/1000000 [9:10:54<2566:59:12, 9.40s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [16615], local_loss=0.007446499075740576, train_loss=0.03691454976797104, time_cost=4.510180234909058
+
Steps: 2%|▏ | 16615/1000000 [9:10:54<2566:59:12, 9.40s/it, lr=1e-5, step_loss=0.00745]
Steps: 2%|▏ | 16616/1000000 [9:10:59<2198:20:28, 8.05s/it, lr=1e-5, step_loss=0.00745][RANK-0]: Step: [16616], local_loss=0.006581632420420647, train_loss=0.026716426014900208, time_cost=2.167482614517212
+
Steps: 2%|▏ | 16616/1000000 [9:10:59<2198:20:28, 8.05s/it, lr=1e-5, step_loss=0.00658]
Steps: 2%|▏ | 16617/1000000 [9:11:08<2243:51:38, 8.21s/it, lr=1e-5, step_loss=0.00658][RANK-0]: Step: [16617], local_loss=0.005956551060080528, train_loss=0.050109390169382095, time_cost=2.7155580520629883
+
Steps: 2%|▏ | 16617/1000000 [9:11:08<2243:51:38, 8.21s/it, lr=1e-5, step_loss=0.00596]
Steps: 2%|▏ | 16618/1000000 [9:11:19<2475:48:39, 9.06s/it, lr=1e-5, step_loss=0.00596][RANK-0]: Step: [16618], local_loss=0.004511602688580751, train_loss=0.08238524198532104, time_cost=3.543886661529541
+
Steps: 2%|▏ | 16618/1000000 [9:11:19<2475:48:39, 9.06s/it, lr=1e-5, step_loss=0.00451]
Steps: 2%|▏ | 16619/1000000 [9:11:26<2331:31:26, 8.54s/it, lr=1e-5, step_loss=0.00451][RANK-0]: Step: [16619], local_loss=0.009398188441991806, train_loss=0.03518550097942352, time_cost=1.500929594039917
+
Steps: 2%|▏ | 16619/1000000 [9:11:26<2331:31:26, 8.54s/it, lr=1e-5, step_loss=0.0094]
Steps: 2%|▏ | 16620/1000000 [9:11:41<2867:46:57, 10.50s/it, lr=1e-5, step_loss=0.0094][RANK-0]: Step: [16620], local_loss=0.017429815605282784, train_loss=0.10092286765575409, time_cost=6.9320056438446045
+
Steps: 2%|▏ | 16620/1000000 [9:11:41<2867:46:57, 10.50s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 16621/1000000 [9:11:52<2867:55:39, 10.50s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [16621], local_loss=0.007026893552392721, train_loss=0.07602399587631226, time_cost=4.927107095718384
+
Steps: 2%|▏ | 16621/1000000 [9:11:52<2867:55:39, 10.50s/it, lr=1e-5, step_loss=0.00703]
Steps: 2%|▏ | 16622/1000000 [9:12:08<3344:28:23, 12.24s/it, lr=1e-5, step_loss=0.00703][RANK-0]: Step: [16622], local_loss=0.052174344658851624, train_loss=0.04964600130915642, time_cost=8.070544242858887
+
Steps: 2%|▏ | 16622/1000000 [9:12:08<3344:28:23, 12.24s/it, lr=1e-5, step_loss=0.0522]
Steps: 2%|▏ | 16623/1000000 [9:12:17<3073:16:51, 11.25s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [16623], local_loss=0.004992371425032616, train_loss=0.024133581668138504, time_cost=4.812073469161987
+
Steps: 2%|▏ | 16623/1000000 [9:12:17<3073:16:51, 11.25s/it, lr=1e-5, step_loss=0.00499]
Steps: 2%|▏ | 16624/1000000 [9:12:24<2754:44:37, 10.08s/it, lr=1e-5, step_loss=0.00499][RANK-0]: Step: [16624], local_loss=0.025469832122325897, train_loss=0.031152775511145592, time_cost=1.2109792232513428
+
Steps: 2%|▏ | 16624/1000000 [9:12:24<2754:44:37, 10.08s/it, lr=1e-5, step_loss=0.0255]
Steps: 2%|▏ | 16625/1000000 [9:12:33<2631:05:26, 9.63s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [16625], local_loss=269.57611083984375, train_loss=33.77745819091797, time_cost=1.4275588989257812
+
Steps: 2%|▏ | 16625/1000000 [9:12:33<2631:05:26, 9.63s/it, lr=1e-5, step_loss=270]
Steps: 2%|▏ | 16626/1000000 [9:12:46<2937:26:59, 10.75s/it, lr=1e-5, step_loss=270][RANK-0]: Step: [16626], local_loss=0.0869237557053566, train_loss=0.033627886325120926, time_cost=4.100157260894775
+
Steps: 2%|▏ | 16626/1000000 [9:12:46<2937:26:59, 10.75s/it, lr=1e-5, step_loss=0.0869]
Steps: 2%|▏ | 16627/1000000 [9:12:54<2665:31:32, 9.76s/it, lr=1e-5, step_loss=0.0869][RANK-0]: Step: [16627], local_loss=0.008792960084974766, train_loss=0.13813090324401855, time_cost=3.1124467849731445
+
Steps: 2%|▏ | 16627/1000000 [9:12:54<2665:31:32, 9.76s/it, lr=1e-5, step_loss=0.00879]
Steps: 2%|▏ | 16628/1000000 [9:13:06<2848:17:42, 10.43s/it, lr=1e-5, step_loss=0.00879][RANK-0]: Step: [16628], local_loss=0.03774338588118553, train_loss=0.043251883238554, time_cost=4.743778705596924
+
Steps: 2%|▏ | 16628/1000000 [9:13:06<2848:17:42, 10.43s/it, lr=1e-5, step_loss=0.0377]
Steps: 2%|▏ | 16629/1000000 [9:13:19<3090:44:33, 11.31s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [16629], local_loss=0.018916480243206024, train_loss=0.015213176608085632, time_cost=4.6518473625183105
+
Steps: 2%|▏ | 16629/1000000 [9:13:19<3090:44:33, 11.31s/it, lr=1e-5, step_loss=0.0189]
Steps: 2%|▏ | 16630/1000000 [9:13:26<2753:19:39, 10.08s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [16630], local_loss=0.007431524805724621, train_loss=0.14454299211502075, time_cost=2.739259719848633
+
Steps: 2%|▏ | 16630/1000000 [9:13:26<2753:19:39, 10.08s/it, lr=1e-5, step_loss=0.00743]
Steps: 2%|▏ | 16631/1000000 [9:13:34<2555:03:36, 9.35s/it, lr=1e-5, step_loss=0.00743][RANK-0]: Step: [16631], local_loss=0.04561173543334007, train_loss=0.08726818859577179, time_cost=2.2019879817962646
+
Steps: 2%|▏ | 16631/1000000 [9:13:34<2555:03:36, 9.35s/it, lr=1e-5, step_loss=0.0456]
Steps: 2%|▏ | 16632/1000000 [9:13:45<2682:26:09, 9.82s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [16632], local_loss=0.0636102631688118, train_loss=0.043417803943157196, time_cost=2.633772134780884
+
Steps: 2%|▏ | 16632/1000000 [9:13:45<2682:26:09, 9.82s/it, lr=1e-5, step_loss=0.0636]
Steps: 2%|▏ | 16633/1000000 [9:13:52<2445:25:33, 8.95s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [16633], local_loss=0.00447761919349432, train_loss=0.050428468734025955, time_cost=2.5201263427734375
+
Steps: 2%|▏ | 16633/1000000 [9:13:52<2445:25:33, 8.95s/it, lr=1e-5, step_loss=0.00448]
Steps: 2%|▏ | 16634/1000000 [9:14:03<2645:21:50, 9.68s/it, lr=1e-5, step_loss=0.00448][RANK-0]: Step: [16634], local_loss=0.016546882688999176, train_loss=0.03337008133530617, time_cost=1.2815876007080078
+
Steps: 2%|▏ | 16634/1000000 [9:14:03<2645:21:50, 9.68s/it, lr=1e-5, step_loss=0.0165]
Steps: 2%|▏ | 16635/1000000 [9:14:15<2866:46:12, 10.49s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [16635], local_loss=0.9924830794334412, train_loss=0.1620829850435257, time_cost=4.777709245681763
+
Steps: 2%|▏ | 16635/1000000 [9:14:15<2866:46:12, 10.49s/it, lr=1e-5, step_loss=0.992]
Steps: 2%|▏ | 16636/1000000 [9:14:23<2606:03:27, 9.54s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [16636], local_loss=0.05920318141579628, train_loss=0.02939983829855919, time_cost=2.5814452171325684
+
Steps: 2%|▏ | 16636/1000000 [9:14:23<2606:03:27, 9.54s/it, lr=1e-5, step_loss=0.0592]
Steps: 2%|▏ | 16637/1000000 [9:14:31<2503:30:23, 9.17s/it, lr=1e-5, step_loss=0.0592][RANK-0]: Step: [16637], local_loss=0.014471969567239285, train_loss=0.04496479034423828, time_cost=3.628870964050293
+
Steps: 2%|▏ | 16637/1000000 [9:14:31<2503:30:23, 9.17s/it, lr=1e-5, step_loss=0.0145]
Steps: 2%|▏ | 16638/1000000 [9:14:41<2588:26:30, 9.48s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [16638], local_loss=0.006404878571629524, train_loss=0.024493206292390823, time_cost=1.2559418678283691
+
Steps: 2%|▏ | 16638/1000000 [9:14:41<2588:26:30, 9.48s/it, lr=1e-5, step_loss=0.0064]
Steps: 2%|▏ | 16639/1000000 [9:14:46<2172:05:05, 7.95s/it, lr=1e-5, step_loss=0.0064][RANK-0]: Step: [16639], local_loss=0.06711854785680771, train_loss=0.08147920668125153, time_cost=1.824084758758545
+
Steps: 2%|▏ | 16639/1000000 [9:14:46<2172:05:05, 7.95s/it, lr=1e-5, step_loss=0.0671]
Steps: 2%|▏ | 16640/1000000 [9:14:52<2031:20:20, 7.44s/it, lr=1e-5, step_loss=0.0671][RANK-0]: Step: [16640], local_loss=0.015932483598589897, train_loss=0.028220171108841896, time_cost=2.0155930519104004
+
Steps: 2%|▏ | 16640/1000000 [9:14:52<2031:20:20, 7.44s/it, lr=1e-5, step_loss=0.0159]
Steps: 2%|▏ | 16641/1000000 [9:15:05<2526:37:35, 9.25s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [16641], local_loss=0.037950728088617325, train_loss=0.04332253336906433, time_cost=4.374740362167358
+
Steps: 2%|▏ | 16641/1000000 [9:15:05<2526:37:35, 9.25s/it, lr=1e-5, step_loss=0.038]
Steps: 2%|▏ | 16642/1000000 [9:15:14<2484:35:35, 9.10s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [16642], local_loss=0.014945205301046371, train_loss=0.10291440039873123, time_cost=2.670234441757202
+
Steps: 2%|▏ | 16642/1000000 [9:15:14<2484:35:35, 9.10s/it, lr=1e-5, step_loss=0.0149]
Steps: 2%|▏ | 16643/1000000 [9:15:25<2607:08:03, 9.54s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [16643], local_loss=0.009873218834400177, train_loss=0.1406514197587967, time_cost=1.5607340335845947
+
Steps: 2%|▏ | 16643/1000000 [9:15:25<2607:08:03, 9.54s/it, lr=1e-5, step_loss=0.00987]
Steps: 2%|▏ | 16644/1000000 [9:15:35<2706:19:32, 9.91s/it, lr=1e-5, step_loss=0.00987][RANK-0]: Step: [16644], local_loss=0.027698736637830734, train_loss=0.04115815460681915, time_cost=1.3065495491027832
+
Steps: 2%|▏ | 16644/1000000 [9:15:35<2706:19:32, 9.91s/it, lr=1e-5, step_loss=0.0277]
Steps: 2%|▏ | 16645/1000000 [9:15:42<2422:25:20, 8.87s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [16645], local_loss=0.013688388280570507, train_loss=0.042582787573337555, time_cost=1.2287611961364746
+
Steps: 2%|▏ | 16645/1000000 [9:15:42<2422:25:20, 8.87s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 16646/1000000 [9:15:52<2563:43:03, 9.39s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [16646], local_loss=0.11416351050138474, train_loss=0.2542450726032257, time_cost=1.2722468376159668
+
Steps: 2%|▏ | 16646/1000000 [9:15:52<2563:43:03, 9.39s/it, lr=1e-5, step_loss=0.114]
Steps: 2%|▏ | 16647/1000000 [9:15:57<2204:31:29, 8.07s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [16647], local_loss=0.010883299633860588, train_loss=0.08843208849430084, time_cost=2.201861619949341
+
Steps: 2%|▏ | 16647/1000000 [9:15:57<2204:31:29, 8.07s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 16648/1000000 [9:16:08<2400:28:39, 8.79s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [16648], local_loss=0.006347865331918001, train_loss=0.04862990602850914, time_cost=1.827143669128418
+
Steps: 2%|▏ | 16648/1000000 [9:16:08<2400:28:39, 8.79s/it, lr=1e-5, step_loss=0.00635]
Steps: 2%|▏ | 16649/1000000 [9:16:15<2232:38:48, 8.17s/it, lr=1e-5, step_loss=0.00635][RANK-0]: Step: [16649], local_loss=0.008788447827100754, train_loss=0.02766287699341774, time_cost=2.5632801055908203
+
Steps: 2%|▏ | 16649/1000000 [9:16:15<2232:38:48, 8.17s/it, lr=1e-5, step_loss=0.00879]
Steps: 2%|▏ | 16650/1000000 [9:16:19<1922:46:18, 7.04s/it, lr=1e-5, step_loss=0.00879][RANK-0]: Step: [16650], local_loss=0.03760022297501564, train_loss=0.025507314130663872, time_cost=1.325289249420166
+
Steps: 2%|▏ | 16650/1000000 [9:16:19<1922:46:18, 7.04s/it, lr=1e-5, step_loss=0.0376]
Steps: 2%|▏ | 16651/1000000 [9:16:30<2228:01:59, 8.16s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [16651], local_loss=0.5983874797821045, train_loss=0.11655654013156891, time_cost=2.621293783187866
+
Steps: 2%|▏ | 16651/1000000 [9:16:30<2228:01:59, 8.16s/it, lr=1e-5, step_loss=0.598]
Steps: 2%|▏ | 16652/1000000 [9:16:40<2433:16:45, 8.91s/it, lr=1e-5, step_loss=0.598][RANK-0]: Step: [16652], local_loss=0.03644603118300438, train_loss=14.449850082397461, time_cost=3.157801389694214
+
Steps: 2%|▏ | 16652/1000000 [9:16:40<2433:16:45, 8.91s/it, lr=1e-5, step_loss=0.0364]
Steps: 2%|▏ | 16653/1000000 [9:16:46<2140:22:11, 7.84s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [16653], local_loss=0.010497855953872204, train_loss=0.019856229424476624, time_cost=1.593909740447998
+
Steps: 2%|▏ | 16653/1000000 [9:16:46<2140:22:11, 7.84s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 16654/1000000 [9:16:54<2151:48:23, 7.88s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [16654], local_loss=0.02931295521557331, train_loss=0.02712557278573513, time_cost=2.00828218460083
+
Steps: 2%|▏ | 16654/1000000 [9:16:54<2151:48:23, 7.88s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 16655/1000000 [9:17:03<2234:30:08, 8.18s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [16655], local_loss=0.07255551964044571, train_loss=0.033743731677532196, time_cost=7.497137784957886
+
Steps: 2%|▏ | 16655/1000000 [9:17:03<2234:30:08, 8.18s/it, lr=1e-5, step_loss=0.0726]
Steps: 2%|▏ | 16656/1000000 [9:17:12<2317:45:31, 8.49s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [16656], local_loss=0.12203739583492279, train_loss=30.880992889404297, time_cost=3.1219210624694824
+
Steps: 2%|▏ | 16656/1000000 [9:17:12<2317:45:31, 8.49s/it, lr=1e-5, step_loss=0.122]
Steps: 2%|▏ | 16657/1000000 [9:17:16<1999:34:59, 7.32s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [16657], local_loss=0.007181593216955662, train_loss=0.020518828183412552, time_cost=1.957671880722046
+
Steps: 2%|▏ | 16657/1000000 [9:17:16<1999:34:59, 7.32s/it, lr=1e-5, step_loss=0.00718]
Steps: 2%|▏ | 16658/1000000 [9:17:29<2451:38:25, 8.98s/it, lr=1e-5, step_loss=0.00718][RANK-0]: Step: [16658], local_loss=0.058697499334812164, train_loss=0.024990063160657883, time_cost=4.036667346954346
+
Steps: 2%|▏ | 16658/1000000 [9:17:29<2451:38:25, 8.98s/it, lr=1e-5, step_loss=0.0587]
Steps: 2%|▏ | 16659/1000000 [9:17:35<2208:01:30, 8.08s/it, lr=1e-5, step_loss=0.0587][RANK-0]: Step: [16659], local_loss=0.04540519416332245, train_loss=0.050465114414691925, time_cost=1.7049641609191895
+
Steps: 2%|▏ | 16659/1000000 [9:17:35<2208:01:30, 8.08s/it, lr=1e-5, step_loss=0.0454]
Steps: 2%|▏ | 16660/1000000 [9:17:46<2431:20:55, 8.90s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [16660], local_loss=0.016240393742918968, train_loss=0.06632275879383087, time_cost=3.656306743621826
+
Steps: 2%|▏ | 16660/1000000 [9:17:46<2431:20:55, 8.90s/it, lr=1e-5, step_loss=0.0162]
Steps: 2%|▏ | 16661/1000000 [9:17:53<2295:24:01, 8.40s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [16661], local_loss=0.007026189938187599, train_loss=0.020812558010220528, time_cost=2.742567300796509
+
Steps: 2%|▏ | 16661/1000000 [9:17:53<2295:24:01, 8.40s/it, lr=1e-5, step_loss=0.00703]
Steps: 2%|▏ | 16662/1000000 [9:18:07<2709:32:17, 9.92s/it, lr=1e-5, step_loss=0.00703][RANK-0]: Step: [16662], local_loss=0.04687140882015228, train_loss=0.08885682374238968, time_cost=5.176695346832275
+
Steps: 2%|▏ | 16662/1000000 [9:18:07<2709:32:17, 9.92s/it, lr=1e-5, step_loss=0.0469]
Steps: 2%|▏ | 16663/1000000 [9:18:21<3062:12:54, 11.21s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [16663], local_loss=0.012539751827716827, train_loss=0.029941406100988388, time_cost=7.31307053565979
+
Steps: 2%|▏ | 16663/1000000 [9:18:21<3062:12:54, 11.21s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 16664/1000000 [9:18:31<2990:37:02, 10.95s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [16664], local_loss=0.011549477465450764, train_loss=0.13319483399391174, time_cost=1.2189013957977295
+
Steps: 2%|▏ | 16664/1000000 [9:18:31<2990:37:02, 10.95s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 16665/1000000 [9:18:37<2570:32:56, 9.41s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [16665], local_loss=0.006568768061697483, train_loss=0.01709633879363537, time_cost=1.829888105392456
+
Steps: 2%|▏ | 16665/1000000 [9:18:37<2570:32:56, 9.41s/it, lr=1e-5, step_loss=0.00657]
Steps: 2%|▏ | 16666/1000000 [9:18:45<2455:10:01, 8.99s/it, lr=1e-5, step_loss=0.00657][RANK-0]: Step: [16666], local_loss=0.01542256586253643, train_loss=0.04278060793876648, time_cost=3.996594190597534
+
Steps: 2%|▏ | 16666/1000000 [9:18:45<2455:10:01, 8.99s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 16667/1000000 [9:18:57<2700:43:05, 9.89s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [16667], local_loss=0.008214760571718216, train_loss=0.07532842457294464, time_cost=3.768545150756836
+
Steps: 2%|▏ | 16667/1000000 [9:18:57<2700:43:05, 9.89s/it, lr=1e-5, step_loss=0.00821]
Steps: 2%|▏ | 16668/1000000 [9:19:07<2677:15:31, 9.80s/it, lr=1e-5, step_loss=0.00821][RANK-0]: Step: [16668], local_loss=0.0034328377805650234, train_loss=0.03329119458794594, time_cost=3.967700719833374
+
Steps: 2%|▏ | 16668/1000000 [9:19:07<2677:15:31, 9.80s/it, lr=1e-5, step_loss=0.00343]
Steps: 2%|▏ | 16669/1000000 [9:19:14<2449:37:06, 8.97s/it, lr=1e-5, step_loss=0.00343][RANK-0]: Step: [16669], local_loss=0.015563743188977242, train_loss=0.039483293890953064, time_cost=2.696890354156494
+
Steps: 2%|▏ | 16669/1000000 [9:19:14<2449:37:06, 8.97s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 16670/1000000 [9:19:26<2684:48:15, 9.83s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [16670], local_loss=0.005835402756929398, train_loss=0.029932746663689613, time_cost=3.19109845161438
+
Steps: 2%|▏ | 16670/1000000 [9:19:26<2684:48:15, 9.83s/it, lr=1e-5, step_loss=0.00584]
Steps: 2%|▏ | 16671/1000000 [9:19:31<2288:39:21, 8.38s/it, lr=1e-5, step_loss=0.00584][RANK-0]: Step: [16671], local_loss=0.008510916493833065, train_loss=0.050636932253837585, time_cost=2.024364709854126
+
Steps: 2%|▏ | 16671/1000000 [9:19:31<2288:39:21, 8.38s/it, lr=1e-5, step_loss=0.00851]
Steps: 2%|▏ | 16672/1000000 [9:19:41<2456:47:07, 8.99s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [16672], local_loss=0.006012219004333019, train_loss=0.03126172348856926, time_cost=1.2610197067260742
+
Steps: 2%|▏ | 16672/1000000 [9:19:41<2456:47:07, 8.99s/it, lr=1e-5, step_loss=0.00601]
Steps: 2%|▏ | 16673/1000000 [9:19:48<2263:40:10, 8.29s/it, lr=1e-5, step_loss=0.00601][RANK-0]: Step: [16673], local_loss=0.008043224923312664, train_loss=0.023389829322695732, time_cost=2.667156219482422
+
Steps: 2%|▏ | 16673/1000000 [9:19:48<2263:40:10, 8.29s/it, lr=1e-5, step_loss=0.00804]
Steps: 2%|▏ | 16674/1000000 [9:19:58<2451:19:51, 8.97s/it, lr=1e-5, step_loss=0.00804][RANK-0]: Step: [16674], local_loss=0.004723475780338049, train_loss=0.0212232805788517, time_cost=1.2583637237548828
+
Steps: 2%|▏ | 16674/1000000 [9:19:58<2451:19:51, 8.97s/it, lr=1e-5, step_loss=0.00472]
Steps: 2%|▏ | 16675/1000000 [9:20:06<2359:09:48, 8.64s/it, lr=1e-5, step_loss=0.00472][RANK-0]: Step: [16675], local_loss=0.017281372100114822, train_loss=0.0243731327354908, time_cost=3.8327410221099854
+
Steps: 2%|▏ | 16675/1000000 [9:20:06<2359:09:48, 8.64s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 16676/1000000 [9:20:17<2512:30:27, 9.20s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [16676], local_loss=0.023665884509682655, train_loss=0.01610809937119484, time_cost=2.0733802318573
+
Steps: 2%|▏ | 16676/1000000 [9:20:17<2512:30:27, 9.20s/it, lr=1e-5, step_loss=0.0237]
Steps: 2%|▏ | 16677/1000000 [9:20:21<2141:52:16, 7.84s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [16677], local_loss=0.006823933683335781, train_loss=0.14760133624076843, time_cost=2.0769100189208984
+
Steps: 2%|▏ | 16677/1000000 [9:20:21<2141:52:16, 7.84s/it, lr=1e-5, step_loss=0.00682]
Steps: 2%|▏ | 16678/1000000 [9:20:30<2178:04:45, 7.97s/it, lr=1e-5, step_loss=0.00682][RANK-0]: Step: [16678], local_loss=0.13406136631965637, train_loss=0.1072646751999855, time_cost=4.211560010910034
+
Steps: 2%|▏ | 16678/1000000 [9:20:30<2178:04:45, 7.97s/it, lr=1e-5, step_loss=0.134]
Steps: 2%|▏ | 16679/1000000 [9:20:36<2017:31:32, 7.39s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [16679], local_loss=0.0405251607298851, train_loss=0.039150699973106384, time_cost=2.4450392723083496
+
Steps: 2%|▏ | 16679/1000000 [9:20:36<2017:31:32, 7.39s/it, lr=1e-5, step_loss=0.0405]
Steps: 2%|▏ | 16680/1000000 [9:20:43<2042:00:58, 7.48s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [16680], local_loss=0.015659818425774574, train_loss=0.022668596357107162, time_cost=4.119824647903442
+
Steps: 2%|▏ | 16680/1000000 [9:20:43<2042:00:58, 7.48s/it, lr=1e-5, step_loss=0.0157]
Steps: 2%|▏ | 16681/1000000 [9:20:57<2521:27:37, 9.23s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [16681], local_loss=0.03395526856184006, train_loss=0.011100495234131813, time_cost=1.2598485946655273
+
Steps: 2%|▏ | 16681/1000000 [9:20:57<2521:27:37, 9.23s/it, lr=1e-5, step_loss=0.034]
Steps: 2%|▏ | 16682/1000000 [9:21:08<2679:13:24, 9.81s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [16682], local_loss=0.05743171647191048, train_loss=0.026991846039891243, time_cost=1.5146892070770264
+
Steps: 2%|▏ | 16682/1000000 [9:21:08<2679:13:24, 9.81s/it, lr=1e-5, step_loss=0.0574]
Steps: 2%|▏ | 16683/1000000 [9:21:12<2235:24:42, 8.18s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [16683], local_loss=0.024799806997179985, train_loss=0.015299521386623383, time_cost=1.7258210182189941
+
Steps: 2%|▏ | 16683/1000000 [9:21:12<2235:24:42, 8.18s/it, lr=1e-5, step_loss=0.0248]
Steps: 2%|▏ | 16684/1000000 [9:21:17<1926:07:47, 7.05s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [16684], local_loss=0.007232233881950378, train_loss=0.02401462197303772, time_cost=1.479015588760376
+
Steps: 2%|▏ | 16684/1000000 [9:21:17<1926:07:47, 7.05s/it, lr=1e-5, step_loss=0.00723]
Steps: 2%|▏ | 16685/1000000 [9:21:32<2604:51:42, 9.54s/it, lr=1e-5, step_loss=0.00723][RANK-0]: Step: [16685], local_loss=0.009838748723268509, train_loss=0.033105555921792984, time_cost=1.282691240310669
+
Steps: 2%|▏ | 16685/1000000 [9:21:32<2604:51:42, 9.54s/it, lr=1e-5, step_loss=0.00984]
Steps: 2%|▏ | 16686/1000000 [9:21:36<2194:03:50, 8.03s/it, lr=1e-5, step_loss=0.00984][RANK-0]: Step: [16686], local_loss=0.0071304840967059135, train_loss=0.016102811321616173, time_cost=2.5311551094055176
+
Steps: 2%|▏ | 16686/1000000 [9:21:36<2194:03:50, 8.03s/it, lr=1e-5, step_loss=0.00713]
Steps: 2%|▏ | 16687/1000000 [9:21:42<1981:57:51, 7.26s/it, lr=1e-5, step_loss=0.00713][RANK-0]: Step: [16687], local_loss=0.055742740631103516, train_loss=0.03142732381820679, time_cost=2.1115305423736572
+
Steps: 2%|▏ | 16687/1000000 [9:21:42<1981:57:51, 7.26s/it, lr=1e-5, step_loss=0.0557]
Steps: 2%|▏ | 16688/1000000 [9:21:59<2769:11:28, 10.14s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [16688], local_loss=0.019709084182977676, train_loss=8.473382949829102, time_cost=8.699422836303711
+
Steps: 2%|▏ | 16688/1000000 [9:21:59<2769:11:28, 10.14s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 16689/1000000 [9:22:04<2356:50:12, 8.63s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [16689], local_loss=0.006794329732656479, train_loss=0.041357558220624924, time_cost=2.0918357372283936
+
Steps: 2%|▏ | 16689/1000000 [9:22:04<2356:50:12, 8.63s/it, lr=1e-5, step_loss=0.00679]
Steps: 2%|▏ | 16690/1000000 [9:22:08<2014:14:21, 7.37s/it, lr=1e-5, step_loss=0.00679][RANK-0]: Step: [16690], local_loss=0.043353233486413956, train_loss=0.05283321440219879, time_cost=1.3936622142791748
+
Steps: 2%|▏ | 16690/1000000 [9:22:08<2014:14:21, 7.37s/it, lr=1e-5, step_loss=0.0434]
Steps: 2%|▏ | 16691/1000000 [9:22:15<1943:39:46, 7.12s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [16691], local_loss=0.13679289817810059, train_loss=0.03581196069717407, time_cost=2.4930708408355713
+
Steps: 2%|▏ | 16691/1000000 [9:22:15<1943:39:46, 7.12s/it, lr=1e-5, step_loss=0.137]
Steps: 2%|▏ | 16692/1000000 [9:22:20<1798:35:00, 6.58s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [16692], local_loss=0.01828519068658352, train_loss=0.05647584795951843, time_cost=2.429273843765259
+
Steps: 2%|▏ | 16692/1000000 [9:22:20<1798:35:00, 6.58s/it, lr=1e-5, step_loss=0.0183]
Steps: 2%|▏ | 16693/1000000 [9:22:34<2382:34:54, 8.72s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [16693], local_loss=0.29185566306114197, train_loss=0.07592322677373886, time_cost=1.2591965198516846
+
Steps: 2%|▏ | 16693/1000000 [9:22:34<2382:34:54, 8.72s/it, lr=1e-5, step_loss=0.292]
Steps: 2%|▏ | 16694/1000000 [9:22:39<2104:31:12, 7.70s/it, lr=1e-5, step_loss=0.292][RANK-0]: Step: [16694], local_loss=1.032562494277954, train_loss=0.14299504458904266, time_cost=2.4998726844787598
+
Steps: 2%|▏ | 16694/1000000 [9:22:39<2104:31:12, 7.70s/it, lr=1e-5, step_loss=1.03]
Steps: 2%|▏ | 16695/1000000 [9:22:47<2087:58:48, 7.64s/it, lr=1e-5, step_loss=1.03][RANK-0]: Step: [16695], local_loss=0.1211310401558876, train_loss=0.061209797859191895, time_cost=2.0440354347229004
+
Steps: 2%|▏ | 16695/1000000 [9:22:47<2087:58:48, 7.64s/it, lr=1e-5, step_loss=0.121]
Steps: 2%|▏ | 16696/1000000 [9:22:56<2209:12:19, 8.09s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [16696], local_loss=0.014704890549182892, train_loss=0.03910192847251892, time_cost=4.701539754867554
+
Steps: 2%|▏ | 16696/1000000 [9:22:56<2209:12:19, 8.09s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 16697/1000000 [9:23:07<2490:41:45, 9.12s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [16697], local_loss=0.03663244843482971, train_loss=0.022177694365382195, time_cost=4.120102405548096
+
Steps: 2%|▏ | 16697/1000000 [9:23:07<2490:41:45, 9.12s/it, lr=1e-5, step_loss=0.0366]
Steps: 2%|▏ | 16698/1000000 [9:23:22<2943:54:07, 10.78s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [16698], local_loss=0.038083720952272415, train_loss=0.03200272470712662, time_cost=6.784924507141113
+
Steps: 2%|▏ | 16698/1000000 [9:23:22<2943:54:07, 10.78s/it, lr=1e-5, step_loss=0.0381]
Steps: 2%|▏ | 16699/1000000 [9:23:34<3005:18:28, 11.00s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [16699], local_loss=0.021118005737662315, train_loss=0.014903565868735313, time_cost=3.041767120361328
+
Steps: 2%|▏ | 16699/1000000 [9:23:34<3005:18:28, 11.00s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 16700/1000000 [9:23:39<2526:05:09, 9.25s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [16700], local_loss=0.0444553904235363, train_loss=0.018810782581567764, time_cost=3.185209035873413
+
Steps: 2%|▏ | 16700/1000000 [9:23:39<2526:05:09, 9.25s/it, lr=1e-5, step_loss=0.0445]
Steps: 2%|▏ | 16701/1000000 [9:23:47<2427:19:37, 8.89s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [16701], local_loss=0.014212939888238907, train_loss=0.11467862129211426, time_cost=4.000442266464233
+
Steps: 2%|▏ | 16701/1000000 [9:23:47<2427:19:37, 8.89s/it, lr=1e-5, step_loss=0.0142]
Steps: 2%|▏ | 16702/1000000 [9:23:51<2060:09:25, 7.54s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [16702], local_loss=0.0665346011519432, train_loss=0.0416659414768219, time_cost=1.869870662689209
+
Steps: 2%|▏ | 16702/1000000 [9:23:51<2060:09:25, 7.54s/it, lr=1e-5, step_loss=0.0665]
Steps: 2%|▏ | 16703/1000000 [9:23:56<1799:36:02, 6.59s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [16703], local_loss=0.4032750129699707, train_loss=0.070750392973423, time_cost=1.4279460906982422
+
Steps: 2%|▏ | 16703/1000000 [9:23:56<1799:36:02, 6.59s/it, lr=1e-5, step_loss=0.403]
Steps: 2%|▏ | 16704/1000000 [9:24:00<1610:37:14, 5.90s/it, lr=1e-5, step_loss=0.403][RANK-0]: Step: [16704], local_loss=0.035276103764772415, train_loss=0.03638884425163269, time_cost=1.7382490634918213
+
Steps: 2%|▏ | 16704/1000000 [9:24:00<1610:37:14, 5.90s/it, lr=1e-5, step_loss=0.0353]
Steps: 2%|▏ | 16705/1000000 [9:24:05<1514:07:53, 5.54s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [16705], local_loss=0.03396954387426376, train_loss=0.02525896392762661, time_cost=2.125235080718994
+
Steps: 2%|▏ | 16705/1000000 [9:24:05<1514:07:53, 5.54s/it, lr=1e-5, step_loss=0.034]
Steps: 2%|▏ | 16706/1000000 [9:24:20<2337:59:56, 8.56s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [16706], local_loss=0.03082800656557083, train_loss=0.025267219170928, time_cost=1.456507682800293
+
Steps: 2%|▏ | 16706/1000000 [9:24:20<2337:59:56, 8.56s/it, lr=1e-5, step_loss=0.0308]
Steps: 2%|▏ | 16707/1000000 [9:24:35<2840:39:38, 10.40s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [16707], local_loss=0.01612509787082672, train_loss=0.008804326876997948, time_cost=5.264113426208496
+
Steps: 2%|▏ | 16707/1000000 [9:24:35<2840:39:38, 10.40s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 16708/1000000 [9:24:48<3044:08:43, 11.15s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [16708], local_loss=0.09613532572984695, train_loss=0.028523962944746017, time_cost=5.352964878082275
+
Steps: 2%|▏ | 16708/1000000 [9:24:48<3044:08:43, 11.15s/it, lr=1e-5, step_loss=0.0961]
Steps: 2%|▏ | 16709/1000000 [9:25:01<3183:34:57, 11.66s/it, lr=1e-5, step_loss=0.0961][RANK-0]: Step: [16709], local_loss=0.007534536998718977, train_loss=0.026631375774741173, time_cost=4.388428211212158
+
Steps: 2%|▏ | 16709/1000000 [9:25:01<3183:34:57, 11.66s/it, lr=1e-5, step_loss=0.00753]
Steps: 2%|▏ | 16710/1000000 [9:25:16<3481:44:24, 12.75s/it, lr=1e-5, step_loss=0.00753][RANK-0]: Step: [16710], local_loss=0.008261618204414845, train_loss=0.02705228142440319, time_cost=1.7503230571746826
+
Steps: 2%|▏ | 16710/1000000 [9:25:16<3481:44:24, 12.75s/it, lr=1e-5, step_loss=0.00826]
Steps: 2%|▏ | 16711/1000000 [9:25:23<3006:17:45, 11.01s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [16711], local_loss=0.028068622574210167, train_loss=0.04004736989736557, time_cost=2.378661870956421
+
Steps: 2%|▏ | 16711/1000000 [9:25:23<3006:17:45, 11.01s/it, lr=1e-5, step_loss=0.0281]
Steps: 2%|▏ | 16712/1000000 [9:25:30<2684:26:31, 9.83s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [16712], local_loss=0.008184303529560566, train_loss=0.1159413754940033, time_cost=2.7347331047058105
+
Steps: 2%|▏ | 16712/1000000 [9:25:30<2684:26:31, 9.83s/it, lr=1e-5, step_loss=0.00818]
Steps: 2%|▏ | 16713/1000000 [9:25:44<3050:31:20, 11.17s/it, lr=1e-5, step_loss=0.00818][RANK-0]: Step: [16713], local_loss=0.005856633652001619, train_loss=0.03141357749700546, time_cost=4.05848240852356
+
Steps: 2%|▏ | 16713/1000000 [9:25:44<3050:31:20, 11.17s/it, lr=1e-5, step_loss=0.00586]
Steps: 2%|▏ | 16714/1000000 [9:25:52<2749:00:27, 10.06s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [16714], local_loss=0.015682963654398918, train_loss=0.02035505324602127, time_cost=2.0178418159484863
+
Steps: 2%|▏ | 16714/1000000 [9:25:52<2749:00:27, 10.06s/it, lr=1e-5, step_loss=0.0157]
Steps: 2%|▏ | 16715/1000000 [9:25:59<2498:16:26, 9.15s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [16715], local_loss=0.01441691443324089, train_loss=0.1332802176475525, time_cost=2.5003106594085693
+
Steps: 2%|▏ | 16715/1000000 [9:25:59<2498:16:26, 9.15s/it, lr=1e-5, step_loss=0.0144]
Steps: 2%|▏ | 16716/1000000 [9:26:12<2818:13:20, 10.32s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [16716], local_loss=0.008174619637429714, train_loss=0.059327300637960434, time_cost=4.5684895515441895
+
Steps: 2%|▏ | 16716/1000000 [9:26:12<2818:13:20, 10.32s/it, lr=1e-5, step_loss=0.00817]
Steps: 2%|▏ | 16717/1000000 [9:26:19<2553:16:18, 9.35s/it, lr=1e-5, step_loss=0.00817][RANK-0]: Step: [16717], local_loss=0.0053288135677576065, train_loss=0.1062120720744133, time_cost=2.9463279247283936
+
Steps: 2%|▏ | 16717/1000000 [9:26:19<2553:16:18, 9.35s/it, lr=1e-5, step_loss=0.00533]
Steps: 2%|▏ | 16718/1000000 [9:26:24<2190:27:58, 8.02s/it, lr=1e-5, step_loss=0.00533][RANK-0]: Step: [16718], local_loss=0.02340766042470932, train_loss=0.022602558135986328, time_cost=1.2962751388549805
+
Steps: 2%|▏ | 16718/1000000 [9:26:24<2190:27:58, 8.02s/it, lr=1e-5, step_loss=0.0234]
Steps: 2%|▏ | 16719/1000000 [9:26:31<2114:31:35, 7.74s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [16719], local_loss=0.010685491375625134, train_loss=0.058024048805236816, time_cost=2.322169303894043
+
Steps: 2%|▏ | 16719/1000000 [9:26:31<2114:31:35, 7.74s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 16720/1000000 [9:26:39<2148:04:32, 7.86s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [16720], local_loss=0.09139060229063034, train_loss=0.030966486781835556, time_cost=1.5492866039276123
+
Steps: 2%|▏ | 16720/1000000 [9:26:39<2148:04:32, 7.86s/it, lr=1e-5, step_loss=0.0914]
Steps: 2%|▏ | 16721/1000000 [9:26:46<2050:29:09, 7.51s/it, lr=1e-5, step_loss=0.0914][RANK-0]: Step: [16721], local_loss=0.06138450652360916, train_loss=0.16666875779628754, time_cost=2.509408712387085
+
Steps: 2%|▏ | 16721/1000000 [9:26:46<2050:29:09, 7.51s/it, lr=1e-5, step_loss=0.0614]
Steps: 2%|▏ | 16722/1000000 [9:26:54<2132:31:43, 7.81s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [16722], local_loss=0.011667544953525066, train_loss=0.02420213259756565, time_cost=4.248555898666382
+
Steps: 2%|▏ | 16722/1000000 [9:26:54<2132:31:43, 7.81s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 16723/1000000 [9:27:06<2435:56:27, 8.92s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [16723], local_loss=0.3933965265750885, train_loss=0.21237170696258545, time_cost=3.2687504291534424
+
Steps: 2%|▏ | 16723/1000000 [9:27:06<2435:56:27, 8.92s/it, lr=1e-5, step_loss=0.393]
Steps: 2%|▏ | 16724/1000000 [9:27:14<2376:40:24, 8.70s/it, lr=1e-5, step_loss=0.393][RANK-0]: Step: [16724], local_loss=0.09985750913619995, train_loss=0.03187238425016403, time_cost=4.411013841629028
+
Steps: 2%|▏ | 16724/1000000 [9:27:14<2376:40:24, 8.70s/it, lr=1e-5, step_loss=0.0999]
Steps: 2%|▏ | 16725/1000000 [9:27:27<2721:14:42, 9.96s/it, lr=1e-5, step_loss=0.0999][RANK-0]: Step: [16725], local_loss=0.005030214320868254, train_loss=0.0896601751446724, time_cost=6.952523231506348
+
Steps: 2%|▏ | 16725/1000000 [9:27:27<2721:14:42, 9.96s/it, lr=1e-5, step_loss=0.00503]
Steps: 2%|▏ | 16726/1000000 [9:27:38<2797:43:00, 10.24s/it, lr=1e-5, step_loss=0.00503][RANK-0]: Step: [16726], local_loss=0.0030211033299565315, train_loss=0.03406774252653122, time_cost=4.900644302368164
+
Steps: 2%|▏ | 16726/1000000 [9:27:38<2797:43:00, 10.24s/it, lr=1e-5, step_loss=0.00302]
Steps: 2%|▏ | 16727/1000000 [9:27:52<3150:57:29, 11.54s/it, lr=1e-5, step_loss=0.00302][RANK-0]: Step: [16727], local_loss=0.006039759144186974, train_loss=0.020573105663061142, time_cost=2.7480974197387695
+
Steps: 2%|▏ | 16727/1000000 [9:27:52<3150:57:29, 11.54s/it, lr=1e-5, step_loss=0.00604]
Steps: 2%|▏ | 16728/1000000 [9:28:03<3072:42:44, 11.25s/it, lr=1e-5, step_loss=0.00604][RANK-0]: Step: [16728], local_loss=0.07444684207439423, train_loss=0.06256435811519623, time_cost=8.231023788452148
+
Steps: 2%|▏ | 16728/1000000 [9:28:03<3072:42:44, 11.25s/it, lr=1e-5, step_loss=0.0744]
Steps: 2%|▏ | 16729/1000000 [9:28:14<3045:29:56, 11.15s/it, lr=1e-5, step_loss=0.0744][RANK-0]: Step: [16729], local_loss=0.10206957161426544, train_loss=0.05103864520788193, time_cost=8.481238603591919
+
Steps: 2%|▏ | 16729/1000000 [9:28:14<3045:29:56, 11.15s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 16730/1000000 [9:28:19<2569:26:28, 9.41s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [16730], local_loss=0.008682098239660263, train_loss=0.04314976930618286, time_cost=4.300201177597046
+
Steps: 2%|▏ | 16730/1000000 [9:28:19<2569:26:28, 9.41s/it, lr=1e-5, step_loss=0.00868]
Steps: 2%|▏ | 16731/1000000 [9:28:30<2676:45:17, 9.80s/it, lr=1e-5, step_loss=0.00868][RANK-0]: Step: [16731], local_loss=0.0687393993139267, train_loss=17.50156021118164, time_cost=2.726768970489502
+
Steps: 2%|▏ | 16731/1000000 [9:28:30<2676:45:17, 9.80s/it, lr=1e-5, step_loss=0.0687]
Steps: 2%|▏ | 16732/1000000 [9:28:41<2807:46:08, 10.28s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [16732], local_loss=0.0156791303306818, train_loss=0.030095968395471573, time_cost=4.486712455749512
+
Steps: 2%|▏ | 16732/1000000 [9:28:41<2807:46:08, 10.28s/it, lr=1e-5, step_loss=0.0157]
Steps: 2%|▏ | 16733/1000000 [9:28:52<2856:00:52, 10.46s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [16733], local_loss=0.00901174545288086, train_loss=0.024271473288536072, time_cost=2.6341054439544678
+
Steps: 2%|▏ | 16733/1000000 [9:28:52<2856:00:52, 10.46s/it, lr=1e-5, step_loss=0.00901]
Steps: 2%|▏ | 16734/1000000 [9:29:03<2908:28:36, 10.65s/it, lr=1e-5, step_loss=0.00901][RANK-0]: Step: [16734], local_loss=0.024137116968631744, train_loss=0.08021371066570282, time_cost=2.7397620677948
+
Steps: 2%|▏ | 16734/1000000 [9:29:03<2908:28:36, 10.65s/it, lr=1e-5, step_loss=0.0241]
Steps: 2%|▏ | 16735/1000000 [9:29:14<2933:06:29, 10.74s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [16735], local_loss=0.0066161686554551125, train_loss=0.02104203775525093, time_cost=2.158210515975952
+
Steps: 2%|▏ | 16735/1000000 [9:29:14<2933:06:29, 10.74s/it, lr=1e-5, step_loss=0.00662]
Steps: 2%|▏ | 16736/1000000 [9:29:21<2656:44:40, 9.73s/it, lr=1e-5, step_loss=0.00662][RANK-0]: Step: [16736], local_loss=0.013906541280448437, train_loss=0.09249565005302429, time_cost=1.9817430973052979
+
Steps: 2%|▏ | 16736/1000000 [9:29:21<2656:44:40, 9.73s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 16737/1000000 [9:29:33<2814:41:34, 10.31s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [16737], local_loss=0.010257091373205185, train_loss=0.013567986898124218, time_cost=3.274707794189453
+
Steps: 2%|▏ | 16737/1000000 [9:29:33<2814:41:34, 10.31s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 16738/1000000 [9:29:43<2785:00:01, 10.20s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [16738], local_loss=0.024291429668664932, train_loss=0.12123563140630722, time_cost=3.368014097213745
+
Steps: 2%|▏ | 16738/1000000 [9:29:43<2785:00:01, 10.20s/it, lr=1e-5, step_loss=0.0243]
Steps: 2%|▏ | 16739/1000000 [9:29:54<2836:31:37, 10.39s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [16739], local_loss=0.007132077589631081, train_loss=0.04455076530575752, time_cost=4.432434558868408
+
Steps: 2%|▏ | 16739/1000000 [9:29:54<2836:31:37, 10.39s/it, lr=1e-5, step_loss=0.00713]
Steps: 2%|▏ | 16740/1000000 [9:30:08<3166:10:47, 11.59s/it, lr=1e-5, step_loss=0.00713][RANK-0]: Step: [16740], local_loss=0.011640003882348537, train_loss=0.04891073331236839, time_cost=4.334892511367798
+
Steps: 2%|▏ | 16740/1000000 [9:30:08<3166:10:47, 11.59s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 16741/1000000 [9:30:14<2691:20:45, 9.85s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [16741], local_loss=0.01186314970254898, train_loss=0.019286688417196274, time_cost=1.244436264038086
+
Steps: 2%|▏ | 16741/1000000 [9:30:14<2691:20:45, 9.85s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 16742/1000000 [9:30:24<2727:43:40, 9.99s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [16742], local_loss=0.015276273712515831, train_loss=0.15222826600074768, time_cost=1.4362688064575195
+
Steps: 2%|▏ | 16742/1000000 [9:30:24<2727:43:40, 9.99s/it, lr=1e-5, step_loss=0.0153]
Steps: 2%|▏ | 16743/1000000 [9:30:33<2647:00:50, 9.69s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [16743], local_loss=0.055077455937862396, train_loss=9.51279354095459, time_cost=3.0547852516174316
+
Steps: 2%|▏ | 16743/1000000 [9:30:33<2647:00:50, 9.69s/it, lr=1e-5, step_loss=0.0551]
Steps: 2%|▏ | 16744/1000000 [9:30:44<2751:52:52, 10.08s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [16744], local_loss=0.0069600255228579044, train_loss=0.019482556730508804, time_cost=2.1075475215911865
+
Steps: 2%|▏ | 16744/1000000 [9:30:44<2751:52:52, 10.08s/it, lr=1e-5, step_loss=0.00696]
Steps: 2%|▏ | 16745/1000000 [9:30:55<2815:11:54, 10.31s/it, lr=1e-5, step_loss=0.00696][RANK-0]: Step: [16745], local_loss=0.021973662078380585, train_loss=0.028857799246907234, time_cost=3.9704084396362305
+
Steps: 2%|▏ | 16745/1000000 [9:30:55<2815:11:54, 10.31s/it, lr=1e-5, step_loss=0.022]
Steps: 2%|▏ | 16746/1000000 [9:31:04<2673:17:51, 9.79s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [16746], local_loss=0.032423220574855804, train_loss=0.040807321667671204, time_cost=2.8920185565948486
+
Steps: 2%|▏ | 16746/1000000 [9:31:04<2673:17:51, 9.79s/it, lr=1e-5, step_loss=0.0324]
Steps: 2%|▏ | 16747/1000000 [9:31:15<2801:45:02, 10.26s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [16747], local_loss=0.052301209419965744, train_loss=0.15476666390895844, time_cost=4.028530120849609
+
Steps: 2%|▏ | 16747/1000000 [9:31:15<2801:45:02, 10.26s/it, lr=1e-5, step_loss=0.0523]
Steps: 2%|▏ | 16748/1000000 [9:31:20<2373:28:04, 8.69s/it, lr=1e-5, step_loss=0.0523][RANK-0]: Step: [16748], local_loss=0.0537409707903862, train_loss=0.02914291061460972, time_cost=3.724008560180664
+
Steps: 2%|▏ | 16748/1000000 [9:31:20<2373:28:04, 8.69s/it, lr=1e-5, step_loss=0.0537]
Steps: 2%|▏ | 16749/1000000 [9:31:31<2575:31:08, 9.43s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [16749], local_loss=0.10184481739997864, train_loss=0.07576078176498413, time_cost=8.531280994415283
+
Steps: 2%|▏ | 16749/1000000 [9:31:31<2575:31:08, 9.43s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 16750/1000000 [9:31:44<2824:17:24, 10.34s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [16750], local_loss=0.053282540291547775, train_loss=0.12281282991170883, time_cost=4.231473922729492
+
Steps: 2%|▏ | 16750/1000000 [9:31:44<2824:17:24, 10.34s/it, lr=1e-5, step_loss=0.0533]
Steps: 2%|▏ | 16751/1000000 [9:31:51<2592:13:59, 9.49s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [16751], local_loss=0.011842862702906132, train_loss=0.02328626438975334, time_cost=3.3260385990142822
+
Steps: 2%|▏ | 16751/1000000 [9:31:51<2592:13:59, 9.49s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 16752/1000000 [9:32:02<2660:45:34, 9.74s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [16752], local_loss=0.011230694130063057, train_loss=0.037052348256111145, time_cost=1.919969081878662
+
Steps: 2%|▏ | 16752/1000000 [9:32:02<2660:45:34, 9.74s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 16753/1000000 [9:32:09<2494:10:44, 9.13s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [16753], local_loss=0.00737750343978405, train_loss=0.039951179176568985, time_cost=6.44766640663147
+
Steps: 2%|▏ | 16753/1000000 [9:32:09<2494:10:44, 9.13s/it, lr=1e-5, step_loss=0.00738]
Steps: 2%|▏ | 16754/1000000 [9:32:20<2617:40:05, 9.58s/it, lr=1e-5, step_loss=0.00738][RANK-0]: Step: [16754], local_loss=0.089536152780056, train_loss=0.1233971118927002, time_cost=5.453103065490723
+
Steps: 2%|▏ | 16754/1000000 [9:32:20<2617:40:05, 9.58s/it, lr=1e-5, step_loss=0.0895]
Steps: 2%|▏ | 16755/1000000 [9:32:25<2218:00:34, 8.12s/it, lr=1e-5, step_loss=0.0895][RANK-0]: Step: [16755], local_loss=0.01544836163520813, train_loss=0.030897168442606926, time_cost=1.4669954776763916
+
Steps: 2%|▏ | 16755/1000000 [9:32:25<2218:00:34, 8.12s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 16756/1000000 [9:32:30<1984:30:20, 7.27s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [16756], local_loss=0.1522759646177292, train_loss=0.05794680118560791, time_cost=4.357240915298462
+
Steps: 2%|▏ | 16756/1000000 [9:32:30<1984:30:20, 7.27s/it, lr=1e-5, step_loss=0.152]
Steps: 2%|▏ | 16757/1000000 [9:32:43<2464:16:31, 9.02s/it, lr=1e-5, step_loss=0.152][RANK-0]: Step: [16757], local_loss=0.037405457347631454, train_loss=0.01851118728518486, time_cost=4.444924831390381
+
Steps: 2%|▏ | 16757/1000000 [9:32:43<2464:16:31, 9.02s/it, lr=1e-5, step_loss=0.0374]
Steps: 2%|▏ | 16758/1000000 [9:32:58<2977:38:08, 10.90s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [16758], local_loss=0.006610444746911526, train_loss=0.12567254900932312, time_cost=12.367062091827393
+
Steps: 2%|▏ | 16758/1000000 [9:32:58<2977:38:08, 10.90s/it, lr=1e-5, step_loss=0.00661]
Steps: 2%|▏ | 16759/1000000 [9:33:04<2521:29:51, 9.23s/it, lr=1e-5, step_loss=0.00661][RANK-0]: Step: [16759], local_loss=0.06733182072639465, train_loss=0.02426273003220558, time_cost=2.6668543815612793
+
Steps: 2%|▏ | 16759/1000000 [9:33:04<2521:29:51, 9.23s/it, lr=1e-5, step_loss=0.0673]
Steps: 2%|▏ | 16760/1000000 [9:33:18<2921:27:05, 10.70s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [16760], local_loss=0.004448754247277975, train_loss=0.03104843571782112, time_cost=5.407621145248413
+
Steps: 2%|▏ | 16760/1000000 [9:33:18<2921:27:05, 10.70s/it, lr=1e-5, step_loss=0.00445]
Steps: 2%|▏ | 16761/1000000 [9:33:25<2675:16:29, 9.80s/it, lr=1e-5, step_loss=0.00445][RANK-0]: Step: [16761], local_loss=0.009230563417077065, train_loss=0.02310124598443508, time_cost=3.454090118408203
+
Steps: 2%|▏ | 16761/1000000 [9:33:25<2675:16:29, 9.80s/it, lr=1e-5, step_loss=0.00923]
Steps: 2%|▏ | 16762/1000000 [9:33:39<2999:46:04, 10.98s/it, lr=1e-5, step_loss=0.00923][RANK-0]: Step: [16762], local_loss=0.012850439175963402, train_loss=0.032849930226802826, time_cost=3.313227415084839
+
Steps: 2%|▏ | 16762/1000000 [9:33:39<2999:46:04, 10.98s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 16763/1000000 [9:33:53<3201:00:25, 11.72s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [16763], local_loss=0.019050363451242447, train_loss=0.011683695949614048, time_cost=11.180326700210571
+
Steps: 2%|▏ | 16763/1000000 [9:33:53<3201:00:25, 11.72s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 16764/1000000 [9:33:58<2656:31:26, 9.73s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [16764], local_loss=0.08376634120941162, train_loss=0.05007011443376541, time_cost=2.09238600730896
+
Steps: 2%|▏ | 16764/1000000 [9:33:58<2656:31:26, 9.73s/it, lr=1e-5, step_loss=0.0838]
Steps: 2%|▏ | 16765/1000000 [9:34:09<2759:37:27, 10.10s/it, lr=1e-5, step_loss=0.0838][RANK-0]: Step: [16765], local_loss=0.009986914694309235, train_loss=43.87764358520508, time_cost=2.05995512008667
+
Steps: 2%|▏ | 16765/1000000 [9:34:09<2759:37:27, 10.10s/it, lr=1e-5, step_loss=0.00999]
Steps: 2%|▏ | 16766/1000000 [9:34:14<2354:26:57, 8.62s/it, lr=1e-5, step_loss=0.00999][RANK-0]: Step: [16766], local_loss=0.02410103939473629, train_loss=0.031850527971982956, time_cost=4.007977485656738
+
Steps: 2%|▏ | 16766/1000000 [9:34:14<2354:26:57, 8.62s/it, lr=1e-5, step_loss=0.0241]
Steps: 2%|▏ | 16767/1000000 [9:34:19<2039:12:05, 7.47s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [16767], local_loss=0.06371524930000305, train_loss=0.09755983203649521, time_cost=2.1998162269592285
+
Steps: 2%|▏ | 16767/1000000 [9:34:19<2039:12:05, 7.47s/it, lr=1e-5, step_loss=0.0637]
Steps: 2%|▏ | 16768/1000000 [9:34:29<2311:54:31, 8.46s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [16768], local_loss=0.04489603638648987, train_loss=0.04763661324977875, time_cost=2.4516897201538086
+
Steps: 2%|▏ | 16768/1000000 [9:34:29<2311:54:31, 8.46s/it, lr=1e-5, step_loss=0.0449]
Steps: 2%|▏ | 16769/1000000 [9:34:36<2130:35:05, 7.80s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [16769], local_loss=0.04873829334974289, train_loss=0.0903453528881073, time_cost=1.9340624809265137
+
Steps: 2%|▏ | 16769/1000000 [9:34:36<2130:35:05, 7.80s/it, lr=1e-5, step_loss=0.0487]
Steps: 2%|▏ | 16770/1000000 [9:34:48<2459:18:32, 9.00s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [16770], local_loss=0.40413305163383484, train_loss=0.07151821255683899, time_cost=4.234816074371338
+
Steps: 2%|▏ | 16770/1000000 [9:34:48<2459:18:32, 9.00s/it, lr=1e-5, step_loss=0.404]
Steps: 2%|▏ | 16771/1000000 [9:34:59<2643:54:47, 9.68s/it, lr=1e-5, step_loss=0.404][RANK-0]: Step: [16771], local_loss=0.008672993630170822, train_loss=0.01835986226797104, time_cost=2.2397801876068115
+
Steps: 2%|▏ | 16771/1000000 [9:34:59<2643:54:47, 9.68s/it, lr=1e-5, step_loss=0.00867]
Steps: 2%|▏ | 16772/1000000 [9:35:12<2963:27:42, 10.85s/it, lr=1e-5, step_loss=0.00867][RANK-0]: Step: [16772], local_loss=0.11770961433649063, train_loss=0.032119378447532654, time_cost=4.0818493366241455
+
Steps: 2%|▏ | 16772/1000000 [9:35:12<2963:27:42, 10.85s/it, lr=1e-5, step_loss=0.118]
Steps: 2%|▏ | 16773/1000000 [9:35:19<2641:10:57, 9.67s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [16773], local_loss=0.013976259157061577, train_loss=0.0390009805560112, time_cost=1.304187536239624
+
Steps: 2%|▏ | 16773/1000000 [9:35:19<2641:10:57, 9.67s/it, lr=1e-5, step_loss=0.014]
Steps: 2%|▏ | 16774/1000000 [9:35:24<2228:10:03, 8.16s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [16774], local_loss=0.006537248380482197, train_loss=0.06293796747922897, time_cost=1.6636426448822021
+
Steps: 2%|▏ | 16774/1000000 [9:35:24<2228:10:03, 8.16s/it, lr=1e-5, step_loss=0.00654]
Steps: 2%|▏ | 16775/1000000 [9:35:36<2573:05:02, 9.42s/it, lr=1e-5, step_loss=0.00654][RANK-0]: Step: [16775], local_loss=0.4793643355369568, train_loss=0.10251877456903458, time_cost=5.002204656600952
+
Steps: 2%|▏ | 16775/1000000 [9:35:36<2573:05:02, 9.42s/it, lr=1e-5, step_loss=0.479]
Steps: 2%|▏ | 16776/1000000 [9:35:47<2700:28:12, 9.89s/it, lr=1e-5, step_loss=0.479][RANK-0]: Step: [16776], local_loss=0.006803826428949833, train_loss=0.07682362198829651, time_cost=3.4191577434539795
+
Steps: 2%|▏ | 16776/1000000 [9:35:47<2700:28:12, 9.89s/it, lr=1e-5, step_loss=0.0068]
Steps: 2%|▏ | 16777/1000000 [9:35:52<2302:03:23, 8.43s/it, lr=1e-5, step_loss=0.0068][RANK-0]: Step: [16777], local_loss=0.04838457331061363, train_loss=0.028587885200977325, time_cost=2.156806468963623
+
Steps: 2%|▏ | 16777/1000000 [9:35:52<2302:03:23, 8.43s/it, lr=1e-5, step_loss=0.0484]
Steps: 2%|▏ | 16778/1000000 [9:36:00<2219:46:35, 8.13s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [16778], local_loss=0.035841744393110275, train_loss=0.03181330859661102, time_cost=1.3159241676330566
+
Steps: 2%|▏ | 16778/1000000 [9:36:00<2219:46:35, 8.13s/it, lr=1e-5, step_loss=0.0358]
Steps: 2%|▏ | 16779/1000000 [9:36:11<2507:26:11, 9.18s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [16779], local_loss=0.0057127755135297775, train_loss=0.031987421214580536, time_cost=3.8200299739837646
+
Steps: 2%|▏ | 16779/1000000 [9:36:11<2507:26:11, 9.18s/it, lr=1e-5, step_loss=0.00571]
Steps: 2%|▏ | 16780/1000000 [9:36:23<2694:20:30, 9.87s/it, lr=1e-5, step_loss=0.00571][RANK-0]: Step: [16780], local_loss=0.005707768257707357, train_loss=0.010255116969347, time_cost=3.2578444480895996
+
Steps: 2%|▏ | 16780/1000000 [9:36:23<2694:20:30, 9.87s/it, lr=1e-5, step_loss=0.00571]
Steps: 2%|▏ | 16781/1000000 [9:36:38<3162:13:09, 11.58s/it, lr=1e-5, step_loss=0.00571][RANK-0]: Step: [16781], local_loss=0.012376256287097931, train_loss=0.17261992394924164, time_cost=2.318288564682007
+
Steps: 2%|▏ | 16781/1000000 [9:36:38<3162:13:09, 11.58s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 16782/1000000 [9:36:54<3494:23:04, 12.79s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [16782], local_loss=0.021863162517547607, train_loss=0.04524286091327667, time_cost=6.258480548858643
+
Steps: 2%|▏ | 16782/1000000 [9:36:54<3494:23:04, 12.79s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 16783/1000000 [9:37:05<3371:34:29, 12.34s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [16783], local_loss=0.00971001386642456, train_loss=0.14414313435554504, time_cost=5.468006134033203
+
Steps: 2%|▏ | 16783/1000000 [9:37:05<3371:34:29, 12.34s/it, lr=1e-5, step_loss=0.00971]
Steps: 2%|▏ | 16784/1000000 [9:37:14<3065:16:22, 11.22s/it, lr=1e-5, step_loss=0.00971][RANK-0]: Step: [16784], local_loss=0.006958866026252508, train_loss=0.03687169402837753, time_cost=1.2752981185913086
+
Steps: 2%|▏ | 16784/1000000 [9:37:14<3065:16:22, 11.22s/it, lr=1e-5, step_loss=0.00696]
Steps: 2%|▏ | 16785/1000000 [9:37:19<2573:36:16, 9.42s/it, lr=1e-5, step_loss=0.00696][RANK-0]: Step: [16785], local_loss=0.04270552471280098, train_loss=0.028163665905594826, time_cost=2.105879545211792
+
Steps: 2%|▏ | 16785/1000000 [9:37:19<2573:36:16, 9.42s/it, lr=1e-5, step_loss=0.0427]
Steps: 2%|▏ | 16786/1000000 [9:37:34<3043:19:29, 11.14s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [16786], local_loss=0.010109258815646172, train_loss=0.030748898163437843, time_cost=6.7390522956848145
+
Steps: 2%|▏ | 16786/1000000 [9:37:34<3043:19:29, 11.14s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 16787/1000000 [9:37:48<3272:07:06, 11.98s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [16787], local_loss=0.00830857828259468, train_loss=0.07654915004968643, time_cost=5.111815452575684
+
Steps: 2%|▏ | 16787/1000000 [9:37:48<3272:07:06, 11.98s/it, lr=1e-5, step_loss=0.00831]
Steps: 2%|▏ | 16788/1000000 [9:38:01<3371:00:26, 12.34s/it, lr=1e-5, step_loss=0.00831][RANK-0]: Step: [16788], local_loss=0.005601187236607075, train_loss=0.04219575226306915, time_cost=4.158313512802124
+
Steps: 2%|▏ | 16788/1000000 [9:38:01<3371:00:26, 12.34s/it, lr=1e-5, step_loss=0.0056]
Steps: 2%|▏ | 16789/1000000 [9:38:07<2779:21:08, 10.18s/it, lr=1e-5, step_loss=0.0056][RANK-0]: Step: [16789], local_loss=0.012659751810133457, train_loss=0.035357631742954254, time_cost=2.3114840984344482
+
Steps: 2%|▏ | 16789/1000000 [9:38:07<2779:21:08, 10.18s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 16790/1000000 [9:38:18<2901:51:51, 10.63s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [16790], local_loss=0.01799001917243004, train_loss=0.04773147404193878, time_cost=4.796350002288818
+
Steps: 2%|▏ | 16790/1000000 [9:38:18<2901:51:51, 10.63s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 16791/1000000 [9:38:23<2427:48:08, 8.89s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [16791], local_loss=0.0037626554258167744, train_loss=0.023886650800704956, time_cost=2.0054354667663574
+
Steps: 2%|▏ | 16791/1000000 [9:38:23<2427:48:08, 8.89s/it, lr=1e-5, step_loss=0.00376]
Steps: 2%|▏ | 16792/1000000 [9:38:32<2421:20:50, 8.87s/it, lr=1e-5, step_loss=0.00376][RANK-0]: Step: [16792], local_loss=0.05571672320365906, train_loss=0.08401212841272354, time_cost=3.043674945831299
+
Steps: 2%|▏ | 16792/1000000 [9:38:32<2421:20:50, 8.87s/it, lr=1e-5, step_loss=0.0557]
Steps: 2%|▏ | 16793/1000000 [9:38:39<2287:27:36, 8.38s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [16793], local_loss=0.011570636183023453, train_loss=0.019228797405958176, time_cost=2.707183837890625
+
Steps: 2%|▏ | 16793/1000000 [9:38:39<2287:27:36, 8.38s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 16794/1000000 [9:38:47<2259:33:27, 8.27s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [16794], local_loss=0.005961504764854908, train_loss=0.051545076072216034, time_cost=1.8397631645202637
+
Steps: 2%|▏ | 16794/1000000 [9:38:47<2259:33:27, 8.27s/it, lr=1e-5, step_loss=0.00596]
Steps: 2%|▏ | 16795/1000000 [9:38:55<2222:06:20, 8.14s/it, lr=1e-5, step_loss=0.00596][RANK-0]: Step: [16795], local_loss=0.09182555973529816, train_loss=0.07961658388376236, time_cost=1.8553595542907715
+
Steps: 2%|▏ | 16795/1000000 [9:38:55<2222:06:20, 8.14s/it, lr=1e-5, step_loss=0.0918]
Steps: 2%|▏ | 16796/1000000 [9:39:01<2086:45:16, 7.64s/it, lr=1e-5, step_loss=0.0918][RANK-0]: Step: [16796], local_loss=0.007601594086736441, train_loss=0.06365078687667847, time_cost=2.8927555084228516
+
Steps: 2%|▏ | 16796/1000000 [9:39:01<2086:45:16, 7.64s/it, lr=1e-5, step_loss=0.0076]
Steps: 2%|▏ | 16797/1000000 [9:39:16<2637:28:32, 9.66s/it, lr=1e-5, step_loss=0.0076][RANK-0]: Step: [16797], local_loss=0.05989541485905647, train_loss=0.05645989626646042, time_cost=5.453205585479736
+
Steps: 2%|▏ | 16797/1000000 [9:39:16<2637:28:32, 9.66s/it, lr=1e-5, step_loss=0.0599]
Steps: 2%|▏ | 16798/1000000 [9:39:25<2613:21:43, 9.57s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [16798], local_loss=0.011922143399715424, train_loss=0.03014635294675827, time_cost=3.4666144847869873
+
Steps: 2%|▏ | 16798/1000000 [9:39:25<2613:21:43, 9.57s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 16799/1000000 [9:39:36<2751:35:52, 10.08s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [16799], local_loss=0.023921573534607887, train_loss=0.026847202330827713, time_cost=5.258177042007446
+
Steps: 2%|▏ | 16799/1000000 [9:39:36<2751:35:52, 10.08s/it, lr=1e-5, step_loss=0.0239]
Steps: 2%|▏ | 16800/1000000 [9:39:44<2508:47:48, 9.19s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [16800], local_loss=0.3324013948440552, train_loss=0.08026222139596939, time_cost=2.961400032043457
+
Steps: 2%|▏ | 16800/1000000 [9:39:44<2508:47:48, 9.19s/it, lr=1e-5, step_loss=0.332]
Steps: 2%|▏ | 16801/1000000 [9:39:51<2382:42:51, 8.72s/it, lr=1e-5, step_loss=0.332][RANK-0]: Step: [16801], local_loss=0.011392779648303986, train_loss=0.040711693465709686, time_cost=6.427207946777344
+
Steps: 2%|▏ | 16801/1000000 [9:39:51<2382:42:51, 8.72s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 16802/1000000 [9:39:56<2045:23:30, 7.49s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [16802], local_loss=0.08959644287824631, train_loss=0.03522532060742378, time_cost=1.6509325504302979
+
Steps: 2%|▏ | 16802/1000000 [9:39:56<2045:23:30, 7.49s/it, lr=1e-5, step_loss=0.0896]
Steps: 2%|▏ | 16803/1000000 [9:40:01<1883:55:54, 6.90s/it, lr=1e-5, step_loss=0.0896][RANK-0]: Step: [16803], local_loss=0.04277504235506058, train_loss=20.42205238342285, time_cost=2.7533226013183594
+
Steps: 2%|▏ | 16803/1000000 [9:40:01<1883:55:54, 6.90s/it, lr=1e-5, step_loss=0.0428]
Steps: 2%|▏ | 16804/1000000 [9:40:12<2212:05:19, 8.10s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [16804], local_loss=0.025109801441431046, train_loss=0.054257266223430634, time_cost=1.5202417373657227
+
Steps: 2%|▏ | 16804/1000000 [9:40:12<2212:05:19, 8.10s/it, lr=1e-5, step_loss=0.0251]
Steps: 2%|▏ | 16805/1000000 [9:40:22<2378:16:38, 8.71s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [16805], local_loss=0.008786258287727833, train_loss=0.02910982444882393, time_cost=1.9382569789886475
+
Steps: 2%|▏ | 16805/1000000 [9:40:22<2378:16:38, 8.71s/it, lr=1e-5, step_loss=0.00879]
Steps: 2%|▏ | 16806/1000000 [9:40:34<2610:49:58, 9.56s/it, lr=1e-5, step_loss=0.00879][RANK-0]: Step: [16806], local_loss=0.11010289192199707, train_loss=0.08191250264644623, time_cost=4.094695568084717
+
Steps: 2%|▏ | 16806/1000000 [9:40:34<2610:49:58, 9.56s/it, lr=1e-5, step_loss=0.11]
Steps: 2%|▏ | 16807/1000000 [9:40:49<3053:01:07, 11.18s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [16807], local_loss=0.005603412631899118, train_loss=0.05159187316894531, time_cost=1.3556382656097412
+
Steps: 2%|▏ | 16807/1000000 [9:40:49<3053:01:07, 11.18s/it, lr=1e-5, step_loss=0.0056]
Steps: 2%|▏ | 16808/1000000 [9:40:55<2644:13:50, 9.68s/it, lr=1e-5, step_loss=0.0056][RANK-0]: Step: [16808], local_loss=0.08757942169904709, train_loss=0.0426873154938221, time_cost=1.9650661945343018
+
Steps: 2%|▏ | 16808/1000000 [9:40:55<2644:13:50, 9.68s/it, lr=1e-5, step_loss=0.0876]
Steps: 2%|▏ | 16809/1000000 [9:41:04<2569:34:00, 9.41s/it, lr=1e-5, step_loss=0.0876][RANK-0]: Step: [16809], local_loss=0.021530376747250557, train_loss=0.1506900191307068, time_cost=2.1955230236053467
+
Steps: 2%|▏ | 16809/1000000 [9:41:04<2569:34:00, 9.41s/it, lr=1e-5, step_loss=0.0215]
Steps: 2%|▏ | 16810/1000000 [9:41:11<2351:47:48, 8.61s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [16810], local_loss=0.0260374266654253, train_loss=0.031175315380096436, time_cost=2.144820213317871
+
Steps: 2%|▏ | 16810/1000000 [9:41:11<2351:47:48, 8.61s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 16811/1000000 [9:41:21<2521:27:52, 9.23s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [16811], local_loss=0.027981620281934738, train_loss=0.03276568651199341, time_cost=1.5657477378845215
+
Steps: 2%|▏ | 16811/1000000 [9:41:21<2521:27:52, 9.23s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 16812/1000000 [9:41:35<2863:55:11, 10.49s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [16812], local_loss=0.00960454810410738, train_loss=0.021926168352365494, time_cost=4.013867378234863
+
Steps: 2%|▏ | 16812/1000000 [9:41:35<2863:55:11, 10.49s/it, lr=1e-5, step_loss=0.0096]
Steps: 2%|▏ | 16813/1000000 [9:41:43<2708:04:27, 9.92s/it, lr=1e-5, step_loss=0.0096][RANK-0]: Step: [16813], local_loss=0.0185324028134346, train_loss=0.028521373867988586, time_cost=6.6307079792022705
+
Steps: 2%|▏ | 16813/1000000 [9:41:43<2708:04:27, 9.92s/it, lr=1e-5, step_loss=0.0185]
Steps: 2%|▏ | 16814/1000000 [9:41:59<3151:36:55, 11.54s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [16814], local_loss=0.04021584987640381, train_loss=0.1581101268529892, time_cost=5.753091812133789
+
Steps: 2%|▏ | 16814/1000000 [9:41:59<3151:36:55, 11.54s/it, lr=1e-5, step_loss=0.0402]
Steps: 2%|▏ | 16815/1000000 [9:42:06<2817:22:15, 10.32s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [16815], local_loss=0.05369049310684204, train_loss=0.025822903960943222, time_cost=1.8541150093078613
+
Steps: 2%|▏ | 16815/1000000 [9:42:06<2817:22:15, 10.32s/it, lr=1e-5, step_loss=0.0537]
Steps: 2%|▏ | 16816/1000000 [9:42:11<2342:51:02, 8.58s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [16816], local_loss=0.2819383144378662, train_loss=0.18477892875671387, time_cost=2.035111665725708
+
Steps: 2%|▏ | 16816/1000000 [9:42:11<2342:51:02, 8.58s/it, lr=1e-5, step_loss=0.282]
Steps: 2%|▏ | 16817/1000000 [9:42:19<2375:04:07, 8.70s/it, lr=1e-5, step_loss=0.282][RANK-0]: Step: [16817], local_loss=0.00845531839877367, train_loss=0.026363667100667953, time_cost=1.5282249450683594
+
Steps: 2%|▏ | 16817/1000000 [9:42:19<2375:04:07, 8.70s/it, lr=1e-5, step_loss=0.00846]
Steps: 2%|▏ | 16818/1000000 [9:42:31<2568:36:28, 9.41s/it, lr=1e-5, step_loss=0.00846][RANK-0]: Step: [16818], local_loss=0.48241209983825684, train_loss=0.10144326835870743, time_cost=1.3145701885223389
+
Steps: 2%|▏ | 16818/1000000 [9:42:31<2568:36:28, 9.41s/it, lr=1e-5, step_loss=0.482]
Steps: 2%|▏ | 16819/1000000 [9:42:42<2728:07:44, 9.99s/it, lr=1e-5, step_loss=0.482][RANK-0]: Step: [16819], local_loss=0.031051596626639366, train_loss=0.1451377272605896, time_cost=3.426945447921753
+
Steps: 2%|▏ | 16819/1000000 [9:42:42<2728:07:44, 9.99s/it, lr=1e-5, step_loss=0.0311]
Steps: 2%|▏ | 16820/1000000 [9:42:49<2475:06:12, 9.06s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [16820], local_loss=0.021993456408381462, train_loss=0.15743917226791382, time_cost=2.438875436782837
+
Steps: 2%|▏ | 16820/1000000 [9:42:49<2475:06:12, 9.06s/it, lr=1e-5, step_loss=0.022]
Steps: 2%|▏ | 16821/1000000 [9:42:55<2218:13:38, 8.12s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [16821], local_loss=0.00481280405074358, train_loss=0.07127887010574341, time_cost=2.200249671936035
+
Steps: 2%|▏ | 16821/1000000 [9:42:55<2218:13:38, 8.12s/it, lr=1e-5, step_loss=0.00481]
Steps: 2%|▏ | 16822/1000000 [9:43:08<2603:19:44, 9.53s/it, lr=1e-5, step_loss=0.00481][RANK-0]: Step: [16822], local_loss=0.029355734586715698, train_loss=0.0256043653935194, time_cost=5.225265741348267
+
Steps: 2%|▏ | 16822/1000000 [9:43:08<2603:19:44, 9.53s/it, lr=1e-5, step_loss=0.0294]
Steps: 2%|▏ | 16823/1000000 [9:43:22<3021:03:21, 11.06s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [16823], local_loss=0.018413037061691284, train_loss=0.059719666838645935, time_cost=5.767308235168457
+
Steps: 2%|▏ | 16823/1000000 [9:43:22<3021:03:21, 11.06s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 16824/1000000 [9:43:34<3061:50:11, 11.21s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [16824], local_loss=0.01877656579017639, train_loss=0.03849131613969803, time_cost=4.042956590652466
+
Steps: 2%|▏ | 16824/1000000 [9:43:34<3061:50:11, 11.21s/it, lr=1e-5, step_loss=0.0188]
Steps: 2%|▏ | 16825/1000000 [9:43:39<2572:44:58, 9.42s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [16825], local_loss=0.007430835627019405, train_loss=0.04268375039100647, time_cost=1.7605128288269043
+
Steps: 2%|▏ | 16825/1000000 [9:43:39<2572:44:58, 9.42s/it, lr=1e-5, step_loss=0.00743]
Steps: 2%|▏ | 16826/1000000 [9:43:46<2410:08:46, 8.83s/it, lr=1e-5, step_loss=0.00743][RANK-0]: Step: [16826], local_loss=0.005725751630961895, train_loss=0.01788106933236122, time_cost=3.360121250152588
+
Steps: 2%|▏ | 16826/1000000 [9:43:46<2410:08:46, 8.83s/it, lr=1e-5, step_loss=0.00573]
Steps: 2%|▏ | 16827/1000000 [9:43:52<2138:09:57, 7.83s/it, lr=1e-5, step_loss=0.00573][RANK-0]: Step: [16827], local_loss=0.013350785709917545, train_loss=0.032263558357954025, time_cost=2.995133638381958
+
Steps: 2%|▏ | 16827/1000000 [9:43:52<2138:09:57, 7.83s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 16828/1000000 [9:44:05<2569:32:48, 9.41s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [16828], local_loss=0.033674515783786774, train_loss=0.03736069053411484, time_cost=3.0437803268432617
+
Steps: 2%|▏ | 16828/1000000 [9:44:05<2569:32:48, 9.41s/it, lr=1e-5, step_loss=0.0337]
Steps: 2%|▏ | 16829/1000000 [9:44:12<2389:48:43, 8.75s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [16829], local_loss=0.00837252289056778, train_loss=0.02343188226222992, time_cost=1.8829305171966553
+
Steps: 2%|▏ | 16829/1000000 [9:44:12<2389:48:43, 8.75s/it, lr=1e-5, step_loss=0.00837]
Steps: 2%|▏ | 16830/1000000 [9:44:29<3007:53:37, 11.01s/it, lr=1e-5, step_loss=0.00837][RANK-0]: Step: [16830], local_loss=0.08464410901069641, train_loss=0.037526685744524, time_cost=8.726388216018677
+
Steps: 2%|▏ | 16830/1000000 [9:44:29<3007:53:37, 11.01s/it, lr=1e-5, step_loss=0.0846]
Steps: 2%|▏ | 16831/1000000 [9:44:41<3095:55:15, 11.34s/it, lr=1e-5, step_loss=0.0846][RANK-0]: Step: [16831], local_loss=0.025796102359890938, train_loss=0.031265731900930405, time_cost=9.143645763397217
+
Steps: 2%|▏ | 16831/1000000 [9:44:41<3095:55:15, 11.34s/it, lr=1e-5, step_loss=0.0258]
Steps: 2%|▏ | 16832/1000000 [9:44:52<3087:56:12, 11.31s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [16832], local_loss=0.04379289597272873, train_loss=0.05948416143655777, time_cost=4.154433250427246
+
Steps: 2%|▏ | 16832/1000000 [9:44:52<3087:56:12, 11.31s/it, lr=1e-5, step_loss=0.0438]
Steps: 2%|▏ | 16833/1000000 [9:44:59<2753:42:04, 10.08s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [16833], local_loss=0.02130870148539543, train_loss=0.0411200113594532, time_cost=3.613234281539917
+
Steps: 2%|▏ | 16833/1000000 [9:44:59<2753:42:04, 10.08s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 16834/1000000 [9:45:08<2668:54:52, 9.77s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [16834], local_loss=0.004011925775557756, train_loss=0.04002797603607178, time_cost=3.5627026557922363
+
Steps: 2%|▏ | 16834/1000000 [9:45:08<2668:54:52, 9.77s/it, lr=1e-5, step_loss=0.00401]
Steps: 2%|▏ | 16835/1000000 [9:45:16<2474:23:35, 9.06s/it, lr=1e-5, step_loss=0.00401][RANK-0]: Step: [16835], local_loss=0.004012772347778082, train_loss=4.78753662109375, time_cost=1.8745543956756592
+
Steps: 2%|▏ | 16835/1000000 [9:45:16<2474:23:35, 9.06s/it, lr=1e-5, step_loss=0.00401]
Steps: 2%|▏ | 16836/1000000 [9:45:23<2343:03:12, 8.58s/it, lr=1e-5, step_loss=0.00401][RANK-0]: Step: [16836], local_loss=0.051347240805625916, train_loss=0.09170135110616684, time_cost=1.448800802230835
+
Steps: 2%|▏ | 16836/1000000 [9:45:23<2343:03:12, 8.58s/it, lr=1e-5, step_loss=0.0513]
Steps: 2%|▏ | 16837/1000000 [9:45:29<2121:37:13, 7.77s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [16837], local_loss=0.015141066163778305, train_loss=0.021987877786159515, time_cost=1.2713675498962402
+
Steps: 2%|▏ | 16837/1000000 [9:45:29<2121:37:13, 7.77s/it, lr=1e-5, step_loss=0.0151]
Steps: 2%|▏ | 16838/1000000 [9:45:40<2389:46:34, 8.75s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [16838], local_loss=0.027295999228954315, train_loss=0.04938770830631256, time_cost=3.148534059524536
+
Steps: 2%|▏ | 16838/1000000 [9:45:40<2389:46:34, 8.75s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 16839/1000000 [9:45:49<2406:43:41, 8.81s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [16839], local_loss=170.931884765625, train_loss=21.45111656188965, time_cost=3.0836100578308105
+
Steps: 2%|▏ | 16839/1000000 [9:45:49<2406:43:41, 8.81s/it, lr=1e-5, step_loss=171]
Steps: 2%|▏ | 16840/1000000 [9:45:59<2527:53:38, 9.26s/it, lr=1e-5, step_loss=171][RANK-0]: Step: [16840], local_loss=0.02854694426059723, train_loss=0.06500832736492157, time_cost=4.700765132904053
+
Steps: 2%|▏ | 16840/1000000 [9:45:59<2527:53:38, 9.26s/it, lr=1e-5, step_loss=0.0285]
Steps: 2%|▏ | 16841/1000000 [9:46:08<2508:14:04, 9.18s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [16841], local_loss=0.03218662366271019, train_loss=0.034185826778411865, time_cost=3.1751554012298584
+
Steps: 2%|▏ | 16841/1000000 [9:46:08<2508:14:04, 9.18s/it, lr=1e-5, step_loss=0.0322]
Steps: 2%|▏ | 16842/1000000 [9:46:18<2566:36:16, 9.40s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [16842], local_loss=0.010663144290447235, train_loss=0.029430285096168518, time_cost=3.668519973754883
+
Steps: 2%|▏ | 16842/1000000 [9:46:18<2566:36:16, 9.40s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 16843/1000000 [9:46:25<2389:32:39, 8.75s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [16843], local_loss=0.04512270912528038, train_loss=0.048517532646656036, time_cost=1.4021046161651611
+
Steps: 2%|▏ | 16843/1000000 [9:46:25<2389:32:39, 8.75s/it, lr=1e-5, step_loss=0.0451]
Steps: 2%|▏ | 16844/1000000 [9:46:30<2090:18:39, 7.65s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [16844], local_loss=0.017505407333374023, train_loss=0.01906895637512207, time_cost=3.8089826107025146
+
Steps: 2%|▏ | 16844/1000000 [9:46:30<2090:18:39, 7.65s/it, lr=1e-5, step_loss=0.0175]
Steps: 2%|▏ | 16845/1000000 [9:46:36<1893:59:15, 6.94s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [16845], local_loss=0.029879160225391388, train_loss=0.04277973249554634, time_cost=2.216287612915039
+
Steps: 2%|▏ | 16845/1000000 [9:46:36<1893:59:15, 6.94s/it, lr=1e-5, step_loss=0.0299]
Steps: 2%|▏ | 16846/1000000 [9:46:51<2558:33:27, 9.37s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [16846], local_loss=52.886451721191406, train_loss=6.629560470581055, time_cost=6.795530319213867
+
Steps: 2%|▏ | 16846/1000000 [9:46:51<2558:33:27, 9.37s/it, lr=1e-5, step_loss=52.9]
Steps: 2%|▏ | 16847/1000000 [9:46:55<2167:15:53, 7.94s/it, lr=1e-5, step_loss=52.9][RANK-0]: Step: [16847], local_loss=0.009143337607383728, train_loss=0.027794163674116135, time_cost=2.0056350231170654
+
Steps: 2%|▏ | 16847/1000000 [9:46:55<2167:15:53, 7.94s/it, lr=1e-5, step_loss=0.00914]
Steps: 2%|▏ | 16848/1000000 [9:47:12<2867:58:08, 10.50s/it, lr=1e-5, step_loss=0.00914][RANK-0]: Step: [16848], local_loss=0.019628874957561493, train_loss=0.051183588802814484, time_cost=6.825024366378784
+
Steps: 2%|▏ | 16848/1000000 [9:47:12<2867:58:08, 10.50s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 16849/1000000 [9:47:20<2713:15:06, 9.94s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [16849], local_loss=0.04522543400526047, train_loss=0.020490597933530807, time_cost=6.14904522895813
+
Steps: 2%|▏ | 16849/1000000 [9:47:20<2713:15:06, 9.94s/it, lr=1e-5, step_loss=0.0452]
Steps: 2%|▏ | 16850/1000000 [9:47:33<2962:46:35, 10.85s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [16850], local_loss=0.020612619817256927, train_loss=0.03838159143924713, time_cost=4.009426116943359
+
Steps: 2%|▏ | 16850/1000000 [9:47:33<2962:46:35, 10.85s/it, lr=1e-5, step_loss=0.0206]
Steps: 2%|▏ | 16851/1000000 [9:47:38<2483:32:46, 9.09s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [16851], local_loss=0.03452795743942261, train_loss=0.03365089371800423, time_cost=2.4661507606506348
+
Steps: 2%|▏ | 16851/1000000 [9:47:38<2483:32:46, 9.09s/it, lr=1e-5, step_loss=0.0345]
Steps: 2%|▏ | 16852/1000000 [9:47:49<2616:10:02, 9.58s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [16852], local_loss=0.004831231664866209, train_loss=0.03692338615655899, time_cost=1.614769458770752
+
Steps: 2%|▏ | 16852/1000000 [9:47:49<2616:10:02, 9.58s/it, lr=1e-5, step_loss=0.00483]
Steps: 2%|▏ | 16853/1000000 [9:48:03<2959:26:26, 10.84s/it, lr=1e-5, step_loss=0.00483][RANK-0]: Step: [16853], local_loss=0.010134914889931679, train_loss=0.020482193678617477, time_cost=5.618640661239624
+
Steps: 2%|▏ | 16853/1000000 [9:48:03<2959:26:26, 10.84s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 16854/1000000 [9:48:15<3086:12:54, 11.30s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [16854], local_loss=0.007224433124065399, train_loss=0.0160696841776371, time_cost=8.71878981590271
+
Steps: 2%|▏ | 16854/1000000 [9:48:15<3086:12:54, 11.30s/it, lr=1e-5, step_loss=0.00722]
Steps: 2%|▏ | 16855/1000000 [9:48:27<3122:07:14, 11.43s/it, lr=1e-5, step_loss=0.00722][RANK-0]: Step: [16855], local_loss=0.1981453150510788, train_loss=0.06649823486804962, time_cost=2.9005184173583984
+
Steps: 2%|▏ | 16855/1000000 [9:48:27<3122:07:14, 11.43s/it, lr=1e-5, step_loss=0.198]
Steps: 2%|▏ | 16856/1000000 [9:48:42<3440:18:00, 12.60s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [16856], local_loss=0.00821330863982439, train_loss=0.016429714858531952, time_cost=6.076308727264404
+
Steps: 2%|▏ | 16856/1000000 [9:48:42<3440:18:00, 12.60s/it, lr=1e-5, step_loss=0.00821]
Steps: 2%|▏ | 16857/1000000 [9:48:53<3293:11:20, 12.06s/it, lr=1e-5, step_loss=0.00821][RANK-0]: Step: [16857], local_loss=0.014148414134979248, train_loss=0.015206672251224518, time_cost=1.489661693572998
+
Steps: 2%|▏ | 16857/1000000 [9:48:53<3293:11:20, 12.06s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 16858/1000000 [9:49:02<3040:34:40, 11.13s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [16858], local_loss=0.021900424733757973, train_loss=0.023979410529136658, time_cost=2.6817128658294678
+
Steps: 2%|▏ | 16858/1000000 [9:49:02<3040:34:40, 11.13s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 16859/1000000 [9:49:07<2538:31:07, 9.30s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [16859], local_loss=0.00392699521034956, train_loss=0.03106461837887764, time_cost=3.7970292568206787
+
Steps: 2%|▏ | 16859/1000000 [9:49:07<2538:31:07, 9.30s/it, lr=1e-5, step_loss=0.00393]
Steps: 2%|▏ | 16860/1000000 [9:49:14<2380:08:03, 8.72s/it, lr=1e-5, step_loss=0.00393][RANK-0]: Step: [16860], local_loss=0.023158259689807892, train_loss=0.0281795933842659, time_cost=5.46075177192688
+
Steps: 2%|▏ | 16860/1000000 [9:49:14<2380:08:03, 8.72s/it, lr=1e-5, step_loss=0.0232]
Steps: 2%|▏ | 16861/1000000 [9:49:26<2578:38:10, 9.44s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [16861], local_loss=0.055632948875427246, train_loss=0.03614193573594093, time_cost=4.642180442810059
+
Steps: 2%|▏ | 16861/1000000 [9:49:26<2578:38:10, 9.44s/it, lr=1e-5, step_loss=0.0556]
Steps: 2%|▏ | 16862/1000000 [9:49:40<2953:38:54, 10.82s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [16862], local_loss=0.3921395540237427, train_loss=0.06163328140974045, time_cost=5.152036666870117
+
Steps: 2%|▏ | 16862/1000000 [9:49:40<2953:38:54, 10.82s/it, lr=1e-5, step_loss=0.392]
Steps: 2%|▏ | 16863/1000000 [9:49:46<2604:55:31, 9.54s/it, lr=1e-5, step_loss=0.392][RANK-0]: Step: [16863], local_loss=0.014069970697164536, train_loss=0.07852346450090408, time_cost=1.2273006439208984
+
Steps: 2%|▏ | 16863/1000000 [9:49:46<2604:55:31, 9.54s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 16864/1000000 [9:49:52<2313:13:15, 8.47s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [16864], local_loss=0.0075039733201265335, train_loss=0.04404452443122864, time_cost=1.632190227508545
+
Steps: 2%|▏ | 16864/1000000 [9:49:52<2313:13:15, 8.47s/it, lr=1e-5, step_loss=0.0075]
Steps: 2%|▏ | 16865/1000000 [9:50:04<2609:07:42, 9.55s/it, lr=1e-5, step_loss=0.0075][RANK-0]: Step: [16865], local_loss=0.010401898995041847, train_loss=0.057674430310726166, time_cost=5.265295743942261
+
Steps: 2%|▏ | 16865/1000000 [9:50:04<2609:07:42, 9.55s/it, lr=1e-5, step_loss=0.0104]
Steps: 2%|▏ | 16866/1000000 [9:50:11<2413:34:29, 8.84s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [16866], local_loss=0.03416110575199127, train_loss=0.03597057983279228, time_cost=1.3939368724822998
+
Steps: 2%|▏ | 16866/1000000 [9:50:11<2413:34:29, 8.84s/it, lr=1e-5, step_loss=0.0342]
Steps: 2%|▏ | 16867/1000000 [9:50:20<2393:10:25, 8.76s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [16867], local_loss=0.018306812271475792, train_loss=0.19290317595005035, time_cost=1.2359473705291748
+
Steps: 2%|▏ | 16867/1000000 [9:50:20<2393:10:25, 8.76s/it, lr=1e-5, step_loss=0.0183]
Steps: 2%|▏ | 16868/1000000 [9:50:30<2456:05:33, 8.99s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [16868], local_loss=0.004973333328962326, train_loss=0.03236265480518341, time_cost=3.1873931884765625
+
Steps: 2%|▏ | 16868/1000000 [9:50:30<2456:05:33, 8.99s/it, lr=1e-5, step_loss=0.00497]
Steps: 2%|▏ | 16869/1000000 [9:50:40<2558:38:54, 9.37s/it, lr=1e-5, step_loss=0.00497][RANK-0]: Step: [16869], local_loss=0.014814160764217377, train_loss=0.04245396703481674, time_cost=2.5796525478363037
+
Steps: 2%|▏ | 16869/1000000 [9:50:40<2558:38:54, 9.37s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 16870/1000000 [9:50:44<2134:33:23, 7.82s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [16870], local_loss=0.005035326350480318, train_loss=0.034070782363414764, time_cost=1.6015634536743164
+
Steps: 2%|▏ | 16870/1000000 [9:50:44<2134:33:23, 7.82s/it, lr=1e-5, step_loss=0.00504]
Steps: 2%|▏ | 16871/1000000 [9:50:56<2520:30:07, 9.23s/it, lr=1e-5, step_loss=0.00504][RANK-0]: Step: [16871], local_loss=0.005238727200776339, train_loss=0.06129710003733635, time_cost=4.917575120925903
+
Steps: 2%|▏ | 16871/1000000 [9:50:56<2520:30:07, 9.23s/it, lr=1e-5, step_loss=0.00524]
Steps: 2%|▏ | 16872/1000000 [9:51:04<2402:14:12, 8.80s/it, lr=1e-5, step_loss=0.00524][RANK-0]: Step: [16872], local_loss=0.040414948016405106, train_loss=0.05163487046957016, time_cost=1.2130708694458008
+
Steps: 2%|▏ | 16872/1000000 [9:51:04<2402:14:12, 8.80s/it, lr=1e-5, step_loss=0.0404]
Steps: 2%|▏ | 16873/1000000 [9:51:09<2104:44:25, 7.71s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [16873], local_loss=0.0714605525135994, train_loss=0.07454252243041992, time_cost=1.5736477375030518
+
Steps: 2%|▏ | 16873/1000000 [9:51:09<2104:44:25, 7.71s/it, lr=1e-5, step_loss=0.0715]
Steps: 2%|▏ | 16874/1000000 [9:51:19<2259:48:49, 8.27s/it, lr=1e-5, step_loss=0.0715][RANK-0]: Step: [16874], local_loss=0.004982516635209322, train_loss=0.06357777118682861, time_cost=4.766327857971191
+
Steps: 2%|▏ | 16874/1000000 [9:51:19<2259:48:49, 8.27s/it, lr=1e-5, step_loss=0.00498]
Steps: 2%|▏ | 16875/1000000 [9:51:30<2450:45:29, 8.97s/it, lr=1e-5, step_loss=0.00498][RANK-0]: Step: [16875], local_loss=0.007898683659732342, train_loss=0.014168348163366318, time_cost=1.5174312591552734
+
Steps: 2%|▏ | 16875/1000000 [9:51:30<2450:45:29, 8.97s/it, lr=1e-5, step_loss=0.0079]
Steps: 2%|▏ | 16876/1000000 [9:51:34<2081:57:39, 7.62s/it, lr=1e-5, step_loss=0.0079][RANK-0]: Step: [16876], local_loss=0.007854444906115532, train_loss=0.06436655670404434, time_cost=1.838360071182251
+
Steps: 2%|▏ | 16876/1000000 [9:51:34<2081:57:39, 7.62s/it, lr=1e-5, step_loss=0.00785]
Steps: 2%|▏ | 16877/1000000 [9:51:41<2028:50:24, 7.43s/it, lr=1e-5, step_loss=0.00785][RANK-0]: Step: [16877], local_loss=0.01504949014633894, train_loss=0.05447465181350708, time_cost=1.2668004035949707
+
Steps: 2%|▏ | 16877/1000000 [9:51:41<2028:50:24, 7.43s/it, lr=1e-5, step_loss=0.015]
Steps: 2%|▏ | 16878/1000000 [9:51:46<1831:35:56, 6.71s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [16878], local_loss=0.010496236383914948, train_loss=0.025349173694849014, time_cost=2.284853935241699
+
Steps: 2%|▏ | 16878/1000000 [9:51:46<1831:35:56, 6.71s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 16879/1000000 [9:51:53<1882:58:45, 6.90s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [16879], local_loss=0.005326290614902973, train_loss=0.04488619044423103, time_cost=4.680625915527344
+
Steps: 2%|▏ | 16879/1000000 [9:51:53<1882:58:45, 6.90s/it, lr=1e-5, step_loss=0.00533]
Steps: 2%|▏ | 16880/1000000 [9:52:04<2182:25:07, 7.99s/it, lr=1e-5, step_loss=0.00533][RANK-0]: Step: [16880], local_loss=0.0094703184440732, train_loss=0.05746385455131531, time_cost=5.318007230758667
+
Steps: 2%|▏ | 16880/1000000 [9:52:04<2182:25:07, 7.99s/it, lr=1e-5, step_loss=0.00947]
Steps: 2%|▏ | 16881/1000000 [9:52:14<2368:27:08, 8.67s/it, lr=1e-5, step_loss=0.00947][RANK-0]: Step: [16881], local_loss=0.007111517246812582, train_loss=0.04211118817329407, time_cost=2.192270278930664
+
Steps: 2%|▏ | 16881/1000000 [9:52:14<2368:27:08, 8.67s/it, lr=1e-5, step_loss=0.00711]
Steps: 2%|▏ | 16882/1000000 [9:52:20<2146:45:33, 7.86s/it, lr=1e-5, step_loss=0.00711][RANK-0]: Step: [16882], local_loss=0.01312874536961317, train_loss=0.15125684440135956, time_cost=1.583341121673584
+
Steps: 2%|▏ | 16882/1000000 [9:52:20<2146:45:33, 7.86s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 16883/1000000 [9:52:28<2100:31:16, 7.69s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [16883], local_loss=0.04646027088165283, train_loss=0.0566142275929451, time_cost=5.416013240814209
+
Steps: 2%|▏ | 16883/1000000 [9:52:28<2100:31:16, 7.69s/it, lr=1e-5, step_loss=0.0465]
Steps: 2%|▏ | 16884/1000000 [9:52:32<1824:35:44, 6.68s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [16884], local_loss=0.06151527538895607, train_loss=0.1261001080274582, time_cost=1.725050926208496
+
Steps: 2%|▏ | 16884/1000000 [9:52:32<1824:35:44, 6.68s/it, lr=1e-5, step_loss=0.0615]
Steps: 2%|▏ | 16885/1000000 [9:52:45<2331:21:38, 8.54s/it, lr=1e-5, step_loss=0.0615][RANK-0]: Step: [16885], local_loss=0.01733083836734295, train_loss=0.06725699454545975, time_cost=4.497185230255127
+
Steps: 2%|▏ | 16885/1000000 [9:52:45<2331:21:38, 8.54s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 16886/1000000 [9:52:58<2724:56:36, 9.98s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [16886], local_loss=0.008848311379551888, train_loss=0.017240438610315323, time_cost=1.3299202919006348
+
Steps: 2%|▏ | 16886/1000000 [9:52:58<2724:56:36, 9.98s/it, lr=1e-5, step_loss=0.00885]
Steps: 2%|▏ | 16887/1000000 [9:53:05<2513:09:22, 9.20s/it, lr=1e-5, step_loss=0.00885][RANK-0]: Step: [16887], local_loss=0.036924682557582855, train_loss=0.029020866379141808, time_cost=1.9462482929229736
+
Steps: 2%|▏ | 16887/1000000 [9:53:05<2513:09:22, 9.20s/it, lr=1e-5, step_loss=0.0369]
Steps: 2%|▏ | 16888/1000000 [9:53:16<2642:51:10, 9.68s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [16888], local_loss=0.013914113864302635, train_loss=0.025608476251363754, time_cost=5.956508159637451
+
Steps: 2%|▏ | 16888/1000000 [9:53:16<2642:51:10, 9.68s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 16889/1000000 [9:53:22<2349:28:23, 8.60s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [16889], local_loss=0.023053674027323723, train_loss=0.026504958048462868, time_cost=2.110142707824707
+
Steps: 2%|▏ | 16889/1000000 [9:53:22<2349:28:23, 8.60s/it, lr=1e-5, step_loss=0.0231]
Steps: 2%|▏ | 16890/1000000 [9:53:37<2848:58:32, 10.43s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [16890], local_loss=0.028266532346606255, train_loss=0.022986605763435364, time_cost=3.2244131565093994
+
Steps: 2%|▏ | 16890/1000000 [9:53:37<2848:58:32, 10.43s/it, lr=1e-5, step_loss=0.0283]
Steps: 2%|▏ | 16891/1000000 [9:53:46<2736:14:40, 10.02s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [16891], local_loss=0.006639075465500355, train_loss=0.030163871124386787, time_cost=3.888038158416748
+
Steps: 2%|▏ | 16891/1000000 [9:53:46<2736:14:40, 10.02s/it, lr=1e-5, step_loss=0.00664]
Steps: 2%|▏ | 16892/1000000 [9:53:52<2362:30:15, 8.65s/it, lr=1e-5, step_loss=0.00664][RANK-0]: Step: [16892], local_loss=0.03168387711048126, train_loss=0.04056629538536072, time_cost=1.783351182937622
+
Steps: 2%|▏ | 16892/1000000 [9:53:52<2362:30:15, 8.65s/it, lr=1e-5, step_loss=0.0317] /home/image_data/hxy/Open-Sora-Plan/opensora/utils/utils.py:369: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
+ caption = BeautifulSoup(caption, features='html.parser').text
+
Steps: 2%|▏ | 16893/1000000 [9:54:03<2563:56:45, 9.39s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [16893], local_loss=0.012530133128166199, train_loss=0.02922319620847702, time_cost=1.9262630939483643
+
Steps: 2%|▏ | 16893/1000000 [9:54:03<2563:56:45, 9.39s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 16894/1000000 [9:54:18<3054:07:41, 11.18s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [16894], local_loss=0.0967797338962555, train_loss=0.05058060213923454, time_cost=7.799904108047485
+
Steps: 2%|▏ | 16894/1000000 [9:54:18<3054:07:41, 11.18s/it, lr=1e-5, step_loss=0.0968]
Steps: 2%|▏ | 16895/1000000 [9:54:29<3052:58:48, 11.18s/it, lr=1e-5, step_loss=0.0968][RANK-0]: Step: [16895], local_loss=0.04476756975054741, train_loss=0.021468836814165115, time_cost=3.865584373474121
+
Steps: 2%|▏ | 16895/1000000 [9:54:29<3052:58:48, 11.18s/it, lr=1e-5, step_loss=0.0448]
Steps: 2%|▏ | 16896/1000000 [9:54:39<2921:54:42, 10.70s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [16896], local_loss=0.011538650840520859, train_loss=0.038902610540390015, time_cost=4.384485960006714
+
Steps: 2%|▏ | 16896/1000000 [9:54:39<2921:54:42, 10.70s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 16897/1000000 [9:54:50<2933:10:07, 10.74s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [16897], local_loss=0.004901539534330368, train_loss=0.013323131948709488, time_cost=4.6193528175354
+
Steps: 2%|▏ | 16897/1000000 [9:54:50<2933:10:07, 10.74s/it, lr=1e-5, step_loss=0.0049]
Steps: 2%|▏ | 16898/1000000 [9:54:56<2609:59:21, 9.56s/it, lr=1e-5, step_loss=0.0049][RANK-0]: Step: [16898], local_loss=0.022280031815171242, train_loss=0.0535908006131649, time_cost=2.1326711177825928
+
Steps: 2%|▏ | 16898/1000000 [9:54:56<2609:59:21, 9.56s/it, lr=1e-5, step_loss=0.0223]
Steps: 2%|▏ | 16899/1000000 [9:55:02<2290:18:42, 8.39s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [16899], local_loss=0.005996644031256437, train_loss=0.049926720559597015, time_cost=4.469437837600708
+
Steps: 2%|▏ | 16899/1000000 [9:55:02<2290:18:42, 8.39s/it, lr=1e-5, step_loss=0.006]
Steps: 2%|▏ | 16900/1000000 [9:55:11<2371:30:26, 8.68s/it, lr=1e-5, step_loss=0.006][RANK-0]: Step: [16900], local_loss=0.0634877160191536, train_loss=0.02842779830098152, time_cost=3.779961347579956
+
Steps: 2%|▏ | 16900/1000000 [9:55:11<2371:30:26, 8.68s/it, lr=1e-5, step_loss=0.0635]
Steps: 2%|▏ | 16901/1000000 [9:55:19<2273:21:52, 8.32s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [16901], local_loss=0.014493877999484539, train_loss=0.045488204807043076, time_cost=3.926111936569214
+
Steps: 2%|▏ | 16901/1000000 [9:55:19<2273:21:52, 8.32s/it, lr=1e-5, step_loss=0.0145]
Steps: 2%|▏ | 16902/1000000 [9:55:34<2863:56:45, 10.49s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [16902], local_loss=0.030121654272079468, train_loss=0.018798604607582092, time_cost=12.2548828125
+
Steps: 2%|▏ | 16902/1000000 [9:55:34<2863:56:45, 10.49s/it, lr=1e-5, step_loss=0.0301]
Steps: 2%|▏ | 16903/1000000 [9:55:46<2941:09:11, 10.77s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [16903], local_loss=0.01518678106367588, train_loss=0.01811130903661251, time_cost=2.8539693355560303
+
Steps: 2%|▏ | 16903/1000000 [9:55:46<2941:09:11, 10.77s/it, lr=1e-5, step_loss=0.0152]
Steps: 2%|▏ | 16904/1000000 [9:55:50<2433:53:39, 8.91s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [16904], local_loss=0.02434595301747322, train_loss=0.015188589692115784, time_cost=1.8773887157440186
+
Steps: 2%|▏ | 16904/1000000 [9:55:50<2433:53:39, 8.91s/it, lr=1e-5, step_loss=0.0243]
Steps: 2%|▏ | 16905/1000000 [9:55:56<2140:40:19, 7.84s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [16905], local_loss=0.0148770147934556, train_loss=0.02097538858652115, time_cost=1.7404499053955078
+
Steps: 2%|▏ | 16905/1000000 [9:55:56<2140:40:19, 7.84s/it, lr=1e-5, step_loss=0.0149]
Steps: 2%|▏ | 16906/1000000 [9:56:08<2538:48:35, 9.30s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [16906], local_loss=0.039507847279310226, train_loss=0.1965041309595108, time_cost=5.956130027770996
+
Steps: 2%|▏ | 16906/1000000 [9:56:09<2538:48:35, 9.30s/it, lr=1e-5, step_loss=0.0395]
Steps: 2%|▏ | 16907/1000000 [9:56:21<2806:16:28, 10.28s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [16907], local_loss=0.04088054969906807, train_loss=0.024403348565101624, time_cost=1.1884722709655762
+
Steps: 2%|▏ | 16907/1000000 [9:56:21<2806:16:28, 10.28s/it, lr=1e-5, step_loss=0.0409]
Steps: 2%|▏ | 16908/1000000 [9:56:36<3214:58:53, 11.77s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [16908], local_loss=0.02316349744796753, train_loss=0.017910756170749664, time_cost=7.70176887512207
+
Steps: 2%|▏ | 16908/1000000 [9:56:36<3214:58:53, 11.77s/it, lr=1e-5, step_loss=0.0232]
Steps: 2%|▏ | 16909/1000000 [9:56:41<2666:15:28, 9.76s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [16909], local_loss=0.005215676501393318, train_loss=0.04737866297364235, time_cost=2.931572198867798
+
Steps: 2%|▏ | 16909/1000000 [9:56:41<2666:15:28, 9.76s/it, lr=1e-5, step_loss=0.00522]
Steps: 2%|▏ | 16910/1000000 [9:56:47<2290:42:37, 8.39s/it, lr=1e-5, step_loss=0.00522][RANK-0]: Step: [16910], local_loss=0.03831646963953972, train_loss=0.02327214926481247, time_cost=1.7869873046875
+
Steps: 2%|▏ | 16910/1000000 [9:56:47<2290:42:37, 8.39s/it, lr=1e-5, step_loss=0.0383]
Steps: 2%|▏ | 16911/1000000 [9:56:56<2379:07:13, 8.71s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [16911], local_loss=0.005871286615729332, train_loss=26.70825958251953, time_cost=3.6944141387939453
+
Steps: 2%|▏ | 16911/1000000 [9:56:56<2379:07:13, 8.71s/it, lr=1e-5, step_loss=0.00587]
Steps: 2%|▏ | 16912/1000000 [9:57:04<2318:35:41, 8.49s/it, lr=1e-5, step_loss=0.00587][RANK-0]: Step: [16912], local_loss=0.013582433573901653, train_loss=0.0307847261428833, time_cost=1.1962921619415283
+
Steps: 2%|▏ | 16912/1000000 [9:57:04<2318:35:41, 8.49s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 16913/1000000 [9:57:09<2044:46:50, 7.49s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [16913], local_loss=0.05397747829556465, train_loss=0.026350781321525574, time_cost=4.3944103717803955
+
Steps: 2%|▏ | 16913/1000000 [9:57:09<2044:46:50, 7.49s/it, lr=1e-5, step_loss=0.054]
Steps: 2%|▏ | 16914/1000000 [9:57:18<2186:28:32, 8.01s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [16914], local_loss=0.14307451248168945, train_loss=0.045683518052101135, time_cost=3.6703741550445557
+
Steps: 2%|▏ | 16914/1000000 [9:57:18<2186:28:32, 8.01s/it, lr=1e-5, step_loss=0.143]
Steps: 2%|▏ | 16915/1000000 [9:57:25<2109:06:47, 7.72s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [16915], local_loss=0.04799384996294975, train_loss=0.06251519173383713, time_cost=1.6848962306976318
+
Steps: 2%|▏ | 16915/1000000 [9:57:25<2109:06:47, 7.72s/it, lr=1e-5, step_loss=0.048]
Steps: 2%|▏ | 16916/1000000 [9:57:39<2583:24:13, 9.46s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [16916], local_loss=0.020271368324756622, train_loss=0.05673827975988388, time_cost=3.8025310039520264
+
Steps: 2%|▏ | 16916/1000000 [9:57:39<2583:24:13, 9.46s/it, lr=1e-5, step_loss=0.0203]
Steps: 2%|▏ | 16917/1000000 [9:57:50<2738:16:21, 10.03s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [16917], local_loss=0.00519444840028882, train_loss=0.06046123057603836, time_cost=4.665292739868164
+
Steps: 2%|▏ | 16917/1000000 [9:57:50<2738:16:21, 10.03s/it, lr=1e-5, step_loss=0.00519]
Steps: 2%|▏ | 16918/1000000 [9:57:59<2643:46:37, 9.68s/it, lr=1e-5, step_loss=0.00519][RANK-0]: Step: [16918], local_loss=0.08972971886396408, train_loss=0.16220255196094513, time_cost=2.790803909301758
+
Steps: 2%|▏ | 16918/1000000 [9:57:59<2643:46:37, 9.68s/it, lr=1e-5, step_loss=0.0897]
Steps: 2%|▏ | 16919/1000000 [9:58:05<2350:11:32, 8.61s/it, lr=1e-5, step_loss=0.0897][RANK-0]: Step: [16919], local_loss=0.04714680463075638, train_loss=0.031719870865345, time_cost=2.596656084060669
+
Steps: 2%|▏ | 16919/1000000 [9:58:05<2350:11:32, 8.61s/it, lr=1e-5, step_loss=0.0471]
Steps: 2%|▏ | 16920/1000000 [9:58:20<2816:24:59, 10.31s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [16920], local_loss=0.01969633810222149, train_loss=0.03035275638103485, time_cost=1.988886833190918
+
Steps: 2%|▏ | 16920/1000000 [9:58:20<2816:24:59, 10.31s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 16921/1000000 [9:58:24<2324:27:07, 8.51s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [16921], local_loss=0.05496100336313248, train_loss=0.04265356808900833, time_cost=1.3778448104858398
+
Steps: 2%|▏ | 16921/1000000 [9:58:24<2324:27:07, 8.51s/it, lr=1e-5, step_loss=0.055]
Steps: 2%|▏ | 16922/1000000 [9:58:37<2712:24:22, 9.93s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [16922], local_loss=0.1654180884361267, train_loss=0.04376446455717087, time_cost=1.8658685684204102
+
Steps: 2%|▏ | 16922/1000000 [9:58:37<2712:24:22, 9.93s/it, lr=1e-5, step_loss=0.165]
Steps: 2%|▏ | 16923/1000000 [9:58:42<2291:10:50, 8.39s/it, lr=1e-5, step_loss=0.165][RANK-0]: Step: [16923], local_loss=0.004288474563509226, train_loss=0.13036730885505676, time_cost=1.873732566833496
+
Steps: 2%|▏ | 16923/1000000 [9:58:42<2291:10:50, 8.39s/it, lr=1e-5, step_loss=0.00429]
Steps: 2%|▏ | 16924/1000000 [9:58:57<2804:09:36, 10.27s/it, lr=1e-5, step_loss=0.00429][RANK-0]: Step: [16924], local_loss=0.00795456487685442, train_loss=0.019172193482518196, time_cost=5.620039224624634
+
Steps: 2%|▏ | 16924/1000000 [9:58:57<2804:09:36, 10.27s/it, lr=1e-5, step_loss=0.00795]
Steps: 2%|▏ | 16925/1000000 [9:59:10<3053:26:38, 11.18s/it, lr=1e-5, step_loss=0.00795][RANK-0]: Step: [16925], local_loss=0.010545270517468452, train_loss=0.023267341777682304, time_cost=10.051693677902222
+
Steps: 2%|▏ | 16925/1000000 [9:59:10<3053:26:38, 11.18s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 16926/1000000 [9:59:21<3066:28:11, 11.23s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [16926], local_loss=0.2940763235092163, train_loss=0.05556964874267578, time_cost=2.0753166675567627
+
Steps: 2%|▏ | 16926/1000000 [9:59:21<3066:28:11, 11.23s/it, lr=1e-5, step_loss=0.294]
Steps: 2%|▏ | 16927/1000000 [9:59:34<3173:34:44, 11.62s/it, lr=1e-5, step_loss=0.294][RANK-0]: Step: [16927], local_loss=0.09342624247074127, train_loss=0.03793967142701149, time_cost=6.175010681152344
+
Steps: 2%|▏ | 16927/1000000 [9:59:34<3173:34:44, 11.62s/it, lr=1e-5, step_loss=0.0934]
Steps: 2%|▏ | 16928/1000000 [9:59:47<3309:23:00, 12.12s/it, lr=1e-5, step_loss=0.0934][RANK-0]: Step: [16928], local_loss=0.01797153428196907, train_loss=0.10692138969898224, time_cost=6.546281814575195
+
Steps: 2%|▏ | 16928/1000000 [9:59:47<3309:23:00, 12.12s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 16929/1000000 [9:59:55<2966:34:34, 10.86s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [16929], local_loss=0.022424669936299324, train_loss=0.011408008635044098, time_cost=4.548857688903809
+
Steps: 2%|▏ | 16929/1000000 [9:59:55<2966:34:34, 10.86s/it, lr=1e-5, step_loss=0.0224]
Steps: 2%|▏ | 16930/1000000 [10:00:04<2816:21:20, 10.31s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [16930], local_loss=0.9853650331497192, train_loss=0.13626058399677277, time_cost=2.9011828899383545
+
Steps: 2%|▏ | 16930/1000000 [10:00:04<2816:21:20, 10.31s/it, lr=1e-5, step_loss=0.985]
Steps: 2%|▏ | 16931/1000000 [10:00:08<2326:45:50, 8.52s/it, lr=1e-5, step_loss=0.985][RANK-0]: Step: [16931], local_loss=0.22833919525146484, train_loss=0.17915299534797668, time_cost=1.732717514038086
+
Steps: 2%|▏ | 16931/1000000 [10:00:08<2326:45:50, 8.52s/it, lr=1e-5, step_loss=0.228]
Steps: 2%|▏ | 16932/1000000 [10:00:20<2573:18:58, 9.42s/it, lr=1e-5, step_loss=0.228][RANK-0]: Step: [16932], local_loss=0.08339173346757889, train_loss=0.025294940918684006, time_cost=8.315983295440674
+
Steps: 2%|▏ | 16932/1000000 [10:00:20<2573:18:58, 9.42s/it, lr=1e-5, step_loss=0.0834]
Steps: 2%|▏ | 16933/1000000 [10:00:32<2828:50:41, 10.36s/it, lr=1e-5, step_loss=0.0834][RANK-0]: Step: [16933], local_loss=0.006290892604738474, train_loss=0.03325963020324707, time_cost=3.8125252723693848
+
Steps: 2%|▏ | 16933/1000000 [10:00:32<2828:50:41, 10.36s/it, lr=1e-5, step_loss=0.00629]
Steps: 2%|▏ | 16934/1000000 [10:00:39<2496:03:34, 9.14s/it, lr=1e-5, step_loss=0.00629][RANK-0]: Step: [16934], local_loss=0.008451610803604126, train_loss=0.029879257082939148, time_cost=2.6175732612609863
+
Steps: 2%|▏ | 16934/1000000 [10:00:39<2496:03:34, 9.14s/it, lr=1e-5, step_loss=0.00845]
Steps: 2%|▏ | 16935/1000000 [10:00:46<2319:28:34, 8.49s/it, lr=1e-5, step_loss=0.00845][RANK-0]: Step: [16935], local_loss=0.022581921890378, train_loss=0.024741891771554947, time_cost=2.371865749359131
+
Steps: 2%|▏ | 16935/1000000 [10:00:46<2319:28:34, 8.49s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 16936/1000000 [10:00:52<2111:16:21, 7.73s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [16936], local_loss=0.006044288165867329, train_loss=0.02204989641904831, time_cost=2.4032294750213623
+
Steps: 2%|▏ | 16936/1000000 [10:00:52<2111:16:21, 7.73s/it, lr=1e-5, step_loss=0.00604]
Steps: 2%|▏ | 16937/1000000 [10:01:06<2626:36:19, 9.62s/it, lr=1e-5, step_loss=0.00604][RANK-0]: Step: [16937], local_loss=0.9903552532196045, train_loss=0.1439492404460907, time_cost=2.622979164123535
+
Steps: 2%|▏ | 16937/1000000 [10:01:06<2626:36:19, 9.62s/it, lr=1e-5, step_loss=0.99]
Steps: 2%|▏ | 16938/1000000 [10:01:15<2611:54:56, 9.56s/it, lr=1e-5, step_loss=0.99][RANK-0]: Step: [16938], local_loss=0.9999889135360718, train_loss=0.1486106961965561, time_cost=3.9156813621520996
+
Steps: 2%|▏ | 16938/1000000 [10:01:15<2611:54:56, 9.56s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 16939/1000000 [10:01:26<2747:06:17, 10.06s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [16939], local_loss=0.02408078871667385, train_loss=0.030920952558517456, time_cost=2.407726526260376
+
Steps: 2%|▏ | 16939/1000000 [10:01:26<2747:06:17, 10.06s/it, lr=1e-5, step_loss=0.0241]
Steps: 2%|▏ | 16940/1000000 [10:01:36<2694:54:52, 9.87s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [16940], local_loss=0.00331237749196589, train_loss=0.01186673529446125, time_cost=2.221776247024536
+
Steps: 2%|▏ | 16940/1000000 [10:01:36<2694:54:52, 9.87s/it, lr=1e-5, step_loss=0.00331]
Steps: 2%|▏ | 16941/1000000 [10:01:46<2743:12:35, 10.05s/it, lr=1e-5, step_loss=0.00331][RANK-0]: Step: [16941], local_loss=0.009857213124632835, train_loss=7.1513590812683105, time_cost=2.1735236644744873
+
Steps: 2%|▏ | 16941/1000000 [10:01:46<2743:12:35, 10.05s/it, lr=1e-5, step_loss=0.00986]
Steps: 2%|▏ | 16942/1000000 [10:01:57<2807:25:35, 10.28s/it, lr=1e-5, step_loss=0.00986][RANK-0]: Step: [16942], local_loss=0.07924185693264008, train_loss=0.03576327860355377, time_cost=1.9430019855499268
+
Steps: 2%|▏ | 16942/1000000 [10:01:57<2807:25:35, 10.28s/it, lr=1e-5, step_loss=0.0792]
Steps: 2%|▏ | 16943/1000000 [10:02:03<2427:20:47, 8.89s/it, lr=1e-5, step_loss=0.0792][RANK-0]: Step: [16943], local_loss=0.011565967462956905, train_loss=0.030280787497758865, time_cost=1.2880828380584717
+
Steps: 2%|▏ | 16943/1000000 [10:02:03<2427:20:47, 8.89s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 16944/1000000 [10:02:07<2078:17:12, 7.61s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [16944], local_loss=0.01822158880531788, train_loss=0.042174819856882095, time_cost=1.9504146575927734
+
Steps: 2%|▏ | 16944/1000000 [10:02:07<2078:17:12, 7.61s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 16945/1000000 [10:02:15<2065:53:33, 7.57s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [16945], local_loss=0.033981770277023315, train_loss=0.03954920545220375, time_cost=5.218738794326782
+
Steps: 2%|▏ | 16945/1000000 [10:02:15<2065:53:33, 7.57s/it, lr=1e-5, step_loss=0.034]
Steps: 2%|▏ | 16946/1000000 [10:02:30<2709:50:50, 9.92s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [16946], local_loss=0.005565114319324493, train_loss=0.1577294021844864, time_cost=7.951881408691406
+
Steps: 2%|▏ | 16946/1000000 [10:02:30<2709:50:50, 9.92s/it, lr=1e-5, step_loss=0.00557]
Steps: 2%|▏ | 16947/1000000 [10:02:40<2718:40:38, 9.96s/it, lr=1e-5, step_loss=0.00557][RANK-0]: Step: [16947], local_loss=0.00796005129814148, train_loss=0.06924330443143845, time_cost=1.2089097499847412
+
Steps: 2%|▏ | 16947/1000000 [10:02:40<2718:40:38, 9.96s/it, lr=1e-5, step_loss=0.00796]
Steps: 2%|▏ | 16948/1000000 [10:02:52<2835:39:43, 10.38s/it, lr=1e-5, step_loss=0.00796][RANK-0]: Step: [16948], local_loss=0.07762607932090759, train_loss=0.08128741383552551, time_cost=2.216914415359497
+
Steps: 2%|▏ | 16948/1000000 [10:02:52<2835:39:43, 10.38s/it, lr=1e-5, step_loss=0.0776]
Steps: 2%|▏ | 16949/1000000 [10:02:59<2613:36:20, 9.57s/it, lr=1e-5, step_loss=0.0776][RANK-0]: Step: [16949], local_loss=0.02373199164867401, train_loss=0.02209293283522129, time_cost=2.4772257804870605
+
Steps: 2%|▏ | 16949/1000000 [10:02:59<2613:36:20, 9.57s/it, lr=1e-5, step_loss=0.0237]
Steps: 2%|▏ | 16950/1000000 [10:03:11<2760:10:24, 10.11s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [16950], local_loss=0.1513897031545639, train_loss=0.052216216921806335, time_cost=3.616481065750122
+
Steps: 2%|▏ | 16950/1000000 [10:03:11<2760:10:24, 10.11s/it, lr=1e-5, step_loss=0.151]
Steps: 2%|▏ | 16951/1000000 [10:03:16<2354:04:31, 8.62s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [16951], local_loss=0.08068841695785522, train_loss=0.030960317701101303, time_cost=1.2713587284088135
+
Steps: 2%|▏ | 16951/1000000 [10:03:16<2354:04:31, 8.62s/it, lr=1e-5, step_loss=0.0807]
Steps: 2%|▏ | 16952/1000000 [10:03:30<2817:26:35, 10.32s/it, lr=1e-5, step_loss=0.0807][RANK-0]: Step: [16952], local_loss=0.023022310808300972, train_loss=0.027003740891814232, time_cost=5.21203351020813
+
Steps: 2%|▏ | 16952/1000000 [10:03:30<2817:26:35, 10.32s/it, lr=1e-5, step_loss=0.023]
Steps: 2%|▏ | 16953/1000000 [10:03:42<2951:16:36, 10.81s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [16953], local_loss=0.00661213556304574, train_loss=0.060466937720775604, time_cost=3.1174168586730957
+
Steps: 2%|▏ | 16953/1000000 [10:03:42<2951:16:36, 10.81s/it, lr=1e-5, step_loss=0.00661]
Steps: 2%|▏ | 16954/1000000 [10:03:47<2459:48:35, 9.01s/it, lr=1e-5, step_loss=0.00661][RANK-0]: Step: [16954], local_loss=0.022185832262039185, train_loss=0.042826030403375626, time_cost=2.528249979019165
+
Steps: 2%|▏ | 16954/1000000 [10:03:47<2459:48:35, 9.01s/it, lr=1e-5, step_loss=0.0222]
Steps: 2%|▏ | 16955/1000000 [10:03:52<2137:21:42, 7.83s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [16955], local_loss=0.02417066879570484, train_loss=0.04861386865377426, time_cost=3.8310515880584717
+
Steps: 2%|▏ | 16955/1000000 [10:03:52<2137:21:42, 7.83s/it, lr=1e-5, step_loss=0.0242]
Steps: 2%|▏ | 16956/1000000 [10:03:59<2062:56:05, 7.55s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [16956], local_loss=0.015432419255375862, train_loss=0.0346885547041893, time_cost=1.5753741264343262
+
Steps: 2%|▏ | 16956/1000000 [10:03:59<2062:56:05, 7.55s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 16957/1000000 [10:04:08<2156:36:55, 7.90s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [16957], local_loss=0.022749798372387886, train_loss=0.07206814736127853, time_cost=1.1988158226013184
+
Steps: 2%|▏ | 16957/1000000 [10:04:08<2156:36:55, 7.90s/it, lr=1e-5, step_loss=0.0227]
Steps: 2%|▏ | 16958/1000000 [10:04:12<1878:00:56, 6.88s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [16958], local_loss=0.05522514134645462, train_loss=0.06934119760990143, time_cost=1.6372706890106201
+
Steps: 2%|▏ | 16958/1000000 [10:04:12<1878:00:56, 6.88s/it, lr=1e-5, step_loss=0.0552]
Steps: 2%|▏ | 16959/1000000 [10:04:16<1662:07:52, 6.09s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [16959], local_loss=0.018012627959251404, train_loss=0.045883215963840485, time_cost=1.4389123916625977
+
Steps: 2%|▏ | 16959/1000000 [10:04:16<1662:07:52, 6.09s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 16960/1000000 [10:04:29<2203:33:14, 8.07s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [16960], local_loss=1.019602656364441, train_loss=0.17567838728427887, time_cost=1.193474531173706
+
Steps: 2%|▏ | 16960/1000000 [10:04:29<2203:33:14, 8.07s/it, lr=1e-5, step_loss=1.02]
Steps: 2%|▏ | 16961/1000000 [10:04:37<2171:52:38, 7.95s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [16961], local_loss=0.005667855031788349, train_loss=0.021630477160215378, time_cost=3.9845967292785645
+
Steps: 2%|▏ | 16961/1000000 [10:04:37<2171:52:38, 7.95s/it, lr=1e-5, step_loss=0.00567]
Steps: 2%|▏ | 16962/1000000 [10:04:44<2130:43:07, 7.80s/it, lr=1e-5, step_loss=0.00567][RANK-0]: Step: [16962], local_loss=0.025258611887693405, train_loss=0.058094605803489685, time_cost=3.0305328369140625
+
Steps: 2%|▏ | 16962/1000000 [10:04:44<2130:43:07, 7.80s/it, lr=1e-5, step_loss=0.0253]
Steps: 2%|▏ | 16963/1000000 [10:04:48<1823:39:32, 6.68s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [16963], local_loss=0.04873114451766014, train_loss=0.09168240427970886, time_cost=1.3624906539916992
+
Steps: 2%|▏ | 16963/1000000 [10:04:48<1823:39:32, 6.68s/it, lr=1e-5, step_loss=0.0487]
Steps: 2%|▏ | 16964/1000000 [10:05:02<2402:23:31, 8.80s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [16964], local_loss=0.03478807210922241, train_loss=0.03314162790775299, time_cost=9.752309322357178
+
Steps: 2%|▏ | 16964/1000000 [10:05:02<2402:23:31, 8.80s/it, lr=1e-5, step_loss=0.0348]
Steps: 2%|▏ | 16965/1000000 [10:05:08<2146:39:10, 7.86s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [16965], local_loss=0.04365193843841553, train_loss=0.07912140339612961, time_cost=1.2876386642456055
+
Steps: 2%|▏ | 16965/1000000 [10:05:08<2146:39:10, 7.86s/it, lr=1e-5, step_loss=0.0437]
Steps: 2%|▏ | 16966/1000000 [10:05:22<2671:46:49, 9.78s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [16966], local_loss=0.0651305764913559, train_loss=0.06883548945188522, time_cost=5.819165945053101
+
Steps: 2%|▏ | 16966/1000000 [10:05:22<2671:46:49, 9.78s/it, lr=1e-5, step_loss=0.0651]
Steps: 2%|▏ | 16967/1000000 [10:05:36<3046:55:07, 11.16s/it, lr=1e-5, step_loss=0.0651][RANK-0]: Step: [16967], local_loss=0.023353077471256256, train_loss=0.025528857484459877, time_cost=8.927243709564209
+
Steps: 2%|▏ | 16967/1000000 [10:05:36<3046:55:07, 11.16s/it, lr=1e-5, step_loss=0.0234]
Steps: 2%|▏ | 16968/1000000 [10:05:50<3232:31:02, 11.84s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [16968], local_loss=0.013552075251936913, train_loss=0.019338458776474, time_cost=2.4516923427581787
+
Steps: 2%|▏ | 16968/1000000 [10:05:50<3232:31:02, 11.84s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 16969/1000000 [10:06:01<3182:32:53, 11.65s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [16969], local_loss=0.005732052493840456, train_loss=0.008740484714508057, time_cost=3.459278106689453
+
Steps: 2%|▏ | 16969/1000000 [10:06:01<3182:32:53, 11.65s/it, lr=1e-5, step_loss=0.00573]
Steps: 2%|▏ | 16970/1000000 [10:06:14<3306:41:51, 12.11s/it, lr=1e-5, step_loss=0.00573][RANK-0]: Step: [16970], local_loss=0.012480362318456173, train_loss=0.02934947982430458, time_cost=1.2816269397735596
+
Steps: 2%|▏ | 16970/1000000 [10:06:14<3306:41:51, 12.11s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 16971/1000000 [10:06:23<3047:11:24, 11.16s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [16971], local_loss=0.06877107918262482, train_loss=5.525625705718994, time_cost=3.2532382011413574
+
Steps: 2%|▏ | 16971/1000000 [10:06:23<3047:11:24, 11.16s/it, lr=1e-5, step_loss=0.0688]
Steps: 2%|▏ | 16972/1000000 [10:06:34<3004:40:43, 11.00s/it, lr=1e-5, step_loss=0.0688][RANK-0]: Step: [16972], local_loss=0.043528392910957336, train_loss=0.03267799690365791, time_cost=1.3365068435668945
+
Steps: 2%|▏ | 16972/1000000 [10:06:34<3004:40:43, 11.00s/it, lr=1e-5, step_loss=0.0435]
Steps: 2%|▏ | 16973/1000000 [10:06:45<3051:12:27, 11.17s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [16973], local_loss=0.01609361357986927, train_loss=0.04351959377527237, time_cost=5.31736421585083
+
Steps: 2%|▏ | 16973/1000000 [10:06:45<3051:12:27, 11.17s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 16974/1000000 [10:06:51<2594:02:53, 9.50s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [16974], local_loss=0.08690910786390305, train_loss=0.028812184929847717, time_cost=2.0309932231903076
+
Steps: 2%|▏ | 16974/1000000 [10:06:51<2594:02:53, 9.50s/it, lr=1e-5, step_loss=0.0869]
Steps: 2%|▏ | 16975/1000000 [10:07:04<2907:40:25, 10.65s/it, lr=1e-5, step_loss=0.0869][RANK-0]: Step: [16975], local_loss=0.0387265719473362, train_loss=0.07040584087371826, time_cost=11.276785612106323
+
Steps: 2%|▏ | 16975/1000000 [10:07:04<2907:40:25, 10.65s/it, lr=1e-5, step_loss=0.0387]
Steps: 2%|▏ | 16976/1000000 [10:07:09<2424:40:14, 8.88s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [16976], local_loss=0.33022540807724, train_loss=0.08049185574054718, time_cost=2.492464542388916
+
Steps: 2%|▏ | 16976/1000000 [10:07:09<2424:40:14, 8.88s/it, lr=1e-5, step_loss=0.33]
Steps: 2%|▏ | 16977/1000000 [10:07:14<2116:41:49, 7.75s/it, lr=1e-5, step_loss=0.33][RANK-0]: Step: [16977], local_loss=0.9766656756401062, train_loss=0.21946586668491364, time_cost=2.467515468597412
+
Steps: 2%|▏ | 16977/1000000 [10:07:14<2116:41:49, 7.75s/it, lr=1e-5, step_loss=0.977]
Steps: 2%|▏ | 16978/1000000 [10:07:19<1891:07:40, 6.93s/it, lr=1e-5, step_loss=0.977][RANK-0]: Step: [16978], local_loss=0.04031319543719292, train_loss=0.15366405248641968, time_cost=4.151828289031982
+
Steps: 2%|▏ | 16978/1000000 [10:07:19<1891:07:40, 6.93s/it, lr=1e-5, step_loss=0.0403]
Steps: 2%|▏ | 16979/1000000 [10:07:33<2510:38:40, 9.19s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [16979], local_loss=0.03634532541036606, train_loss=0.022054173052310944, time_cost=6.629252910614014
+
Steps: 2%|▏ | 16979/1000000 [10:07:33<2510:38:40, 9.19s/it, lr=1e-5, step_loss=0.0363]
Steps: 2%|▏ | 16980/1000000 [10:07:39<2244:40:08, 8.22s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [16980], local_loss=0.006145508028566837, train_loss=0.01978078857064247, time_cost=4.252540588378906
+
Steps: 2%|▏ | 16980/1000000 [10:07:39<2244:40:08, 8.22s/it, lr=1e-5, step_loss=0.00615]
Steps: 2%|▏ | 16981/1000000 [10:07:54<2747:44:49, 10.06s/it, lr=1e-5, step_loss=0.00615][RANK-0]: Step: [16981], local_loss=0.027203481644392014, train_loss=0.02567155286669731, time_cost=6.201105356216431
+
Steps: 2%|▏ | 16981/1000000 [10:07:54<2747:44:49, 10.06s/it, lr=1e-5, step_loss=0.0272]
Steps: 2%|▏ | 16982/1000000 [10:08:04<2760:33:30, 10.11s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [16982], local_loss=0.044068437069654465, train_loss=0.05490998923778534, time_cost=1.2907941341400146
+
Steps: 2%|▏ | 16982/1000000 [10:08:04<2760:33:30, 10.11s/it, lr=1e-5, step_loss=0.0441]
Steps: 2%|▏ | 16983/1000000 [10:08:15<2863:28:37, 10.49s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [16983], local_loss=0.06268516927957535, train_loss=0.030176108703017235, time_cost=1.9355123043060303
+
Steps: 2%|▏ | 16983/1000000 [10:08:15<2863:28:37, 10.49s/it, lr=1e-5, step_loss=0.0627]
Steps: 2%|▏ | 16984/1000000 [10:08:22<2587:00:11, 9.47s/it, lr=1e-5, step_loss=0.0627][RANK-0]: Step: [16984], local_loss=0.004665303509682417, train_loss=0.04780847579240799, time_cost=3.6116721630096436
+
Steps: 2%|▏ | 16984/1000000 [10:08:22<2587:00:11, 9.47s/it, lr=1e-5, step_loss=0.00467]
Steps: 2%|▏ | 16985/1000000 [10:08:33<2656:09:24, 9.73s/it, lr=1e-5, step_loss=0.00467][RANK-0]: Step: [16985], local_loss=0.007129145786166191, train_loss=0.15815570950508118, time_cost=3.704897403717041
+
Steps: 2%|▏ | 16985/1000000 [10:08:33<2656:09:24, 9.73s/it, lr=1e-5, step_loss=0.00713]
Steps: 2%|▏ | 16986/1000000 [10:08:43<2699:09:18, 9.88s/it, lr=1e-5, step_loss=0.00713][RANK-0]: Step: [16986], local_loss=0.009582838043570518, train_loss=0.026912033557891846, time_cost=1.5762689113616943
+
Steps: 2%|▏ | 16986/1000000 [10:08:43<2699:09:18, 9.88s/it, lr=1e-5, step_loss=0.00958]
Steps: 2%|▏ | 16987/1000000 [10:08:54<2759:40:46, 10.11s/it, lr=1e-5, step_loss=0.00958][RANK-0]: Step: [16987], local_loss=0.009001071564853191, train_loss=0.1354983150959015, time_cost=1.5422286987304688
+
Steps: 2%|▏ | 16987/1000000 [10:08:54<2759:40:46, 10.11s/it, lr=1e-5, step_loss=0.009]
Steps: 2%|▏ | 16988/1000000 [10:09:05<2871:36:39, 10.52s/it, lr=1e-5, step_loss=0.009][RANK-0]: Step: [16988], local_loss=0.005725904367864132, train_loss=0.03588515520095825, time_cost=8.303779363632202
+
Steps: 2%|▏ | 16988/1000000 [10:09:05<2871:36:39, 10.52s/it, lr=1e-5, step_loss=0.00573]
Steps: 2%|▏ | 16989/1000000 [10:09:11<2475:00:44, 9.06s/it, lr=1e-5, step_loss=0.00573][RANK-0]: Step: [16989], local_loss=0.06213373318314552, train_loss=0.024782374501228333, time_cost=2.888181209564209
+
Steps: 2%|▏ | 16989/1000000 [10:09:11<2475:00:44, 9.06s/it, lr=1e-5, step_loss=0.0621]
Steps: 2%|▏ | 16990/1000000 [10:09:17<2234:18:28, 8.18s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [16990], local_loss=0.06452228128910065, train_loss=0.046877019107341766, time_cost=1.7363412380218506
+
Steps: 2%|▏ | 16990/1000000 [10:09:17<2234:18:28, 8.18s/it, lr=1e-5, step_loss=0.0645]
Steps: 2%|▏ | 16991/1000000 [10:09:27<2426:01:43, 8.88s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [16991], local_loss=0.0050713177770376205, train_loss=0.06623139977455139, time_cost=2.190692663192749
+
Steps: 2%|▏ | 16991/1000000 [10:09:27<2426:01:43, 8.88s/it, lr=1e-5, step_loss=0.00507]
Steps: 2%|▏ | 16992/1000000 [10:09:36<2375:27:52, 8.70s/it, lr=1e-5, step_loss=0.00507][RANK-0]: Step: [16992], local_loss=0.013284281827509403, train_loss=0.02337595820426941, time_cost=3.2945239543914795
+
Steps: 2%|▏ | 16992/1000000 [10:09:36<2375:27:52, 8.70s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 16993/1000000 [10:09:41<2099:48:30, 7.69s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [16993], local_loss=0.03885376453399658, train_loss=0.029813066124916077, time_cost=1.1939818859100342
+
Steps: 2%|▏ | 16993/1000000 [10:09:41<2099:48:30, 7.69s/it, lr=1e-5, step_loss=0.0389]
Steps: 2%|▏ | 16994/1000000 [10:09:54<2535:37:39, 9.29s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [16994], local_loss=0.013058827258646488, train_loss=0.021664034575223923, time_cost=5.589549541473389
+
Steps: 2%|▏ | 16994/1000000 [10:09:54<2535:37:39, 9.29s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 16995/1000000 [10:10:02<2406:08:55, 8.81s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [16995], local_loss=0.0395885705947876, train_loss=0.05782051756978035, time_cost=1.8647572994232178
+
Steps: 2%|▏ | 16995/1000000 [10:10:02<2406:08:55, 8.81s/it, lr=1e-5, step_loss=0.0396]
Steps: 2%|▏ | 16996/1000000 [10:10:06<2057:12:04, 7.53s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [16996], local_loss=0.03815441578626633, train_loss=0.1322508454322815, time_cost=1.7731428146362305
+
Steps: 2%|▏ | 16996/1000000 [10:10:06<2057:12:04, 7.53s/it, lr=1e-5, step_loss=0.0382]
Steps: 2%|▏ | 16997/1000000 [10:10:12<1878:23:24, 6.88s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [16997], local_loss=0.037505120038986206, train_loss=0.07899881899356842, time_cost=2.9516499042510986
+
Steps: 2%|▏ | 16997/1000000 [10:10:12<1878:23:24, 6.88s/it, lr=1e-5, step_loss=0.0375]
Steps: 2%|▏ | 16998/1000000 [10:10:22<2132:51:59, 7.81s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [16998], local_loss=0.028437847271561623, train_loss=0.057584136724472046, time_cost=1.5583269596099854
+
Steps: 2%|▏ | 16998/1000000 [10:10:22<2132:51:59, 7.81s/it, lr=1e-5, step_loss=0.0284]
Steps: 2%|▏ | 16999/1000000 [10:10:35<2562:31:36, 9.38s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [16999], local_loss=0.017332209274172783, train_loss=0.01810310035943985, time_cost=1.2047786712646484
+
Steps: 2%|▏ | 16999/1000000 [10:10:35<2562:31:36, 9.38s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 17000/1000000 [10:10:39<2164:13:51, 7.93s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [17000], local_loss=0.010467816144227982, train_loss=0.025796040892601013, time_cost=2.458712577819824
+09/18/2024 19:34:42 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1/checkpoint-17000
+09/18/2024 19:34:42 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-18 19:34:42,630] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-18 19:34:42,659] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-18 19:34:42,660] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 19:35:00,395] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:00,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-18 19:35:35,415] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:35,415] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:35,415] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:35,591] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:35,591] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:35,591] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:35,686] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:35,686] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:35,687] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:36,151] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:36,210] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:36,211] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:36,280] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:36,280] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:36,281] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:36,462] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:36,463] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:36,463] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:36,608] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:36,608] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:36,608] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 19:35:36,620] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-18 19:35:36,620] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-18 19:35:36,620] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/18/2024 19:35:36 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/pytorch_model
+{'norm_num_groups', 'dropout', 'use_additional_conditions'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/model/diffusion_pytorch_model.safetensors
+09/18/2024 19:36:39 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/scheduler.bin
+09/18/2024 19:36:39 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/sampler.bin
+09/18/2024 19:36:39 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-17000/random_states_0.pkl
+09/18/2024 19:36:39 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1/checkpoint-17000
+
Steps: 2%|▏ | 17000/1000000 [10:12:36<2164:13:51, 7.93s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17001/1000000 [10:12:43<11624:03:22, 42.57s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17001], local_loss=0.16729380190372467, train_loss=0.040026433765888214, time_cost=1.2185585498809814
+
Steps: 2%|▏ | 17001/1000000 [10:12:43<11624:03:22, 42.57s/it, lr=1e-5, step_loss=0.167]
Steps: 2%|▏ | 17002/1000000 [10:12:50<8776:02:22, 32.14s/it, lr=1e-5, step_loss=0.167] [RANK-0]: Step: [17002], local_loss=0.014310137368738651, train_loss=0.03296928480267525, time_cost=3.7338662147521973
+
Steps: 2%|▏ | 17002/1000000 [10:12:50<8776:02:22, 32.14s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 17003/1000000 [10:13:02<7094:58:08, 25.98s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [17003], local_loss=0.3650026321411133, train_loss=0.07945461571216583, time_cost=2.9234795570373535
+
Steps: 2%|▏ | 17003/1000000 [10:13:02<7094:58:08, 25.98s/it, lr=1e-5, step_loss=0.365]
Steps: 2%|▏ | 17004/1000000 [10:13:16<6132:17:52, 22.46s/it, lr=1e-5, step_loss=0.365][RANK-0]: Step: [17004], local_loss=0.024287329986691475, train_loss=0.02627602592110634, time_cost=6.571466445922852
+
Steps: 2%|▏ | 17004/1000000 [10:13:16<6132:17:52, 22.46s/it, lr=1e-5, step_loss=0.0243]
Steps: 2%|▏ | 17005/1000000 [10:13:29<5295:28:43, 19.39s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [17005], local_loss=0.0056488998234272, train_loss=0.008736910298466682, time_cost=5.432394981384277
+
Steps: 2%|▏ | 17005/1000000 [10:13:29<5295:28:43, 19.39s/it, lr=1e-5, step_loss=0.00565]
Steps: 2%|▏ | 17006/1000000 [10:13:43<4891:30:46, 17.91s/it, lr=1e-5, step_loss=0.00565][RANK-0]: Step: [17006], local_loss=0.24337923526763916, train_loss=5.432071685791016, time_cost=7.699734449386597
+
Steps: 2%|▏ | 17006/1000000 [10:13:43<4891:30:46, 17.91s/it, lr=1e-5, step_loss=0.243]
Steps: 2%|▏ | 17007/1000000 [10:13:58<4628:31:06, 16.95s/it, lr=1e-5, step_loss=0.243][RANK-0]: Step: [17007], local_loss=0.007492171134799719, train_loss=0.021486900746822357, time_cost=2.821817398071289
+
Steps: 2%|▏ | 17007/1000000 [10:13:58<4628:31:06, 16.95s/it, lr=1e-5, step_loss=0.00749]
Steps: 2%|▏ | 17008/1000000 [10:14:05<3803:43:57, 13.93s/it, lr=1e-5, step_loss=0.00749][RANK-0]: Step: [17008], local_loss=0.01994553580880165, train_loss=0.019243977963924408, time_cost=2.849259853363037
+
Steps: 2%|▏ | 17008/1000000 [10:14:05<3803:43:57, 13.93s/it, lr=1e-5, step_loss=0.0199]
Steps: 2%|▏ | 17009/1000000 [10:14:18<3774:33:53, 13.82s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [17009], local_loss=0.05582321435213089, train_loss=0.021249083802103996, time_cost=4.934828519821167
+
Steps: 2%|▏ | 17009/1000000 [10:14:18<3774:33:53, 13.82s/it, lr=1e-5, step_loss=0.0558]
Steps: 2%|▏ | 17010/1000000 [10:14:23<3050:47:55, 11.17s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [17010], local_loss=0.009395998902618885, train_loss=0.026475384831428528, time_cost=2.6838924884796143
+
Steps: 2%|▏ | 17010/1000000 [10:14:23<3050:47:55, 11.17s/it, lr=1e-5, step_loss=0.0094]
Steps: 2%|▏ | 17011/1000000 [10:14:32<2843:05:36, 10.41s/it, lr=1e-5, step_loss=0.0094][RANK-0]: Step: [17011], local_loss=0.07386559247970581, train_loss=0.04007510468363762, time_cost=1.9882135391235352
+
Steps: 2%|▏ | 17011/1000000 [10:14:32<2843:05:36, 10.41s/it, lr=1e-5, step_loss=0.0739]
Steps: 2%|▏ | 17012/1000000 [10:14:39<2553:04:56, 9.35s/it, lr=1e-5, step_loss=0.0739][RANK-0]: Step: [17012], local_loss=0.10174945741891861, train_loss=0.072031170129776, time_cost=2.6873836517333984
+
Steps: 2%|▏ | 17012/1000000 [10:14:39<2553:04:56, 9.35s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 17013/1000000 [10:14:50<2693:51:41, 9.87s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [17013], local_loss=0.009217701852321625, train_loss=0.04372020810842514, time_cost=1.8832461833953857
+
Steps: 2%|▏ | 17013/1000000 [10:14:50<2693:51:41, 9.87s/it, lr=1e-5, step_loss=0.00922]
Steps: 2%|▏ | 17014/1000000 [10:14:56<2365:40:40, 8.66s/it, lr=1e-5, step_loss=0.00922][RANK-0]: Step: [17014], local_loss=0.008174596354365349, train_loss=0.03054795227944851, time_cost=3.1648924350738525
+
Steps: 2%|▏ | 17014/1000000 [10:14:56<2365:40:40, 8.66s/it, lr=1e-5, step_loss=0.00817]
Steps: 2%|▏ | 17015/1000000 [10:15:00<2009:28:37, 7.36s/it, lr=1e-5, step_loss=0.00817][RANK-0]: Step: [17015], local_loss=0.04274656996130943, train_loss=0.06954493373632431, time_cost=3.2361862659454346
+
Steps: 2%|▏ | 17015/1000000 [10:15:00<2009:28:37, 7.36s/it, lr=1e-5, step_loss=0.0427]
Steps: 2%|▏ | 17016/1000000 [10:15:05<1784:37:55, 6.54s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [17016], local_loss=0.08971323072910309, train_loss=0.04827580228447914, time_cost=1.9536457061767578
+
Steps: 2%|▏ | 17016/1000000 [10:15:05<1784:37:55, 6.54s/it, lr=1e-5, step_loss=0.0897]
Steps: 2%|▏ | 17017/1000000 [10:15:13<1956:08:41, 7.16s/it, lr=1e-5, step_loss=0.0897][RANK-0]: Step: [17017], local_loss=0.01414811797440052, train_loss=0.040951184928417206, time_cost=2.0652544498443604
+
Steps: 2%|▏ | 17017/1000000 [10:15:13<1956:08:41, 7.16s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 17018/1000000 [10:15:23<2161:38:17, 7.92s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [17018], local_loss=0.020506415516138077, train_loss=0.03863615542650223, time_cost=4.838881254196167
+
Steps: 2%|▏ | 17018/1000000 [10:15:23<2161:38:17, 7.92s/it, lr=1e-5, step_loss=0.0205]
Steps: 2%|▏ | 17019/1000000 [10:15:36<2609:44:36, 9.56s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [17019], local_loss=0.004572460427880287, train_loss=0.01665356755256653, time_cost=1.839611530303955
+
Steps: 2%|▏ | 17019/1000000 [10:15:36<2609:44:36, 9.56s/it, lr=1e-5, step_loss=0.00457]
Steps: 2%|▏ | 17020/1000000 [10:15:43<2345:40:41, 8.59s/it, lr=1e-5, step_loss=0.00457][RANK-0]: Step: [17020], local_loss=0.03745537996292114, train_loss=0.023091644048690796, time_cost=1.9580888748168945
+
Steps: 2%|▏ | 17020/1000000 [10:15:43<2345:40:41, 8.59s/it, lr=1e-5, step_loss=0.0375]
Steps: 2%|▏ | 17021/1000000 [10:15:49<2207:24:03, 8.08s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [17021], local_loss=0.0065335496328771114, train_loss=0.07693533599376678, time_cost=1.572727918624878
+
Steps: 2%|▏ | 17021/1000000 [10:15:49<2207:24:03, 8.08s/it, lr=1e-5, step_loss=0.00653]
Steps: 2%|▏ | 17022/1000000 [10:15:55<1960:43:31, 7.18s/it, lr=1e-5, step_loss=0.00653][RANK-0]: Step: [17022], local_loss=0.004518147557973862, train_loss=0.025952113792300224, time_cost=2.3593649864196777
+
Steps: 2%|▏ | 17022/1000000 [10:15:55<1960:43:31, 7.18s/it, lr=1e-5, step_loss=0.00452]
Steps: 2%|▏ | 17023/1000000 [10:16:04<2110:36:02, 7.73s/it, lr=1e-5, step_loss=0.00452][RANK-0]: Step: [17023], local_loss=0.02569216676056385, train_loss=0.14195960760116577, time_cost=3.052978992462158
+
Steps: 2%|▏ | 17023/1000000 [10:16:04<2110:36:02, 7.73s/it, lr=1e-5, step_loss=0.0257]
Steps: 2%|▏ | 17024/1000000 [10:16:08<1868:30:54, 6.84s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [17024], local_loss=0.01020748633891344, train_loss=0.038323014974594116, time_cost=2.0481576919555664
+
Steps: 2%|▏ | 17024/1000000 [10:16:08<1868:30:54, 6.84s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 17025/1000000 [10:16:21<2369:14:39, 8.68s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [17025], local_loss=0.05221980810165405, train_loss=0.044524431228637695, time_cost=2.909327507019043
+
Steps: 2%|▏ | 17025/1000000 [10:16:21<2369:14:39, 8.68s/it, lr=1e-5, step_loss=0.0522]
Steps: 2%|▏ | 17026/1000000 [10:16:27<2137:37:37, 7.83s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [17026], local_loss=0.010027221404016018, train_loss=0.02155633457005024, time_cost=2.048107147216797
+
Steps: 2%|▏ | 17026/1000000 [10:16:27<2137:37:37, 7.83s/it, lr=1e-5, step_loss=0.01]
Steps: 2%|▏ | 17027/1000000 [10:16:37<2326:46:28, 8.52s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [17027], local_loss=0.01682356372475624, train_loss=0.049988530576229095, time_cost=4.816103458404541
+
Steps: 2%|▏ | 17027/1000000 [10:16:37<2326:46:28, 8.52s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 17028/1000000 [10:16:42<2043:21:30, 7.48s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [17028], local_loss=0.13665904104709625, train_loss=0.04363662004470825, time_cost=2.0197064876556396
+
Steps: 2%|▏ | 17028/1000000 [10:16:42<2043:21:30, 7.48s/it, lr=1e-5, step_loss=0.137]
Steps: 2%|▏ | 17029/1000000 [10:16:49<1988:24:49, 7.28s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [17029], local_loss=0.03544684126973152, train_loss=0.08063122630119324, time_cost=2.223586320877075
+
Steps: 2%|▏ | 17029/1000000 [10:16:49<1988:24:49, 7.28s/it, lr=1e-5, step_loss=0.0354]
Steps: 2%|▏ | 17030/1000000 [10:16:54<1811:28:15, 6.63s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [17030], local_loss=0.03812282159924507, train_loss=0.0251470897346735, time_cost=2.120861530303955
+
Steps: 2%|▏ | 17030/1000000 [10:16:54<1811:28:15, 6.63s/it, lr=1e-5, step_loss=0.0381]
Steps: 2%|▏ | 17031/1000000 [10:16:59<1649:56:44, 6.04s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [17031], local_loss=0.008001661859452724, train_loss=0.09258247911930084, time_cost=1.8013203144073486
+
Steps: 2%|▏ | 17031/1000000 [10:16:59<1649:56:44, 6.04s/it, lr=1e-5, step_loss=0.008]
Steps: 2%|▏ | 17032/1000000 [10:17:13<2321:50:49, 8.50s/it, lr=1e-5, step_loss=0.008][RANK-0]: Step: [17032], local_loss=0.021294081583619118, train_loss=0.07609344273805618, time_cost=5.155463457107544
+
Steps: 2%|▏ | 17032/1000000 [10:17:13<2321:50:49, 8.50s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 17033/1000000 [10:17:19<2107:24:37, 7.72s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [17033], local_loss=0.008264526724815369, train_loss=0.0216110497713089, time_cost=1.667039155960083
+
Steps: 2%|▏ | 17033/1000000 [10:17:19<2107:24:37, 7.72s/it, lr=1e-5, step_loss=0.00826]
Steps: 2%|▏ | 17034/1000000 [10:17:30<2400:02:51, 8.79s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [17034], local_loss=0.02181442454457283, train_loss=0.0950622707605362, time_cost=3.1636316776275635
+
Steps: 2%|▏ | 17034/1000000 [10:17:30<2400:02:51, 8.79s/it, lr=1e-5, step_loss=0.0218]
Steps: 2%|▏ | 17035/1000000 [10:17:40<2443:45:03, 8.95s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [17035], local_loss=0.06964204460382462, train_loss=0.01811233162879944, time_cost=2.1837499141693115
+
Steps: 2%|▏ | 17035/1000000 [10:17:40<2443:45:03, 8.95s/it, lr=1e-5, step_loss=0.0696]
Steps: 2%|▏ | 17036/1000000 [10:17:50<2580:50:57, 9.45s/it, lr=1e-5, step_loss=0.0696][RANK-0]: Step: [17036], local_loss=0.008123046718537807, train_loss=0.03549238294363022, time_cost=1.5178306102752686
+
Steps: 2%|▏ | 17036/1000000 [10:17:50<2580:50:57, 9.45s/it, lr=1e-5, step_loss=0.00812]
Steps: 2%|▏ | 17037/1000000 [10:17:56<2282:11:33, 8.36s/it, lr=1e-5, step_loss=0.00812][RANK-0]: Step: [17037], local_loss=0.011182689107954502, train_loss=0.023621022701263428, time_cost=1.7580585479736328
+
Steps: 2%|▏ | 17037/1000000 [10:17:56<2282:11:33, 8.36s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 17038/1000000 [10:18:01<1966:50:01, 7.20s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [17038], local_loss=0.023985425010323524, train_loss=0.04145105928182602, time_cost=1.8924241065979004
+
Steps: 2%|▏ | 17038/1000000 [10:18:01<1966:50:01, 7.20s/it, lr=1e-5, step_loss=0.024]
Steps: 2%|▏ | 17039/1000000 [10:18:13<2430:08:41, 8.90s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [17039], local_loss=0.00828513503074646, train_loss=0.03431126847863197, time_cost=4.703451633453369
+
Steps: 2%|▏ | 17039/1000000 [10:18:13<2430:08:41, 8.90s/it, lr=1e-5, step_loss=0.00829]
Steps: 2%|▏ | 17040/1000000 [10:18:18<2065:09:22, 7.56s/it, lr=1e-5, step_loss=0.00829][RANK-0]: Step: [17040], local_loss=217.28854370117188, train_loss=27.167818069458008, time_cost=1.4279425144195557
+
Steps: 2%|▏ | 17040/1000000 [10:18:18<2065:09:22, 7.56s/it, lr=1e-5, step_loss=217]
Steps: 2%|▏ | 17041/1000000 [10:18:23<1829:02:12, 6.70s/it, lr=1e-5, step_loss=217][RANK-0]: Step: [17041], local_loss=0.23120467364788055, train_loss=0.056637972593307495, time_cost=4.049082517623901
+
Steps: 2%|▏ | 17041/1000000 [10:18:23<1829:02:12, 6.70s/it, lr=1e-5, step_loss=0.231]
Steps: 2%|▏ | 17042/1000000 [10:18:32<2083:48:17, 7.63s/it, lr=1e-5, step_loss=0.231][RANK-0]: Step: [17042], local_loss=0.018355675041675568, train_loss=0.06239573284983635, time_cost=1.2126598358154297
+
Steps: 2%|▏ | 17042/1000000 [10:18:32<2083:48:17, 7.63s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 17043/1000000 [10:18:38<1931:11:40, 7.07s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [17043], local_loss=0.012962582521140575, train_loss=0.035104572772979736, time_cost=2.861264944076538
+
Steps: 2%|▏ | 17043/1000000 [10:18:38<1931:11:40, 7.07s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 17044/1000000 [10:18:45<1904:14:09, 6.97s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [17044], local_loss=0.01475436333566904, train_loss=0.05589974299073219, time_cost=1.1971018314361572
+
Steps: 2%|▏ | 17044/1000000 [10:18:45<1904:14:09, 6.97s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 17045/1000000 [10:18:55<2180:27:52, 7.99s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [17045], local_loss=0.0037623175885528326, train_loss=0.02047348953783512, time_cost=8.155667066574097
+
Steps: 2%|▏ | 17045/1000000 [10:18:55<2180:27:52, 7.99s/it, lr=1e-5, step_loss=0.00376]
Steps: 2%|▏ | 17046/1000000 [10:19:06<2425:17:59, 8.88s/it, lr=1e-5, step_loss=0.00376][RANK-0]: Step: [17046], local_loss=0.0913219004869461, train_loss=0.05411882326006889, time_cost=9.211600303649902
+
Steps: 2%|▏ | 17046/1000000 [10:19:06<2425:17:59, 8.88s/it, lr=1e-5, step_loss=0.0913]
Steps: 2%|▏ | 17047/1000000 [10:19:12<2187:16:17, 8.01s/it, lr=1e-5, step_loss=0.0913][RANK-0]: Step: [17047], local_loss=0.010656215250492096, train_loss=0.05836912989616394, time_cost=2.320749521255493
+
Steps: 2%|▏ | 17047/1000000 [10:19:12<2187:16:17, 8.01s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17048/1000000 [10:19:27<2750:54:44, 10.08s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17048], local_loss=0.027633780613541603, train_loss=0.034164972603321075, time_cost=5.011379241943359
+
Steps: 2%|▏ | 17048/1000000 [10:19:27<2750:54:44, 10.08s/it, lr=1e-5, step_loss=0.0276]
Steps: 2%|▏ | 17049/1000000 [10:19:38<2806:29:35, 10.28s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [17049], local_loss=0.013317900709807873, train_loss=0.018373243510723114, time_cost=3.491133689880371
+
Steps: 2%|▏ | 17049/1000000 [10:19:38<2806:29:35, 10.28s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 17050/1000000 [10:19:44<2458:51:44, 9.01s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [17050], local_loss=0.010539736598730087, train_loss=0.026859130710363388, time_cost=1.706376314163208
+
Steps: 2%|▏ | 17050/1000000 [10:19:44<2458:51:44, 9.01s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17051/1000000 [10:19:55<2612:11:01, 9.57s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17051], local_loss=0.013653445057570934, train_loss=0.04327967390418053, time_cost=3.1823229789733887
+
Steps: 2%|▏ | 17051/1000000 [10:19:55<2612:11:01, 9.57s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 17052/1000000 [10:20:04<2575:51:56, 9.43s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [17052], local_loss=0.02377752587199211, train_loss=0.016080014407634735, time_cost=2.3989760875701904
+
Steps: 2%|▏ | 17052/1000000 [10:20:04<2575:51:56, 9.43s/it, lr=1e-5, step_loss=0.0238]
Steps: 2%|▏ | 17053/1000000 [10:20:09<2247:29:25, 8.23s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [17053], local_loss=0.007768424227833748, train_loss=0.03281796723604202, time_cost=1.1913602352142334
+
Steps: 2%|▏ | 17053/1000000 [10:20:09<2247:29:25, 8.23s/it, lr=1e-5, step_loss=0.00777]
Steps: 2%|▏ | 17054/1000000 [10:20:15<2050:50:04, 7.51s/it, lr=1e-5, step_loss=0.00777][RANK-0]: Step: [17054], local_loss=0.056104376912117004, train_loss=0.12552843987941742, time_cost=1.2372794151306152
+
Steps: 2%|▏ | 17054/1000000 [10:20:15<2050:50:04, 7.51s/it, lr=1e-5, step_loss=0.0561]
Steps: 2%|▏ | 17055/1000000 [10:20:19<1791:02:20, 6.56s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [17055], local_loss=0.004002873320132494, train_loss=0.03742249682545662, time_cost=1.3046696186065674
+
Steps: 2%|▏ | 17055/1000000 [10:20:19<1791:02:20, 6.56s/it, lr=1e-5, step_loss=0.004]
Steps: 2%|▏ | 17056/1000000 [10:20:25<1744:55:59, 6.39s/it, lr=1e-5, step_loss=0.004][RANK-0]: Step: [17056], local_loss=0.006754414178431034, train_loss=0.030748941004276276, time_cost=1.805302381515503
+
Steps: 2%|▏ | 17056/1000000 [10:20:25<1744:55:59, 6.39s/it, lr=1e-5, step_loss=0.00675]
Steps: 2%|▏ | 17057/1000000 [10:20:32<1787:01:50, 6.54s/it, lr=1e-5, step_loss=0.00675][RANK-0]: Step: [17057], local_loss=0.0045179040171206, train_loss=0.027827220037579536, time_cost=1.6811225414276123
+
Steps: 2%|▏ | 17057/1000000 [10:20:32<1787:01:50, 6.54s/it, lr=1e-5, step_loss=0.00452]
Steps: 2%|▏ | 17058/1000000 [10:20:46<2400:53:43, 8.79s/it, lr=1e-5, step_loss=0.00452][RANK-0]: Step: [17058], local_loss=0.027017006650567055, train_loss=0.0343392938375473, time_cost=1.2097713947296143
+
Steps: 2%|▏ | 17058/1000000 [10:20:46<2400:53:43, 8.79s/it, lr=1e-5, step_loss=0.027]
Steps: 2%|▏ | 17059/1000000 [10:20:56<2435:58:33, 8.92s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [17059], local_loss=0.02262757159769535, train_loss=0.04767153784632683, time_cost=1.820549726486206
+
Steps: 2%|▏ | 17059/1000000 [10:20:56<2435:58:33, 8.92s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 17060/1000000 [10:21:09<2779:51:13, 10.18s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [17060], local_loss=0.026640478521585464, train_loss=0.04087529331445694, time_cost=4.051220417022705
+
Steps: 2%|▏ | 17060/1000000 [10:21:09<2779:51:13, 10.18s/it, lr=1e-5, step_loss=0.0266]
Steps: 2%|▏ | 17061/1000000 [10:21:18<2678:31:10, 9.81s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [17061], local_loss=0.01843699999153614, train_loss=0.012620381079614162, time_cost=3.2662792205810547
+
Steps: 2%|▏ | 17061/1000000 [10:21:18<2678:31:10, 9.81s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 17062/1000000 [10:21:23<2288:10:12, 8.38s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [17062], local_loss=0.011787229217588902, train_loss=0.01749703288078308, time_cost=2.136343240737915
+
Steps: 2%|▏ | 17062/1000000 [10:21:23<2288:10:12, 8.38s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 17063/1000000 [10:21:33<2402:52:52, 8.80s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [17063], local_loss=0.006451945751905441, train_loss=0.06308436393737793, time_cost=1.2171967029571533
+
Steps: 2%|▏ | 17063/1000000 [10:21:33<2402:52:52, 8.80s/it, lr=1e-5, step_loss=0.00645]
Steps: 2%|▏ | 17064/1000000 [10:21:38<2106:03:38, 7.71s/it, lr=1e-5, step_loss=0.00645][RANK-0]: Step: [17064], local_loss=0.05452749505639076, train_loss=0.06825768947601318, time_cost=2.837646961212158
+
Steps: 2%|▏ | 17064/1000000 [10:21:38<2106:03:38, 7.71s/it, lr=1e-5, step_loss=0.0545]
Steps: 2%|▏ | 17065/1000000 [10:21:51<2579:52:04, 9.45s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [17065], local_loss=0.05535423010587692, train_loss=0.045638810843229294, time_cost=5.069146156311035
+
Steps: 2%|▏ | 17065/1000000 [10:21:51<2579:52:04, 9.45s/it, lr=1e-5, step_loss=0.0554]
Steps: 2%|▏ | 17066/1000000 [10:21:57<2243:24:03, 8.22s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [17066], local_loss=0.027827154844999313, train_loss=0.024214791133999825, time_cost=3.4502508640289307
+
Steps: 2%|▏ | 17066/1000000 [10:21:57<2243:24:03, 8.22s/it, lr=1e-5, step_loss=0.0278]
Steps: 2%|▏ | 17067/1000000 [10:22:10<2683:47:02, 9.83s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [17067], local_loss=0.061876922845840454, train_loss=0.028765130788087845, time_cost=4.003878831863403
+
Steps: 2%|▏ | 17067/1000000 [10:22:10<2683:47:02, 9.83s/it, lr=1e-5, step_loss=0.0619]
Steps: 2%|▏ | 17068/1000000 [10:22:17<2465:41:49, 9.03s/it, lr=1e-5, step_loss=0.0619][RANK-0]: Step: [17068], local_loss=0.9953207969665527, train_loss=0.1452087014913559, time_cost=3.1157546043395996
+
Steps: 2%|▏ | 17068/1000000 [10:22:17<2465:41:49, 9.03s/it, lr=1e-5, step_loss=0.995]
Steps: 2%|▏ | 17069/1000000 [10:22:30<2798:11:04, 10.25s/it, lr=1e-5, step_loss=0.995][RANK-0]: Step: [17069], local_loss=0.007730227895081043, train_loss=0.03830719739198685, time_cost=5.172710180282593
+
Steps: 2%|▏ | 17069/1000000 [10:22:30<2798:11:04, 10.25s/it, lr=1e-5, step_loss=0.00773]
Steps: 2%|▏ | 17070/1000000 [10:22:38<2562:19:14, 9.38s/it, lr=1e-5, step_loss=0.00773][RANK-0]: Step: [17070], local_loss=0.006284655537456274, train_loss=0.048874299973249435, time_cost=2.9931273460388184
+
Steps: 2%|▏ | 17070/1000000 [10:22:38<2562:19:14, 9.38s/it, lr=1e-5, step_loss=0.00628]
Steps: 2%|▏ | 17071/1000000 [10:22:54<3130:49:55, 11.47s/it, lr=1e-5, step_loss=0.00628][RANK-0]: Step: [17071], local_loss=0.02051355130970478, train_loss=0.03242572396993637, time_cost=6.959258079528809
+
Steps: 2%|▏ | 17071/1000000 [10:22:54<3130:49:55, 11.47s/it, lr=1e-5, step_loss=0.0205]
Steps: 2%|▏ | 17072/1000000 [10:22:59<2599:12:58, 9.52s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [17072], local_loss=0.05499086529016495, train_loss=0.017716161906719208, time_cost=1.2321975231170654
+
Steps: 2%|▏ | 17072/1000000 [10:22:59<2599:12:58, 9.52s/it, lr=1e-5, step_loss=0.055]
Steps: 2%|▏ | 17073/1000000 [10:23:08<2516:59:29, 9.22s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [17073], local_loss=0.004170609172433615, train_loss=0.048622552305459976, time_cost=2.2054989337921143
+
Steps: 2%|▏ | 17073/1000000 [10:23:08<2516:59:29, 9.22s/it, lr=1e-5, step_loss=0.00417]
Steps: 2%|▏ | 17074/1000000 [10:23:14<2273:28:11, 8.33s/it, lr=1e-5, step_loss=0.00417][RANK-0]: Step: [17074], local_loss=0.018853534013032913, train_loss=0.0272950641810894, time_cost=2.6289665699005127
+
Steps: 2%|▏ | 17074/1000000 [10:23:14<2273:28:11, 8.33s/it, lr=1e-5, step_loss=0.0189]
Steps: 2%|▏ | 17075/1000000 [10:23:29<2807:51:35, 10.28s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [17075], local_loss=0.011946111917495728, train_loss=0.029367543756961823, time_cost=5.694402456283569
+
Steps: 2%|▏ | 17075/1000000 [10:23:29<2807:51:35, 10.28s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 17076/1000000 [10:23:36<2553:22:52, 9.35s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [17076], local_loss=0.030453503131866455, train_loss=0.013341376557946205, time_cost=3.3968088626861572
+
Steps: 2%|▏ | 17076/1000000 [10:23:36<2553:22:52, 9.35s/it, lr=1e-5, step_loss=0.0305]
Steps: 2%|▏ | 17077/1000000 [10:23:47<2667:06:37, 9.77s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [17077], local_loss=0.025900553911924362, train_loss=0.02443017065525055, time_cost=2.7009503841400146
+
Steps: 2%|▏ | 17077/1000000 [10:23:47<2667:06:37, 9.77s/it, lr=1e-5, step_loss=0.0259]
Steps: 2%|▏ | 17078/1000000 [10:24:00<2989:13:57, 10.95s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [17078], local_loss=0.016340207308530807, train_loss=0.05744945630431175, time_cost=4.242282152175903
+
Steps: 2%|▏ | 17078/1000000 [10:24:00<2989:13:57, 10.95s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 17079/1000000 [10:24:14<3180:02:39, 11.65s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [17079], local_loss=0.034918349236249924, train_loss=0.018303124234080315, time_cost=5.919290065765381
+
Steps: 2%|▏ | 17079/1000000 [10:24:14<3180:02:39, 11.65s/it, lr=1e-5, step_loss=0.0349]
Steps: 2%|▏ | 17080/1000000 [10:24:21<2799:25:27, 10.25s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [17080], local_loss=0.022627482190728188, train_loss=0.02942259982228279, time_cost=1.7373194694519043
+
Steps: 2%|▏ | 17080/1000000 [10:24:21<2799:25:27, 10.25s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 17081/1000000 [10:24:31<2849:56:31, 10.44s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [17081], local_loss=0.009917911142110825, train_loss=0.06406739354133606, time_cost=3.4599058628082275
+
Steps: 2%|▏ | 17081/1000000 [10:24:31<2849:56:31, 10.44s/it, lr=1e-5, step_loss=0.00992]
Steps: 2%|▏ | 17082/1000000 [10:24:36<2354:25:02, 8.62s/it, lr=1e-5, step_loss=0.00992][RANK-0]: Step: [17082], local_loss=0.062414880841970444, train_loss=0.026574524119496346, time_cost=1.9491181373596191
+
Steps: 2%|▏ | 17082/1000000 [10:24:36<2354:25:02, 8.62s/it, lr=1e-5, step_loss=0.0624]
Steps: 2%|▏ | 17083/1000000 [10:24:47<2565:50:58, 9.40s/it, lr=1e-5, step_loss=0.0624][RANK-0]: Step: [17083], local_loss=0.010323893278837204, train_loss=0.05153757333755493, time_cost=1.2712445259094238
+
Steps: 2%|▏ | 17083/1000000 [10:24:47<2565:50:58, 9.40s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 17084/1000000 [10:24:53<2269:17:58, 8.31s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [17084], local_loss=0.025615325197577477, train_loss=0.019817005842924118, time_cost=3.1970255374908447
+
Steps: 2%|▏ | 17084/1000000 [10:24:53<2269:17:58, 8.31s/it, lr=1e-5, step_loss=0.0256]
Steps: 2%|▏ | 17085/1000000 [10:25:06<2627:56:53, 9.63s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [17085], local_loss=0.03696546331048012, train_loss=0.029480069875717163, time_cost=3.481982946395874
+
Steps: 2%|▏ | 17085/1000000 [10:25:06<2627:56:53, 9.63s/it, lr=1e-5, step_loss=0.037]
Steps: 2%|▏ | 17086/1000000 [10:25:13<2437:06:14, 8.93s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [17086], local_loss=0.009769631549715996, train_loss=0.016736947000026703, time_cost=1.4460463523864746
+
Steps: 2%|▏ | 17086/1000000 [10:25:13<2437:06:14, 8.93s/it, lr=1e-5, step_loss=0.00977]
Steps: 2%|▏ | 17087/1000000 [10:25:18<2114:02:12, 7.74s/it, lr=1e-5, step_loss=0.00977][RANK-0]: Step: [17087], local_loss=0.016131140291690826, train_loss=0.02335335500538349, time_cost=1.243837833404541
+
Steps: 2%|▏ | 17087/1000000 [10:25:18<2114:02:12, 7.74s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 17088/1000000 [10:25:29<2393:04:37, 8.76s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [17088], local_loss=0.06097079813480377, train_loss=0.07244427502155304, time_cost=2.4268717765808105
+
Steps: 2%|▏ | 17088/1000000 [10:25:29<2393:04:37, 8.76s/it, lr=1e-5, step_loss=0.061]
Steps: 2%|▏ | 17089/1000000 [10:25:34<2111:53:05, 7.73s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [17089], local_loss=0.012160531245172024, train_loss=0.028521068394184113, time_cost=1.767665147781372
+
Steps: 2%|▏ | 17089/1000000 [10:25:34<2111:53:05, 7.73s/it, lr=1e-5, step_loss=0.0122]
Steps: 2%|▏ | 17090/1000000 [10:25:42<2085:32:12, 7.64s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [17090], local_loss=0.013313534669578075, train_loss=0.16352711617946625, time_cost=2.1236393451690674
+
Steps: 2%|▏ | 17090/1000000 [10:25:42<2085:32:12, 7.64s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 17091/1000000 [10:25:47<1928:54:18, 7.06s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [17091], local_loss=0.005564291961491108, train_loss=0.04193637892603874, time_cost=3.018935203552246
+
Steps: 2%|▏ | 17091/1000000 [10:25:47<1928:54:18, 7.06s/it, lr=1e-5, step_loss=0.00556]
Steps: 2%|▏ | 17092/1000000 [10:25:58<2180:12:20, 7.99s/it, lr=1e-5, step_loss=0.00556][RANK-0]: Step: [17092], local_loss=0.018645048141479492, train_loss=0.15143685042858124, time_cost=2.2827582359313965
+
Steps: 2%|▏ | 17092/1000000 [10:25:58<2180:12:20, 7.99s/it, lr=1e-5, step_loss=0.0186]
Steps: 2%|▏ | 17093/1000000 [10:26:07<2313:45:27, 8.47s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [17093], local_loss=0.010770758613944054, train_loss=0.014527013525366783, time_cost=7.342333078384399
+
Steps: 2%|▏ | 17093/1000000 [10:26:07<2313:45:27, 8.47s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 17094/1000000 [10:26:14<2198:12:02, 8.05s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [17094], local_loss=0.030701909214258194, train_loss=0.05898623913526535, time_cost=1.479090929031372
+
Steps: 2%|▏ | 17094/1000000 [10:26:14<2198:12:02, 8.05s/it, lr=1e-5, step_loss=0.0307]
Steps: 2%|▏ | 17095/1000000 [10:26:19<1905:31:47, 6.98s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [17095], local_loss=0.007715380750596523, train_loss=0.07811996340751648, time_cost=1.5310094356536865
+
Steps: 2%|▏ | 17095/1000000 [10:26:19<1905:31:47, 6.98s/it, lr=1e-5, step_loss=0.00772]
Steps: 2%|▏ | 17096/1000000 [10:26:32<2388:57:01, 8.75s/it, lr=1e-5, step_loss=0.00772][RANK-0]: Step: [17096], local_loss=0.023798100650310516, train_loss=0.06823917478322983, time_cost=3.7151966094970703
+
Steps: 2%|▏ | 17096/1000000 [10:26:32<2388:57:01, 8.75s/it, lr=1e-5, step_loss=0.0238]
Steps: 2%|▏ | 17097/1000000 [10:26:39<2319:07:37, 8.49s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [17097], local_loss=0.013394953683018684, train_loss=0.04315163567662239, time_cost=1.4909603595733643
+
Steps: 2%|▏ | 17097/1000000 [10:26:39<2319:07:37, 8.49s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 17098/1000000 [10:26:53<2722:44:52, 9.97s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [17098], local_loss=0.0168361384421587, train_loss=0.09085961431264877, time_cost=6.528135061264038
+
Steps: 2%|▏ | 17098/1000000 [10:26:53<2722:44:52, 9.97s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 17099/1000000 [10:26:59<2395:13:34, 8.77s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [17099], local_loss=0.02224777452647686, train_loss=0.02013300731778145, time_cost=3.204885482788086
+
Steps: 2%|▏ | 17099/1000000 [10:26:59<2395:13:34, 8.77s/it, lr=1e-5, step_loss=0.0222]
Steps: 2%|▏ | 17100/1000000 [10:27:09<2524:20:28, 9.25s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [17100], local_loss=0.047431811690330505, train_loss=0.0389510914683342, time_cost=2.422595739364624
+
Steps: 2%|▏ | 17100/1000000 [10:27:09<2524:20:28, 9.25s/it, lr=1e-5, step_loss=0.0474]
Steps: 2%|▏ | 17101/1000000 [10:27:15<2227:45:47, 8.16s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [17101], local_loss=0.008371138013899326, train_loss=0.04870928078889847, time_cost=2.4199299812316895
+
Steps: 2%|▏ | 17101/1000000 [10:27:15<2227:45:47, 8.16s/it, lr=1e-5, step_loss=0.00837]
Steps: 2%|▏ | 17102/1000000 [10:27:25<2352:45:44, 8.62s/it, lr=1e-5, step_loss=0.00837][RANK-0]: Step: [17102], local_loss=0.008475162088871002, train_loss=0.0380842499434948, time_cost=3.5780279636383057
+
Steps: 2%|▏ | 17102/1000000 [10:27:25<2352:45:44, 8.62s/it, lr=1e-5, step_loss=0.00848]
Steps: 2%|▏ | 17103/1000000 [10:27:30<2075:29:42, 7.60s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [17103], local_loss=0.0033923794981092215, train_loss=0.011446196585893631, time_cost=1.684288501739502
+
Steps: 2%|▏ | 17103/1000000 [10:27:30<2075:29:42, 7.60s/it, lr=1e-5, step_loss=0.00339]
Steps: 2%|▏ | 17104/1000000 [10:27:41<2346:54:56, 8.60s/it, lr=1e-5, step_loss=0.00339][RANK-0]: Step: [17104], local_loss=0.015494677238166332, train_loss=0.06258359551429749, time_cost=1.8795270919799805
+
Steps: 2%|▏ | 17104/1000000 [10:27:41<2346:54:56, 8.60s/it, lr=1e-5, step_loss=0.0155]
Steps: 2%|▏ | 17105/1000000 [10:27:49<2320:24:05, 8.50s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [17105], local_loss=1.0119497776031494, train_loss=0.19561299681663513, time_cost=3.5167367458343506
+
Steps: 2%|▏ | 17105/1000000 [10:27:49<2320:24:05, 8.50s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 17106/1000000 [10:27:56<2195:23:03, 8.04s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [17106], local_loss=0.08052906394004822, train_loss=0.026947928592562675, time_cost=2.8517966270446777
+
Steps: 2%|▏ | 17106/1000000 [10:27:56<2195:23:03, 8.04s/it, lr=1e-5, step_loss=0.0805]
Steps: 2%|▏ | 17107/1000000 [10:28:04<2197:09:54, 8.05s/it, lr=1e-5, step_loss=0.0805][RANK-0]: Step: [17107], local_loss=0.029280779883265495, train_loss=0.02352266013622284, time_cost=1.7102937698364258
+
Steps: 2%|▏ | 17107/1000000 [10:28:04<2197:09:54, 8.05s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 17108/1000000 [10:28:13<2300:31:27, 8.43s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [17108], local_loss=0.17559760808944702, train_loss=0.1052403599023819, time_cost=1.745103120803833
+
Steps: 2%|▏ | 17108/1000000 [10:28:13<2300:31:27, 8.43s/it, lr=1e-5, step_loss=0.176]
Steps: 2%|▏ | 17109/1000000 [10:28:20<2145:08:03, 7.86s/it, lr=1e-5, step_loss=0.176][RANK-0]: Step: [17109], local_loss=0.006674581207334995, train_loss=0.03349587693810463, time_cost=1.7437434196472168
+
Steps: 2%|▏ | 17109/1000000 [10:28:20<2145:08:03, 7.86s/it, lr=1e-5, step_loss=0.00667]
Steps: 2%|▏ | 17110/1000000 [10:28:26<1978:05:06, 7.25s/it, lr=1e-5, step_loss=0.00667][RANK-0]: Step: [17110], local_loss=0.03741637244820595, train_loss=21.119070053100586, time_cost=2.244131565093994
+
Steps: 2%|▏ | 17110/1000000 [10:28:26<1978:05:06, 7.25s/it, lr=1e-5, step_loss=0.0374]
Steps: 2%|▏ | 17111/1000000 [10:28:33<1981:31:35, 7.26s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [17111], local_loss=0.018283206969499588, train_loss=0.0573887974023819, time_cost=1.833521842956543
+
Steps: 2%|▏ | 17111/1000000 [10:28:33<1981:31:35, 7.26s/it, lr=1e-5, step_loss=0.0183]
Steps: 2%|▏ | 17112/1000000 [10:28:42<2154:49:43, 7.89s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [17112], local_loss=0.007564044091850519, train_loss=0.04200679808855057, time_cost=2.288390874862671
+
Steps: 2%|▏ | 17112/1000000 [10:28:42<2154:49:43, 7.89s/it, lr=1e-5, step_loss=0.00756]
Steps: 2%|▏ | 17113/1000000 [10:28:56<2627:55:04, 9.63s/it, lr=1e-5, step_loss=0.00756][RANK-0]: Step: [17113], local_loss=0.004767666570842266, train_loss=0.1422896832227707, time_cost=5.812268257141113
+
Steps: 2%|▏ | 17113/1000000 [10:28:56<2627:55:04, 9.63s/it, lr=1e-5, step_loss=0.00477]
Steps: 2%|▏ | 17114/1000000 [10:29:05<2556:31:13, 9.36s/it, lr=1e-5, step_loss=0.00477][RANK-0]: Step: [17114], local_loss=0.03426910564303398, train_loss=0.022020023316144943, time_cost=1.3018684387207031
+
Steps: 2%|▏ | 17114/1000000 [10:29:05<2556:31:13, 9.36s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 17115/1000000 [10:29:11<2284:31:02, 8.37s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [17115], local_loss=0.010281167924404144, train_loss=0.042847033590078354, time_cost=2.8983922004699707
+
Steps: 2%|▏ | 17115/1000000 [10:29:11<2284:31:02, 8.37s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 17116/1000000 [10:29:24<2669:38:04, 9.78s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [17116], local_loss=0.11659227311611176, train_loss=0.07768339663743973, time_cost=1.300832986831665
+
Steps: 2%|▏ | 17116/1000000 [10:29:24<2669:38:04, 9.78s/it, lr=1e-5, step_loss=0.117]
Steps: 2%|▏ | 17117/1000000 [10:29:34<2729:50:08, 10.00s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [17117], local_loss=0.05238935351371765, train_loss=0.04478283226490021, time_cost=5.267875909805298
+
Steps: 2%|▏ | 17117/1000000 [10:29:34<2729:50:08, 10.00s/it, lr=1e-5, step_loss=0.0524]
Steps: 2%|▏ | 17118/1000000 [10:29:39<2329:16:55, 8.53s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [17118], local_loss=0.07344929128885269, train_loss=0.02826605550944805, time_cost=2.723808765411377
+
Steps: 2%|▏ | 17118/1000000 [10:29:39<2329:16:55, 8.53s/it, lr=1e-5, step_loss=0.0734]
Steps: 2%|▏ | 17119/1000000 [10:29:44<2013:56:14, 7.38s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [17119], local_loss=0.006358189042657614, train_loss=0.0297941155731678, time_cost=2.3531789779663086
+
Steps: 2%|▏ | 17119/1000000 [10:29:44<2013:56:14, 7.38s/it, lr=1e-5, step_loss=0.00636]
Steps: 2%|▏ | 17120/1000000 [10:29:51<1961:44:18, 7.19s/it, lr=1e-5, step_loss=0.00636][RANK-0]: Step: [17120], local_loss=0.005625964142382145, train_loss=0.020977813750505447, time_cost=2.7232275009155273
+
Steps: 2%|▏ | 17120/1000000 [10:29:51<1961:44:18, 7.19s/it, lr=1e-5, step_loss=0.00563]
Steps: 2%|▏ | 17121/1000000 [10:30:02<2312:42:58, 8.47s/it, lr=1e-5, step_loss=0.00563][RANK-0]: Step: [17121], local_loss=0.003548691514879465, train_loss=0.035916902124881744, time_cost=4.457494258880615
+
Steps: 2%|▏ | 17121/1000000 [10:30:02<2312:42:58, 8.47s/it, lr=1e-5, step_loss=0.00355]
Steps: 2%|▏ | 17122/1000000 [10:30:21<3143:46:34, 11.51s/it, lr=1e-5, step_loss=0.00355][RANK-0]: Step: [17122], local_loss=0.02704804390668869, train_loss=0.040191344916820526, time_cost=15.11713719367981
+
Steps: 2%|▏ | 17122/1000000 [10:30:21<3143:46:34, 11.51s/it, lr=1e-5, step_loss=0.027]
Steps: 2%|▏ | 17123/1000000 [10:30:28<2783:30:04, 10.20s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [17123], local_loss=0.018052099272608757, train_loss=0.1473543494939804, time_cost=2.549639940261841
+
Steps: 2%|▏ | 17123/1000000 [10:30:28<2783:30:04, 10.20s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 17124/1000000 [10:30:33<2355:32:51, 8.63s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [17124], local_loss=0.017276672646403313, train_loss=0.0424175001680851, time_cost=1.781468391418457
+
Steps: 2%|▏ | 17124/1000000 [10:30:33<2355:32:51, 8.63s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 17125/1000000 [10:30:39<2173:59:45, 7.96s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [17125], local_loss=0.0121476324275136, train_loss=0.02822267636656761, time_cost=1.2953858375549316
+
Steps: 2%|▏ | 17125/1000000 [10:30:39<2173:59:45, 7.96s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 17126/1000000 [10:30:50<2365:13:39, 8.66s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [17126], local_loss=0.10015150159597397, train_loss=0.07835046947002411, time_cost=1.3395969867706299
+
Steps: 2%|▏ | 17126/1000000 [10:30:50<2365:13:39, 8.66s/it, lr=1e-5, step_loss=0.1]
Steps: 2%|▏ | 17127/1000000 [10:30:56<2158:28:55, 7.91s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [17127], local_loss=0.01068396121263504, train_loss=0.09895104169845581, time_cost=2.1416385173797607
+
Steps: 2%|▏ | 17127/1000000 [10:30:56<2158:28:55, 7.91s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17128/1000000 [10:31:07<2426:56:00, 8.89s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17128], local_loss=0.14021694660186768, train_loss=6.508214473724365, time_cost=2.259843111038208
+
Steps: 2%|▏ | 17128/1000000 [10:31:07<2426:56:00, 8.89s/it, lr=1e-5, step_loss=0.14]
Steps: 2%|▏ | 17129/1000000 [10:31:18<2592:47:53, 9.50s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [17129], local_loss=0.007361278869211674, train_loss=0.012461073696613312, time_cost=2.998835802078247
+
Steps: 2%|▏ | 17129/1000000 [10:31:18<2592:47:53, 9.50s/it, lr=1e-5, step_loss=0.00736]
Steps: 2%|▏ | 17130/1000000 [10:31:30<2836:56:22, 10.39s/it, lr=1e-5, step_loss=0.00736][RANK-0]: Step: [17130], local_loss=0.003973466344177723, train_loss=0.011820184998214245, time_cost=10.372739553451538
+
Steps: 2%|▏ | 17130/1000000 [10:31:30<2836:56:22, 10.39s/it, lr=1e-5, step_loss=0.00397]
Steps: 2%|▏ | 17131/1000000 [10:31:40<2732:58:37, 10.01s/it, lr=1e-5, step_loss=0.00397][RANK-0]: Step: [17131], local_loss=0.02471802569925785, train_loss=0.15202587842941284, time_cost=2.0711405277252197
+
Steps: 2%|▏ | 17131/1000000 [10:31:40<2732:58:37, 10.01s/it, lr=1e-5, step_loss=0.0247]
Steps: 2%|▏ | 17132/1000000 [10:31:47<2512:28:14, 9.20s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [17132], local_loss=0.017853910103440285, train_loss=0.034645065665245056, time_cost=3.0714752674102783
+
Steps: 2%|▏ | 17132/1000000 [10:31:47<2512:28:14, 9.20s/it, lr=1e-5, step_loss=0.0179]
Steps: 2%|▏ | 17133/1000000 [10:31:54<2337:22:52, 8.56s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [17133], local_loss=0.015421630814671516, train_loss=0.0794316828250885, time_cost=5.3080384731292725
+
Steps: 2%|▏ | 17133/1000000 [10:31:54<2337:22:52, 8.56s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 17134/1000000 [10:31:59<2039:26:43, 7.47s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [17134], local_loss=0.036404091864824295, train_loss=0.03085879050195217, time_cost=3.8838295936584473
+
Steps: 2%|▏ | 17134/1000000 [10:31:59<2039:26:43, 7.47s/it, lr=1e-5, step_loss=0.0364]
Steps: 2%|▏ | 17135/1000000 [10:32:04<1838:47:52, 6.74s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [17135], local_loss=0.019062010571360588, train_loss=6.44851016998291, time_cost=1.9871718883514404
+
Steps: 2%|▏ | 17135/1000000 [10:32:04<1838:47:52, 6.74s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 17136/1000000 [10:32:15<2224:34:42, 8.15s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [17136], local_loss=0.04590502753853798, train_loss=0.051072217524051666, time_cost=1.2245895862579346
+
Steps: 2%|▏ | 17136/1000000 [10:32:15<2224:34:42, 8.15s/it, lr=1e-5, step_loss=0.0459]
Steps: 2%|▏ | 17137/1000000 [10:32:20<1975:53:00, 7.24s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [17137], local_loss=0.010789016261696815, train_loss=0.026286941021680832, time_cost=2.151676654815674
+
Steps: 2%|▏ | 17137/1000000 [10:32:20<1975:53:00, 7.24s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 17138/1000000 [10:32:29<2080:37:04, 7.62s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [17138], local_loss=0.03109882026910782, train_loss=0.030759019777178764, time_cost=1.7947866916656494
+
Steps: 2%|▏ | 17138/1000000 [10:32:29<2080:37:04, 7.62s/it, lr=1e-5, step_loss=0.0311]
Steps: 2%|▏ | 17139/1000000 [10:32:39<2314:57:43, 8.48s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [17139], local_loss=0.010712006129324436, train_loss=0.06345026940107346, time_cost=4.297187089920044
+
Steps: 2%|▏ | 17139/1000000 [10:32:39<2314:57:43, 8.48s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17140/1000000 [10:32:46<2172:37:40, 7.96s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17140], local_loss=0.04723839834332466, train_loss=0.07370564341545105, time_cost=3.1183364391326904
+
Steps: 2%|▏ | 17140/1000000 [10:32:46<2172:37:40, 7.96s/it, lr=1e-5, step_loss=0.0472]
Steps: 2%|▏ | 17141/1000000 [10:32:57<2384:57:29, 8.74s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [17141], local_loss=0.11890839785337448, train_loss=0.08284083008766174, time_cost=1.9019420146942139
+
Steps: 2%|▏ | 17141/1000000 [10:32:57<2384:57:29, 8.74s/it, lr=1e-5, step_loss=0.119]
Steps: 2%|▏ | 17142/1000000 [10:33:03<2159:33:20, 7.91s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [17142], local_loss=0.011283145286142826, train_loss=0.02681908570230007, time_cost=1.5543851852416992
+
Steps: 2%|▏ | 17142/1000000 [10:33:03<2159:33:20, 7.91s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 17143/1000000 [10:33:09<1998:39:40, 7.32s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [17143], local_loss=0.010634428821504116, train_loss=0.0458163321018219, time_cost=2.174447536468506
+
Steps: 2%|▏ | 17143/1000000 [10:33:09<1998:39:40, 7.32s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 17144/1000000 [10:33:13<1758:45:15, 6.44s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [17144], local_loss=0.006663517095148563, train_loss=0.15899938344955444, time_cost=1.4077284336090088
+
Steps: 2%|▏ | 17144/1000000 [10:33:13<1758:45:15, 6.44s/it, lr=1e-5, step_loss=0.00666]
Steps: 2%|▏ | 17145/1000000 [10:33:18<1651:26:04, 6.05s/it, lr=1e-5, step_loss=0.00666][RANK-0]: Step: [17145], local_loss=0.04465483874082565, train_loss=0.029335957020521164, time_cost=2.1370298862457275
+
Steps: 2%|▏ | 17145/1000000 [10:33:18<1651:26:04, 6.05s/it, lr=1e-5, step_loss=0.0447]
Steps: 2%|▏ | 17146/1000000 [10:33:28<1985:14:40, 7.27s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [17146], local_loss=0.01747104898095131, train_loss=0.02711649239063263, time_cost=3.2970056533813477
+
Steps: 2%|▏ | 17146/1000000 [10:33:28<1985:14:40, 7.27s/it, lr=1e-5, step_loss=0.0175]
Steps: 2%|▏ | 17147/1000000 [10:33:40<2325:27:35, 8.52s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [17147], local_loss=0.02131481282413006, train_loss=0.05192849785089493, time_cost=3.785869598388672
+
Steps: 2%|▏ | 17147/1000000 [10:33:40<2325:27:35, 8.52s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 17148/1000000 [10:33:46<2117:43:25, 7.76s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [17148], local_loss=1.005928635597229, train_loss=0.13603658974170685, time_cost=1.66654372215271
+
Steps: 2%|▏ | 17148/1000000 [10:33:46<2117:43:25, 7.76s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 17149/1000000 [10:33:53<2091:42:50, 7.66s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [17149], local_loss=0.054496195167303085, train_loss=0.035998594015836716, time_cost=1.8794808387756348
+
Steps: 2%|▏ | 17149/1000000 [10:33:53<2091:42:50, 7.66s/it, lr=1e-5, step_loss=0.0545]
Steps: 2%|▏ | 17150/1000000 [10:33:58<1828:50:59, 6.70s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [17150], local_loss=0.03264959156513214, train_loss=0.03687714785337448, time_cost=1.5945394039154053
+
Steps: 2%|▏ | 17150/1000000 [10:33:58<1828:50:59, 6.70s/it, lr=1e-5, step_loss=0.0326]
Steps: 2%|▏ | 17151/1000000 [10:34:07<2046:21:11, 7.50s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [17151], local_loss=0.03531739115715027, train_loss=0.03220415860414505, time_cost=2.1598262786865234
+
Steps: 2%|▏ | 17151/1000000 [10:34:07<2046:21:11, 7.50s/it, lr=1e-5, step_loss=0.0353]
Steps: 2%|▏ | 17152/1000000 [10:34:20<2465:50:22, 9.03s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [17152], local_loss=0.021234722808003426, train_loss=0.02876708284020424, time_cost=5.495088815689087
+
Steps: 2%|▏ | 17152/1000000 [10:34:20<2465:50:22, 9.03s/it, lr=1e-5, step_loss=0.0212]
Steps: 2%|▏ | 17153/1000000 [10:34:25<2206:50:20, 8.08s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [17153], local_loss=0.025993939489126205, train_loss=0.043110065162181854, time_cost=1.3480477333068848
+
Steps: 2%|▏ | 17153/1000000 [10:34:25<2206:50:20, 8.08s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 17154/1000000 [10:34:31<1996:17:37, 7.31s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [17154], local_loss=0.006469164043664932, train_loss=0.02858629822731018, time_cost=2.3053410053253174
+
Steps: 2%|▏ | 17154/1000000 [10:34:31<1996:17:37, 7.31s/it, lr=1e-5, step_loss=0.00647]
Steps: 2%|▏ | 17155/1000000 [10:34:40<2115:44:01, 7.75s/it, lr=1e-5, step_loss=0.00647][RANK-0]: Step: [17155], local_loss=0.010326666757464409, train_loss=0.035195332020521164, time_cost=2.9304139614105225
+
Steps: 2%|▏ | 17155/1000000 [10:34:40<2115:44:01, 7.75s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 17156/1000000 [10:34:47<2102:29:11, 7.70s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [17156], local_loss=0.01678105816245079, train_loss=0.07221725583076477, time_cost=1.5422124862670898
+
Steps: 2%|▏ | 17156/1000000 [10:34:47<2102:29:11, 7.70s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 17157/1000000 [10:34:55<2071:27:46, 7.59s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [17157], local_loss=0.004215781576931477, train_loss=0.018153898417949677, time_cost=1.9524903297424316
+
Steps: 2%|▏ | 17157/1000000 [10:34:55<2071:27:46, 7.59s/it, lr=1e-5, step_loss=0.00422]
Steps: 2%|▏ | 17158/1000000 [10:35:02<2066:27:52, 7.57s/it, lr=1e-5, step_loss=0.00422][RANK-0]: Step: [17158], local_loss=0.045482661575078964, train_loss=0.1538550853729248, time_cost=2.295433521270752
+
Steps: 2%|▏ | 17158/1000000 [10:35:02<2066:27:52, 7.57s/it, lr=1e-5, step_loss=0.0455]
Steps: 2%|▏ | 17159/1000000 [10:35:07<1833:45:41, 6.72s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [17159], local_loss=0.03381654992699623, train_loss=0.032146621495485306, time_cost=1.2330467700958252
+
Steps: 2%|▏ | 17159/1000000 [10:35:07<1833:45:41, 6.72s/it, lr=1e-5, step_loss=0.0338]
Steps: 2%|▏ | 17160/1000000 [10:35:15<1929:42:39, 7.07s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [17160], local_loss=0.044187143445014954, train_loss=0.029678404331207275, time_cost=2.990926504135132
+
Steps: 2%|▏ | 17160/1000000 [10:35:15<1929:42:39, 7.07s/it, lr=1e-5, step_loss=0.0442]
Steps: 2%|▏ | 17161/1000000 [10:35:25<2156:16:08, 7.90s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [17161], local_loss=0.01995261386036873, train_loss=0.07378056645393372, time_cost=1.7687296867370605
+
Steps: 2%|▏ | 17161/1000000 [10:35:25<2156:16:08, 7.90s/it, lr=1e-5, step_loss=0.02]
Steps: 2%|▏ | 17162/1000000 [10:35:30<1936:06:14, 7.09s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [17162], local_loss=0.012438500300049782, train_loss=0.06986457109451294, time_cost=1.7939167022705078
+
Steps: 2%|▏ | 17162/1000000 [10:35:30<1936:06:14, 7.09s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 17163/1000000 [10:35:44<2543:49:15, 9.32s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [17163], local_loss=0.09580973535776138, train_loss=0.04698414355516434, time_cost=4.632310152053833
+
Steps: 2%|▏ | 17163/1000000 [10:35:44<2543:49:15, 9.32s/it, lr=1e-5, step_loss=0.0958]
Steps: 2%|▏ | 17164/1000000 [10:35:52<2381:34:00, 8.72s/it, lr=1e-5, step_loss=0.0958][RANK-0]: Step: [17164], local_loss=0.009722741320729256, train_loss=0.13331668078899384, time_cost=2.271346092224121
+
Steps: 2%|▏ | 17164/1000000 [10:35:52<2381:34:00, 8.72s/it, lr=1e-5, step_loss=0.00972]
Steps: 2%|▏ | 17165/1000000 [10:36:03<2569:16:48, 9.41s/it, lr=1e-5, step_loss=0.00972][RANK-0]: Step: [17165], local_loss=0.012271569110453129, train_loss=0.014836407266557217, time_cost=3.332444190979004
+
Steps: 2%|▏ | 17165/1000000 [10:36:03<2569:16:48, 9.41s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 17166/1000000 [10:36:07<2188:12:25, 8.02s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [17166], local_loss=0.005667452234774828, train_loss=0.021166017279028893, time_cost=1.2597241401672363
+
Steps: 2%|▏ | 17166/1000000 [10:36:07<2188:12:25, 8.02s/it, lr=1e-5, step_loss=0.00567]
Steps: 2%|▏ | 17167/1000000 [10:36:21<2610:11:55, 9.56s/it, lr=1e-5, step_loss=0.00567][RANK-0]: Step: [17167], local_loss=0.04693833738565445, train_loss=0.04164819046854973, time_cost=3.277423620223999
+
Steps: 2%|▏ | 17167/1000000 [10:36:21<2610:11:55, 9.56s/it, lr=1e-5, step_loss=0.0469]
Steps: 2%|▏ | 17168/1000000 [10:36:26<2229:53:36, 8.17s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [17168], local_loss=0.013906270265579224, train_loss=0.06684263795614243, time_cost=1.264786958694458
+
Steps: 2%|▏ | 17168/1000000 [10:36:26<2229:53:36, 8.17s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 17169/1000000 [10:36:37<2479:52:00, 9.08s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [17169], local_loss=0.009114840067923069, train_loss=0.086735300719738, time_cost=4.375071048736572
+
Steps: 2%|▏ | 17169/1000000 [10:36:37<2479:52:00, 9.08s/it, lr=1e-5, step_loss=0.00911]
Steps: 2%|▏ | 17170/1000000 [10:36:51<2902:58:04, 10.63s/it, lr=1e-5, step_loss=0.00911][RANK-0]: Step: [17170], local_loss=0.04048430919647217, train_loss=0.056893400847911835, time_cost=1.2203137874603271
+
Steps: 2%|▏ | 17170/1000000 [10:36:51<2902:58:04, 10.63s/it, lr=1e-5, step_loss=0.0405]
Steps: 2%|▏ | 17171/1000000 [10:36:58<2609:24:22, 9.56s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [17171], local_loss=0.2174765169620514, train_loss=0.04437441751360893, time_cost=4.264864206314087
+
Steps: 2%|▏ | 17171/1000000 [10:36:58<2609:24:22, 9.56s/it, lr=1e-5, step_loss=0.217]
Steps: 2%|▏ | 17172/1000000 [10:37:02<2169:36:17, 7.95s/it, lr=1e-5, step_loss=0.217][RANK-0]: Step: [17172], local_loss=0.0111435167491436, train_loss=0.17268989980220795, time_cost=2.85172438621521
+
Steps: 2%|▏ | 17172/1000000 [10:37:02<2169:36:17, 7.95s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 17173/1000000 [10:37:08<1950:24:44, 7.14s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [17173], local_loss=0.07049018144607544, train_loss=0.043620914220809937, time_cost=2.297714948654175
+
Steps: 2%|▏ | 17173/1000000 [10:37:08<1950:24:44, 7.14s/it, lr=1e-5, step_loss=0.0705]
Steps: 2%|▏ | 17174/1000000 [10:37:16<2097:00:17, 7.68s/it, lr=1e-5, step_loss=0.0705][RANK-0]: Step: [17174], local_loss=0.008387453854084015, train_loss=0.028782369568943977, time_cost=2.705216407775879
+
Steps: 2%|▏ | 17174/1000000 [10:37:16<2097:00:17, 7.68s/it, lr=1e-5, step_loss=0.00839]
Steps: 2%|▏ | 17175/1000000 [10:37:27<2356:23:25, 8.63s/it, lr=1e-5, step_loss=0.00839][RANK-0]: Step: [17175], local_loss=0.03666770085692406, train_loss=0.0397975854575634, time_cost=2.052076578140259
+
Steps: 2%|▏ | 17175/1000000 [10:37:27<2356:23:25, 8.63s/it, lr=1e-5, step_loss=0.0367]
Steps: 2%|▏ | 17176/1000000 [10:37:38<2516:33:51, 9.22s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [17176], local_loss=0.05951488018035889, train_loss=0.027213457971811295, time_cost=2.0954995155334473
+
Steps: 2%|▏ | 17176/1000000 [10:37:38<2516:33:51, 9.22s/it, lr=1e-5, step_loss=0.0595]
Steps: 2%|▏ | 17177/1000000 [10:37:48<2609:11:56, 9.56s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [17177], local_loss=0.15201084315776825, train_loss=0.03334590792655945, time_cost=1.6840770244598389
+
Steps: 2%|▏ | 17177/1000000 [10:37:48<2609:11:56, 9.56s/it, lr=1e-5, step_loss=0.152]
Steps: 2%|▏ | 17178/1000000 [10:38:01<2889:29:08, 10.58s/it, lr=1e-5, step_loss=0.152][RANK-0]: Step: [17178], local_loss=0.005231970921158791, train_loss=0.017936773598194122, time_cost=6.029926538467407
+
Steps: 2%|▏ | 17178/1000000 [10:38:01<2889:29:08, 10.58s/it, lr=1e-5, step_loss=0.00523]
Steps: 2%|▏ | 17179/1000000 [10:38:12<2888:14:01, 10.58s/it, lr=1e-5, step_loss=0.00523][RANK-0]: Step: [17179], local_loss=0.00969714391976595, train_loss=0.032207563519477844, time_cost=2.5171432495117188
+
Steps: 2%|▏ | 17179/1000000 [10:38:12<2888:14:01, 10.58s/it, lr=1e-5, step_loss=0.0097]
Steps: 2%|▏ | 17180/1000000 [10:38:24<3053:55:58, 11.19s/it, lr=1e-5, step_loss=0.0097][RANK-0]: Step: [17180], local_loss=0.005392559804022312, train_loss=0.013281162828207016, time_cost=9.026897668838501
+
Steps: 2%|▏ | 17180/1000000 [10:38:24<3053:55:58, 11.19s/it, lr=1e-5, step_loss=0.00539]
Steps: 2%|▏ | 17181/1000000 [10:38:35<2971:57:49, 10.89s/it, lr=1e-5, step_loss=0.00539][RANK-0]: Step: [17181], local_loss=0.01163970772176981, train_loss=0.01947803795337677, time_cost=1.8031127452850342
+
Steps: 2%|▏ | 17181/1000000 [10:38:35<2971:57:49, 10.89s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 17182/1000000 [10:38:48<3206:24:06, 11.74s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [17182], local_loss=0.029792573302984238, train_loss=0.148351788520813, time_cost=4.750248193740845
+
Steps: 2%|▏ | 17182/1000000 [10:38:48<3206:24:06, 11.74s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 17183/1000000 [10:38:57<2937:49:59, 10.76s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [17183], local_loss=0.007362802047282457, train_loss=0.018342966213822365, time_cost=4.8404905796051025
+
Steps: 2%|▏ | 17183/1000000 [10:38:57<2937:49:59, 10.76s/it, lr=1e-5, step_loss=0.00736]
Steps: 2%|▏ | 17184/1000000 [10:39:04<2624:19:53, 9.61s/it, lr=1e-5, step_loss=0.00736][RANK-0]: Step: [17184], local_loss=0.014157315716147423, train_loss=0.0372760072350502, time_cost=1.5361273288726807
+
Steps: 2%|▏ | 17184/1000000 [10:39:04<2624:19:53, 9.61s/it, lr=1e-5, step_loss=0.0142]
Steps: 2%|▏ | 17185/1000000 [10:39:17<2903:20:18, 10.63s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [17185], local_loss=0.054207123816013336, train_loss=0.020422134548425674, time_cost=3.7649736404418945
+
Steps: 2%|▏ | 17185/1000000 [10:39:17<2903:20:18, 10.63s/it, lr=1e-5, step_loss=0.0542]
Steps: 2%|▏ | 17186/1000000 [10:39:26<2774:29:32, 10.16s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [17186], local_loss=0.01367775909602642, train_loss=0.012489606626331806, time_cost=3.8016653060913086
+
Steps: 2%|▏ | 17186/1000000 [10:39:26<2774:29:32, 10.16s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 17187/1000000 [10:39:36<2810:23:41, 10.29s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [17187], local_loss=0.0036925440654158592, train_loss=0.015272160060703754, time_cost=7.4581756591796875
+
Steps: 2%|▏ | 17187/1000000 [10:39:36<2810:23:41, 10.29s/it, lr=1e-5, step_loss=0.00369]
Steps: 2%|▏ | 17188/1000000 [10:39:43<2526:19:18, 9.25s/it, lr=1e-5, step_loss=0.00369][RANK-0]: Step: [17188], local_loss=0.0841456800699234, train_loss=0.04154965654015541, time_cost=1.2604436874389648
+
Steps: 2%|▏ | 17188/1000000 [10:39:43<2526:19:18, 9.25s/it, lr=1e-5, step_loss=0.0841]
Steps: 2%|▏ | 17189/1000000 [10:39:54<2649:13:22, 9.70s/it, lr=1e-5, step_loss=0.0841][RANK-0]: Step: [17189], local_loss=0.006834344007074833, train_loss=0.14528526365756989, time_cost=1.6482620239257812
+
Steps: 2%|▏ | 17189/1000000 [10:39:54<2649:13:22, 9.70s/it, lr=1e-5, step_loss=0.00683]
Steps: 2%|▏ | 17190/1000000 [10:40:07<2885:28:50, 10.57s/it, lr=1e-5, step_loss=0.00683][RANK-0]: Step: [17190], local_loss=0.009467190131545067, train_loss=0.024949925020337105, time_cost=3.405730962753296
+
Steps: 2%|▏ | 17190/1000000 [10:40:07<2885:28:50, 10.57s/it, lr=1e-5, step_loss=0.00947]
Steps: 2%|▏ | 17191/1000000 [10:40:21<3238:46:10, 11.86s/it, lr=1e-5, step_loss=0.00947][RANK-0]: Step: [17191], local_loss=0.0337352529168129, train_loss=0.017070025205612183, time_cost=6.458819627761841
+
Steps: 2%|▏ | 17191/1000000 [10:40:21<3238:46:10, 11.86s/it, lr=1e-5, step_loss=0.0337]
Steps: 2%|▏ | 17192/1000000 [10:40:28<2794:39:44, 10.24s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [17192], local_loss=0.03738969564437866, train_loss=0.08958041667938232, time_cost=1.3469326496124268
+
Steps: 2%|▏ | 17192/1000000 [10:40:28<2794:39:44, 10.24s/it, lr=1e-5, step_loss=0.0374]
Steps: 2%|▏ | 17193/1000000 [10:40:34<2438:51:11, 8.93s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [17193], local_loss=0.030188167467713356, train_loss=0.02437460795044899, time_cost=1.7685399055480957
+
Steps: 2%|▏ | 17193/1000000 [10:40:34<2438:51:11, 8.93s/it, lr=1e-5, step_loss=0.0302]
Steps: 2%|▏ | 17194/1000000 [10:40:41<2331:42:41, 8.54s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [17194], local_loss=0.021422071382403374, train_loss=0.03460949659347534, time_cost=3.8639261722564697
+
Steps: 2%|▏ | 17194/1000000 [10:40:41<2331:42:41, 8.54s/it, lr=1e-5, step_loss=0.0214]
Steps: 2%|▏ | 17195/1000000 [10:40:51<2387:08:57, 8.74s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [17195], local_loss=0.07569482177495956, train_loss=0.09611541032791138, time_cost=2.5704550743103027
+
Steps: 2%|▏ | 17195/1000000 [10:40:51<2387:08:57, 8.74s/it, lr=1e-5, step_loss=0.0757]
Steps: 2%|▏ | 17196/1000000 [10:41:04<2723:02:03, 9.97s/it, lr=1e-5, step_loss=0.0757][RANK-0]: Step: [17196], local_loss=0.047608423978090286, train_loss=0.05698137730360031, time_cost=10.796110391616821
+
Steps: 2%|▏ | 17196/1000000 [10:41:04<2723:02:03, 9.97s/it, lr=1e-5, step_loss=0.0476]
Steps: 2%|▏ | 17197/1000000 [10:41:14<2774:52:29, 10.16s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [17197], local_loss=0.05598749592900276, train_loss=0.0315110981464386, time_cost=3.5741512775421143
+
Steps: 2%|▏ | 17197/1000000 [10:41:14<2774:52:29, 10.16s/it, lr=1e-5, step_loss=0.056]
Steps: 2%|▏ | 17198/1000000 [10:41:31<3346:19:55, 12.26s/it, lr=1e-5, step_loss=0.056][RANK-0]: Step: [17198], local_loss=0.07149379700422287, train_loss=0.03474005311727524, time_cost=8.961509227752686
+
Steps: 2%|▏ | 17198/1000000 [10:41:31<3346:19:55, 12.26s/it, lr=1e-5, step_loss=0.0715]
Steps: 2%|▏ | 17199/1000000 [10:41:36<2702:04:08, 9.90s/it, lr=1e-5, step_loss=0.0715][RANK-0]: Step: [17199], local_loss=0.03125695511698723, train_loss=0.021849194541573524, time_cost=1.3388237953186035
+
Steps: 2%|▏ | 17199/1000000 [10:41:36<2702:04:08, 9.90s/it, lr=1e-5, step_loss=0.0313]
Steps: 2%|▏ | 17200/1000000 [10:41:43<2495:04:47, 9.14s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [17200], local_loss=0.018809083849191666, train_loss=0.024094657972455025, time_cost=1.494460105895996
+
Steps: 2%|▏ | 17200/1000000 [10:41:43<2495:04:47, 9.14s/it, lr=1e-5, step_loss=0.0188]
Steps: 2%|▏ | 17201/1000000 [10:41:54<2639:23:05, 9.67s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [17201], local_loss=0.004660391714423895, train_loss=0.014869846403598785, time_cost=1.3113114833831787
+
Steps: 2%|▏ | 17201/1000000 [10:41:54<2639:23:05, 9.67s/it, lr=1e-5, step_loss=0.00466]
Steps: 2%|▏ | 17202/1000000 [10:42:03<2582:08:03, 9.46s/it, lr=1e-5, step_loss=0.00466][RANK-0]: Step: [17202], local_loss=0.014660771004855633, train_loss=0.05650632455945015, time_cost=3.0844836235046387
+
Steps: 2%|▏ | 17202/1000000 [10:42:03<2582:08:03, 9.46s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 17203/1000000 [10:42:16<2887:24:40, 10.58s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [17203], local_loss=0.01290002278983593, train_loss=0.03550099581480026, time_cost=1.9366438388824463
+
Steps: 2%|▏ | 17203/1000000 [10:42:16<2887:24:40, 10.58s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 17204/1000000 [10:42:21<2440:31:41, 8.94s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [17204], local_loss=0.02579433284699917, train_loss=0.17038720846176147, time_cost=2.446298837661743
+
Steps: 2%|▏ | 17204/1000000 [10:42:21<2440:31:41, 8.94s/it, lr=1e-5, step_loss=0.0258]
Steps: 2%|▏ | 17205/1000000 [10:42:30<2425:48:36, 8.89s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [17205], local_loss=0.006436466239392757, train_loss=0.12666015326976776, time_cost=1.2379429340362549
+
Steps: 2%|▏ | 17205/1000000 [10:42:30<2425:48:36, 8.89s/it, lr=1e-5, step_loss=0.00644]
Steps: 2%|▏ | 17206/1000000 [10:42:37<2269:45:52, 8.31s/it, lr=1e-5, step_loss=0.00644][RANK-0]: Step: [17206], local_loss=0.08348391950130463, train_loss=0.05453087389469147, time_cost=2.271657705307007
+
Steps: 2%|▏ | 17206/1000000 [10:42:37<2269:45:52, 8.31s/it, lr=1e-5, step_loss=0.0835]
Steps: 2%|▏ | 17207/1000000 [10:42:44<2139:44:06, 7.84s/it, lr=1e-5, step_loss=0.0835][RANK-0]: Step: [17207], local_loss=0.004723673220723867, train_loss=0.129514679312706, time_cost=2.4125308990478516
+
Steps: 2%|▏ | 17207/1000000 [10:42:44<2139:44:06, 7.84s/it, lr=1e-5, step_loss=0.00472]
Steps: 2%|▏ | 17208/1000000 [10:42:56<2467:16:39, 9.04s/it, lr=1e-5, step_loss=0.00472][RANK-0]: Step: [17208], local_loss=0.013101750984787941, train_loss=0.04044877737760544, time_cost=4.396544933319092
+
Steps: 2%|▏ | 17208/1000000 [10:42:56<2467:16:39, 9.04s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 17209/1000000 [10:43:03<2315:34:47, 8.48s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [17209], local_loss=0.01772073283791542, train_loss=0.04193387180566788, time_cost=1.3682959079742432
+
Steps: 2%|▏ | 17209/1000000 [10:43:03<2315:34:47, 8.48s/it, lr=1e-5, step_loss=0.0177]
Steps: 2%|▏ | 17210/1000000 [10:43:08<2054:04:11, 7.52s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [17210], local_loss=0.03790836036205292, train_loss=0.0676174908876419, time_cost=4.122992277145386
+
Steps: 2%|▏ | 17210/1000000 [10:43:08<2054:04:11, 7.52s/it, lr=1e-5, step_loss=0.0379]
Steps: 2%|▏ | 17211/1000000 [10:43:15<1992:11:24, 7.30s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [17211], local_loss=0.18272314965724945, train_loss=0.05009394884109497, time_cost=1.2444441318511963
+
Steps: 2%|▏ | 17211/1000000 [10:43:15<1992:11:24, 7.30s/it, lr=1e-5, step_loss=0.183]
Steps: 2%|▏ | 17212/1000000 [10:43:25<2273:37:10, 8.33s/it, lr=1e-5, step_loss=0.183][RANK-0]: Step: [17212], local_loss=0.01952371560037136, train_loss=0.03496863320469856, time_cost=3.152812957763672
+
Steps: 2%|▏ | 17212/1000000 [10:43:25<2273:37:10, 8.33s/it, lr=1e-5, step_loss=0.0195]
Steps: 2%|▏ | 17213/1000000 [10:43:36<2434:07:01, 8.92s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [17213], local_loss=0.04278959706425667, train_loss=0.15117602050304413, time_cost=3.8602795600891113
+
Steps: 2%|▏ | 17213/1000000 [10:43:36<2434:07:01, 8.92s/it, lr=1e-5, step_loss=0.0428]
Steps: 2%|▏ | 17214/1000000 [10:43:43<2323:58:46, 8.51s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [17214], local_loss=0.00882762111723423, train_loss=0.019108686596155167, time_cost=1.8000202178955078
+
Steps: 2%|▏ | 17214/1000000 [10:43:43<2323:58:46, 8.51s/it, lr=1e-5, step_loss=0.00883]
Steps: 2%|▏ | 17215/1000000 [10:43:51<2269:56:27, 8.31s/it, lr=1e-5, step_loss=0.00883][RANK-0]: Step: [17215], local_loss=0.05308259278535843, train_loss=0.03863246738910675, time_cost=2.0281429290771484
+
Steps: 2%|▏ | 17215/1000000 [10:43:51<2269:56:27, 8.31s/it, lr=1e-5, step_loss=0.0531]
Steps: 2%|▏ | 17216/1000000 [10:44:02<2465:20:01, 9.03s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [17216], local_loss=0.037714675068855286, train_loss=0.018437212333083153, time_cost=2.7985823154449463
+
Steps: 2%|▏ | 17216/1000000 [10:44:02<2465:20:01, 9.03s/it, lr=1e-5, step_loss=0.0377]
Steps: 2%|▏ | 17217/1000000 [10:44:14<2712:09:51, 9.93s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [17217], local_loss=0.007807186339050531, train_loss=0.04614616930484772, time_cost=4.390766859054565
+
Steps: 2%|▏ | 17217/1000000 [10:44:14<2712:09:51, 9.93s/it, lr=1e-5, step_loss=0.00781]
Steps: 2%|▏ | 17218/1000000 [10:44:28<3039:46:05, 11.13s/it, lr=1e-5, step_loss=0.00781][RANK-0]: Step: [17218], local_loss=0.08547161519527435, train_loss=0.07360392063856125, time_cost=9.943005084991455
+
Steps: 2%|▏ | 17218/1000000 [10:44:28<3039:46:05, 11.13s/it, lr=1e-5, step_loss=0.0855]
Steps: 2%|▏ | 17219/1000000 [10:44:41<3175:08:02, 11.63s/it, lr=1e-5, step_loss=0.0855][RANK-0]: Step: [17219], local_loss=0.07305267453193665, train_loss=0.02993040531873703, time_cost=6.314693450927734
+
Steps: 2%|▏ | 17219/1000000 [10:44:41<3175:08:02, 11.63s/it, lr=1e-5, step_loss=0.0731]
Steps: 2%|▏ | 17220/1000000 [10:44:49<2929:40:44, 10.73s/it, lr=1e-5, step_loss=0.0731][RANK-0]: Step: [17220], local_loss=0.035747602581977844, train_loss=0.05436035245656967, time_cost=1.616375207901001
+
Steps: 2%|▏ | 17220/1000000 [10:44:49<2929:40:44, 10.73s/it, lr=1e-5, step_loss=0.0357]
Steps: 2%|▏ | 17221/1000000 [10:45:01<2984:35:00, 10.93s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [17221], local_loss=0.0074629997834563255, train_loss=0.04448316991329193, time_cost=2.6933350563049316
+
Steps: 2%|▏ | 17221/1000000 [10:45:01<2984:35:00, 10.93s/it, lr=1e-5, step_loss=0.00746]
Steps: 2%|▏ | 17222/1000000 [10:45:15<3284:12:53, 12.03s/it, lr=1e-5, step_loss=0.00746][RANK-0]: Step: [17222], local_loss=0.009371291846036911, train_loss=0.14332342147827148, time_cost=5.20168924331665
+
Steps: 2%|▏ | 17222/1000000 [10:45:15<3284:12:53, 12.03s/it, lr=1e-5, step_loss=0.00937]
Steps: 2%|▏ | 17223/1000000 [10:45:23<2964:21:08, 10.86s/it, lr=1e-5, step_loss=0.00937][RANK-0]: Step: [17223], local_loss=0.005994494538754225, train_loss=0.16727611422538757, time_cost=4.569934606552124
+
Steps: 2%|▏ | 17223/1000000 [10:45:23<2964:21:08, 10.86s/it, lr=1e-5, step_loss=0.00599]
Steps: 2%|▏ | 17224/1000000 [10:45:30<2610:46:32, 9.56s/it, lr=1e-5, step_loss=0.00599][RANK-0]: Step: [17224], local_loss=0.006823324598371983, train_loss=0.0587661974132061, time_cost=2.718822956085205
+
Steps: 2%|▏ | 17224/1000000 [10:45:30<2610:46:32, 9.56s/it, lr=1e-5, step_loss=0.00682]
Steps: 2%|▏ | 17225/1000000 [10:45:40<2631:03:40, 9.64s/it, lr=1e-5, step_loss=0.00682][RANK-0]: Step: [17225], local_loss=0.15718020498752594, train_loss=0.06575068831443787, time_cost=1.4935071468353271
+
Steps: 2%|▏ | 17225/1000000 [10:45:40<2631:03:40, 9.64s/it, lr=1e-5, step_loss=0.157]
Steps: 2%|▏ | 17226/1000000 [10:45:58<3303:34:49, 12.10s/it, lr=1e-5, step_loss=0.157][RANK-0]: Step: [17226], local_loss=0.03449335694313049, train_loss=0.04089930281043053, time_cost=1.1864745616912842
+
Steps: 2%|▏ | 17226/1000000 [10:45:58<3303:34:49, 12.10s/it, lr=1e-5, step_loss=0.0345]
Steps: 2%|▏ | 17227/1000000 [10:46:02<2711:10:30, 9.93s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [17227], local_loss=0.02605224959552288, train_loss=0.019092679023742676, time_cost=1.1925148963928223
+
Steps: 2%|▏ | 17227/1000000 [10:46:02<2711:10:30, 9.93s/it, lr=1e-5, step_loss=0.0261]
Steps: 2%|▏ | 17228/1000000 [10:46:08<2318:33:33, 8.49s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [17228], local_loss=0.46380066871643066, train_loss=0.07954530417919159, time_cost=2.4741430282592773
+
Steps: 2%|▏ | 17228/1000000 [10:46:08<2318:33:33, 8.49s/it, lr=1e-5, step_loss=0.464]
Steps: 2%|▏ | 17229/1000000 [10:46:19<2565:11:39, 9.40s/it, lr=1e-5, step_loss=0.464][RANK-0]: Step: [17229], local_loss=0.02556890994310379, train_loss=0.038736578077077866, time_cost=1.7865338325500488
+
Steps: 2%|▏ | 17229/1000000 [10:46:19<2565:11:39, 9.40s/it, lr=1e-5, step_loss=0.0256]
Steps: 2%|▏ | 17230/1000000 [10:46:24<2180:28:23, 7.99s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [17230], local_loss=0.020075278356671333, train_loss=0.009708295576274395, time_cost=1.3765389919281006
+
Steps: 2%|▏ | 17230/1000000 [10:46:24<2180:28:23, 7.99s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 17231/1000000 [10:46:31<2099:59:18, 7.69s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [17231], local_loss=0.010650907643139362, train_loss=0.0464550219476223, time_cost=1.8289711475372314
+
Steps: 2%|▏ | 17231/1000000 [10:46:31<2099:59:18, 7.69s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17232/1000000 [10:46:36<1927:22:18, 7.06s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17232], local_loss=0.01962130330502987, train_loss=0.03756435215473175, time_cost=1.9088637828826904
+
Steps: 2%|▏ | 17232/1000000 [10:46:36<1927:22:18, 7.06s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 17233/1000000 [10:46:45<2093:17:07, 7.67s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [17233], local_loss=0.07600877434015274, train_loss=0.04997146129608154, time_cost=1.7875642776489258
+
Steps: 2%|▏ | 17233/1000000 [10:46:45<2093:17:07, 7.67s/it, lr=1e-5, step_loss=0.076]
Steps: 2%|▏ | 17234/1000000 [10:46:52<1969:02:34, 7.21s/it, lr=1e-5, step_loss=0.076][RANK-0]: Step: [17234], local_loss=0.020855290815234184, train_loss=0.025988321751356125, time_cost=5.061002969741821
+
Steps: 2%|▏ | 17234/1000000 [10:46:52<1969:02:34, 7.21s/it, lr=1e-5, step_loss=0.0209]
Steps: 2%|▏ | 17235/1000000 [10:46:59<2009:56:05, 7.36s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [17235], local_loss=0.04897994548082352, train_loss=0.03083367459475994, time_cost=3.615191698074341
+
Steps: 2%|▏ | 17235/1000000 [10:46:59<2009:56:05, 7.36s/it, lr=1e-5, step_loss=0.049]
Steps: 2%|▏ | 17236/1000000 [10:47:05<1887:37:13, 6.91s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [17236], local_loss=0.007251456845551729, train_loss=0.057096004486083984, time_cost=2.4236931800842285
+
Steps: 2%|▏ | 17236/1000000 [10:47:05<1887:37:13, 6.91s/it, lr=1e-5, step_loss=0.00725]
Steps: 2%|▏ | 17237/1000000 [10:47:11<1758:17:10, 6.44s/it, lr=1e-5, step_loss=0.00725][RANK-0]: Step: [17237], local_loss=0.10191881656646729, train_loss=0.06743036955595016, time_cost=2.2998740673065186
+
Steps: 2%|▏ | 17237/1000000 [10:47:11<1758:17:10, 6.44s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 17238/1000000 [10:47:19<1944:53:49, 7.12s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [17238], local_loss=0.006880210712552071, train_loss=0.03165416792035103, time_cost=2.826871871948242
+
Steps: 2%|▏ | 17238/1000000 [10:47:19<1944:53:49, 7.12s/it, lr=1e-5, step_loss=0.00688]
Steps: 2%|▏ | 17239/1000000 [10:47:28<2068:42:57, 7.58s/it, lr=1e-5, step_loss=0.00688][RANK-0]: Step: [17239], local_loss=0.11875706166028976, train_loss=0.025719091296195984, time_cost=6.414544343948364
+
Steps: 2%|▏ | 17239/1000000 [10:47:28<2068:42:57, 7.58s/it, lr=1e-5, step_loss=0.119]
Steps: 2%|▏ | 17240/1000000 [10:47:41<2521:26:06, 9.24s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [17240], local_loss=0.05528301000595093, train_loss=0.042456064373254776, time_cost=7.46587610244751
+
Steps: 2%|▏ | 17240/1000000 [10:47:41<2521:26:06, 9.24s/it, lr=1e-5, step_loss=0.0553]
Steps: 2%|▏ | 17241/1000000 [10:47:53<2716:27:09, 9.95s/it, lr=1e-5, step_loss=0.0553][RANK-0]: Step: [17241], local_loss=0.03431050851941109, train_loss=0.015810750424861908, time_cost=2.1844077110290527
+
Steps: 2%|▏ | 17241/1000000 [10:47:53<2716:27:09, 9.95s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 17242/1000000 [10:48:04<2847:36:19, 10.43s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [17242], local_loss=0.014644458889961243, train_loss=0.028601400554180145, time_cost=2.3966856002807617
+
Steps: 2%|▏ | 17242/1000000 [10:48:04<2847:36:19, 10.43s/it, lr=1e-5, step_loss=0.0146]
Steps: 2%|▏ | 17243/1000000 [10:48:15<2891:02:32, 10.59s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [17243], local_loss=0.0092959338799119, train_loss=0.04118834435939789, time_cost=8.169363498687744
+
Steps: 2%|▏ | 17243/1000000 [10:48:15<2891:02:32, 10.59s/it, lr=1e-5, step_loss=0.0093]
Steps: 2%|▏ | 17244/1000000 [10:48:26<2916:35:13, 10.68s/it, lr=1e-5, step_loss=0.0093][RANK-0]: Step: [17244], local_loss=0.012646697461605072, train_loss=0.04403994232416153, time_cost=2.6937415599823
+
Steps: 2%|▏ | 17244/1000000 [10:48:26<2916:35:13, 10.68s/it, lr=1e-5, step_loss=0.0126]
Steps: 2%|▏ | 17245/1000000 [10:48:31<2482:16:06, 9.09s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [17245], local_loss=0.013942037709057331, train_loss=0.014631612226366997, time_cost=2.1321325302124023
+
Steps: 2%|▏ | 17245/1000000 [10:48:31<2482:16:06, 9.09s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 17246/1000000 [10:48:40<2437:32:07, 8.93s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [17246], local_loss=0.012542585842311382, train_loss=0.024389872327446938, time_cost=1.9521660804748535
+
Steps: 2%|▏ | 17246/1000000 [10:48:40<2437:32:07, 8.93s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 17247/1000000 [10:48:48<2324:29:07, 8.52s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [17247], local_loss=0.02182924561202526, train_loss=0.033825479447841644, time_cost=1.568244218826294
+
Steps: 2%|▏ | 17247/1000000 [10:48:48<2324:29:07, 8.52s/it, lr=1e-5, step_loss=0.0218]/home/image_data/hxy/Open-Sora-Plan/opensora/utils/utils.py:369: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
+ caption = BeautifulSoup(caption, features='html.parser').text
+
Steps: 2%|▏ | 17248/1000000 [10:49:01<2718:54:51, 9.96s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [17248], local_loss=0.019034380093216896, train_loss=0.03327096626162529, time_cost=5.745054721832275
+
Steps: 2%|▏ | 17248/1000000 [10:49:01<2718:54:51, 9.96s/it, lr=1e-5, step_loss=0.019]
Steps: 2%|▏ | 17249/1000000 [10:49:10<2681:21:22, 9.82s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [17249], local_loss=0.012372353114187717, train_loss=0.020777426660060883, time_cost=3.3379721641540527
+
Steps: 2%|▏ | 17249/1000000 [10:49:10<2681:21:22, 9.82s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 17250/1000000 [10:49:15<2278:38:06, 8.35s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [17250], local_loss=0.029677197337150574, train_loss=0.022345880046486855, time_cost=1.9449462890625
+
Steps: 2%|▏ | 17250/1000000 [10:49:15<2278:38:06, 8.35s/it, lr=1e-5, step_loss=0.0297]
Steps: 2%|▏ | 17251/1000000 [10:49:22<2171:02:33, 7.95s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [17251], local_loss=0.0303975697606802, train_loss=0.03205427527427673, time_cost=2.4987995624542236
+
Steps: 2%|▏ | 17251/1000000 [10:49:22<2171:02:33, 7.95s/it, lr=1e-5, step_loss=0.0304]
Steps: 2%|▏ | 17252/1000000 [10:49:29<2067:19:21, 7.57s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [17252], local_loss=0.07049378007650375, train_loss=0.03699316456913948, time_cost=2.2414703369140625
+
Steps: 2%|▏ | 17252/1000000 [10:49:29<2067:19:21, 7.57s/it, lr=1e-5, step_loss=0.0705]
Steps: 2%|▏ | 17253/1000000 [10:49:42<2542:11:59, 9.31s/it, lr=1e-5, step_loss=0.0705][RANK-0]: Step: [17253], local_loss=0.012351304292678833, train_loss=0.07403793931007385, time_cost=5.837366580963135
+
Steps: 2%|▏ | 17253/1000000 [10:49:42<2542:11:59, 9.31s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 17254/1000000 [10:49:48<2210:21:52, 8.10s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [17254], local_loss=0.04726635292172432, train_loss=0.029535768553614616, time_cost=2.725313186645508
+
Steps: 2%|▏ | 17254/1000000 [10:49:48<2210:21:52, 8.10s/it, lr=1e-5, step_loss=0.0473]
Steps: 2%|▏ | 17255/1000000 [10:49:59<2463:51:42, 9.03s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [17255], local_loss=0.051482684910297394, train_loss=0.024073190987110138, time_cost=4.82099723815918
+
Steps: 2%|▏ | 17255/1000000 [10:49:59<2463:51:42, 9.03s/it, lr=1e-5, step_loss=0.0515]
Steps: 2%|▏ | 17256/1000000 [10:50:07<2376:26:16, 8.71s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [17256], local_loss=0.22776681184768677, train_loss=0.05506696179509163, time_cost=2.536060333251953
+
Steps: 2%|▏ | 17256/1000000 [10:50:07<2376:26:16, 8.71s/it, lr=1e-5, step_loss=0.228]
Steps: 2%|▏ | 17257/1000000 [10:50:17<2490:48:03, 9.12s/it, lr=1e-5, step_loss=0.228][RANK-0]: Step: [17257], local_loss=0.006890744436532259, train_loss=0.15824748575687408, time_cost=8.429982423782349
+
Steps: 2%|▏ | 17257/1000000 [10:50:17<2490:48:03, 9.12s/it, lr=1e-5, step_loss=0.00689]
Steps: 2%|▏ | 17258/1000000 [10:50:27<2593:44:19, 9.50s/it, lr=1e-5, step_loss=0.00689][RANK-0]: Step: [17258], local_loss=0.07660700380802155, train_loss=0.04036322236061096, time_cost=1.1994845867156982
+
Steps: 2%|▏ | 17258/1000000 [10:50:27<2593:44:19, 9.50s/it, lr=1e-5, step_loss=0.0766]
Steps: 2%|▏ | 17259/1000000 [10:50:32<2168:00:52, 7.94s/it, lr=1e-5, step_loss=0.0766][RANK-0]: Step: [17259], local_loss=0.22678576409816742, train_loss=0.17049072682857513, time_cost=1.6530985832214355
+
Steps: 2%|▏ | 17259/1000000 [10:50:32<2168:00:52, 7.94s/it, lr=1e-5, step_loss=0.227]
Steps: 2%|▏ | 17260/1000000 [10:50:38<2082:48:56, 7.63s/it, lr=1e-5, step_loss=0.227][RANK-0]: Step: [17260], local_loss=0.008745983242988586, train_loss=0.02068878710269928, time_cost=2.9854848384857178
+
Steps: 2%|▏ | 17260/1000000 [10:50:38<2082:48:56, 7.63s/it, lr=1e-5, step_loss=0.00875]
Steps: 2%|▏ | 17261/1000000 [10:50:45<1958:31:41, 7.17s/it, lr=1e-5, step_loss=0.00875][RANK-0]: Step: [17261], local_loss=0.005050759296864271, train_loss=0.046952467411756516, time_cost=2.372483015060425
+
Steps: 2%|▏ | 17261/1000000 [10:50:45<1958:31:41, 7.17s/it, lr=1e-5, step_loss=0.00505]
Steps: 2%|▏ | 17262/1000000 [10:50:50<1792:40:50, 6.57s/it, lr=1e-5, step_loss=0.00505][RANK-0]: Step: [17262], local_loss=0.0298022348433733, train_loss=0.044067129492759705, time_cost=2.0883195400238037
+
Steps: 2%|▏ | 17262/1000000 [10:50:50<1792:40:50, 6.57s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 17263/1000000 [10:50:57<1818:56:00, 6.66s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [17263], local_loss=0.08243875950574875, train_loss=0.03530234843492508, time_cost=2.978034496307373
+
Steps: 2%|▏ | 17263/1000000 [10:50:57<1818:56:00, 6.66s/it, lr=1e-5, step_loss=0.0824]
Steps: 2%|▏ | 17264/1000000 [10:51:07<2104:31:34, 7.71s/it, lr=1e-5, step_loss=0.0824][RANK-0]: Step: [17264], local_loss=0.008226354606449604, train_loss=0.06035088002681732, time_cost=7.566737651824951
+
Steps: 2%|▏ | 17264/1000000 [10:51:07<2104:31:34, 7.71s/it, lr=1e-5, step_loss=0.00823]
Steps: 2%|▏ | 17265/1000000 [10:51:14<2100:30:54, 7.69s/it, lr=1e-5, step_loss=0.00823][RANK-0]: Step: [17265], local_loss=0.1396181732416153, train_loss=0.1021081805229187, time_cost=1.8088769912719727
+
Steps: 2%|▏ | 17265/1000000 [10:51:14<2100:30:54, 7.69s/it, lr=1e-5, step_loss=0.14]
Steps: 2%|▏ | 17266/1000000 [10:51:20<1936:32:52, 7.09s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [17266], local_loss=0.007904568687081337, train_loss=0.1583622246980667, time_cost=3.4156415462493896
+
Steps: 2%|▏ | 17266/1000000 [10:51:20<1936:32:52, 7.09s/it, lr=1e-5, step_loss=0.0079]
Steps: 2%|▏ | 17267/1000000 [10:51:28<2003:08:26, 7.34s/it, lr=1e-5, step_loss=0.0079][RANK-0]: Step: [17267], local_loss=0.00684161065146327, train_loss=0.04306165128946304, time_cost=5.701264142990112
+
Steps: 2%|▏ | 17267/1000000 [10:51:28<2003:08:26, 7.34s/it, lr=1e-5, step_loss=0.00684]
Steps: 2%|▏ | 17268/1000000 [10:51:41<2493:03:56, 9.13s/it, lr=1e-5, step_loss=0.00684][RANK-0]: Step: [17268], local_loss=0.10247280448675156, train_loss=0.021208882331848145, time_cost=1.2892513275146484
+
Steps: 2%|▏ | 17268/1000000 [10:51:41<2493:03:56, 9.13s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 17269/1000000 [10:51:47<2181:34:35, 7.99s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [17269], local_loss=0.016197672113776207, train_loss=0.02061314508318901, time_cost=2.273017168045044
+
Steps: 2%|▏ | 17269/1000000 [10:51:47<2181:34:35, 7.99s/it, lr=1e-5, step_loss=0.0162]
Steps: 2%|▏ | 17270/1000000 [10:51:57<2398:16:05, 8.79s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [17270], local_loss=0.01860019750893116, train_loss=8.868375778198242, time_cost=7.900351524353027
+
Steps: 2%|▏ | 17270/1000000 [10:51:57<2398:16:05, 8.79s/it, lr=1e-5, step_loss=0.0186]
Steps: 2%|▏ | 17271/1000000 [10:52:09<2650:33:23, 9.71s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [17271], local_loss=0.01959860883653164, train_loss=0.04990151524543762, time_cost=5.688867807388306
+
Steps: 2%|▏ | 17271/1000000 [10:52:09<2650:33:23, 9.71s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 17272/1000000 [10:52:14<2267:19:21, 8.31s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [17272], local_loss=0.02803031913936138, train_loss=0.07601945102214813, time_cost=2.1411640644073486
+
Steps: 2%|▏ | 17272/1000000 [10:52:14<2267:19:21, 8.31s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 17273/1000000 [10:52:25<2442:33:54, 8.95s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [17273], local_loss=0.007786482572555542, train_loss=0.01705111935734749, time_cost=1.9848127365112305
+
Steps: 2%|▏ | 17273/1000000 [10:52:25<2442:33:54, 8.95s/it, lr=1e-5, step_loss=0.00779]
Steps: 2%|▏ | 17274/1000000 [10:52:35<2583:03:40, 9.46s/it, lr=1e-5, step_loss=0.00779][RANK-0]: Step: [17274], local_loss=0.040537748485803604, train_loss=0.01567113772034645, time_cost=3.390974521636963
+
Steps: 2%|▏ | 17274/1000000 [10:52:35<2583:03:40, 9.46s/it, lr=1e-5, step_loss=0.0405]
Steps: 2%|▏ | 17275/1000000 [10:52:48<2867:51:52, 10.51s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [17275], local_loss=0.05334533005952835, train_loss=0.030306510627269745, time_cost=4.715341567993164
+
Steps: 2%|▏ | 17275/1000000 [10:52:48<2867:51:52, 10.51s/it, lr=1e-5, step_loss=0.0533]
Steps: 2%|▏ | 17276/1000000 [10:52:59<2857:47:49, 10.47s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [17276], local_loss=0.0106071000918746, train_loss=0.023505104705691338, time_cost=1.2310519218444824
+
Steps: 2%|▏ | 17276/1000000 [10:52:59<2857:47:49, 10.47s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 17277/1000000 [10:53:10<2936:52:58, 10.76s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [17277], local_loss=0.7758089303970337, train_loss=0.1308344602584839, time_cost=1.2623679637908936
+
Steps: 2%|▏ | 17277/1000000 [10:53:10<2936:52:58, 10.76s/it, lr=1e-5, step_loss=0.776]
Steps: 2%|▏ | 17278/1000000 [10:53:26<3343:54:09, 12.25s/it, lr=1e-5, step_loss=0.776][RANK-0]: Step: [17278], local_loss=0.05438621714711189, train_loss=0.030798714607954025, time_cost=7.373033046722412
+
Steps: 2%|▏ | 17278/1000000 [10:53:26<3343:54:09, 12.25s/it, lr=1e-5, step_loss=0.0544]
Steps: 2%|▏ | 17279/1000000 [10:53:33<2906:14:14, 10.65s/it, lr=1e-5, step_loss=0.0544][RANK-0]: Step: [17279], local_loss=0.017707932740449905, train_loss=0.021304896101355553, time_cost=3.313566207885742
+
Steps: 2%|▏ | 17279/1000000 [10:53:33<2906:14:14, 10.65s/it, lr=1e-5, step_loss=0.0177]
Steps: 2%|▏ | 17280/1000000 [10:53:43<2862:36:01, 10.49s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [17280], local_loss=0.006715518422424793, train_loss=0.08822204172611237, time_cost=2.3687500953674316
+
Steps: 2%|▏ | 17280/1000000 [10:53:43<2862:36:01, 10.49s/it, lr=1e-5, step_loss=0.00672]
Steps: 2%|▏ | 17281/1000000 [10:53:50<2625:05:57, 9.62s/it, lr=1e-5, step_loss=0.00672][RANK-0]: Step: [17281], local_loss=0.01808828115463257, train_loss=0.028107672929763794, time_cost=2.1581530570983887
+
Steps: 2%|▏ | 17281/1000000 [10:53:50<2625:05:57, 9.62s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 17282/1000000 [10:54:04<2960:11:52, 10.84s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [17282], local_loss=0.010459034703671932, train_loss=0.02622830681502819, time_cost=5.166303396224976
+
Steps: 2%|▏ | 17282/1000000 [10:54:04<2960:11:52, 10.84s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17283/1000000 [10:54:11<2644:04:29, 9.69s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17283], local_loss=0.00935191847383976, train_loss=0.06505617499351501, time_cost=2.7683424949645996
+
Steps: 2%|▏ | 17283/1000000 [10:54:11<2644:04:29, 9.69s/it, lr=1e-5, step_loss=0.00935]
Steps: 2%|▏ | 17284/1000000 [10:54:16<2263:37:53, 8.29s/it, lr=1e-5, step_loss=0.00935][RANK-0]: Step: [17284], local_loss=0.01312632579356432, train_loss=0.03960629552602768, time_cost=2.140169858932495
+
Steps: 2%|▏ | 17284/1000000 [10:54:16<2263:37:53, 8.29s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 17285/1000000 [10:54:32<2856:16:40, 10.46s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [17285], local_loss=0.06365285813808441, train_loss=0.051582809537649155, time_cost=3.74609637260437
+
Steps: 2%|▏ | 17285/1000000 [10:54:32<2856:16:40, 10.46s/it, lr=1e-5, step_loss=0.0637]
Steps: 2%|▏ | 17286/1000000 [10:54:38<2550:15:29, 9.34s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [17286], local_loss=0.010205688886344433, train_loss=0.03852035850286484, time_cost=2.9600071907043457
+
Steps: 2%|▏ | 17286/1000000 [10:54:38<2550:15:29, 9.34s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 17287/1000000 [10:54:52<2884:38:42, 10.57s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [17287], local_loss=0.011506144888699055, train_loss=0.016250813379883766, time_cost=1.9080865383148193
+
Steps: 2%|▏ | 17287/1000000 [10:54:52<2884:38:42, 10.57s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 17288/1000000 [10:55:03<2918:06:08, 10.69s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [17288], local_loss=0.04796355590224266, train_loss=0.14903442561626434, time_cost=3.0525851249694824
+
Steps: 2%|▏ | 17288/1000000 [10:55:03<2918:06:08, 10.69s/it, lr=1e-5, step_loss=0.048]
Steps: 2%|▏ | 17289/1000000 [10:55:16<3095:50:33, 11.34s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [17289], local_loss=0.0560726523399353, train_loss=0.03540930151939392, time_cost=4.388364315032959
+
Steps: 2%|▏ | 17289/1000000 [10:55:16<3095:50:33, 11.34s/it, lr=1e-5, step_loss=0.0561]
Steps: 2%|▏ | 17290/1000000 [10:55:27<3067:59:09, 11.24s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [17290], local_loss=0.01983060874044895, train_loss=0.039163827896118164, time_cost=5.295308589935303
+
Steps: 2%|▏ | 17290/1000000 [10:55:27<3067:59:09, 11.24s/it, lr=1e-5, step_loss=0.0198]
Steps: 2%|▏ | 17291/1000000 [10:55:34<2772:12:29, 10.16s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [17291], local_loss=0.020541805773973465, train_loss=0.011915445327758789, time_cost=1.5221202373504639
+
Steps: 2%|▏ | 17291/1000000 [10:55:34<2772:12:29, 10.16s/it, lr=1e-5, step_loss=0.0205]
Steps: 2%|▏ | 17292/1000000 [10:55:40<2444:44:29, 8.96s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [17292], local_loss=0.00898787286132574, train_loss=0.18466198444366455, time_cost=2.0541129112243652
+
Steps: 2%|▏ | 17292/1000000 [10:55:40<2444:44:29, 8.96s/it, lr=1e-5, step_loss=0.00899]
Steps: 2%|▏ | 17293/1000000 [10:55:56<2969:21:31, 10.88s/it, lr=1e-5, step_loss=0.00899][RANK-0]: Step: [17293], local_loss=0.009603308513760567, train_loss=0.17065873742103577, time_cost=7.800736427307129
+
Steps: 2%|▏ | 17293/1000000 [10:55:56<2969:21:31, 10.88s/it, lr=1e-5, step_loss=0.0096]
Steps: 2%|▏ | 17294/1000000 [10:56:02<2544:42:46, 9.32s/it, lr=1e-5, step_loss=0.0096][RANK-0]: Step: [17294], local_loss=0.025173276662826538, train_loss=0.1414976716041565, time_cost=2.939785957336426
+
Steps: 2%|▏ | 17294/1000000 [10:56:02<2544:42:46, 9.32s/it, lr=1e-5, step_loss=0.0252]
Steps: 2%|▏ | 17295/1000000 [10:56:13<2712:37:11, 9.94s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [17295], local_loss=0.016827359795570374, train_loss=0.03881393373012543, time_cost=4.864094257354736
+
Steps: 2%|▏ | 17295/1000000 [10:56:13<2712:37:11, 9.94s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 17296/1000000 [10:56:24<2796:49:32, 10.25s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [17296], local_loss=0.1319464147090912, train_loss=0.06939397007226944, time_cost=3.2091546058654785
+
Steps: 2%|▏ | 17296/1000000 [10:56:24<2796:49:32, 10.25s/it, lr=1e-5, step_loss=0.132]
Steps: 2%|▏ | 17297/1000000 [10:56:39<3182:26:39, 11.66s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [17297], local_loss=0.024842508137226105, train_loss=0.05036531388759613, time_cost=5.334615230560303
+
Steps: 2%|▏ | 17297/1000000 [10:56:39<3182:26:39, 11.66s/it, lr=1e-5, step_loss=0.0248]
Steps: 2%|▏ | 17298/1000000 [10:56:46<2816:29:25, 10.32s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [17298], local_loss=0.024490617215633392, train_loss=0.02159389853477478, time_cost=2.8652403354644775
+
Steps: 2%|▏ | 17298/1000000 [10:56:46<2816:29:25, 10.32s/it, lr=1e-5, step_loss=0.0245]
Steps: 2%|▏ | 17299/1000000 [10:56:55<2732:18:21, 10.01s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [17299], local_loss=0.00963553972542286, train_loss=0.02783949300646782, time_cost=1.4584360122680664
+
Steps: 2%|▏ | 17299/1000000 [10:56:55<2732:18:21, 10.01s/it, lr=1e-5, step_loss=0.00964]
Steps: 2%|▏ | 17300/1000000 [10:57:06<2785:53:06, 10.21s/it, lr=1e-5, step_loss=0.00964][RANK-0]: Step: [17300], local_loss=0.05467582866549492, train_loss=0.038537465035915375, time_cost=1.3101730346679688
+
Steps: 2%|▏ | 17300/1000000 [10:57:06<2785:53:06, 10.21s/it, lr=1e-5, step_loss=0.0547]
Steps: 2%|▏ | 17301/1000000 [10:57:15<2694:59:15, 9.87s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [17301], local_loss=0.005576318129897118, train_loss=0.020967336371541023, time_cost=7.206344127655029
+
Steps: 2%|▏ | 17301/1000000 [10:57:15<2694:59:15, 9.87s/it, lr=1e-5, step_loss=0.00558]
Steps: 2%|▏ | 17302/1000000 [10:57:19<2249:00:48, 8.24s/it, lr=1e-5, step_loss=0.00558][RANK-0]: Step: [17302], local_loss=0.01145640853792429, train_loss=6.217714786529541, time_cost=1.7741584777832031
+
Steps: 2%|▏ | 17302/1000000 [10:57:19<2249:00:48, 8.24s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 17303/1000000 [10:57:30<2413:05:44, 8.84s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [17303], local_loss=0.09363914281129837, train_loss=0.04127102345228195, time_cost=1.2173900604248047
+
Steps: 2%|▏ | 17303/1000000 [10:57:30<2413:05:44, 8.84s/it, lr=1e-5, step_loss=0.0936]
Steps: 2%|▏ | 17304/1000000 [10:57:39<2464:19:07, 9.03s/it, lr=1e-5, step_loss=0.0936][RANK-0]: Step: [17304], local_loss=0.008316216990351677, train_loss=0.03236885741353035, time_cost=7.681309461593628
+
Steps: 2%|▏ | 17304/1000000 [10:57:39<2464:19:07, 9.03s/it, lr=1e-5, step_loss=0.00832]
Steps: 2%|▏ | 17305/1000000 [10:57:53<2875:27:26, 10.53s/it, lr=1e-5, step_loss=0.00832][RANK-0]: Step: [17305], local_loss=0.035548266023397446, train_loss=0.03630628064274788, time_cost=5.257446050643921
+
Steps: 2%|▏ | 17305/1000000 [10:57:53<2875:27:26, 10.53s/it, lr=1e-5, step_loss=0.0355]
Steps: 2%|▏ | 17306/1000000 [10:57:59<2512:29:57, 9.20s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [17306], local_loss=0.009482862427830696, train_loss=0.04131466895341873, time_cost=1.503211259841919
+
Steps: 2%|▏ | 17306/1000000 [10:57:59<2512:29:57, 9.20s/it, lr=1e-5, step_loss=0.00948]
Steps: 2%|▏ | 17307/1000000 [10:58:10<2594:50:18, 9.51s/it, lr=1e-5, step_loss=0.00948][RANK-0]: Step: [17307], local_loss=0.007802105508744717, train_loss=0.029243700206279755, time_cost=1.869194507598877
+
Steps: 2%|▏ | 17307/1000000 [10:58:10<2594:50:18, 9.51s/it, lr=1e-5, step_loss=0.0078]
Steps: 2%|▏ | 17308/1000000 [10:58:24<2970:49:58, 10.88s/it, lr=1e-5, step_loss=0.0078][RANK-0]: Step: [17308], local_loss=0.024975048378109932, train_loss=0.052263982594013214, time_cost=8.276031970977783
+
Steps: 2%|▏ | 17308/1000000 [10:58:24<2970:49:58, 10.88s/it, lr=1e-5, step_loss=0.025]
Steps: 2%|▏ | 17309/1000000 [10:58:36<3118:57:46, 11.43s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [17309], local_loss=0.006908544804900885, train_loss=0.059640951454639435, time_cost=5.866860389709473
+
Steps: 2%|▏ | 17309/1000000 [10:58:36<3118:57:46, 11.43s/it, lr=1e-5, step_loss=0.00691]
Steps: 2%|▏ | 17310/1000000 [10:58:50<3331:11:58, 12.20s/it, lr=1e-5, step_loss=0.00691][RANK-0]: Step: [17310], local_loss=0.06705830246210098, train_loss=0.17194116115570068, time_cost=4.658435821533203
+
Steps: 2%|▏ | 17310/1000000 [10:58:50<3331:11:58, 12.20s/it, lr=1e-5, step_loss=0.0671]
Steps: 2%|▏ | 17311/1000000 [10:58:58<2928:37:26, 10.73s/it, lr=1e-5, step_loss=0.0671][RANK-0]: Step: [17311], local_loss=0.010250001214444637, train_loss=0.022230587899684906, time_cost=2.994203567504883
+
Steps: 2%|▏ | 17311/1000000 [10:58:58<2928:37:26, 10.73s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 17312/1000000 [10:59:05<2631:02:55, 9.64s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [17312], local_loss=0.007061176933348179, train_loss=0.07968113571405411, time_cost=1.4298737049102783
+
Steps: 2%|▏ | 17312/1000000 [10:59:05<2631:02:55, 9.64s/it, lr=1e-5, step_loss=0.00706]
Steps: 2%|▏ | 17313/1000000 [10:59:16<2755:24:09, 10.09s/it, lr=1e-5, step_loss=0.00706][RANK-0]: Step: [17313], local_loss=0.05172533169388771, train_loss=0.07447436451911926, time_cost=2.10699725151062
+
Steps: 2%|▏ | 17313/1000000 [10:59:16<2755:24:09, 10.09s/it, lr=1e-5, step_loss=0.0517]
Steps: 2%|▏ | 17314/1000000 [10:59:30<3114:22:29, 11.41s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [17314], local_loss=0.014946339651942253, train_loss=0.031088463962078094, time_cost=2.7474348545074463
+
Steps: 2%|▏ | 17314/1000000 [10:59:30<3114:22:29, 11.41s/it, lr=1e-5, step_loss=0.0149]
Steps: 2%|▏ | 17315/1000000 [10:59:41<3087:50:44, 11.31s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [17315], local_loss=0.010495657101273537, train_loss=0.01848091557621956, time_cost=1.8926072120666504
+
Steps: 2%|▏ | 17315/1000000 [10:59:41<3087:50:44, 11.31s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17316/1000000 [10:59:52<3009:39:00, 11.03s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17316], local_loss=0.009595717303454876, train_loss=0.0282377228140831, time_cost=1.216360092163086
+
Steps: 2%|▏ | 17316/1000000 [10:59:52<3009:39:00, 11.03s/it, lr=1e-5, step_loss=0.0096]
Steps: 2%|▏ | 17317/1000000 [11:00:03<3031:34:34, 11.11s/it, lr=1e-5, step_loss=0.0096][RANK-0]: Step: [17317], local_loss=0.05293256416916847, train_loss=0.042864859104156494, time_cost=3.863431692123413
+
Steps: 2%|▏ | 17317/1000000 [11:00:03<3031:34:34, 11.11s/it, lr=1e-5, step_loss=0.0529]
Steps: 2%|▏ | 17318/1000000 [11:00:15<3060:41:46, 11.21s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [17318], local_loss=0.006442921236157417, train_loss=0.02292047068476677, time_cost=2.4061851501464844
+
Steps: 2%|▏ | 17318/1000000 [11:00:15<3060:41:46, 11.21s/it, lr=1e-5, step_loss=0.00644]
Steps: 2%|▏ | 17319/1000000 [11:00:24<2908:30:00, 10.66s/it, lr=1e-5, step_loss=0.00644][RANK-0]: Step: [17319], local_loss=0.0317673459649086, train_loss=0.02756493166089058, time_cost=2.555394411087036
+
Steps: 2%|▏ | 17319/1000000 [11:00:24<2908:30:00, 10.66s/it, lr=1e-5, step_loss=0.0318]
Steps: 2%|▏ | 17320/1000000 [11:00:33<2783:16:19, 10.20s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [17320], local_loss=0.044653359800577164, train_loss=0.02252158522605896, time_cost=6.811448097229004
+
Steps: 2%|▏ | 17320/1000000 [11:00:33<2783:16:19, 10.20s/it, lr=1e-5, step_loss=0.0447]
Steps: 2%|▏ | 17321/1000000 [11:00:45<2890:10:48, 10.59s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [17321], local_loss=0.05387319251894951, train_loss=0.1739889532327652, time_cost=4.034418106079102
+
Steps: 2%|▏ | 17321/1000000 [11:00:45<2890:10:48, 10.59s/it, lr=1e-5, step_loss=0.0539]
Steps: 2%|▏ | 17322/1000000 [11:00:55<2920:50:40, 10.70s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [17322], local_loss=0.010534994304180145, train_loss=0.029857492074370384, time_cost=1.8609435558319092
+
Steps: 2%|▏ | 17322/1000000 [11:00:55<2920:50:40, 10.70s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17323/1000000 [11:01:02<2606:28:50, 9.55s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17323], local_loss=0.014060728251934052, train_loss=0.1428953856229782, time_cost=5.209434747695923
+
Steps: 2%|▏ | 17323/1000000 [11:01:02<2606:28:50, 9.55s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 17324/1000000 [11:01:13<2698:57:22, 9.89s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [17324], local_loss=0.02470851130783558, train_loss=0.028311386704444885, time_cost=4.334564685821533
+
Steps: 2%|▏ | 17324/1000000 [11:01:13<2698:57:22, 9.89s/it, lr=1e-5, step_loss=0.0247]
Steps: 2%|▏ | 17325/1000000 [11:01:27<3041:46:27, 11.14s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [17325], local_loss=0.07498618960380554, train_loss=0.016550280153751373, time_cost=6.707385778427124
+
Steps: 2%|▏ | 17325/1000000 [11:01:27<3041:46:27, 11.14s/it, lr=1e-5, step_loss=0.075]
Steps: 2%|▏ | 17326/1000000 [11:01:33<2605:36:34, 9.55s/it, lr=1e-5, step_loss=0.075][RANK-0]: Step: [17326], local_loss=0.006138961296528578, train_loss=0.027835801243782043, time_cost=1.4080052375793457
+
Steps: 2%|▏ | 17326/1000000 [11:01:33<2605:36:34, 9.55s/it, lr=1e-5, step_loss=0.00614]
Steps: 2%|▏ | 17327/1000000 [11:01:41<2465:48:24, 9.03s/it, lr=1e-5, step_loss=0.00614][RANK-0]: Step: [17327], local_loss=0.007857950404286385, train_loss=0.04412205144762993, time_cost=6.822038173675537
+
Steps: 2%|▏ | 17327/1000000 [11:01:41<2465:48:24, 9.03s/it, lr=1e-5, step_loss=0.00786]
Steps: 2%|▏ | 17328/1000000 [11:01:48<2321:28:28, 8.50s/it, lr=1e-5, step_loss=0.00786][RANK-0]: Step: [17328], local_loss=0.03486942499876022, train_loss=0.08669055998325348, time_cost=1.2610015869140625
+
Steps: 2%|▏ | 17328/1000000 [11:01:48<2321:28:28, 8.50s/it, lr=1e-5, step_loss=0.0349]
Steps: 2%|▏ | 17329/1000000 [11:01:55<2235:51:52, 8.19s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [17329], local_loss=0.21306408941745758, train_loss=0.03911276161670685, time_cost=3.622577667236328
+
Steps: 2%|▏ | 17329/1000000 [11:01:55<2235:51:52, 8.19s/it, lr=1e-5, step_loss=0.213]
Steps: 2%|▏ | 17330/1000000 [11:02:03<2141:02:07, 7.84s/it, lr=1e-5, step_loss=0.213][RANK-0]: Step: [17330], local_loss=0.005268592853099108, train_loss=0.035892996937036514, time_cost=2.6952898502349854
+
Steps: 2%|▏ | 17330/1000000 [11:02:03<2141:02:07, 7.84s/it, lr=1e-5, step_loss=0.00527]
Steps: 2%|▏ | 17331/1000000 [11:02:18<2800:45:30, 10.26s/it, lr=1e-5, step_loss=0.00527][RANK-0]: Step: [17331], local_loss=138.58412170410156, train_loss=17.377016067504883, time_cost=13.022039413452148
+
Steps: 2%|▏ | 17331/1000000 [11:02:18<2800:45:30, 10.26s/it, lr=1e-5, step_loss=139]
Steps: 2%|▏ | 17332/1000000 [11:02:29<2805:11:48, 10.28s/it, lr=1e-5, step_loss=139][RANK-0]: Step: [17332], local_loss=0.013269511982798576, train_loss=0.13594062626361847, time_cost=3.122128963470459
+
Steps: 2%|▏ | 17332/1000000 [11:02:29<2805:11:48, 10.28s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 17333/1000000 [11:02:34<2385:21:19, 8.74s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [17333], local_loss=0.007510068826377392, train_loss=0.05513594672083855, time_cost=1.601057529449463
+
Steps: 2%|▏ | 17333/1000000 [11:02:34<2385:21:19, 8.74s/it, lr=1e-5, step_loss=0.00751]
Steps: 2%|▏ | 17334/1000000 [11:02:41<2250:41:58, 8.25s/it, lr=1e-5, step_loss=0.00751][RANK-0]: Step: [17334], local_loss=0.008056624792516232, train_loss=0.04729112982749939, time_cost=1.2311615943908691
+
Steps: 2%|▏ | 17334/1000000 [11:02:41<2250:41:58, 8.25s/it, lr=1e-5, step_loss=0.00806]
Steps: 2%|▏ | 17335/1000000 [11:02:46<2024:38:34, 7.42s/it, lr=1e-5, step_loss=0.00806][RANK-0]: Step: [17335], local_loss=0.04571472108364105, train_loss=0.04245381057262421, time_cost=2.410261631011963
+
Steps: 2%|▏ | 17335/1000000 [11:02:46<2024:38:34, 7.42s/it, lr=1e-5, step_loss=0.0457]
Steps: 2%|▏ | 17336/1000000 [11:02:59<2456:54:31, 9.00s/it, lr=1e-5, step_loss=0.0457][RANK-0]: Step: [17336], local_loss=0.054984334856271744, train_loss=0.027400821447372437, time_cost=3.449490547180176
+
Steps: 2%|▏ | 17336/1000000 [11:02:59<2456:54:31, 9.00s/it, lr=1e-5, step_loss=0.055]
Steps: 2%|▏ | 17337/1000000 [11:03:14<2960:06:42, 10.84s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [17337], local_loss=0.023496858775615692, train_loss=0.030367769300937653, time_cost=12.461142778396606
+
Steps: 2%|▏ | 17337/1000000 [11:03:14<2960:06:42, 10.84s/it, lr=1e-5, step_loss=0.0235]
Steps: 2%|▏ | 17338/1000000 [11:03:22<2682:02:06, 9.83s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [17338], local_loss=0.045988649129867554, train_loss=0.055033303797245026, time_cost=2.343726873397827
+
Steps: 2%|▏ | 17338/1000000 [11:03:22<2682:02:06, 9.83s/it, lr=1e-5, step_loss=0.046]
Steps: 2%|▏ | 17339/1000000 [11:03:38<3187:00:42, 11.68s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [17339], local_loss=0.04638556018471718, train_loss=0.036352191120386124, time_cost=7.715432405471802
+
Steps: 2%|▏ | 17339/1000000 [11:03:38<3187:00:42, 11.68s/it, lr=1e-5, step_loss=0.0464]
Steps: 2%|▏ | 17340/1000000 [11:03:51<3292:10:52, 12.06s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [17340], local_loss=0.009767716750502586, train_loss=0.016933759674429893, time_cost=4.973973035812378
+
Steps: 2%|▏ | 17340/1000000 [11:03:51<3292:10:52, 12.06s/it, lr=1e-5, step_loss=0.00977]
Steps: 2%|▏ | 17341/1000000 [11:04:05<3477:20:39, 12.74s/it, lr=1e-5, step_loss=0.00977][RANK-0]: Step: [17341], local_loss=0.006014786660671234, train_loss=0.04104229062795639, time_cost=6.986429452896118
+
Steps: 2%|▏ | 17341/1000000 [11:04:05<3477:20:39, 12.74s/it, lr=1e-5, step_loss=0.00601]
Steps: 2%|▏ | 17342/1000000 [11:04:12<3034:28:34, 11.12s/it, lr=1e-5, step_loss=0.00601][RANK-0]: Step: [17342], local_loss=0.005389883648604155, train_loss=0.1734190136194229, time_cost=3.6445388793945312
+
Steps: 2%|▏ | 17342/1000000 [11:04:12<3034:28:34, 11.12s/it, lr=1e-5, step_loss=0.00539]
Steps: 2%|▏ | 17343/1000000 [11:04:18<2567:47:46, 9.41s/it, lr=1e-5, step_loss=0.00539][RANK-0]: Step: [17343], local_loss=0.0069984933361411095, train_loss=0.026202648878097534, time_cost=2.3446638584136963
+
Steps: 2%|▏ | 17343/1000000 [11:04:18<2567:47:46, 9.41s/it, lr=1e-5, step_loss=0.007]
Steps: 2%|▏ | 17344/1000000 [11:04:23<2207:00:45, 8.09s/it, lr=1e-5, step_loss=0.007][RANK-0]: Step: [17344], local_loss=0.01868753880262375, train_loss=15.121793746948242, time_cost=3.8308818340301514
+
Steps: 2%|▏ | 17344/1000000 [11:04:23<2207:00:45, 8.09s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 17345/1000000 [11:04:28<1950:36:02, 7.15s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [17345], local_loss=0.025475408881902695, train_loss=0.029368901625275612, time_cost=1.9898271560668945
+
Steps: 2%|▏ | 17345/1000000 [11:04:28<1950:36:02, 7.15s/it, lr=1e-5, step_loss=0.0255]
Steps: 2%|▏ | 17346/1000000 [11:04:38<2227:16:09, 8.16s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [17346], local_loss=0.08680284768342972, train_loss=0.021109091117978096, time_cost=1.8450992107391357
+
Steps: 2%|▏ | 17346/1000000 [11:04:38<2227:16:09, 8.16s/it, lr=1e-5, step_loss=0.0868]
Steps: 2%|▏ | 17347/1000000 [11:04:53<2728:22:54, 10.00s/it, lr=1e-5, step_loss=0.0868][RANK-0]: Step: [17347], local_loss=0.037513889372348785, train_loss=0.014213372021913528, time_cost=3.34504771232605
+
Steps: 2%|▏ | 17347/1000000 [11:04:53<2728:22:54, 10.00s/it, lr=1e-5, step_loss=0.0375]
Steps: 2%|▏ | 17348/1000000 [11:04:57<2303:12:59, 8.44s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [17348], local_loss=0.005239053629338741, train_loss=0.030042579397559166, time_cost=1.2153148651123047
+
Steps: 2%|▏ | 17348/1000000 [11:04:57<2303:12:59, 8.44s/it, lr=1e-5, step_loss=0.00524]
Steps: 2%|▏ | 17349/1000000 [11:05:05<2276:25:07, 8.34s/it, lr=1e-5, step_loss=0.00524][RANK-0]: Step: [17349], local_loss=1.0019217729568481, train_loss=0.27256080508232117, time_cost=3.1115715503692627
+
Steps: 2%|▏ | 17349/1000000 [11:05:05<2276:25:07, 8.34s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 17350/1000000 [11:05:20<2748:03:10, 10.07s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [17350], local_loss=0.011172044090926647, train_loss=0.025658030062913895, time_cost=6.427321910858154
+
Steps: 2%|▏ | 17350/1000000 [11:05:20<2748:03:10, 10.07s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 17351/1000000 [11:05:24<2269:53:25, 8.32s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [17351], local_loss=0.006149829365313053, train_loss=0.043714843690395355, time_cost=1.732923984527588
+
Steps: 2%|▏ | 17351/1000000 [11:05:24<2269:53:25, 8.32s/it, lr=1e-5, step_loss=0.00615]
Steps: 2%|▏ | 17352/1000000 [11:05:35<2477:14:01, 9.08s/it, lr=1e-5, step_loss=0.00615][RANK-0]: Step: [17352], local_loss=0.02816462703049183, train_loss=0.12823371589183807, time_cost=2.969561815261841
+
Steps: 2%|▏ | 17352/1000000 [11:05:35<2477:14:01, 9.08s/it, lr=1e-5, step_loss=0.0282]
Steps: 2%|▏ | 17353/1000000 [11:05:40<2138:33:00, 7.83s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [17353], local_loss=0.035471271723508835, train_loss=0.04580741003155708, time_cost=1.333132266998291
+
Steps: 2%|▏ | 17353/1000000 [11:05:40<2138:33:00, 7.83s/it, lr=1e-5, step_loss=0.0355]
Steps: 2%|▏ | 17354/1000000 [11:05:53<2630:03:37, 9.64s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [17354], local_loss=0.006503335200250149, train_loss=0.05597051978111267, time_cost=1.288498878479004
+
Steps: 2%|▏ | 17354/1000000 [11:05:53<2630:03:37, 9.64s/it, lr=1e-5, step_loss=0.0065]
Steps: 2%|▏ | 17355/1000000 [11:06:05<2757:12:31, 10.10s/it, lr=1e-5, step_loss=0.0065][RANK-0]: Step: [17355], local_loss=0.021213123574852943, train_loss=0.025600306689739227, time_cost=2.763679265975952
+
Steps: 2%|▏ | 17355/1000000 [11:06:05<2757:12:31, 10.10s/it, lr=1e-5, step_loss=0.0212]
Steps: 2%|▏ | 17356/1000000 [11:06:11<2484:16:37, 9.10s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [17356], local_loss=0.009343559853732586, train_loss=0.029498878866434097, time_cost=2.089801549911499
+
Steps: 2%|▏ | 17356/1000000 [11:06:11<2484:16:37, 9.10s/it, lr=1e-5, step_loss=0.00934]
Steps: 2%|▏ | 17357/1000000 [11:06:27<2982:44:58, 10.93s/it, lr=1e-5, step_loss=0.00934][RANK-0]: Step: [17357], local_loss=0.018014388158917427, train_loss=0.06562113761901855, time_cost=6.996169805526733
+
Steps: 2%|▏ | 17357/1000000 [11:06:27<2982:44:58, 10.93s/it, lr=1e-5, step_loss=0.018]
Steps: 2%|▏ | 17358/1000000 [11:06:34<2690:45:58, 9.86s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [17358], local_loss=0.04173225536942482, train_loss=0.021927548572421074, time_cost=1.6952362060546875
+
Steps: 2%|▏ | 17358/1000000 [11:06:34<2690:45:58, 9.86s/it, lr=1e-5, step_loss=0.0417]
Steps: 2%|▏ | 17359/1000000 [11:06:48<2996:54:34, 10.98s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [17359], local_loss=0.04533549025654793, train_loss=0.05369056761264801, time_cost=4.358886480331421
+
Steps: 2%|▏ | 17359/1000000 [11:06:48<2996:54:34, 10.98s/it, lr=1e-5, step_loss=0.0453]
Steps: 2%|▏ | 17360/1000000 [11:06:57<2840:15:30, 10.41s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [17360], local_loss=0.0065612695179879665, train_loss=0.02164066582918167, time_cost=2.7628390789031982
+
Steps: 2%|▏ | 17360/1000000 [11:06:57<2840:15:30, 10.41s/it, lr=1e-5, step_loss=0.00656]
Steps: 2%|▏ | 17361/1000000 [11:07:02<2424:35:43, 8.88s/it, lr=1e-5, step_loss=0.00656][RANK-0]: Step: [17361], local_loss=0.010813294909894466, train_loss=0.03362574428319931, time_cost=2.9642696380615234
+
Steps: 2%|▏ | 17361/1000000 [11:07:02<2424:35:43, 8.88s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 17362/1000000 [11:07:15<2754:07:32, 10.09s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [17362], local_loss=0.014731692150235176, train_loss=0.03992946445941925, time_cost=5.45080041885376
+
Steps: 2%|▏ | 17362/1000000 [11:07:15<2754:07:32, 10.09s/it, lr=1e-5, step_loss=0.0147]
Steps: 2%|▏ | 17363/1000000 [11:07:25<2781:35:00, 10.19s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [17363], local_loss=0.005899642128497362, train_loss=0.020667152479290962, time_cost=3.0522470474243164
+
Steps: 2%|▏ | 17363/1000000 [11:07:25<2781:35:00, 10.19s/it, lr=1e-5, step_loss=0.0059]
Steps: 2%|▏ | 17364/1000000 [11:07:35<2785:56:50, 10.21s/it, lr=1e-5, step_loss=0.0059][RANK-0]: Step: [17364], local_loss=0.012455666437745094, train_loss=0.07523640245199203, time_cost=5.036905288696289
+
Steps: 2%|▏ | 17364/1000000 [11:07:35<2785:56:50, 10.21s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 17365/1000000 [11:07:40<2356:14:31, 8.63s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [17365], local_loss=0.007641876116394997, train_loss=0.08091753721237183, time_cost=2.046428680419922
+
Steps: 2%|▏ | 17365/1000000 [11:07:40<2356:14:31, 8.63s/it, lr=1e-5, step_loss=0.00764]
Steps: 2%|▏ | 17366/1000000 [11:07:55<2854:40:30, 10.46s/it, lr=1e-5, step_loss=0.00764][RANK-0]: Step: [17366], local_loss=0.06410139799118042, train_loss=0.04810480773448944, time_cost=5.27775239944458
+
Steps: 2%|▏ | 17366/1000000 [11:07:55<2854:40:30, 10.46s/it, lr=1e-5, step_loss=0.0641]
Steps: 2%|▏ | 17367/1000000 [11:08:03<2621:18:58, 9.60s/it, lr=1e-5, step_loss=0.0641][RANK-0]: Step: [17367], local_loss=0.008694487623870373, train_loss=0.07350838929414749, time_cost=3.049382209777832
+
Steps: 2%|▏ | 17367/1000000 [11:08:03<2621:18:58, 9.60s/it, lr=1e-5, step_loss=0.00869]
Steps: 2%|▏ | 17368/1000000 [11:08:14<2774:00:16, 10.16s/it, lr=1e-5, step_loss=0.00869][RANK-0]: Step: [17368], local_loss=0.012096554972231388, train_loss=0.06864065676927567, time_cost=1.6176698207855225
+
Steps: 2%|▏ | 17368/1000000 [11:08:14<2774:00:16, 10.16s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 17369/1000000 [11:08:19<2352:02:59, 8.62s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [17369], local_loss=0.007764636538922787, train_loss=0.019054781645536423, time_cost=2.273555278778076
+
Steps: 2%|▏ | 17369/1000000 [11:08:19<2352:02:59, 8.62s/it, lr=1e-5, step_loss=0.00776]
Steps: 2%|▏ | 17370/1000000 [11:08:32<2731:00:02, 10.01s/it, lr=1e-5, step_loss=0.00776][RANK-0]: Step: [17370], local_loss=0.0098233912140131, train_loss=0.017084863036870956, time_cost=3.997570276260376
+
Steps: 2%|▏ | 17370/1000000 [11:08:33<2731:00:02, 10.01s/it, lr=1e-5, step_loss=0.00982]
Steps: 2%|▏ | 17371/1000000 [11:08:48<3178:44:11, 11.65s/it, lr=1e-5, step_loss=0.00982][RANK-0]: Step: [17371], local_loss=0.0037484019994735718, train_loss=0.011957069858908653, time_cost=8.171746730804443
+
Steps: 2%|▏ | 17371/1000000 [11:08:48<3178:44:11, 11.65s/it, lr=1e-5, step_loss=0.00375]
Steps: 2%|▏ | 17372/1000000 [11:09:03<3454:26:28, 12.66s/it, lr=1e-5, step_loss=0.00375][RANK-0]: Step: [17372], local_loss=0.006681107450276613, train_loss=0.01097111590206623, time_cost=6.932158470153809
+
Steps: 2%|▏ | 17372/1000000 [11:09:03<3454:26:28, 12.66s/it, lr=1e-5, step_loss=0.00668]
Steps: 2%|▏ | 17373/1000000 [11:09:16<3451:33:59, 12.65s/it, lr=1e-5, step_loss=0.00668][RANK-0]: Step: [17373], local_loss=0.06544532626867294, train_loss=0.03280789032578468, time_cost=1.2025158405303955
+
Steps: 2%|▏ | 17373/1000000 [11:09:16<3451:33:59, 12.65s/it, lr=1e-5, step_loss=0.0654]
Steps: 2%|▏ | 17374/1000000 [11:09:21<2816:53:11, 10.32s/it, lr=1e-5, step_loss=0.0654][RANK-0]: Step: [17374], local_loss=0.03328147903084755, train_loss=0.0647803395986557, time_cost=2.4029738903045654
+
Steps: 2%|▏ | 17374/1000000 [11:09:21<2816:53:11, 10.32s/it, lr=1e-5, step_loss=0.0333]
Steps: 2%|▏ | 17375/1000000 [11:09:28<2602:03:02, 9.53s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [17375], local_loss=0.02981221117079258, train_loss=0.032188381999731064, time_cost=1.2465262413024902
+
Steps: 2%|▏ | 17375/1000000 [11:09:28<2602:03:02, 9.53s/it, lr=1e-5, step_loss=0.0298]
Steps: 2%|▏ | 17376/1000000 [11:09:44<3116:42:57, 11.42s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [17376], local_loss=0.01639474742114544, train_loss=0.0513470396399498, time_cost=14.288827180862427
+
Steps: 2%|▏ | 17376/1000000 [11:09:44<3116:42:57, 11.42s/it, lr=1e-5, step_loss=0.0164]
Steps: 2%|▏ | 17377/1000000 [11:09:52<2855:47:56, 10.46s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [17377], local_loss=0.0350492000579834, train_loss=0.03429686650633812, time_cost=1.19966459274292
+
Steps: 2%|▏ | 17377/1000000 [11:09:52<2855:47:56, 10.46s/it, lr=1e-5, step_loss=0.035]
Steps: 2%|▏ | 17378/1000000 [11:09:59<2559:48:50, 9.38s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [17378], local_loss=0.03797324374318123, train_loss=0.06971284747123718, time_cost=1.4571292400360107
+
Steps: 2%|▏ | 17378/1000000 [11:09:59<2559:48:50, 9.38s/it, lr=1e-5, step_loss=0.038]
Steps: 2%|▏ | 17379/1000000 [11:10:06<2379:45:44, 8.72s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [17379], local_loss=0.011478631757199764, train_loss=0.026952333748340607, time_cost=2.705545663833618
+
Steps: 2%|▏ | 17379/1000000 [11:10:06<2379:45:44, 8.72s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 17380/1000000 [11:10:11<2047:36:02, 7.50s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [17380], local_loss=0.0156076205894351, train_loss=0.027249429374933243, time_cost=2.003328323364258
+
Steps: 2%|▏ | 17380/1000000 [11:10:11<2047:36:02, 7.50s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 17381/1000000 [11:10:18<2016:47:49, 7.39s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [17381], local_loss=0.24208606779575348, train_loss=0.10720647126436234, time_cost=2.9131484031677246
+
Steps: 2%|▏ | 17381/1000000 [11:10:18<2016:47:49, 7.39s/it, lr=1e-5, step_loss=0.242]
Steps: 2%|▏ | 17382/1000000 [11:10:29<2279:11:43, 8.35s/it, lr=1e-5, step_loss=0.242][RANK-0]: Step: [17382], local_loss=0.04104664549231529, train_loss=0.02920149452984333, time_cost=7.570236921310425
+
Steps: 2%|▏ | 17382/1000000 [11:10:29<2279:11:43, 8.35s/it, lr=1e-5, step_loss=0.041]
Steps: 2%|▏ | 17383/1000000 [11:10:35<2100:10:41, 7.69s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [17383], local_loss=0.04744546487927437, train_loss=0.03195478767156601, time_cost=2.6030824184417725
+
Steps: 2%|▏ | 17383/1000000 [11:10:35<2100:10:41, 7.69s/it, lr=1e-5, step_loss=0.0474]
Steps: 2%|▏ | 17384/1000000 [11:10:45<2334:13:42, 8.55s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [17384], local_loss=0.06026295945048332, train_loss=0.06433333456516266, time_cost=3.847146987915039
+
Steps: 2%|▏ | 17384/1000000 [11:10:45<2334:13:42, 8.55s/it, lr=1e-5, step_loss=0.0603]
Steps: 2%|▏ | 17385/1000000 [11:10:56<2505:33:45, 9.18s/it, lr=1e-5, step_loss=0.0603][RANK-0]: Step: [17385], local_loss=0.0021748612634837627, train_loss=0.16384321451187134, time_cost=1.6888294219970703
+
Steps: 2%|▏ | 17385/1000000 [11:10:56<2505:33:45, 9.18s/it, lr=1e-5, step_loss=0.00217]
Steps: 2%|▏ | 17386/1000000 [11:11:02<2244:34:31, 8.22s/it, lr=1e-5, step_loss=0.00217][RANK-0]: Step: [17386], local_loss=0.009016958065330982, train_loss=0.02119375765323639, time_cost=1.5481936931610107
+
Steps: 2%|▏ | 17386/1000000 [11:11:02<2244:34:31, 8.22s/it, lr=1e-5, step_loss=0.00902]
Steps: 2%|▏ | 17387/1000000 [11:11:09<2131:19:30, 7.81s/it, lr=1e-5, step_loss=0.00902][RANK-0]: Step: [17387], local_loss=0.0669899582862854, train_loss=0.08138494938611984, time_cost=1.183501958847046
+
Steps: 2%|▏ | 17387/1000000 [11:11:09<2131:19:30, 7.81s/it, lr=1e-5, step_loss=0.067]
Steps: 2%|▏ | 17388/1000000 [11:11:16<2074:21:34, 7.60s/it, lr=1e-5, step_loss=0.067][RANK-0]: Step: [17388], local_loss=0.008748062886297703, train_loss=0.04624816030263901, time_cost=1.7990691661834717
+
Steps: 2%|▏ | 17388/1000000 [11:11:16<2074:21:34, 7.60s/it, lr=1e-5, step_loss=0.00875]
Steps: 2%|▏ | 17389/1000000 [11:11:26<2312:41:48, 8.47s/it, lr=1e-5, step_loss=0.00875][RANK-0]: Step: [17389], local_loss=0.01419675350189209, train_loss=0.06028713285923004, time_cost=3.0677692890167236
+
Steps: 2%|▏ | 17389/1000000 [11:11:26<2312:41:48, 8.47s/it, lr=1e-5, step_loss=0.0142]
Steps: 2%|▏ | 17390/1000000 [11:11:31<1972:57:25, 7.23s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [17390], local_loss=0.006142450496554375, train_loss=0.06297871470451355, time_cost=1.6109719276428223
+
Steps: 2%|▏ | 17390/1000000 [11:11:31<1972:57:25, 7.23s/it, lr=1e-5, step_loss=0.00614]
Steps: 2%|▏ | 17391/1000000 [11:11:42<2282:56:45, 8.36s/it, lr=1e-5, step_loss=0.00614][RANK-0]: Step: [17391], local_loss=0.009191970340907574, train_loss=0.024542152881622314, time_cost=3.9272007942199707
+
Steps: 2%|▏ | 17391/1000000 [11:11:42<2282:56:45, 8.36s/it, lr=1e-5, step_loss=0.00919]
Steps: 2%|▏ | 17392/1000000 [11:11:46<1959:59:28, 7.18s/it, lr=1e-5, step_loss=0.00919][RANK-0]: Step: [17392], local_loss=0.01263448316603899, train_loss=0.01910148188471794, time_cost=1.7713541984558105
+
Steps: 2%|▏ | 17392/1000000 [11:11:46<1959:59:28, 7.18s/it, lr=1e-5, step_loss=0.0126]
Steps: 2%|▏ | 17393/1000000 [11:11:56<2191:29:23, 8.03s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [17393], local_loss=0.003688706085085869, train_loss=0.06628917902708054, time_cost=1.426302433013916
+
Steps: 2%|▏ | 17393/1000000 [11:11:56<2191:29:23, 8.03s/it, lr=1e-5, step_loss=0.00369]
Steps: 2%|▏ | 17394/1000000 [11:12:09<2585:29:58, 9.47s/it, lr=1e-5, step_loss=0.00369][RANK-0]: Step: [17394], local_loss=0.011642560362815857, train_loss=0.052545879036188126, time_cost=3.193269968032837
+
Steps: 2%|▏ | 17394/1000000 [11:12:09<2585:29:58, 9.47s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 17395/1000000 [11:12:23<2977:45:48, 10.91s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [17395], local_loss=0.02212587371468544, train_loss=0.017191968858242035, time_cost=5.66984224319458
+
Steps: 2%|▏ | 17395/1000000 [11:12:23<2977:45:48, 10.91s/it, lr=1e-5, step_loss=0.0221]
Steps: 2%|▏ | 17396/1000000 [11:12:38<3260:16:25, 11.94s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [17396], local_loss=0.0053992304019629955, train_loss=0.030138414353132248, time_cost=4.7041175365448
+
Steps: 2%|▏ | 17396/1000000 [11:12:38<3260:16:25, 11.94s/it, lr=1e-5, step_loss=0.0054]
Steps: 2%|▏ | 17397/1000000 [11:12:45<2847:27:39, 10.43s/it, lr=1e-5, step_loss=0.0054][RANK-0]: Step: [17397], local_loss=0.01506267674267292, train_loss=0.017285872250795364, time_cost=1.4695613384246826
+
Steps: 2%|▏ | 17397/1000000 [11:12:45<2847:27:39, 10.43s/it, lr=1e-5, step_loss=0.0151]
Steps: 2%|▏ | 17398/1000000 [11:12:52<2566:04:09, 9.40s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [17398], local_loss=0.1827675700187683, train_loss=0.09163139760494232, time_cost=5.315770149230957
+
Steps: 2%|▏ | 17398/1000000 [11:12:52<2566:04:09, 9.40s/it, lr=1e-5, step_loss=0.183]
Steps: 2%|▏ | 17399/1000000 [11:13:05<2866:08:02, 10.50s/it, lr=1e-5, step_loss=0.183][RANK-0]: Step: [17399], local_loss=0.07382399588823318, train_loss=0.24868136644363403, time_cost=9.36182165145874
+
Steps: 2%|▏ | 17399/1000000 [11:13:05<2866:08:02, 10.50s/it, lr=1e-5, step_loss=0.0738]
Steps: 2%|▏ | 17400/1000000 [11:13:18<3131:12:30, 11.47s/it, lr=1e-5, step_loss=0.0738][RANK-0]: Step: [17400], local_loss=0.08069074153900146, train_loss=0.24213652312755585, time_cost=5.591170787811279
+
Steps: 2%|▏ | 17400/1000000 [11:13:18<3131:12:30, 11.47s/it, lr=1e-5, step_loss=0.0807]
Steps: 2%|▏ | 17401/1000000 [11:13:31<3206:03:42, 11.75s/it, lr=1e-5, step_loss=0.0807][RANK-0]: Step: [17401], local_loss=0.1776122897863388, train_loss=0.06667456775903702, time_cost=2.6719634532928467
+
Steps: 2%|▏ | 17401/1000000 [11:13:31<3206:03:42, 11.75s/it, lr=1e-5, step_loss=0.178]
Steps: 2%|▏ | 17402/1000000 [11:13:38<2796:55:31, 10.25s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [17402], local_loss=0.005563200917094946, train_loss=0.012385141104459763, time_cost=1.2137222290039062
+
Steps: 2%|▏ | 17402/1000000 [11:13:38<2796:55:31, 10.25s/it, lr=1e-5, step_loss=0.00556]
Steps: 2%|▏ | 17403/1000000 [11:13:52<3178:30:42, 11.65s/it, lr=1e-5, step_loss=0.00556][RANK-0]: Step: [17403], local_loss=0.057420749217271805, train_loss=0.030929680913686752, time_cost=6.498483419418335
+
Steps: 2%|▏ | 17403/1000000 [11:13:52<3178:30:42, 11.65s/it, lr=1e-5, step_loss=0.0574]
Steps: 2%|▏ | 17404/1000000 [11:14:07<3399:59:02, 12.46s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [17404], local_loss=0.024512842297554016, train_loss=0.031053895130753517, time_cost=8.76484227180481
+
Steps: 2%|▏ | 17404/1000000 [11:14:07<3399:59:02, 12.46s/it, lr=1e-5, step_loss=0.0245]
Steps: 2%|▏ | 17405/1000000 [11:14:18<3259:45:07, 11.94s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [17405], local_loss=0.013077120296657085, train_loss=0.07842683047056198, time_cost=4.7698283195495605
+
Steps: 2%|▏ | 17405/1000000 [11:14:18<3259:45:07, 11.94s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 17406/1000000 [11:14:23<2768:38:31, 10.14s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [17406], local_loss=0.01020286325365305, train_loss=0.013286137022078037, time_cost=1.4032421112060547
+
Steps: 2%|▏ | 17406/1000000 [11:14:23<2768:38:31, 10.14s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 17407/1000000 [11:14:42<3430:42:35, 12.57s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [17407], local_loss=0.04450215399265289, train_loss=0.14477261900901794, time_cost=10.727150201797485
+
Steps: 2%|▏ | 17407/1000000 [11:14:42<3430:42:35, 12.57s/it, lr=1e-5, step_loss=0.0445]
Steps: 2%|▏ | 17408/1000000 [11:14:50<3049:18:56, 11.17s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [17408], local_loss=0.029197493568062782, train_loss=0.031077371910214424, time_cost=7.088535308837891
+
Steps: 2%|▏ | 17408/1000000 [11:14:50<3049:18:56, 11.17s/it, lr=1e-5, step_loss=0.0292]
Steps: 2%|▏ | 17409/1000000 [11:15:01<3088:25:15, 11.32s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [17409], local_loss=0.05925673618912697, train_loss=0.061802566051483154, time_cost=2.915313720703125
+
Steps: 2%|▏ | 17409/1000000 [11:15:01<3088:25:15, 11.32s/it, lr=1e-5, step_loss=0.0593]
Steps: 2%|▏ | 17410/1000000 [11:15:06<2528:22:41, 9.26s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [17410], local_loss=0.00424245186150074, train_loss=0.015898557379841805, time_cost=1.717071533203125
+
Steps: 2%|▏ | 17410/1000000 [11:15:06<2528:22:41, 9.26s/it, lr=1e-5, step_loss=0.00424]
Steps: 2%|▏ | 17411/1000000 [11:15:11<2195:56:04, 8.05s/it, lr=1e-5, step_loss=0.00424][RANK-0]: Step: [17411], local_loss=0.12084189802408218, train_loss=0.03487864136695862, time_cost=2.0415749549865723
+
Steps: 2%|▏ | 17411/1000000 [11:15:11<2195:56:04, 8.05s/it, lr=1e-5, step_loss=0.121]
Steps: 2%|▏ | 17412/1000000 [11:15:20<2240:58:42, 8.21s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [17412], local_loss=0.020182261243462563, train_loss=0.024893298745155334, time_cost=1.3001854419708252
+
Steps: 2%|▏ | 17412/1000000 [11:15:20<2240:58:42, 8.21s/it, lr=1e-5, step_loss=0.0202]
Steps: 2%|▏ | 17413/1000000 [11:15:33<2658:08:09, 9.74s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [17413], local_loss=0.005963364616036415, train_loss=0.055947445333004, time_cost=4.549715757369995
+
Steps: 2%|▏ | 17413/1000000 [11:15:33<2658:08:09, 9.74s/it, lr=1e-5, step_loss=0.00596]
Steps: 2%|▏ | 17414/1000000 [11:15:44<2774:55:02, 10.17s/it, lr=1e-5, step_loss=0.00596][RANK-0]: Step: [17414], local_loss=0.009399129077792168, train_loss=0.03467747941613197, time_cost=2.960620403289795
+
Steps: 2%|▏ | 17414/1000000 [11:15:44<2774:55:02, 10.17s/it, lr=1e-5, step_loss=0.0094]
Steps: 2%|▏ | 17415/1000000 [11:15:48<2300:21:50, 8.43s/it, lr=1e-5, step_loss=0.0094][RANK-0]: Step: [17415], local_loss=0.024183884263038635, train_loss=0.023971127346158028, time_cost=1.6626813411712646
+
Steps: 2%|▏ | 17415/1000000 [11:15:48<2300:21:50, 8.43s/it, lr=1e-5, step_loss=0.0242]
Steps: 2%|▏ | 17416/1000000 [11:15:56<2200:18:23, 8.06s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [17416], local_loss=0.04470426216721535, train_loss=0.07374785840511322, time_cost=2.828707695007324
+
Steps: 2%|▏ | 17416/1000000 [11:15:56<2200:18:23, 8.06s/it, lr=1e-5, step_loss=0.0447]
Steps: 2%|▏ | 17417/1000000 [11:16:00<1930:10:47, 7.07s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [17417], local_loss=0.026739396154880524, train_loss=0.0516851469874382, time_cost=1.8207554817199707
+
Steps: 2%|▏ | 17417/1000000 [11:16:00<1930:10:47, 7.07s/it, lr=1e-5, step_loss=0.0267]
Steps: 2%|▏ | 17418/1000000 [11:16:07<1906:42:54, 6.99s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [17418], local_loss=0.00490780733525753, train_loss=0.0374576710164547, time_cost=2.3035829067230225
+
Steps: 2%|▏ | 17418/1000000 [11:16:07<1906:42:54, 6.99s/it, lr=1e-5, step_loss=0.00491]
Steps: 2%|▏ | 17419/1000000 [11:16:14<1911:32:45, 7.00s/it, lr=1e-5, step_loss=0.00491][RANK-0]: Step: [17419], local_loss=0.0157607551664114, train_loss=9.2701416015625, time_cost=1.6676130294799805
+
Steps: 2%|▏ | 17419/1000000 [11:16:14<1911:32:45, 7.00s/it, lr=1e-5, step_loss=0.0158]
Steps: 2%|▏ | 17420/1000000 [11:16:24<2135:42:07, 7.82s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [17420], local_loss=0.02496497333049774, train_loss=0.29719114303588867, time_cost=1.2351415157318115
+
Steps: 2%|▏ | 17420/1000000 [11:16:24<2135:42:07, 7.82s/it, lr=1e-5, step_loss=0.025]
Steps: 2%|▏ | 17421/1000000 [11:16:29<1938:29:57, 7.10s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [17421], local_loss=0.02100820094347, train_loss=0.14686466753482819, time_cost=1.194505214691162
+
Steps: 2%|▏ | 17421/1000000 [11:16:29<1938:29:57, 7.10s/it, lr=1e-5, step_loss=0.021]
Steps: 2%|▏ | 17422/1000000 [11:16:35<1787:10:29, 6.55s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [17422], local_loss=0.008274574764072895, train_loss=0.036017559468746185, time_cost=2.555863380432129
+
Steps: 2%|▏ | 17422/1000000 [11:16:35<1787:10:29, 6.55s/it, lr=1e-5, step_loss=0.00827]
Steps: 2%|▏ | 17423/1000000 [11:16:44<2004:51:22, 7.35s/it, lr=1e-5, step_loss=0.00827][RANK-0]: Step: [17423], local_loss=0.004670299123972654, train_loss=0.07062286883592606, time_cost=2.5779218673706055
+
Steps: 2%|▏ | 17423/1000000 [11:16:44<2004:51:22, 7.35s/it, lr=1e-5, step_loss=0.00467]
Steps: 2%|▏ | 17424/1000000 [11:16:51<2024:34:38, 7.42s/it, lr=1e-5, step_loss=0.00467][RANK-0]: Step: [17424], local_loss=0.06301707774400711, train_loss=0.0555817075073719, time_cost=2.0280141830444336
+
Steps: 2%|▏ | 17424/1000000 [11:16:51<2024:34:38, 7.42s/it, lr=1e-5, step_loss=0.063]
Steps: 2%|▏ | 17425/1000000 [11:16:59<2030:30:39, 7.44s/it, lr=1e-5, step_loss=0.063][RANK-0]: Step: [17425], local_loss=0.17042173445224762, train_loss=0.041071005165576935, time_cost=3.093034505844116
+
Steps: 2%|▏ | 17425/1000000 [11:16:59<2030:30:39, 7.44s/it, lr=1e-5, step_loss=0.17]
Steps: 2%|▏ | 17426/1000000 [11:17:11<2377:32:07, 8.71s/it, lr=1e-5, step_loss=0.17][RANK-0]: Step: [17426], local_loss=0.03140115365386009, train_loss=0.02424512803554535, time_cost=8.769848823547363
+
Steps: 2%|▏ | 17426/1000000 [11:17:11<2377:32:07, 8.71s/it, lr=1e-5, step_loss=0.0314]
Steps: 2%|▏ | 17427/1000000 [11:17:21<2537:26:21, 9.30s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [17427], local_loss=0.015939027070999146, train_loss=0.02297675982117653, time_cost=2.2135000228881836
+
Steps: 2%|▏ | 17427/1000000 [11:17:21<2537:26:21, 9.30s/it, lr=1e-5, step_loss=0.0159]
Steps: 2%|▏ | 17428/1000000 [11:17:35<2880:26:26, 10.55s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [17428], local_loss=0.009817596524953842, train_loss=0.016126487404108047, time_cost=4.10724949836731
+
Steps: 2%|▏ | 17428/1000000 [11:17:35<2880:26:26, 10.55s/it, lr=1e-5, step_loss=0.00982]
Steps: 2%|▏ | 17429/1000000 [11:17:51<3348:18:42, 12.27s/it, lr=1e-5, step_loss=0.00982][RANK-0]: Step: [17429], local_loss=0.009198220446705818, train_loss=0.052767571061849594, time_cost=8.713555812835693
+
Steps: 2%|▏ | 17429/1000000 [11:17:51<3348:18:42, 12.27s/it, lr=1e-5, step_loss=0.0092]
Steps: 2%|▏ | 17430/1000000 [11:17:58<2954:31:35, 10.82s/it, lr=1e-5, step_loss=0.0092][RANK-0]: Step: [17430], local_loss=0.027219120413064957, train_loss=0.03852534294128418, time_cost=2.016937017440796
+
Steps: 2%|▏ | 17430/1000000 [11:17:58<2954:31:35, 10.82s/it, lr=1e-5, step_loss=0.0272]
Steps: 2%|▏ | 17431/1000000 [11:18:05<2614:50:08, 9.58s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [17431], local_loss=0.05948055908083916, train_loss=0.030707869678735733, time_cost=2.154186248779297
+
Steps: 2%|▏ | 17431/1000000 [11:18:05<2614:50:08, 9.58s/it, lr=1e-5, step_loss=0.0595]
Steps: 2%|▏ | 17432/1000000 [11:18:18<2887:54:34, 10.58s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [17432], local_loss=0.006346905138343573, train_loss=0.013196432963013649, time_cost=3.7849810123443604
+
Steps: 2%|▏ | 17432/1000000 [11:18:18<2887:54:34, 10.58s/it, lr=1e-5, step_loss=0.00635]
Steps: 2%|▏ | 17433/1000000 [11:18:27<2750:19:32, 10.08s/it, lr=1e-5, step_loss=0.00635][RANK-0]: Step: [17433], local_loss=0.03391518443822861, train_loss=0.09633789956569672, time_cost=3.4936702251434326
+
Steps: 2%|▏ | 17433/1000000 [11:18:27<2750:19:32, 10.08s/it, lr=1e-5, step_loss=0.0339]
Steps: 2%|▏ | 17434/1000000 [11:18:43<3245:18:26, 11.89s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [17434], local_loss=0.0707898736000061, train_loss=0.19031456112861633, time_cost=7.669501066207886
+
Steps: 2%|▏ | 17434/1000000 [11:18:43<3245:18:26, 11.89s/it, lr=1e-5, step_loss=0.0708]
Steps: 2%|▏ | 17435/1000000 [11:18:48<2646:47:07, 9.70s/it, lr=1e-5, step_loss=0.0708][RANK-0]: Step: [17435], local_loss=0.004882774781435728, train_loss=0.08894021064043045, time_cost=2.292367458343506
+
Steps: 2%|▏ | 17435/1000000 [11:18:48<2646:47:07, 9.70s/it, lr=1e-5, step_loss=0.00488]
Steps: 2%|▏ | 17436/1000000 [11:19:01<2938:01:06, 10.76s/it, lr=1e-5, step_loss=0.00488][RANK-0]: Step: [17436], local_loss=0.02603406459093094, train_loss=0.05391189083456993, time_cost=4.388152599334717
+
Steps: 2%|▏ | 17436/1000000 [11:19:01<2938:01:06, 10.76s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 17437/1000000 [11:19:06<2454:26:23, 8.99s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [17437], local_loss=0.020941810682415962, train_loss=0.026649579405784607, time_cost=1.776212453842163
+
Steps: 2%|▏ | 17437/1000000 [11:19:06<2454:26:23, 8.99s/it, lr=1e-5, step_loss=0.0209]
Steps: 2%|▏ | 17438/1000000 [11:19:10<2069:25:40, 7.58s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [17438], local_loss=0.016287242993712425, train_loss=0.02798212692141533, time_cost=1.269975185394287
+
Steps: 2%|▏ | 17438/1000000 [11:19:10<2069:25:40, 7.58s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 17439/1000000 [11:19:18<2116:36:28, 7.76s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [17439], local_loss=0.007106478326022625, train_loss=0.03560246154665947, time_cost=4.472571611404419
+
Steps: 2%|▏ | 17439/1000000 [11:19:18<2116:36:28, 7.76s/it, lr=1e-5, step_loss=0.00711]
Steps: 2%|▏ | 17440/1000000 [11:19:23<1901:29:57, 6.97s/it, lr=1e-5, step_loss=0.00711][RANK-0]: Step: [17440], local_loss=0.012538658455014229, train_loss=0.07680527865886688, time_cost=2.1462559700012207
+
Steps: 2%|▏ | 17440/1000000 [11:19:23<1901:29:57, 6.97s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 17441/1000000 [11:19:33<2083:58:44, 7.64s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [17441], local_loss=0.022802110761404037, train_loss=0.03696670010685921, time_cost=2.9794278144836426
+
Steps: 2%|▏ | 17441/1000000 [11:19:33<2083:58:44, 7.64s/it, lr=1e-5, step_loss=0.0228]
Steps: 2%|▏ | 17442/1000000 [11:19:44<2366:30:32, 8.67s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [17442], local_loss=0.005943816155195236, train_loss=0.02757040411233902, time_cost=1.5756947994232178
+
Steps: 2%|▏ | 17442/1000000 [11:19:44<2366:30:32, 8.67s/it, lr=1e-5, step_loss=0.00594]
Steps: 2%|▏ | 17443/1000000 [11:19:54<2512:30:11, 9.21s/it, lr=1e-5, step_loss=0.00594][RANK-0]: Step: [17443], local_loss=0.008925425820052624, train_loss=0.047332875430583954, time_cost=2.1664955615997314
+
Steps: 2%|▏ | 17443/1000000 [11:19:54<2512:30:11, 9.21s/it, lr=1e-5, step_loss=0.00893]
Steps: 2%|▏ | 17444/1000000 [11:20:06<2696:57:51, 9.88s/it, lr=1e-5, step_loss=0.00893][RANK-0]: Step: [17444], local_loss=0.012762430123984814, train_loss=0.017745288088917732, time_cost=1.221954107284546
+
Steps: 2%|▏ | 17444/1000000 [11:20:06<2696:57:51, 9.88s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 17445/1000000 [11:20:11<2298:23:53, 8.42s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [17445], local_loss=0.005050265230238438, train_loss=0.009156038984656334, time_cost=3.0304036140441895
+
Steps: 2%|▏ | 17445/1000000 [11:20:11<2298:23:53, 8.42s/it, lr=1e-5, step_loss=0.00505]
Steps: 2%|▏ | 17446/1000000 [11:20:19<2263:38:37, 8.29s/it, lr=1e-5, step_loss=0.00505][RANK-0]: Step: [17446], local_loss=0.08078958094120026, train_loss=0.07077618688344955, time_cost=1.218203067779541
+
Steps: 2%|▏ | 17446/1000000 [11:20:19<2263:38:37, 8.29s/it, lr=1e-5, step_loss=0.0808]
Steps: 2%|▏ | 17447/1000000 [11:20:24<2001:46:51, 7.33s/it, lr=1e-5, step_loss=0.0808][RANK-0]: Step: [17447], local_loss=0.022491609677672386, train_loss=0.08525457978248596, time_cost=2.225818395614624
+
Steps: 2%|▏ | 17447/1000000 [11:20:24<2001:46:51, 7.33s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 17448/1000000 [11:20:29<1820:07:58, 6.67s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [17448], local_loss=0.0505121685564518, train_loss=0.01576949842274189, time_cost=3.862586498260498
+
Steps: 2%|▏ | 17448/1000000 [11:20:29<1820:07:58, 6.67s/it, lr=1e-5, step_loss=0.0505]
Steps: 2%|▏ | 17449/1000000 [11:20:35<1772:22:16, 6.49s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [17449], local_loss=0.017255162820219994, train_loss=0.05304615572094917, time_cost=1.2154710292816162
+
Steps: 2%|▏ | 17449/1000000 [11:20:35<1772:22:16, 6.49s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 17450/1000000 [11:20:40<1662:14:41, 6.09s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [17450], local_loss=1.0099210739135742, train_loss=0.16145837306976318, time_cost=2.204873561859131
+
Steps: 2%|▏ | 17450/1000000 [11:20:40<1662:14:41, 6.09s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 17451/1000000 [11:20:45<1613:40:48, 5.91s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [17451], local_loss=0.05036688596010208, train_loss=0.08384281396865845, time_cost=1.3041481971740723
+
Steps: 2%|▏ | 17451/1000000 [11:20:45<1613:40:48, 5.91s/it, lr=1e-5, step_loss=0.0504]
Steps: 2%|▏ | 17452/1000000 [11:20:55<1930:41:20, 7.07s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [17452], local_loss=0.008814606815576553, train_loss=0.03236502408981323, time_cost=2.084564447402954
+
Steps: 2%|▏ | 17452/1000000 [11:20:55<1930:41:20, 7.07s/it, lr=1e-5, step_loss=0.00881]
Steps: 2%|▏ | 17453/1000000 [11:21:11<2603:53:16, 9.54s/it, lr=1e-5, step_loss=0.00881][RANK-0]: Step: [17453], local_loss=0.017572658136487007, train_loss=0.045021429657936096, time_cost=7.34009051322937
+
Steps: 2%|▏ | 17453/1000000 [11:21:11<2603:53:16, 9.54s/it, lr=1e-5, step_loss=0.0176]
Steps: 2%|▏ | 17454/1000000 [11:21:20<2567:23:17, 9.41s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [17454], local_loss=0.03089078515768051, train_loss=0.08829142153263092, time_cost=3.288623094558716
+
Steps: 2%|▏ | 17454/1000000 [11:21:20<2567:23:17, 9.41s/it, lr=1e-5, step_loss=0.0309]
Steps: 2%|▏ | 17455/1000000 [11:21:33<2914:29:31, 10.68s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [17455], local_loss=0.009422718547284603, train_loss=0.24962814152240753, time_cost=4.667653799057007
+
Steps: 2%|▏ | 17455/1000000 [11:21:33<2914:29:31, 10.68s/it, lr=1e-5, step_loss=0.00942]
Steps: 2%|▏ | 17456/1000000 [11:21:46<3066:05:08, 11.23s/it, lr=1e-5, step_loss=0.00942][RANK-0]: Step: [17456], local_loss=0.06170131638646126, train_loss=0.0833674743771553, time_cost=4.847576141357422
+
Steps: 2%|▏ | 17456/1000000 [11:21:46<3066:05:08, 11.23s/it, lr=1e-5, step_loss=0.0617]
Steps: 2%|▏ | 17457/1000000 [11:21:51<2563:15:14, 9.39s/it, lr=1e-5, step_loss=0.0617][RANK-0]: Step: [17457], local_loss=0.014097023755311966, train_loss=0.142485573887825, time_cost=1.249730110168457
+
Steps: 2%|▏ | 17457/1000000 [11:21:51<2563:15:14, 9.39s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 17458/1000000 [11:21:55<2113:08:24, 7.74s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [17458], local_loss=0.026526551693677902, train_loss=0.07943439483642578, time_cost=1.6474318504333496
+
Steps: 2%|▏ | 17458/1000000 [11:21:55<2113:08:24, 7.74s/it, lr=1e-5, step_loss=0.0265]
Steps: 2%|▏ | 17459/1000000 [11:22:08<2533:32:33, 9.28s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [17459], local_loss=0.04837211221456528, train_loss=0.04238125681877136, time_cost=4.135493278503418
+
Steps: 2%|▏ | 17459/1000000 [11:22:08<2533:32:33, 9.28s/it, lr=1e-5, step_loss=0.0484]
Steps: 2%|▏ | 17460/1000000 [11:22:22<2949:38:54, 10.81s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [17460], local_loss=0.23786146938800812, train_loss=0.04341376945376396, time_cost=5.3167030811309814
+
Steps: 2%|▏ | 17460/1000000 [11:22:22<2949:38:54, 10.81s/it, lr=1e-5, step_loss=0.238]
Steps: 2%|▏ | 17461/1000000 [11:22:28<2542:22:45, 9.32s/it, lr=1e-5, step_loss=0.238][RANK-0]: Step: [17461], local_loss=0.013559915125370026, train_loss=0.05551139637827873, time_cost=5.205711126327515
+
Steps: 2%|▏ | 17461/1000000 [11:22:28<2542:22:45, 9.32s/it, lr=1e-5, step_loss=0.0136]
Steps: 2%|▏ | 17462/1000000 [11:22:39<2670:42:08, 9.79s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [17462], local_loss=0.027441080659627914, train_loss=0.04559285566210747, time_cost=1.255685567855835
+
Steps: 2%|▏ | 17462/1000000 [11:22:39<2670:42:08, 9.79s/it, lr=1e-5, step_loss=0.0274]
Steps: 2%|▏ | 17463/1000000 [11:22:44<2292:13:36, 8.40s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [17463], local_loss=0.008052104152739048, train_loss=0.02853996492922306, time_cost=1.4825701713562012
+
Steps: 2%|▏ | 17463/1000000 [11:22:44<2292:13:36, 8.40s/it, lr=1e-5, step_loss=0.00805]
Steps: 2%|▏ | 17464/1000000 [11:22:49<2028:50:24, 7.43s/it, lr=1e-5, step_loss=0.00805][RANK-0]: Step: [17464], local_loss=0.006490675266832113, train_loss=0.02435651794075966, time_cost=1.20713210105896
+
Steps: 2%|▏ | 17464/1000000 [11:22:49<2028:50:24, 7.43s/it, lr=1e-5, step_loss=0.00649]
Steps: 2%|▏ | 17465/1000000 [11:22:59<2207:38:47, 8.09s/it, lr=1e-5, step_loss=0.00649][RANK-0]: Step: [17465], local_loss=0.3233426511287689, train_loss=0.21936598420143127, time_cost=3.605726718902588
+
Steps: 2%|▏ | 17465/1000000 [11:22:59<2207:38:47, 8.09s/it, lr=1e-5, step_loss=0.323]
Steps: 2%|▏ | 17466/1000000 [11:23:04<1958:57:10, 7.18s/it, lr=1e-5, step_loss=0.323][RANK-0]: Step: [17466], local_loss=0.06598643213510513, train_loss=0.08278092741966248, time_cost=2.153090715408325
+
Steps: 2%|▏ | 17466/1000000 [11:23:04<1958:57:10, 7.18s/it, lr=1e-5, step_loss=0.066]
Steps: 2%|▏ | 17467/1000000 [11:23:12<2030:06:11, 7.44s/it, lr=1e-5, step_loss=0.066][RANK-0]: Step: [17467], local_loss=0.14692465960979462, train_loss=0.03145887702703476, time_cost=1.2263267040252686
+
Steps: 2%|▏ | 17467/1000000 [11:23:12<2030:06:11, 7.44s/it, lr=1e-5, step_loss=0.147]
Steps: 2%|▏ | 17468/1000000 [11:23:17<1855:05:56, 6.80s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [17468], local_loss=0.011428365483880043, train_loss=0.21354235708713531, time_cost=1.434445858001709
+
Steps: 2%|▏ | 17468/1000000 [11:23:17<1855:05:56, 6.80s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 17469/1000000 [11:23:24<1886:49:05, 6.91s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [17469], local_loss=0.00394666101783514, train_loss=0.03122653439640999, time_cost=1.8673582077026367
+
Steps: 2%|▏ | 17469/1000000 [11:23:24<1886:49:05, 6.91s/it, lr=1e-5, step_loss=0.00395]
Steps: 2%|▏ | 17470/1000000 [11:23:32<1937:00:53, 7.10s/it, lr=1e-5, step_loss=0.00395][RANK-0]: Step: [17470], local_loss=0.043489936739206314, train_loss=0.020516006276011467, time_cost=1.4090070724487305
+
Steps: 2%|▏ | 17470/1000000 [11:23:32<1937:00:53, 7.10s/it, lr=1e-5, step_loss=0.0435]
Steps: 2%|▏ | 17471/1000000 [11:23:48<2643:07:36, 9.68s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [17471], local_loss=0.041671305894851685, train_loss=0.06592804193496704, time_cost=7.849552154541016
+
Steps: 2%|▏ | 17471/1000000 [11:23:48<2643:07:36, 9.68s/it, lr=1e-5, step_loss=0.0417]
Steps: 2%|▏ | 17472/1000000 [11:23:52<2253:25:08, 8.26s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [17472], local_loss=0.005317494738847017, train_loss=0.14810730516910553, time_cost=1.2082393169403076
+
Steps: 2%|▏ | 17472/1000000 [11:23:52<2253:25:08, 8.26s/it, lr=1e-5, step_loss=0.00532]
Steps: 2%|▏ | 17473/1000000 [11:23:58<2034:15:14, 7.45s/it, lr=1e-5, step_loss=0.00532][RANK-0]: Step: [17473], local_loss=0.06766991317272186, train_loss=0.02837039902806282, time_cost=4.636416673660278
+
Steps: 2%|▏ | 17473/1000000 [11:23:58<2034:15:14, 7.45s/it, lr=1e-5, step_loss=0.0677]
Steps: 2%|▏ | 17474/1000000 [11:24:06<2047:04:15, 7.50s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [17474], local_loss=0.08817742019891739, train_loss=0.0381106436252594, time_cost=1.706578254699707
+
Steps: 2%|▏ | 17474/1000000 [11:24:06<2047:04:15, 7.50s/it, lr=1e-5, step_loss=0.0882]
Steps: 2%|▏ | 17475/1000000 [11:24:11<1879:59:02, 6.89s/it, lr=1e-5, step_loss=0.0882][RANK-0]: Step: [17475], local_loss=0.016927368938922882, train_loss=0.11074476689100266, time_cost=1.2042250633239746
+
Steps: 2%|▏ | 17475/1000000 [11:24:11<1879:59:02, 6.89s/it, lr=1e-5, step_loss=0.0169]
Steps: 2%|▏ | 17476/1000000 [11:24:19<1942:55:48, 7.12s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [17476], local_loss=0.023418759927153587, train_loss=0.07218785583972931, time_cost=3.217707872390747
+
Steps: 2%|▏ | 17476/1000000 [11:24:19<1942:55:48, 7.12s/it, lr=1e-5, step_loss=0.0234]
Steps: 2%|▏ | 17477/1000000 [11:24:29<2209:21:00, 8.10s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [17477], local_loss=0.01480415090918541, train_loss=0.053627561777830124, time_cost=2.2492711544036865
+
Steps: 2%|▏ | 17477/1000000 [11:24:29<2209:21:00, 8.10s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 17478/1000000 [11:24:40<2420:19:10, 8.87s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [17478], local_loss=0.008572988212108612, train_loss=0.031378015875816345, time_cost=2.0399363040924072
+
Steps: 2%|▏ | 17478/1000000 [11:24:40<2420:19:10, 8.87s/it, lr=1e-5, step_loss=0.00857]
Steps: 2%|▏ | 17479/1000000 [11:24:51<2578:05:03, 9.45s/it, lr=1e-5, step_loss=0.00857][RANK-0]: Step: [17479], local_loss=0.01752944104373455, train_loss=0.06868861615657806, time_cost=3.0379984378814697
+
Steps: 2%|▏ | 17479/1000000 [11:24:51<2578:05:03, 9.45s/it, lr=1e-5, step_loss=0.0175]
Steps: 2%|▏ | 17480/1000000 [11:25:04<2870:51:47, 10.52s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [17480], local_loss=0.012079112231731415, train_loss=0.0240156352519989, time_cost=4.316157579421997
+
Steps: 2%|▏ | 17480/1000000 [11:25:04<2870:51:47, 10.52s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 17481/1000000 [11:25:11<2607:05:01, 9.55s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [17481], local_loss=0.0056339348666369915, train_loss=0.017709018662571907, time_cost=2.8670291900634766
+
Steps: 2%|▏ | 17481/1000000 [11:25:11<2607:05:01, 9.55s/it, lr=1e-5, step_loss=0.00563]
Steps: 2%|▏ | 17482/1000000 [11:25:22<2759:52:36, 10.11s/it, lr=1e-5, step_loss=0.00563][RANK-0]: Step: [17482], local_loss=0.00973596516996622, train_loss=8.776350975036621, time_cost=5.183916330337524
+
Steps: 2%|▏ | 17482/1000000 [11:25:22<2759:52:36, 10.11s/it, lr=1e-5, step_loss=0.00974]
Steps: 2%|▏ | 17483/1000000 [11:25:27<2297:27:07, 8.42s/it, lr=1e-5, step_loss=0.00974][RANK-0]: Step: [17483], local_loss=0.005863632541149855, train_loss=0.04632483422756195, time_cost=3.363593816757202
+
Steps: 2%|▏ | 17483/1000000 [11:25:27<2297:27:07, 8.42s/it, lr=1e-5, step_loss=0.00586]
Steps: 2%|▏ | 17484/1000000 [11:25:33<2096:12:29, 7.68s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [17484], local_loss=0.041327103972435, train_loss=2.9341204166412354, time_cost=2.075106620788574
+
Steps: 2%|▏ | 17484/1000000 [11:25:33<2096:12:29, 7.68s/it, lr=1e-5, step_loss=0.0413]
Steps: 2%|▏ | 17485/1000000 [11:25:49<2773:34:17, 10.16s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [17485], local_loss=0.04444984719157219, train_loss=0.02455749362707138, time_cost=6.646977424621582
+
Steps: 2%|▏ | 17485/1000000 [11:25:49<2773:34:17, 10.16s/it, lr=1e-5, step_loss=0.0444]
Steps: 2%|▏ | 17486/1000000 [11:26:01<2954:39:09, 10.83s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [17486], local_loss=0.0221038069576025, train_loss=0.03481246531009674, time_cost=4.057079076766968
+
Steps: 2%|▏ | 17486/1000000 [11:26:01<2954:39:09, 10.83s/it, lr=1e-5, step_loss=0.0221]
Steps: 2%|▏ | 17487/1000000 [11:26:09<2674:58:09, 9.80s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [17487], local_loss=0.02652629278600216, train_loss=0.02699008584022522, time_cost=1.9505596160888672
+
Steps: 2%|▏ | 17487/1000000 [11:26:09<2674:58:09, 9.80s/it, lr=1e-5, step_loss=0.0265]
Steps: 2%|▏ | 17488/1000000 [11:26:14<2315:25:51, 8.48s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [17488], local_loss=0.05702055245637894, train_loss=0.04973142221570015, time_cost=1.5252223014831543
+
Steps: 2%|▏ | 17488/1000000 [11:26:14<2315:25:51, 8.48s/it, lr=1e-5, step_loss=0.057]
Steps: 2%|▏ | 17489/1000000 [11:26:33<3157:17:24, 11.57s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [17489], local_loss=0.06948131322860718, train_loss=0.05788767337799072, time_cost=1.357011318206787
+
Steps: 2%|▏ | 17489/1000000 [11:26:33<3157:17:24, 11.57s/it, lr=1e-5, step_loss=0.0695]
Steps: 2%|▏ | 17490/1000000 [11:26:38<2681:33:03, 9.83s/it, lr=1e-5, step_loss=0.0695][RANK-0]: Step: [17490], local_loss=0.03752962872385979, train_loss=0.04435569792985916, time_cost=1.925459623336792
+
Steps: 2%|▏ | 17490/1000000 [11:26:38<2681:33:03, 9.83s/it, lr=1e-5, step_loss=0.0375]
Steps: 2%|▏ | 17491/1000000 [11:26:46<2457:58:18, 9.01s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [17491], local_loss=0.0410471111536026, train_loss=0.06834541261196136, time_cost=1.2973644733428955
+
Steps: 2%|▏ | 17491/1000000 [11:26:46<2457:58:18, 9.01s/it, lr=1e-5, step_loss=0.041]
Steps: 2%|▏ | 17492/1000000 [11:26:54<2385:20:56, 8.74s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [17492], local_loss=0.15417353808879852, train_loss=0.04772882163524628, time_cost=4.268335342407227
+
Steps: 2%|▏ | 17492/1000000 [11:26:54<2385:20:56, 8.74s/it, lr=1e-5, step_loss=0.154]
Steps: 2%|▏ | 17493/1000000 [11:27:05<2613:33:11, 9.58s/it, lr=1e-5, step_loss=0.154][RANK-0]: Step: [17493], local_loss=0.05945098400115967, train_loss=0.19003425538539886, time_cost=3.000380516052246
+
Steps: 2%|▏ | 17493/1000000 [11:27:05<2613:33:11, 9.58s/it, lr=1e-5, step_loss=0.0595]
Steps: 2%|▏ | 17494/1000000 [11:27:18<2893:57:52, 10.60s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [17494], local_loss=0.0069993482902646065, train_loss=0.028851352632045746, time_cost=4.31651496887207
+
Steps: 2%|▏ | 17494/1000000 [11:27:18<2893:57:52, 10.60s/it, lr=1e-5, step_loss=0.007]
Steps: 2%|▏ | 17495/1000000 [11:27:23<2436:00:43, 8.93s/it, lr=1e-5, step_loss=0.007][RANK-0]: Step: [17495], local_loss=0.0052647665143013, train_loss=0.017571188509464264, time_cost=1.2058532238006592
+
Steps: 2%|▏ | 17495/1000000 [11:27:23<2436:00:43, 8.93s/it, lr=1e-5, step_loss=0.00526]
Steps: 2%|▏ | 17496/1000000 [11:27:32<2454:02:59, 8.99s/it, lr=1e-5, step_loss=0.00526][RANK-0]: Step: [17496], local_loss=0.011449648067355156, train_loss=0.0207030288875103, time_cost=3.0795817375183105
+
Steps: 2%|▏ | 17496/1000000 [11:27:32<2454:02:59, 8.99s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 17497/1000000 [11:27:44<2690:21:50, 9.86s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [17497], local_loss=0.011085759848356247, train_loss=2.2414088249206543, time_cost=4.864852666854858
+
Steps: 2%|▏ | 17497/1000000 [11:27:44<2690:21:50, 9.86s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 17498/1000000 [11:27:51<2426:57:47, 8.89s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [17498], local_loss=0.007406052201986313, train_loss=0.028786752372980118, time_cost=1.2442278861999512
+
Steps: 2%|▏ | 17498/1000000 [11:27:51<2426:57:47, 8.89s/it, lr=1e-5, step_loss=0.00741]
Steps: 2%|▏ | 17499/1000000 [11:27:57<2181:12:09, 7.99s/it, lr=1e-5, step_loss=0.00741][RANK-0]: Step: [17499], local_loss=0.08050672709941864, train_loss=0.034285109490156174, time_cost=1.7810626029968262
+
Steps: 2%|▏ | 17499/1000000 [11:27:57<2181:12:09, 7.99s/it, lr=1e-5, step_loss=0.0805]
Steps: 2%|▏ | 17500/1000000 [11:28:06<2265:22:30, 8.30s/it, lr=1e-5, step_loss=0.0805][RANK-0]: Step: [17500], local_loss=0.041951101273298264, train_loss=0.037817105650901794, time_cost=3.7276153564453125
+
Steps: 2%|▏ | 17500/1000000 [11:28:06<2265:22:30, 8.30s/it, lr=1e-5, step_loss=0.042]
Steps: 2%|▏ | 17501/1000000 [11:28:14<2253:24:40, 8.26s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [17501], local_loss=0.005402058362960815, train_loss=0.029823632910847664, time_cost=4.061993598937988
+
Steps: 2%|▏ | 17501/1000000 [11:28:14<2253:24:40, 8.26s/it, lr=1e-5, step_loss=0.0054]
Steps: 2%|▏ | 17502/1000000 [11:28:21<2143:36:41, 7.85s/it, lr=1e-5, step_loss=0.0054][RANK-0]: Step: [17502], local_loss=0.055594440549612045, train_loss=0.08625099062919617, time_cost=2.488365411758423
+
Steps: 2%|▏ | 17502/1000000 [11:28:21<2143:36:41, 7.85s/it, lr=1e-5, step_loss=0.0556]
Steps: 2%|▏ | 17503/1000000 [11:28:28<2091:31:11, 7.66s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [17503], local_loss=0.03150367736816406, train_loss=0.14042815566062927, time_cost=3.0179848670959473
+
Steps: 2%|▏ | 17503/1000000 [11:28:28<2091:31:11, 7.66s/it, lr=1e-5, step_loss=0.0315]
Steps: 2%|▏ | 17504/1000000 [11:28:43<2658:06:21, 9.74s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [17504], local_loss=0.004900800064206123, train_loss=0.051832415163517, time_cost=7.298121690750122
+
Steps: 2%|▏ | 17504/1000000 [11:28:43<2658:06:21, 9.74s/it, lr=1e-5, step_loss=0.0049]
Steps: 2%|▏ | 17505/1000000 [11:28:48<2259:45:30, 8.28s/it, lr=1e-5, step_loss=0.0049][RANK-0]: Step: [17505], local_loss=0.4007846415042877, train_loss=0.0657634437084198, time_cost=2.03808856010437
+
Steps: 2%|▏ | 17505/1000000 [11:28:48<2259:45:30, 8.28s/it, lr=1e-5, step_loss=0.401]
Steps: 2%|▏ | 17506/1000000 [11:28:59<2490:48:18, 9.13s/it, lr=1e-5, step_loss=0.401][RANK-0]: Step: [17506], local_loss=0.03586997091770172, train_loss=0.10041236132383347, time_cost=1.2138772010803223
+
Steps: 2%|▏ | 17506/1000000 [11:28:59<2490:48:18, 9.13s/it, lr=1e-5, step_loss=0.0359]
Steps: 2%|▏ | 17507/1000000 [11:29:11<2716:23:19, 9.95s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [17507], local_loss=0.052406419068574905, train_loss=0.027299340814352036, time_cost=4.217534065246582
+
Steps: 2%|▏ | 17507/1000000 [11:29:11<2716:23:19, 9.95s/it, lr=1e-5, step_loss=0.0524]
Steps: 2%|▏ | 17508/1000000 [11:29:18<2533:06:47, 9.28s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [17508], local_loss=0.006363788153976202, train_loss=0.078756183385849, time_cost=2.3957433700561523
+
Steps: 2%|▏ | 17508/1000000 [11:29:18<2533:06:47, 9.28s/it, lr=1e-5, step_loss=0.00636]
Steps: 2%|▏ | 17509/1000000 [11:29:29<2675:37:34, 9.80s/it, lr=1e-5, step_loss=0.00636][RANK-0]: Step: [17509], local_loss=0.00604725256562233, train_loss=0.13256990909576416, time_cost=6.348886251449585
+
Steps: 2%|▏ | 17509/1000000 [11:29:29<2675:37:34, 9.80s/it, lr=1e-5, step_loss=0.00605]
Steps: 2%|▏ | 17510/1000000 [11:29:41<2801:25:12, 10.26s/it, lr=1e-5, step_loss=0.00605][RANK-0]: Step: [17510], local_loss=0.021314892917871475, train_loss=0.03850742056965828, time_cost=2.9531848430633545
+
Steps: 2%|▏ | 17510/1000000 [11:29:41<2801:25:12, 10.26s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 17511/1000000 [11:29:54<3086:58:07, 11.31s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [17511], local_loss=0.011751287616789341, train_loss=0.015811359509825706, time_cost=3.9109156131744385
+
Steps: 2%|▏ | 17511/1000000 [11:29:54<3086:58:07, 11.31s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 17512/1000000 [11:30:02<2759:43:26, 10.11s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [17512], local_loss=0.020057832822203636, train_loss=0.019137248396873474, time_cost=2.040635824203491
+
Steps: 2%|▏ | 17512/1000000 [11:30:02<2759:43:26, 10.11s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 17513/1000000 [11:30:12<2769:00:53, 10.15s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [17513], local_loss=0.041855450719594955, train_loss=0.02937459573149681, time_cost=4.520754814147949
+
Steps: 2%|▏ | 17513/1000000 [11:30:12<2769:00:53, 10.15s/it, lr=1e-5, step_loss=0.0419]
Steps: 2%|▏ | 17514/1000000 [11:30:21<2694:47:18, 9.87s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [17514], local_loss=0.010296555235981941, train_loss=0.030492467805743217, time_cost=1.9096169471740723
+
Steps: 2%|▏ | 17514/1000000 [11:30:21<2694:47:18, 9.87s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 17515/1000000 [11:30:31<2703:27:49, 9.91s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [17515], local_loss=0.006391722708940506, train_loss=0.021861907094717026, time_cost=4.587516784667969
+
Steps: 2%|▏ | 17515/1000000 [11:30:31<2703:27:49, 9.91s/it, lr=1e-5, step_loss=0.00639]
Steps: 2%|▏ | 17516/1000000 [11:30:38<2492:41:39, 9.13s/it, lr=1e-5, step_loss=0.00639][RANK-0]: Step: [17516], local_loss=0.022152205929160118, train_loss=0.07061098515987396, time_cost=2.772583246231079
+
Steps: 2%|▏ | 17516/1000000 [11:30:38<2492:41:39, 9.13s/it, lr=1e-5, step_loss=0.0222]
Steps: 2%|▏ | 17517/1000000 [11:30:43<2093:29:47, 7.67s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [17517], local_loss=0.009662916883826256, train_loss=0.031614355742931366, time_cost=1.5380241870880127
+
Steps: 2%|▏ | 17517/1000000 [11:30:43<2093:29:47, 7.67s/it, lr=1e-5, step_loss=0.00966]
Steps: 2%|▏ | 17518/1000000 [11:30:54<2380:46:49, 8.72s/it, lr=1e-5, step_loss=0.00966][RANK-0]: Step: [17518], local_loss=0.03919211030006409, train_loss=0.03860488533973694, time_cost=2.478715419769287
+
Steps: 2%|▏ | 17518/1000000 [11:30:54<2380:46:49, 8.72s/it, lr=1e-5, step_loss=0.0392]
Steps: 2%|▏ | 17519/1000000 [11:31:07<2764:03:03, 10.13s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [17519], local_loss=0.1270206719636917, train_loss=0.029666338115930557, time_cost=1.2038047313690186
+
Steps: 2%|▏ | 17519/1000000 [11:31:07<2764:03:03, 10.13s/it, lr=1e-5, step_loss=0.127]
Steps: 2%|▏ | 17520/1000000 [11:31:12<2332:17:53, 8.55s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [17520], local_loss=0.006515562534332275, train_loss=0.015451144427061081, time_cost=1.2242496013641357
+
Steps: 2%|▏ | 17520/1000000 [11:31:12<2332:17:53, 8.55s/it, lr=1e-5, step_loss=0.00652]
Steps: 2%|▏ | 17521/1000000 [11:31:30<3101:31:58, 11.36s/it, lr=1e-5, step_loss=0.00652][RANK-0]: Step: [17521], local_loss=0.1867412030696869, train_loss=0.07495865225791931, time_cost=3.38962984085083
+
Steps: 2%|▏ | 17521/1000000 [11:31:30<3101:31:58, 11.36s/it, lr=1e-5, step_loss=0.187]
Steps: 2%|▏ | 17522/1000000 [11:31:41<3042:13:43, 11.15s/it, lr=1e-5, step_loss=0.187][RANK-0]: Step: [17522], local_loss=0.033477768301963806, train_loss=0.029378337785601616, time_cost=3.7208547592163086
+
Steps: 2%|▏ | 17522/1000000 [11:31:41<3042:13:43, 11.15s/it, lr=1e-5, step_loss=0.0335]
Steps: 2%|▏ | 17523/1000000 [11:31:52<3033:31:50, 11.12s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [17523], local_loss=0.030467087402939796, train_loss=0.07642415165901184, time_cost=2.4974234104156494
+
Steps: 2%|▏ | 17523/1000000 [11:31:52<3033:31:50, 11.12s/it, lr=1e-5, step_loss=0.0305]
Steps: 2%|▏ | 17524/1000000 [11:32:02<2964:22:23, 10.86s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [17524], local_loss=0.007648632861673832, train_loss=0.02635718509554863, time_cost=5.133236646652222
+
Steps: 2%|▏ | 17524/1000000 [11:32:02<2964:22:23, 10.86s/it, lr=1e-5, step_loss=0.00765]
Steps: 2%|▏ | 17525/1000000 [11:32:12<2913:31:35, 10.68s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [17525], local_loss=0.0532364547252655, train_loss=0.08476022630929947, time_cost=2.9747090339660645
+
Steps: 2%|▏ | 17525/1000000 [11:32:12<2913:31:35, 10.68s/it, lr=1e-5, step_loss=0.0532]
Steps: 2%|▏ | 17526/1000000 [11:32:22<2815:47:42, 10.32s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [17526], local_loss=0.005133416969329119, train_loss=0.011321919970214367, time_cost=1.2203590869903564
+
Steps: 2%|▏ | 17526/1000000 [11:32:22<2815:47:42, 10.32s/it, lr=1e-5, step_loss=0.00513]
Steps: 2%|▏ | 17527/1000000 [11:32:32<2787:30:02, 10.21s/it, lr=1e-5, step_loss=0.00513][RANK-0]: Step: [17527], local_loss=0.06265465915203094, train_loss=0.053063251078128815, time_cost=7.65816855430603
+
Steps: 2%|▏ | 17527/1000000 [11:32:32<2787:30:02, 10.21s/it, lr=1e-5, step_loss=0.0627]
Steps: 2%|▏ | 17528/1000000 [11:32:36<2320:24:11, 8.50s/it, lr=1e-5, step_loss=0.0627][RANK-0]: Step: [17528], local_loss=0.018843239173293114, train_loss=0.04250133037567139, time_cost=1.6821527481079102
+
Steps: 2%|▏ | 17528/1000000 [11:32:36<2320:24:11, 8.50s/it, lr=1e-5, step_loss=0.0188]
Steps: 2%|▏ | 17529/1000000 [11:32:49<2628:39:24, 9.63s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [17529], local_loss=0.059152234345674515, train_loss=0.041493721306324005, time_cost=1.2226262092590332
+
Steps: 2%|▏ | 17529/1000000 [11:32:49<2628:39:24, 9.63s/it, lr=1e-5, step_loss=0.0592]
Steps: 2%|▏ | 17530/1000000 [11:32:58<2594:09:27, 9.51s/it, lr=1e-5, step_loss=0.0592][RANK-0]: Step: [17530], local_loss=0.049156554043293, train_loss=0.023725520819425583, time_cost=2.6070187091827393
+
Steps: 2%|▏ | 17530/1000000 [11:32:58<2594:09:27, 9.51s/it, lr=1e-5, step_loss=0.0492]
Steps: 2%|▏ | 17531/1000000 [11:33:05<2390:20:22, 8.76s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [17531], local_loss=0.016671862453222275, train_loss=0.03850241005420685, time_cost=2.764570713043213
+
Steps: 2%|▏ | 17531/1000000 [11:33:05<2390:20:22, 8.76s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 17532/1000000 [11:33:16<2616:05:14, 9.59s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [17532], local_loss=0.06591441482305527, train_loss=0.03743652254343033, time_cost=4.007416725158691
+
Steps: 2%|▏ | 17532/1000000 [11:33:16<2616:05:14, 9.59s/it, lr=1e-5, step_loss=0.0659]
Steps: 2%|▏ | 17533/1000000 [11:33:23<2413:47:40, 8.84s/it, lr=1e-5, step_loss=0.0659][RANK-0]: Step: [17533], local_loss=0.011082280427217484, train_loss=0.022387277334928513, time_cost=3.0627193450927734
+
Steps: 2%|▏ | 17533/1000000 [11:33:23<2413:47:40, 8.84s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 17534/1000000 [11:33:29<2112:00:42, 7.74s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [17534], local_loss=0.028046762570738792, train_loss=0.03836752474308014, time_cost=2.0628738403320312
+
Steps: 2%|▏ | 17534/1000000 [11:33:29<2112:00:42, 7.74s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 17535/1000000 [11:33:41<2519:51:21, 9.23s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [17535], local_loss=0.2538214921951294, train_loss=0.04807448759675026, time_cost=10.147994041442871
+
Steps: 2%|▏ | 17535/1000000 [11:33:41<2519:51:21, 9.23s/it, lr=1e-5, step_loss=0.254]
Steps: 2%|▏ | 17536/1000000 [11:33:55<2889:30:41, 10.59s/it, lr=1e-5, step_loss=0.254][RANK-0]: Step: [17536], local_loss=0.01061610784381628, train_loss=0.020140303298830986, time_cost=4.408918380737305
+
Steps: 2%|▏ | 17536/1000000 [11:33:55<2889:30:41, 10.59s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 17537/1000000 [11:34:03<2654:51:51, 9.73s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [17537], local_loss=0.006300552748143673, train_loss=0.022151639685034752, time_cost=3.6807894706726074
+
Steps: 2%|▏ | 17537/1000000 [11:34:03<2654:51:51, 9.73s/it, lr=1e-5, step_loss=0.0063]
Steps: 2%|▏ | 17538/1000000 [11:34:08<2292:30:48, 8.40s/it, lr=1e-5, step_loss=0.0063][RANK-0]: Step: [17538], local_loss=0.008232745341956615, train_loss=0.05126094073057175, time_cost=2.9768238067626953
+
Steps: 2%|▏ | 17538/1000000 [11:34:08<2292:30:48, 8.40s/it, lr=1e-5, step_loss=0.00823]
Steps: 2%|▏ | 17539/1000000 [11:34:15<2168:16:09, 7.95s/it, lr=1e-5, step_loss=0.00823][RANK-0]: Step: [17539], local_loss=0.04957987368106842, train_loss=0.03592582419514656, time_cost=2.283876657485962
+
Steps: 2%|▏ | 17539/1000000 [11:34:15<2168:16:09, 7.95s/it, lr=1e-5, step_loss=0.0496]
Steps: 2%|▏ | 17540/1000000 [11:34:26<2417:58:09, 8.86s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [17540], local_loss=0.010742136277258396, train_loss=8.011106491088867, time_cost=1.2555155754089355
+
Steps: 2%|▏ | 17540/1000000 [11:34:26<2417:58:09, 8.86s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17541/1000000 [11:34:34<2366:35:21, 8.67s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17541], local_loss=0.027105286717414856, train_loss=0.03332223370671272, time_cost=3.3607776165008545
+
Steps: 2%|▏ | 17541/1000000 [11:34:34<2366:35:21, 8.67s/it, lr=1e-5, step_loss=0.0271]
Steps: 2%|▏ | 17542/1000000 [11:34:40<2133:39:04, 7.82s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [17542], local_loss=0.006628560833632946, train_loss=0.07640312612056732, time_cost=1.5031816959381104
+
Steps: 2%|▏ | 17542/1000000 [11:34:40<2133:39:04, 7.82s/it, lr=1e-5, step_loss=0.00663]
Steps: 2%|▏ | 17543/1000000 [11:34:54<2657:17:56, 9.74s/it, lr=1e-5, step_loss=0.00663][RANK-0]: Step: [17543], local_loss=0.009458855725824833, train_loss=0.030308939516544342, time_cost=8.279845237731934
+
Steps: 2%|▏ | 17543/1000000 [11:34:54<2657:17:56, 9.74s/it, lr=1e-5, step_loss=0.00946]
Steps: 2%|▏ | 17544/1000000 [11:34:59<2288:00:01, 8.38s/it, lr=1e-5, step_loss=0.00946][RANK-0]: Step: [17544], local_loss=0.02961391769349575, train_loss=0.030499085783958435, time_cost=2.6703011989593506
+
Steps: 2%|▏ | 17544/1000000 [11:34:59<2288:00:01, 8.38s/it, lr=1e-5, step_loss=0.0296]
Steps: 2%|▏ | 17545/1000000 [11:35:11<2511:34:56, 9.20s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [17545], local_loss=0.08601635694503784, train_loss=0.057463139295578, time_cost=1.2495973110198975
+
Steps: 2%|▏ | 17545/1000000 [11:35:11<2511:34:56, 9.20s/it, lr=1e-5, step_loss=0.086]
Steps: 2%|▏ | 17546/1000000 [11:35:16<2198:42:37, 8.06s/it, lr=1e-5, step_loss=0.086][RANK-0]: Step: [17546], local_loss=0.014137106016278267, train_loss=0.03758400306105614, time_cost=2.4795398712158203
+
Steps: 2%|▏ | 17546/1000000 [11:35:16<2198:42:37, 8.06s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 17547/1000000 [11:35:24<2166:39:33, 7.94s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [17547], local_loss=0.007973129861056805, train_loss=0.03264617547392845, time_cost=5.7307679653167725
+
Steps: 2%|▏ | 17547/1000000 [11:35:24<2166:39:33, 7.94s/it, lr=1e-5, step_loss=0.00797]
Steps: 2%|▏ | 17548/1000000 [11:35:28<1857:20:35, 6.81s/it, lr=1e-5, step_loss=0.00797][RANK-0]: Step: [17548], local_loss=0.07650060951709747, train_loss=0.05929681286215782, time_cost=1.3060715198516846
+
Steps: 2%|▏ | 17548/1000000 [11:35:28<1857:20:35, 6.81s/it, lr=1e-5, step_loss=0.0765]
Steps: 2%|▏ | 17549/1000000 [11:35:32<1687:37:31, 6.18s/it, lr=1e-5, step_loss=0.0765][RANK-0]: Step: [17549], local_loss=0.04391108453273773, train_loss=0.062000345438718796, time_cost=1.3225963115692139
+
Steps: 2%|▏ | 17549/1000000 [11:35:32<1687:37:31, 6.18s/it, lr=1e-5, step_loss=0.0439]
Steps: 2%|▏ | 17550/1000000 [11:35:37<1584:24:47, 5.81s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [17550], local_loss=0.8586890697479248, train_loss=0.22456307709217072, time_cost=2.4743239879608154
+
Steps: 2%|▏ | 17550/1000000 [11:35:37<1584:24:47, 5.81s/it, lr=1e-5, step_loss=0.859]
Steps: 2%|▏ | 17551/1000000 [11:35:43<1547:35:27, 5.67s/it, lr=1e-5, step_loss=0.859][RANK-0]: Step: [17551], local_loss=0.02930043265223503, train_loss=0.016897927969694138, time_cost=2.4881112575531006
+
Steps: 2%|▏ | 17551/1000000 [11:35:43<1547:35:27, 5.67s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 17552/1000000 [11:35:56<2144:19:37, 7.86s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [17552], local_loss=0.03471720591187477, train_loss=0.034390173852443695, time_cost=3.371145248413086
+
Steps: 2%|▏ | 17552/1000000 [11:35:56<2144:19:37, 7.86s/it, lr=1e-5, step_loss=0.0347]
Steps: 2%|▏ | 17553/1000000 [11:36:04<2190:16:52, 8.03s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [17553], local_loss=0.0065948739647865295, train_loss=0.025108076632022858, time_cost=6.41310977935791
+
Steps: 2%|▏ | 17553/1000000 [11:36:04<2190:16:52, 8.03s/it, lr=1e-5, step_loss=0.00659]
Steps: 2%|▏ | 17554/1000000 [11:36:08<1887:30:12, 6.92s/it, lr=1e-5, step_loss=0.00659][RANK-0]: Step: [17554], local_loss=0.01433076336979866, train_loss=0.07060720026493073, time_cost=1.4201035499572754
+
Steps: 2%|▏ | 17554/1000000 [11:36:08<1887:30:12, 6.92s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 17555/1000000 [11:36:24<2593:37:12, 9.50s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [17555], local_loss=0.010690329596400261, train_loss=0.02117467299103737, time_cost=1.3664047718048096
+
Steps: 2%|▏ | 17555/1000000 [11:36:24<2593:37:12, 9.50s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17556/1000000 [11:36:28<2182:22:19, 8.00s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17556], local_loss=0.014863280579447746, train_loss=0.019945841282606125, time_cost=1.5875489711761475
+
Steps: 2%|▏ | 17556/1000000 [11:36:28<2182:22:19, 8.00s/it, lr=1e-5, step_loss=0.0149]
Steps: 2%|▏ | 17557/1000000 [11:36:38<2281:19:09, 8.36s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [17557], local_loss=0.013996915891766548, train_loss=0.05005782097578049, time_cost=3.0458405017852783
+
Steps: 2%|▏ | 17557/1000000 [11:36:38<2281:19:09, 8.36s/it, lr=1e-5, step_loss=0.014]
Steps: 2%|▏ | 17558/1000000 [11:36:51<2707:26:22, 9.92s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [17558], local_loss=0.08535093069076538, train_loss=0.07200188934803009, time_cost=6.076107501983643
+
Steps: 2%|▏ | 17558/1000000 [11:36:51<2707:26:22, 9.92s/it, lr=1e-5, step_loss=0.0854]
Steps: 2%|▏ | 17559/1000000 [11:36:57<2338:50:04, 8.57s/it, lr=1e-5, step_loss=0.0854][RANK-0]: Step: [17559], local_loss=0.06526742875576019, train_loss=0.04470214247703552, time_cost=2.2419073581695557
+
Steps: 2%|▏ | 17559/1000000 [11:36:57<2338:50:04, 8.57s/it, lr=1e-5, step_loss=0.0653]
Steps: 2%|▏ | 17560/1000000 [11:37:12<2893:42:26, 10.60s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [17560], local_loss=0.031777895987033844, train_loss=0.1473180055618286, time_cost=7.350013256072998
+
Steps: 2%|▏ | 17560/1000000 [11:37:12<2893:42:26, 10.60s/it, lr=1e-5, step_loss=0.0318]
Steps: 2%|▏ | 17561/1000000 [11:37:23<2903:03:48, 10.64s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [17561], local_loss=0.28126150369644165, train_loss=0.0618869811296463, time_cost=3.35809063911438
+
Steps: 2%|▏ | 17561/1000000 [11:37:23<2903:03:48, 10.64s/it, lr=1e-5, step_loss=0.281]
Steps: 2%|▏ | 17562/1000000 [11:37:28<2477:39:02, 9.08s/it, lr=1e-5, step_loss=0.281][RANK-0]: Step: [17562], local_loss=0.012858268804848194, train_loss=0.04054573178291321, time_cost=2.8036203384399414
+
Steps: 2%|▏ | 17562/1000000 [11:37:28<2477:39:02, 9.08s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 17563/1000000 [11:37:33<2154:08:39, 7.89s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [17563], local_loss=0.02507009357213974, train_loss=0.03408597409725189, time_cost=1.7310729026794434
+
Steps: 2%|▏ | 17563/1000000 [11:37:33<2154:08:39, 7.89s/it, lr=1e-5, step_loss=0.0251]
Steps: 2%|▏ | 17564/1000000 [11:37:39<1938:50:11, 7.10s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [17564], local_loss=0.19198156893253326, train_loss=0.038066547363996506, time_cost=2.4829392433166504
+
Steps: 2%|▏ | 17564/1000000 [11:37:39<1938:50:11, 7.10s/it, lr=1e-5, step_loss=0.192]
Steps: 2%|▏ | 17565/1000000 [11:37:52<2497:08:02, 9.15s/it, lr=1e-5, step_loss=0.192][RANK-0]: Step: [17565], local_loss=0.20664449036121368, train_loss=0.041620709002017975, time_cost=4.340120553970337
+
Steps: 2%|▏ | 17565/1000000 [11:37:52<2497:08:02, 9.15s/it, lr=1e-5, step_loss=0.207]
Steps: 2%|▏ | 17566/1000000 [11:38:03<2593:44:07, 9.50s/it, lr=1e-5, step_loss=0.207][RANK-0]: Step: [17566], local_loss=0.028521761298179626, train_loss=0.02740708738565445, time_cost=2.3658909797668457
+
Steps: 2%|▏ | 17566/1000000 [11:38:03<2593:44:07, 9.50s/it, lr=1e-5, step_loss=0.0285]
Steps: 2%|▏ | 17567/1000000 [11:38:14<2756:24:48, 10.10s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [17567], local_loss=0.07415799796581268, train_loss=0.05070469155907631, time_cost=2.9429755210876465
+
Steps: 2%|▏ | 17567/1000000 [11:38:14<2756:24:48, 10.10s/it, lr=1e-5, step_loss=0.0742]
Steps: 2%|▏ | 17568/1000000 [11:38:21<2510:35:52, 9.20s/it, lr=1e-5, step_loss=0.0742][RANK-0]: Step: [17568], local_loss=0.004705680068582296, train_loss=0.06515729427337646, time_cost=1.3140978813171387
+
Steps: 2%|▏ | 17568/1000000 [11:38:21<2510:35:52, 9.20s/it, lr=1e-5, step_loss=0.00471]
Steps: 2%|▏ | 17569/1000000 [11:38:27<2182:29:55, 8.00s/it, lr=1e-5, step_loss=0.00471][RANK-0]: Step: [17569], local_loss=0.005299837328493595, train_loss=0.015344787389039993, time_cost=2.859159469604492
+
Steps: 2%|▏ | 17569/1000000 [11:38:27<2182:29:55, 8.00s/it, lr=1e-5, step_loss=0.0053]
Steps: 2%|▏ | 17570/1000000 [11:38:32<1958:47:39, 7.18s/it, lr=1e-5, step_loss=0.0053][RANK-0]: Step: [17570], local_loss=0.16606412827968597, train_loss=0.08322206884622574, time_cost=1.2051212787628174
+
Steps: 2%|▏ | 17570/1000000 [11:38:32<1958:47:39, 7.18s/it, lr=1e-5, step_loss=0.166]
Steps: 2%|▏ | 17571/1000000 [11:38:39<1934:06:10, 7.09s/it, lr=1e-5, step_loss=0.166][RANK-0]: Step: [17571], local_loss=0.008469592779874802, train_loss=0.06162348389625549, time_cost=1.7080345153808594
+
Steps: 2%|▏ | 17571/1000000 [11:38:39<1934:06:10, 7.09s/it, lr=1e-5, step_loss=0.00847]
Steps: 2%|▏ | 17572/1000000 [11:38:53<2512:32:30, 9.21s/it, lr=1e-5, step_loss=0.00847][RANK-0]: Step: [17572], local_loss=0.008673730306327343, train_loss=0.08366483449935913, time_cost=4.072227716445923
+
Steps: 2%|▏ | 17572/1000000 [11:38:53<2512:32:30, 9.21s/it, lr=1e-5, step_loss=0.00867]
Steps: 2%|▏ | 17573/1000000 [11:39:03<2546:05:23, 9.33s/it, lr=1e-5, step_loss=0.00867][RANK-0]: Step: [17573], local_loss=0.00923381932079792, train_loss=0.02046993374824524, time_cost=7.35325813293457
+
Steps: 2%|▏ | 17573/1000000 [11:39:03<2546:05:23, 9.33s/it, lr=1e-5, step_loss=0.00923]
Steps: 2%|▏ | 17574/1000000 [11:39:08<2261:51:11, 8.29s/it, lr=1e-5, step_loss=0.00923][RANK-0]: Step: [17574], local_loss=0.05702874809503555, train_loss=0.025091152638196945, time_cost=1.8729360103607178
+
Steps: 2%|▏ | 17574/1000000 [11:39:08<2261:51:11, 8.29s/it, lr=1e-5, step_loss=0.057]
Steps: 2%|▏ | 17575/1000000 [11:39:20<2512:03:04, 9.21s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [17575], local_loss=0.00918591395020485, train_loss=0.027285989373922348, time_cost=4.599945068359375
+
Steps: 2%|▏ | 17575/1000000 [11:39:20<2512:03:04, 9.21s/it, lr=1e-5, step_loss=0.00919]
Steps: 2%|▏ | 17576/1000000 [11:39:25<2211:08:10, 8.10s/it, lr=1e-5, step_loss=0.00919][RANK-0]: Step: [17576], local_loss=0.03240296617150307, train_loss=0.03686036169528961, time_cost=2.748178720474243
+
Steps: 2%|▏ | 17576/1000000 [11:39:25<2211:08:10, 8.10s/it, lr=1e-5, step_loss=0.0324]
Steps: 2%|▏ | 17577/1000000 [11:39:33<2217:37:22, 8.13s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [17577], local_loss=0.05889889597892761, train_loss=0.027102641761302948, time_cost=7.296590805053711
+
Steps: 2%|▏ | 17577/1000000 [11:39:33<2217:37:22, 8.13s/it, lr=1e-5, step_loss=0.0589]
Steps: 2%|▏ | 17578/1000000 [11:39:38<1912:47:28, 7.01s/it, lr=1e-5, step_loss=0.0589][RANK-0]: Step: [17578], local_loss=0.05577016621828079, train_loss=0.03964034467935562, time_cost=1.5963971614837646
+
Steps: 2%|▏ | 17578/1000000 [11:39:38<1912:47:28, 7.01s/it, lr=1e-5, step_loss=0.0558]
Steps: 2%|▏ | 17579/1000000 [11:39:42<1675:48:27, 6.14s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [17579], local_loss=0.03226976841688156, train_loss=0.03686360642313957, time_cost=1.3966619968414307
+
Steps: 2%|▏ | 17579/1000000 [11:39:42<1675:48:27, 6.14s/it, lr=1e-5, step_loss=0.0323]
Steps: 2%|▏ | 17580/1000000 [11:39:51<1915:47:19, 7.02s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [17580], local_loss=0.02817588299512863, train_loss=0.0361124686896801, time_cost=3.674030065536499
+
Steps: 2%|▏ | 17580/1000000 [11:39:51<1915:47:19, 7.02s/it, lr=1e-5, step_loss=0.0282]
Steps: 2%|▏ | 17581/1000000 [11:40:05<2510:53:11, 9.20s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [17581], local_loss=0.08076830953359604, train_loss=0.0490703247487545, time_cost=1.2631943225860596
+
Steps: 2%|▏ | 17581/1000000 [11:40:05<2510:53:11, 9.20s/it, lr=1e-5, step_loss=0.0808]
Steps: 2%|▏ | 17582/1000000 [11:40:10<2178:28:05, 7.98s/it, lr=1e-5, step_loss=0.0808][RANK-0]: Step: [17582], local_loss=0.012809212319552898, train_loss=0.009719133377075195, time_cost=2.7096505165100098
+
Steps: 2%|▏ | 17582/1000000 [11:40:10<2178:28:05, 7.98s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 17583/1000000 [11:40:16<2015:07:52, 7.38s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [17583], local_loss=0.004928638227283955, train_loss=0.02824212610721588, time_cost=2.0019371509552
+
Steps: 2%|▏ | 17583/1000000 [11:40:16<2015:07:52, 7.38s/it, lr=1e-5, step_loss=0.00493]
Steps: 2%|▏ | 17584/1000000 [11:40:30<2496:58:13, 9.15s/it, lr=1e-5, step_loss=0.00493][RANK-0]: Step: [17584], local_loss=0.02781275101006031, train_loss=0.04675506800413132, time_cost=5.225332260131836
+
Steps: 2%|▏ | 17584/1000000 [11:40:30<2496:58:13, 9.15s/it, lr=1e-5, step_loss=0.0278]
Steps: 2%|▏ | 17585/1000000 [11:40:40<2597:55:31, 9.52s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [17585], local_loss=0.003972809761762619, train_loss=0.026862865313887596, time_cost=1.260995626449585
+
Steps: 2%|▏ | 17585/1000000 [11:40:40<2597:55:31, 9.52s/it, lr=1e-5, step_loss=0.00397]
Steps: 2%|▏ | 17586/1000000 [11:40:55<3051:05:58, 11.18s/it, lr=1e-5, step_loss=0.00397][RANK-0]: Step: [17586], local_loss=0.062401123344898224, train_loss=0.06060974672436714, time_cost=11.234640836715698
+
Steps: 2%|▏ | 17586/1000000 [11:40:55<3051:05:58, 11.18s/it, lr=1e-5, step_loss=0.0624]
Steps: 2%|▏ | 17587/1000000 [11:41:02<2719:39:45, 9.97s/it, lr=1e-5, step_loss=0.0624][RANK-0]: Step: [17587], local_loss=0.035257287323474884, train_loss=0.09410914778709412, time_cost=2.3688554763793945
+
Steps: 2%|▏ | 17587/1000000 [11:41:02<2719:39:45, 9.97s/it, lr=1e-5, step_loss=0.0353]
Steps: 2%|▏ | 17588/1000000 [11:41:08<2375:04:20, 8.70s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [17588], local_loss=0.01657111570239067, train_loss=0.08089601248502731, time_cost=4.347376585006714
+
Steps: 2%|▏ | 17588/1000000 [11:41:08<2375:04:20, 8.70s/it, lr=1e-5, step_loss=0.0166]
Steps: 2%|▏ | 17589/1000000 [11:41:15<2252:53:14, 8.26s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [17589], local_loss=0.030086055397987366, train_loss=0.05181368440389633, time_cost=3.664828062057495
+
Steps: 2%|▏ | 17589/1000000 [11:41:15<2252:53:14, 8.26s/it, lr=1e-5, step_loss=0.0301]
Steps: 2%|▏ | 17590/1000000 [11:41:23<2198:47:37, 8.06s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [17590], local_loss=0.061427075415849686, train_loss=0.0445309579372406, time_cost=3.0028154850006104
+
Steps: 2%|▏ | 17590/1000000 [11:41:23<2198:47:37, 8.06s/it, lr=1e-5, step_loss=0.0614]
Steps: 2%|▏ | 17591/1000000 [11:41:28<1948:29:28, 7.14s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [17591], local_loss=0.008486494421958923, train_loss=0.0489535890519619, time_cost=1.2184422016143799
+
Steps: 2%|▏ | 17591/1000000 [11:41:28<1948:29:28, 7.14s/it, lr=1e-5, step_loss=0.00849]
Steps: 2%|▏ | 17592/1000000 [11:41:35<1927:54:56, 7.06s/it, lr=1e-5, step_loss=0.00849][RANK-0]: Step: [17592], local_loss=0.028961265459656715, train_loss=0.03138731047511101, time_cost=2.3424453735351562
+
Steps: 2%|▏ | 17592/1000000 [11:41:35<1927:54:56, 7.06s/it, lr=1e-5, step_loss=0.029]
Steps: 2%|▏ | 17593/1000000 [11:41:49<2485:54:59, 9.11s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [17593], local_loss=0.479531466960907, train_loss=0.08348669856786728, time_cost=1.2165179252624512
+
Steps: 2%|▏ | 17593/1000000 [11:41:49<2485:54:59, 9.11s/it, lr=1e-5, step_loss=0.48]
Steps: 2%|▏ | 17594/1000000 [11:42:04<3003:04:31, 11.00s/it, lr=1e-5, step_loss=0.48][RANK-0]: Step: [17594], local_loss=0.0036449371837079525, train_loss=0.031628575176000595, time_cost=7.3027637004852295
+
Steps: 2%|▏ | 17594/1000000 [11:42:04<3003:04:31, 11.00s/it, lr=1e-5, step_loss=0.00364]
Steps: 2%|▏ | 17595/1000000 [11:42:16<3042:43:29, 11.15s/it, lr=1e-5, step_loss=0.00364][RANK-0]: Step: [17595], local_loss=0.0041805836372077465, train_loss=0.010063967667520046, time_cost=3.852609157562256
+
Steps: 2%|▏ | 17595/1000000 [11:42:16<3042:43:29, 11.15s/it, lr=1e-5, step_loss=0.00418]
Steps: 2%|▏ | 17596/1000000 [11:42:21<2564:59:44, 9.40s/it, lr=1e-5, step_loss=0.00418][RANK-0]: Step: [17596], local_loss=0.028004221618175507, train_loss=0.028995487838983536, time_cost=3.1489675045013428
+
Steps: 2%|▏ | 17596/1000000 [11:42:21<2564:59:44, 9.40s/it, lr=1e-5, step_loss=0.028]
Steps: 2%|▏ | 17597/1000000 [11:42:28<2352:12:05, 8.62s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [17597], local_loss=0.1157810389995575, train_loss=15.703365325927734, time_cost=1.5537261962890625
+
Steps: 2%|▏ | 17597/1000000 [11:42:28<2352:12:05, 8.62s/it, lr=1e-5, step_loss=0.116]
Steps: 2%|▏ | 17598/1000000 [11:42:35<2220:15:38, 8.14s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [17598], local_loss=0.014252758584916592, train_loss=0.027359554544091225, time_cost=2.740478277206421
+
Steps: 2%|▏ | 17598/1000000 [11:42:35<2220:15:38, 8.14s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 17599/1000000 [11:42:45<2387:33:37, 8.75s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [17599], local_loss=0.0054619573056697845, train_loss=0.08361250162124634, time_cost=1.2274587154388428
+
Steps: 2%|▏ | 17599/1000000 [11:42:45<2387:33:37, 8.75s/it, lr=1e-5, step_loss=0.00546]
Steps: 2%|▏ | 17600/1000000 [11:42:52<2228:04:32, 8.16s/it, lr=1e-5, step_loss=0.00546][RANK-0]: Step: [17600], local_loss=0.037697117775678635, train_loss=0.08869848400354385, time_cost=4.972179174423218
+
Steps: 2%|▏ | 17600/1000000 [11:42:52<2228:04:32, 8.16s/it, lr=1e-5, step_loss=0.0377]
Steps: 2%|▏ | 17601/1000000 [11:43:00<2206:13:39, 8.08s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [17601], local_loss=0.11195006966590881, train_loss=0.0758010596036911, time_cost=1.2589573860168457
+
Steps: 2%|▏ | 17601/1000000 [11:43:00<2206:13:39, 8.08s/it, lr=1e-5, step_loss=0.112]
Steps: 2%|▏ | 17602/1000000 [11:43:07<2165:47:15, 7.94s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [17602], local_loss=0.047089461237192154, train_loss=0.0544678270816803, time_cost=3.2541983127593994
+
Steps: 2%|▏ | 17602/1000000 [11:43:07<2165:47:15, 7.94s/it, lr=1e-5, step_loss=0.0471]
Steps: 2%|▏ | 17603/1000000 [11:43:13<2010:19:25, 7.37s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [17603], local_loss=0.2753141522407532, train_loss=0.09860779345035553, time_cost=1.9214301109313965
+
Steps: 2%|▏ | 17603/1000000 [11:43:13<2010:19:25, 7.37s/it, lr=1e-5, step_loss=0.275]
Steps: 2%|▏ | 17604/1000000 [11:43:20<1955:51:11, 7.17s/it, lr=1e-5, step_loss=0.275][RANK-0]: Step: [17604], local_loss=0.07254345715045929, train_loss=0.06722462922334671, time_cost=2.116913318634033
+
Steps: 2%|▏ | 17604/1000000 [11:43:20<1955:51:11, 7.17s/it, lr=1e-5, step_loss=0.0725]
Steps: 2%|▏ | 17605/1000000 [11:43:26<1908:59:53, 7.00s/it, lr=1e-5, step_loss=0.0725][RANK-0]: Step: [17605], local_loss=0.021281255409121513, train_loss=0.028105182573199272, time_cost=2.170562267303467
+
Steps: 2%|▏ | 17605/1000000 [11:43:26<1908:59:53, 7.00s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 17606/1000000 [11:43:37<2226:39:22, 8.16s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [17606], local_loss=0.021965287625789642, train_loss=0.024799682199954987, time_cost=2.740180015563965
+
Steps: 2%|▏ | 17606/1000000 [11:43:37<2226:39:22, 8.16s/it, lr=1e-5, step_loss=0.022]
Steps: 2%|▏ | 17607/1000000 [11:43:48<2425:48:51, 8.89s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [17607], local_loss=0.013493387028574944, train_loss=0.0218525193631649, time_cost=3.325484037399292
+
Steps: 2%|▏ | 17607/1000000 [11:43:48<2425:48:51, 8.89s/it, lr=1e-5, step_loss=0.0135]
Steps: 2%|▏ | 17608/1000000 [11:43:57<2445:04:21, 8.96s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [17608], local_loss=0.11571130156517029, train_loss=0.05325409770011902, time_cost=3.9079928398132324
+
Steps: 2%|▏ | 17608/1000000 [11:43:57<2445:04:21, 8.96s/it, lr=1e-5, step_loss=0.116]
Steps: 2%|▏ | 17609/1000000 [11:44:05<2353:33:57, 8.62s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [17609], local_loss=0.007458068430423737, train_loss=0.03682290017604828, time_cost=3.6278693675994873
+
Steps: 2%|▏ | 17609/1000000 [11:44:05<2353:33:57, 8.62s/it, lr=1e-5, step_loss=0.00746]
Steps: 2%|▏ | 17610/1000000 [11:44:19<2823:07:36, 10.35s/it, lr=1e-5, step_loss=0.00746][RANK-0]: Step: [17610], local_loss=0.004052935168147087, train_loss=0.05201982706785202, time_cost=6.505173206329346
+
Steps: 2%|▏ | 17610/1000000 [11:44:19<2823:07:36, 10.35s/it, lr=1e-5, step_loss=0.00405]
Steps: 2%|▏ | 17611/1000000 [11:44:24<2387:08:41, 8.75s/it, lr=1e-5, step_loss=0.00405][RANK-0]: Step: [17611], local_loss=0.0046716355718672276, train_loss=0.011911840178072453, time_cost=1.2292630672454834
+
Steps: 2%|▏ | 17611/1000000 [11:44:24<2387:08:41, 8.75s/it, lr=1e-5, step_loss=0.00467]
Steps: 2%|▏ | 17612/1000000 [11:44:35<2551:48:51, 9.35s/it, lr=1e-5, step_loss=0.00467][RANK-0]: Step: [17612], local_loss=0.1421205997467041, train_loss=0.05722929909825325, time_cost=2.816365957260132
+
Steps: 2%|▏ | 17612/1000000 [11:44:35<2551:48:51, 9.35s/it, lr=1e-5, step_loss=0.142]
Steps: 2%|▏ | 17613/1000000 [11:44:41<2316:56:58, 8.49s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [17613], local_loss=0.025106104090809822, train_loss=0.02275446057319641, time_cost=1.2905161380767822
+
Steps: 2%|▏ | 17613/1000000 [11:44:41<2316:56:58, 8.49s/it, lr=1e-5, step_loss=0.0251]
Steps: 2%|▏ | 17614/1000000 [11:44:54<2623:49:13, 9.62s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [17614], local_loss=0.4149783253669739, train_loss=0.07756109535694122, time_cost=3.663994073867798
+
Steps: 2%|▏ | 17614/1000000 [11:44:54<2623:49:13, 9.62s/it, lr=1e-5, step_loss=0.415]
Steps: 2%|▏ | 17615/1000000 [11:45:02<2490:17:39, 9.13s/it, lr=1e-5, step_loss=0.415][RANK-0]: Step: [17615], local_loss=0.0069209858775138855, train_loss=0.03441686928272247, time_cost=4.051947355270386
+
Steps: 2%|▏ | 17615/1000000 [11:45:02<2490:17:39, 9.13s/it, lr=1e-5, step_loss=0.00692]
Steps: 2%|▏ | 17616/1000000 [11:45:11<2481:17:55, 9.09s/it, lr=1e-5, step_loss=0.00692][RANK-0]: Step: [17616], local_loss=0.36127233505249023, train_loss=0.2497556507587433, time_cost=2.985069513320923
+
Steps: 2%|▏ | 17616/1000000 [11:45:11<2481:17:55, 9.09s/it, lr=1e-5, step_loss=0.361]
Steps: 2%|▏ | 17617/1000000 [11:45:22<2641:40:41, 9.68s/it, lr=1e-5, step_loss=0.361][RANK-0]: Step: [17617], local_loss=0.03409472852945328, train_loss=0.04545501992106438, time_cost=2.673236608505249
+
Steps: 2%|▏ | 17617/1000000 [11:45:22<2641:40:41, 9.68s/it, lr=1e-5, step_loss=0.0341]
Steps: 2%|▏ | 17618/1000000 [11:45:27<2283:14:05, 8.37s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [17618], local_loss=0.9813100099563599, train_loss=0.15856176614761353, time_cost=2.877349615097046
+
Steps: 2%|▏ | 17618/1000000 [11:45:27<2283:14:05, 8.37s/it, lr=1e-5, step_loss=0.981]
Steps: 2%|▏ | 17619/1000000 [11:45:32<1976:02:19, 7.24s/it, lr=1e-5, step_loss=0.981][RANK-0]: Step: [17619], local_loss=0.04078429564833641, train_loss=0.04939386993646622, time_cost=1.8458151817321777
+
Steps: 2%|▏ | 17619/1000000 [11:45:32<1976:02:19, 7.24s/it, lr=1e-5, step_loss=0.0408]
Steps: 2%|▏ | 17620/1000000 [11:45:39<1972:38:03, 7.23s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [17620], local_loss=0.3929237425327301, train_loss=0.12300795316696167, time_cost=2.8706417083740234
+
Steps: 2%|▏ | 17620/1000000 [11:45:39<1972:38:03, 7.23s/it, lr=1e-5, step_loss=0.393]
Steps: 2%|▏ | 17621/1000000 [11:45:49<2191:21:37, 8.03s/it, lr=1e-5, step_loss=0.393][RANK-0]: Step: [17621], local_loss=0.39997589588165283, train_loss=0.10343718528747559, time_cost=1.4220900535583496
+
Steps: 2%|▏ | 17621/1000000 [11:45:49<2191:21:37, 8.03s/it, lr=1e-5, step_loss=0.4]
Steps: 2%|▏ | 17622/1000000 [11:45:56<2130:10:41, 7.81s/it, lr=1e-5, step_loss=0.4][RANK-0]: Step: [17622], local_loss=0.08334819972515106, train_loss=0.15269438922405243, time_cost=3.152832508087158
+
Steps: 2%|▏ | 17622/1000000 [11:45:56<2130:10:41, 7.81s/it, lr=1e-5, step_loss=0.0833]
Steps: 2%|▏ | 17623/1000000 [11:46:09<2566:35:34, 9.41s/it, lr=1e-5, step_loss=0.0833][RANK-0]: Step: [17623], local_loss=0.01731240004301071, train_loss=0.032777950167655945, time_cost=3.8492512702941895
+
Steps: 2%|▏ | 17623/1000000 [11:46:09<2566:35:34, 9.41s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 17624/1000000 [11:46:16<2366:42:01, 8.67s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [17624], local_loss=0.008181774988770485, train_loss=0.02747783623635769, time_cost=2.6452996730804443
+
Steps: 2%|▏ | 17624/1000000 [11:46:16<2366:42:01, 8.67s/it, lr=1e-5, step_loss=0.00818]
Steps: 2%|▏ | 17625/1000000 [11:46:25<2374:26:54, 8.70s/it, lr=1e-5, step_loss=0.00818][RANK-0]: Step: [17625], local_loss=0.10177557915449142, train_loss=0.050748907029628754, time_cost=2.046494722366333
+
Steps: 2%|▏ | 17625/1000000 [11:46:25<2374:26:54, 8.70s/it, lr=1e-5, step_loss=0.102]
Steps: 2%|▏ | 17626/1000000 [11:46:34<2381:24:15, 8.73s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [17626], local_loss=0.005835750140249729, train_loss=0.03893975168466568, time_cost=2.5046751499176025
+
Steps: 2%|▏ | 17626/1000000 [11:46:34<2381:24:15, 8.73s/it, lr=1e-5, step_loss=0.00584]
Steps: 2%|▏ | 17627/1000000 [11:46:45<2565:56:32, 9.40s/it, lr=1e-5, step_loss=0.00584][RANK-0]: Step: [17627], local_loss=0.044752538204193115, train_loss=0.03373724967241287, time_cost=1.6476020812988281
+
Steps: 2%|▏ | 17627/1000000 [11:46:45<2565:56:32, 9.40s/it, lr=1e-5, step_loss=0.0448]
Steps: 2%|▏ | 17628/1000000 [11:46:54<2567:46:55, 9.41s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [17628], local_loss=0.33402591943740845, train_loss=0.09190618991851807, time_cost=2.5975427627563477
+
Steps: 2%|▏ | 17628/1000000 [11:46:54<2567:46:55, 9.41s/it, lr=1e-5, step_loss=0.334]
Steps: 2%|▏ | 17629/1000000 [11:46:58<2141:02:33, 7.85s/it, lr=1e-5, step_loss=0.334][RANK-0]: Step: [17629], local_loss=0.051534656435251236, train_loss=0.033844515681266785, time_cost=1.2748827934265137
+
Steps: 2%|▏ | 17629/1000000 [11:46:58<2141:02:33, 7.85s/it, lr=1e-5, step_loss=0.0515]
Steps: 2%|▏ | 17630/1000000 [11:47:09<2333:24:29, 8.55s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [17630], local_loss=0.005754814483225346, train_loss=0.14326976239681244, time_cost=2.123086929321289
+
Steps: 2%|▏ | 17630/1000000 [11:47:09<2333:24:29, 8.55s/it, lr=1e-5, step_loss=0.00575]
Steps: 2%|▏ | 17631/1000000 [11:47:14<2063:06:29, 7.56s/it, lr=1e-5, step_loss=0.00575][RANK-0]: Step: [17631], local_loss=0.011485069058835506, train_loss=0.014979984611272812, time_cost=2.237546920776367
+
Steps: 2%|▏ | 17631/1000000 [11:47:14<2063:06:29, 7.56s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 17632/1000000 [11:47:25<2329:08:13, 8.54s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [17632], local_loss=0.00666718278080225, train_loss=0.10505080968141556, time_cost=3.7049713134765625
+
Steps: 2%|▏ | 17632/1000000 [11:47:25<2329:08:13, 8.54s/it, lr=1e-5, step_loss=0.00667]
Steps: 2%|▏ | 17633/1000000 [11:47:34<2365:56:11, 8.67s/it, lr=1e-5, step_loss=0.00667][RANK-0]: Step: [17633], local_loss=0.004300395026803017, train_loss=0.026900626718997955, time_cost=2.3276965618133545
+
Steps: 2%|▏ | 17633/1000000 [11:47:34<2365:56:11, 8.67s/it, lr=1e-5, step_loss=0.0043]
Steps: 2%|▏ | 17634/1000000 [11:47:38<2013:47:27, 7.38s/it, lr=1e-5, step_loss=0.0043][RANK-0]: Step: [17634], local_loss=0.03264310210943222, train_loss=0.06845545023679733, time_cost=3.354966163635254
+
Steps: 2%|▏ | 17634/1000000 [11:47:38<2013:47:27, 7.38s/it, lr=1e-5, step_loss=0.0326]
Steps: 2%|▏ | 17635/1000000 [11:47:49<2290:10:42, 8.39s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [17635], local_loss=0.058414071798324585, train_loss=0.037206850945949554, time_cost=1.2305057048797607
+
Steps: 2%|▏ | 17635/1000000 [11:47:49<2290:10:42, 8.39s/it, lr=1e-5, step_loss=0.0584]
Steps: 2%|▏ | 17636/1000000 [11:47:59<2480:56:19, 9.09s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [17636], local_loss=0.008264990523457527, train_loss=0.018087487667798996, time_cost=1.5706305503845215
+
Steps: 2%|▏ | 17636/1000000 [11:47:59<2480:56:19, 9.09s/it, lr=1e-5, step_loss=0.00826]
Steps: 2%|▏ | 17637/1000000 [11:48:13<2809:04:28, 10.29s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [17637], local_loss=0.029996858909726143, train_loss=0.028187453746795654, time_cost=5.293345212936401
+
Steps: 2%|▏ | 17637/1000000 [11:48:13<2809:04:28, 10.29s/it, lr=1e-5, step_loss=0.03]
Steps: 2%|▏ | 17638/1000000 [11:48:17<2357:13:25, 8.64s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [17638], local_loss=0.012796199880540371, train_loss=0.13897494971752167, time_cost=1.5773913860321045
+
Steps: 2%|▏ | 17638/1000000 [11:48:17<2357:13:25, 8.64s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 17639/1000000 [11:48:29<2584:26:44, 9.47s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [17639], local_loss=0.00997457280755043, train_loss=0.0351182296872139, time_cost=3.5342838764190674
+
Steps: 2%|▏ | 17639/1000000 [11:48:29<2584:26:44, 9.47s/it, lr=1e-5, step_loss=0.00997]
Steps: 2%|▏ | 17640/1000000 [11:48:34<2217:29:46, 8.13s/it, lr=1e-5, step_loss=0.00997][RANK-0]: Step: [17640], local_loss=0.11181685328483582, train_loss=0.06316907703876495, time_cost=1.7905001640319824
+
Steps: 2%|▏ | 17640/1000000 [11:48:34<2217:29:46, 8.13s/it, lr=1e-5, step_loss=0.112]
Steps: 2%|▏ | 17641/1000000 [11:48:41<2151:33:10, 7.88s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [17641], local_loss=0.014154789038002491, train_loss=0.0838557705283165, time_cost=5.541888475418091
+
Steps: 2%|▏ | 17641/1000000 [11:48:41<2151:33:10, 7.88s/it, lr=1e-5, step_loss=0.0142]
Steps: 2%|▏ | 17642/1000000 [11:48:49<2143:13:22, 7.85s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [17642], local_loss=0.03454110398888588, train_loss=20.84788703918457, time_cost=1.2233047485351562
+
Steps: 2%|▏ | 17642/1000000 [11:48:49<2143:13:22, 7.85s/it, lr=1e-5, step_loss=0.0345]
Steps: 2%|▏ | 17643/1000000 [11:48:54<1930:12:48, 7.07s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [17643], local_loss=0.014826003462076187, train_loss=0.023256564512848854, time_cost=2.5387375354766846
+
Steps: 2%|▏ | 17643/1000000 [11:48:54<1930:12:48, 7.07s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 17644/1000000 [11:49:00<1833:59:21, 6.72s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [17644], local_loss=0.006641093175858259, train_loss=0.03353509679436684, time_cost=3.1044938564300537
+
Steps: 2%|▏ | 17644/1000000 [11:49:00<1833:59:21, 6.72s/it, lr=1e-5, step_loss=0.00664]
Steps: 2%|▏ | 17645/1000000 [11:49:11<2161:48:43, 7.92s/it, lr=1e-5, step_loss=0.00664][RANK-0]: Step: [17645], local_loss=0.005861206911504269, train_loss=0.0422014519572258, time_cost=1.7418928146362305
+
Steps: 2%|▏ | 17645/1000000 [11:49:11<2161:48:43, 7.92s/it, lr=1e-5, step_loss=0.00586]
Steps: 2%|▏ | 17646/1000000 [11:49:19<2233:06:26, 8.18s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [17646], local_loss=0.0077135288156569, train_loss=0.020287219434976578, time_cost=2.6120617389678955
+
Steps: 2%|▏ | 17646/1000000 [11:49:19<2233:06:26, 8.18s/it, lr=1e-5, step_loss=0.00771]
Steps: 2%|▏ | 17647/1000000 [11:49:26<2123:26:11, 7.78s/it, lr=1e-5, step_loss=0.00771][RANK-0]: Step: [17647], local_loss=0.04680459201335907, train_loss=0.03794592618942261, time_cost=2.978499174118042
+
Steps: 2%|▏ | 17647/1000000 [11:49:26<2123:26:11, 7.78s/it, lr=1e-5, step_loss=0.0468]
Steps: 2%|▏ | 17648/1000000 [11:49:42<2760:09:12, 10.12s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [17648], local_loss=0.07855358719825745, train_loss=0.042131826281547546, time_cost=7.41077184677124
+
Steps: 2%|▏ | 17648/1000000 [11:49:42<2760:09:12, 10.12s/it, lr=1e-5, step_loss=0.0786]
Steps: 2%|▏ | 17649/1000000 [11:49:46<2293:00:27, 8.40s/it, lr=1e-5, step_loss=0.0786][RANK-0]: Step: [17649], local_loss=0.40935391187667847, train_loss=0.074004165828228, time_cost=1.6720640659332275
+
Steps: 2%|▏ | 17649/1000000 [11:49:46<2293:00:27, 8.40s/it, lr=1e-5, step_loss=0.409]
Steps: 2%|▏ | 17650/1000000 [11:49:55<2350:44:06, 8.61s/it, lr=1e-5, step_loss=0.409][RANK-0]: Step: [17650], local_loss=0.03675292059779167, train_loss=0.045412857085466385, time_cost=3.1495585441589355
+
Steps: 2%|▏ | 17650/1000000 [11:49:55<2350:44:06, 8.61s/it, lr=1e-5, step_loss=0.0368]
Steps: 2%|▏ | 17651/1000000 [11:50:00<2002:34:18, 7.34s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [17651], local_loss=0.008006183430552483, train_loss=0.019544266164302826, time_cost=3.3912293910980225
+
Steps: 2%|▏ | 17651/1000000 [11:50:00<2002:34:18, 7.34s/it, lr=1e-5, step_loss=0.00801]
Steps: 2%|▏ | 17652/1000000 [11:50:07<1974:16:46, 7.24s/it, lr=1e-5, step_loss=0.00801][RANK-0]: Step: [17652], local_loss=0.04963359236717224, train_loss=0.024160366505384445, time_cost=2.23541259765625
+
Steps: 2%|▏ | 17652/1000000 [11:50:07<1974:16:46, 7.24s/it, lr=1e-5, step_loss=0.0496]
Steps: 2%|▏ | 17653/1000000 [11:50:12<1795:34:32, 6.58s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [17653], local_loss=0.050303857773542404, train_loss=0.14841173589229584, time_cost=2.142512798309326
+
Steps: 2%|▏ | 17653/1000000 [11:50:12<1795:34:32, 6.58s/it, lr=1e-5, step_loss=0.0503]
Steps: 2%|▏ | 17654/1000000 [11:50:18<1761:39:05, 6.46s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [17654], local_loss=0.02463276870548725, train_loss=0.03985849767923355, time_cost=1.2348103523254395
+
Steps: 2%|▏ | 17654/1000000 [11:50:18<1761:39:05, 6.46s/it, lr=1e-5, step_loss=0.0246]
Steps: 2%|▏ | 17655/1000000 [11:50:23<1645:06:31, 6.03s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [17655], local_loss=0.027177799493074417, train_loss=0.03209111467003822, time_cost=2.01839017868042
+
Steps: 2%|▏ | 17655/1000000 [11:50:23<1645:06:31, 6.03s/it, lr=1e-5, step_loss=0.0272]
Steps: 2%|▏ | 17656/1000000 [11:50:31<1777:58:11, 6.52s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [17656], local_loss=0.008749288506805897, train_loss=0.0704393982887268, time_cost=1.4910047054290771
+
Steps: 2%|▏ | 17656/1000000 [11:50:31<1777:58:11, 6.52s/it, lr=1e-5, step_loss=0.00875]
Steps: 2%|▏ | 17657/1000000 [11:50:35<1583:18:59, 5.80s/it, lr=1e-5, step_loss=0.00875][RANK-0]: Step: [17657], local_loss=0.007835040800273418, train_loss=0.04786711931228638, time_cost=3.25528883934021
+
Steps: 2%|▏ | 17657/1000000 [11:50:35<1583:18:59, 5.80s/it, lr=1e-5, step_loss=0.00784]
Steps: 2%|▏ | 17658/1000000 [11:50:42<1696:02:36, 6.22s/it, lr=1e-5, step_loss=0.00784][RANK-0]: Step: [17658], local_loss=0.04601714015007019, train_loss=0.04192575067281723, time_cost=3.5414557456970215
+
Steps: 2%|▏ | 17658/1000000 [11:50:42<1696:02:36, 6.22s/it, lr=1e-5, step_loss=0.046]
Steps: 2%|▏ | 17659/1000000 [11:50:49<1775:56:01, 6.51s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [17659], local_loss=0.036477766931056976, train_loss=0.08129564672708511, time_cost=1.2193689346313477
+
Steps: 2%|▏ | 17659/1000000 [11:50:49<1775:56:01, 6.51s/it, lr=1e-5, step_loss=0.0365]
Steps: 2%|▏ | 17660/1000000 [11:51:06<2628:36:45, 9.63s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [17660], local_loss=0.4219992458820343, train_loss=9.263084411621094, time_cost=4.01312255859375
+
Steps: 2%|▏ | 17660/1000000 [11:51:06<2628:36:45, 9.63s/it, lr=1e-5, step_loss=0.422]
Steps: 2%|▏ | 17661/1000000 [11:51:14<2492:16:11, 9.13s/it, lr=1e-5, step_loss=0.422][RANK-0]: Step: [17661], local_loss=0.006890075281262398, train_loss=0.036590080708265305, time_cost=2.3312175273895264
+
Steps: 2%|▏ | 17661/1000000 [11:51:14<2492:16:11, 9.13s/it, lr=1e-5, step_loss=0.00689]
Steps: 2%|▏ | 17662/1000000 [11:51:21<2284:55:22, 8.37s/it, lr=1e-5, step_loss=0.00689][RANK-0]: Step: [17662], local_loss=0.03429946303367615, train_loss=0.036683715879917145, time_cost=1.212890386581421
+
Steps: 2%|▏ | 17662/1000000 [11:51:21<2284:55:22, 8.37s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 17663/1000000 [11:51:26<2038:37:00, 7.47s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [17663], local_loss=0.014228243380784988, train_loss=43.07734298706055, time_cost=2.57883620262146
+
Steps: 2%|▏ | 17663/1000000 [11:51:26<2038:37:00, 7.47s/it, lr=1e-5, step_loss=0.0142]
Steps: 2%|▏ | 17664/1000000 [11:51:34<2040:27:59, 7.48s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [17664], local_loss=0.008384078741073608, train_loss=0.05173030123114586, time_cost=3.9027788639068604
+
Steps: 2%|▏ | 17664/1000000 [11:51:34<2040:27:59, 7.48s/it, lr=1e-5, step_loss=0.00838]
Steps: 2%|▏ | 17665/1000000 [11:51:47<2553:41:56, 9.36s/it, lr=1e-5, step_loss=0.00838][RANK-0]: Step: [17665], local_loss=0.0058594634756445885, train_loss=0.06053469702601433, time_cost=5.044658184051514
+
Steps: 2%|▏ | 17665/1000000 [11:51:47<2553:41:56, 9.36s/it, lr=1e-5, step_loss=0.00586]
Steps: 2%|▏ | 17666/1000000 [11:51:57<2572:46:21, 9.43s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [17666], local_loss=0.009484237059950829, train_loss=0.018468573689460754, time_cost=8.159874677658081
+
Steps: 2%|▏ | 17666/1000000 [11:51:57<2572:46:21, 9.43s/it, lr=1e-5, step_loss=0.00948]
Steps: 2%|▏ | 17667/1000000 [11:52:01<2164:21:49, 7.93s/it, lr=1e-5, step_loss=0.00948][RANK-0]: Step: [17667], local_loss=0.033238332718610764, train_loss=0.02498941496014595, time_cost=1.8709986209869385
+
Steps: 2%|▏ | 17667/1000000 [11:52:01<2164:21:49, 7.93s/it, lr=1e-5, step_loss=0.0332]
Steps: 2%|▏ | 17668/1000000 [11:52:09<2139:59:16, 7.84s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [17668], local_loss=0.0077016595751047134, train_loss=0.03664674609899521, time_cost=1.3546538352966309
+
Steps: 2%|▏ | 17668/1000000 [11:52:09<2139:59:16, 7.84s/it, lr=1e-5, step_loss=0.0077]
Steps: 2%|▏ | 17669/1000000 [11:52:13<1851:16:19, 6.78s/it, lr=1e-5, step_loss=0.0077][RANK-0]: Step: [17669], local_loss=0.0451190322637558, train_loss=0.04898466169834137, time_cost=1.2807955741882324
+
Steps: 2%|▏ | 17669/1000000 [11:52:13<1851:16:19, 6.78s/it, lr=1e-5, step_loss=0.0451]
Steps: 2%|▏ | 17670/1000000 [11:52:23<2055:12:00, 7.53s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [17670], local_loss=0.03581749275326729, train_loss=0.057500433176755905, time_cost=3.8973264694213867
+
Steps: 2%|▏ | 17670/1000000 [11:52:23<2055:12:00, 7.53s/it, lr=1e-5, step_loss=0.0358]
Steps: 2%|▏ | 17671/1000000 [11:52:35<2498:29:34, 9.16s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [17671], local_loss=0.02061440981924534, train_loss=0.021200835704803467, time_cost=3.969089984893799
+
Steps: 2%|▏ | 17671/1000000 [11:52:35<2498:29:34, 9.16s/it, lr=1e-5, step_loss=0.0206]
Steps: 2%|▏ | 17672/1000000 [11:52:40<2107:12:51, 7.72s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [17672], local_loss=0.015090133994817734, train_loss=0.010120715945959091, time_cost=1.4147849082946777
+
Steps: 2%|▏ | 17672/1000000 [11:52:40<2107:12:51, 7.72s/it, lr=1e-5, step_loss=0.0151]
Steps: 2%|▏ | 17673/1000000 [11:52:47<2065:51:37, 7.57s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [17673], local_loss=0.021900709718465805, train_loss=0.05983203276991844, time_cost=3.451559543609619
+
Steps: 2%|▏ | 17673/1000000 [11:52:47<2065:51:37, 7.57s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 17674/1000000 [11:52:53<1900:57:36, 6.97s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [17674], local_loss=0.01217242144048214, train_loss=0.0773685872554779, time_cost=1.3314781188964844
+
Steps: 2%|▏ | 17674/1000000 [11:52:53<1900:57:36, 6.97s/it, lr=1e-5, step_loss=0.0122]
Steps: 2%|▏ | 17675/1000000 [11:53:00<1950:45:04, 7.15s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [17675], local_loss=0.015399966388940811, train_loss=0.07528802752494812, time_cost=1.8398189544677734
+
Steps: 2%|▏ | 17675/1000000 [11:53:00<1950:45:04, 7.15s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 17676/1000000 [11:53:06<1843:04:00, 6.75s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [17676], local_loss=0.009384366683661938, train_loss=0.019572094082832336, time_cost=3.0343527793884277
+
Steps: 2%|▏ | 17676/1000000 [11:53:06<1843:04:00, 6.75s/it, lr=1e-5, step_loss=0.00938]
Steps: 2%|▏ | 17677/1000000 [11:53:10<1643:58:22, 6.02s/it, lr=1e-5, step_loss=0.00938][RANK-0]: Step: [17677], local_loss=0.028804641216993332, train_loss=0.07622770965099335, time_cost=1.8258538246154785
+
Steps: 2%|▏ | 17677/1000000 [11:53:10<1643:58:22, 6.02s/it, lr=1e-5, step_loss=0.0288]
Steps: 2%|▏ | 17678/1000000 [11:53:17<1654:06:41, 6.06s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [17678], local_loss=0.015367559157311916, train_loss=0.15846289694309235, time_cost=2.3667235374450684
+
Steps: 2%|▏ | 17678/1000000 [11:53:17<1654:06:41, 6.06s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 17679/1000000 [11:53:23<1724:16:18, 6.32s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [17679], local_loss=0.0108566265553236, train_loss=0.0406104139983654, time_cost=1.5386250019073486
+
Steps: 2%|▏ | 17679/1000000 [11:53:23<1724:16:18, 6.32s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 17680/1000000 [11:53:28<1565:40:21, 5.74s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [17680], local_loss=0.04772958159446716, train_loss=0.081368088722229, time_cost=1.453169822692871
+
Steps: 2%|▏ | 17680/1000000 [11:53:28<1565:40:21, 5.74s/it, lr=1e-5, step_loss=0.0477]
Steps: 2%|▏ | 17681/1000000 [11:53:38<1932:54:01, 7.08s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [17681], local_loss=0.007013197988271713, train_loss=0.034568458795547485, time_cost=1.549264669418335
+
Steps: 2%|▏ | 17681/1000000 [11:53:38<1932:54:01, 7.08s/it, lr=1e-5, step_loss=0.00701]
Steps: 2%|▏ | 17682/1000000 [11:53:49<2251:53:28, 8.25s/it, lr=1e-5, step_loss=0.00701][RANK-0]: Step: [17682], local_loss=0.01096811518073082, train_loss=0.03504158556461334, time_cost=8.060128450393677
+
Steps: 2%|▏ | 17682/1000000 [11:53:49<2251:53:28, 8.25s/it, lr=1e-5, step_loss=0.011]
Steps: 2%|▏ | 17683/1000000 [11:54:00<2484:51:24, 9.11s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [17683], local_loss=0.04098778963088989, train_loss=0.020360741764307022, time_cost=2.087660789489746
+
Steps: 2%|▏ | 17683/1000000 [11:54:00<2484:51:24, 9.11s/it, lr=1e-5, step_loss=0.041]
Steps: 2%|▏ | 17684/1000000 [11:54:10<2571:46:29, 9.43s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [17684], local_loss=0.01834435947239399, train_loss=0.17205482721328735, time_cost=1.6034634113311768
+
Steps: 2%|▏ | 17684/1000000 [11:54:10<2571:46:29, 9.43s/it, lr=1e-5, step_loss=0.0183]
Steps: 2%|▏ | 17685/1000000 [11:54:25<3040:52:58, 11.14s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [17685], local_loss=0.033609721809625626, train_loss=2.445080518722534, time_cost=6.8388450145721436
+
Steps: 2%|▏ | 17685/1000000 [11:54:25<3040:52:58, 11.14s/it, lr=1e-5, step_loss=0.0336]
Steps: 2%|▏ | 17686/1000000 [11:54:35<2930:24:42, 10.74s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [17686], local_loss=0.015395292080938816, train_loss=0.13930416107177734, time_cost=3.552907943725586
+
Steps: 2%|▏ | 17686/1000000 [11:54:35<2930:24:42, 10.74s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 17687/1000000 [11:54:40<2424:19:47, 8.88s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [17687], local_loss=0.006417117081582546, train_loss=0.0728507936000824, time_cost=1.250697374343872
+
Steps: 2%|▏ | 17687/1000000 [11:54:40<2424:19:47, 8.88s/it, lr=1e-5, step_loss=0.00642]
Steps: 2%|▏ | 17688/1000000 [11:54:45<2145:58:11, 7.86s/it, lr=1e-5, step_loss=0.00642][RANK-0]: Step: [17688], local_loss=0.027536161243915558, train_loss=0.2937487065792084, time_cost=1.3219549655914307
+
Steps: 2%|▏ | 17688/1000000 [11:54:45<2145:58:11, 7.86s/it, lr=1e-5, step_loss=0.0275]
Steps: 2%|▏ | 17689/1000000 [11:54:59<2605:39:05, 9.55s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [17689], local_loss=0.03500840440392494, train_loss=0.025783143937587738, time_cost=1.2231135368347168
+
Steps: 2%|▏ | 17689/1000000 [11:54:59<2605:39:05, 9.55s/it, lr=1e-5, step_loss=0.035]
Steps: 2%|▏ | 17690/1000000 [11:55:12<2930:10:26, 10.74s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [17690], local_loss=0.007030417677015066, train_loss=0.01592463254928589, time_cost=11.905352354049683
+
Steps: 2%|▏ | 17690/1000000 [11:55:12<2930:10:26, 10.74s/it, lr=1e-5, step_loss=0.00703]
Steps: 2%|▏ | 17691/1000000 [11:55:19<2617:02:04, 9.59s/it, lr=1e-5, step_loss=0.00703][RANK-0]: Step: [17691], local_loss=0.028199970722198486, train_loss=0.0185902900993824, time_cost=2.7129099369049072
+
Steps: 2%|▏ | 17691/1000000 [11:55:19<2617:02:04, 9.59s/it, lr=1e-5, step_loss=0.0282]
Steps: 2%|▏ | 17692/1000000 [11:55:31<2826:25:51, 10.36s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [17692], local_loss=0.0052030798979103565, train_loss=0.03378978371620178, time_cost=4.228590250015259
+
Steps: 2%|▏ | 17692/1000000 [11:55:31<2826:25:51, 10.36s/it, lr=1e-5, step_loss=0.0052]
Steps: 2%|▏ | 17693/1000000 [11:55:37<2457:21:30, 9.01s/it, lr=1e-5, step_loss=0.0052][RANK-0]: Step: [17693], local_loss=0.01181360986083746, train_loss=0.023349134251475334, time_cost=1.500061273574829
+
Steps: 2%|▏ | 17693/1000000 [11:55:37<2457:21:30, 9.01s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 17694/1000000 [11:55:45<2369:49:26, 8.69s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [17694], local_loss=0.04189023748040199, train_loss=0.15187571942806244, time_cost=4.1047563552856445
+
Steps: 2%|▏ | 17694/1000000 [11:55:45<2369:49:26, 8.69s/it, lr=1e-5, step_loss=0.0419]
Steps: 2%|▏ | 17695/1000000 [11:55:55<2428:14:45, 8.90s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [17695], local_loss=0.005981334485113621, train_loss=0.031182579696178436, time_cost=1.501746654510498
+
Steps: 2%|▏ | 17695/1000000 [11:55:55<2428:14:45, 8.90s/it, lr=1e-5, step_loss=0.00598]
Steps: 2%|▏ | 17696/1000000 [11:56:05<2526:19:44, 9.26s/it, lr=1e-5, step_loss=0.00598][RANK-0]: Step: [17696], local_loss=0.01369038037955761, train_loss=0.02842174470424652, time_cost=5.248068809509277
+
Steps: 2%|▏ | 17696/1000000 [11:56:05<2526:19:44, 9.26s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 17697/1000000 [11:56:15<2588:52:35, 9.49s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [17697], local_loss=0.0114646190777421, train_loss=0.05027797445654869, time_cost=2.857898235321045
+
Steps: 2%|▏ | 17697/1000000 [11:56:15<2588:52:35, 9.49s/it, lr=1e-5, step_loss=0.0115]
Steps: 2%|▏ | 17698/1000000 [11:56:29<3015:18:04, 11.05s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [17698], local_loss=0.004299746360629797, train_loss=0.0220018457621336, time_cost=6.70008397102356
+
Steps: 2%|▏ | 17698/1000000 [11:56:29<3015:18:04, 11.05s/it, lr=1e-5, step_loss=0.0043]
Steps: 2%|▏ | 17699/1000000 [11:56:40<3016:12:38, 11.05s/it, lr=1e-5, step_loss=0.0043][RANK-0]: Step: [17699], local_loss=0.005518469028174877, train_loss=0.015442714095115662, time_cost=4.60768723487854
+
Steps: 2%|▏ | 17699/1000000 [11:56:40<3016:12:38, 11.05s/it, lr=1e-5, step_loss=0.00552]
Steps: 2%|▏ | 17700/1000000 [11:56:47<2671:19:46, 9.79s/it, lr=1e-5, step_loss=0.00552][RANK-0]: Step: [17700], local_loss=0.005146200302988291, train_loss=0.024482835084199905, time_cost=1.2266218662261963
+
Steps: 2%|▏ | 17700/1000000 [11:56:47<2671:19:46, 9.79s/it, lr=1e-5, step_loss=0.00515]
Steps: 2%|▏ | 17701/1000000 [11:56:56<2577:58:02, 9.45s/it, lr=1e-5, step_loss=0.00515][RANK-0]: Step: [17701], local_loss=0.08174053579568863, train_loss=0.09293343126773834, time_cost=2.7431857585906982
+
Steps: 2%|▏ | 17701/1000000 [11:56:56<2577:58:02, 9.45s/it, lr=1e-5, step_loss=0.0817]
Steps: 2%|▏ | 17702/1000000 [11:57:04<2431:28:26, 8.91s/it, lr=1e-5, step_loss=0.0817][RANK-0]: Step: [17702], local_loss=0.005411218851804733, train_loss=0.02609330415725708, time_cost=1.6907663345336914
+
Steps: 2%|▏ | 17702/1000000 [11:57:04<2431:28:26, 8.91s/it, lr=1e-5, step_loss=0.00541]
Steps: 2%|▏ | 17703/1000000 [11:57:15<2629:19:15, 9.64s/it, lr=1e-5, step_loss=0.00541][RANK-0]: Step: [17703], local_loss=0.014413348399102688, train_loss=0.05282226949930191, time_cost=4.12563419342041
+
Steps: 2%|▏ | 17703/1000000 [11:57:15<2629:19:15, 9.64s/it, lr=1e-5, step_loss=0.0144]
Steps: 2%|▏ | 17704/1000000 [11:57:20<2246:21:46, 8.23s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [17704], local_loss=0.005362721160054207, train_loss=0.011042393743991852, time_cost=3.768181800842285
+
Steps: 2%|▏ | 17704/1000000 [11:57:20<2246:21:46, 8.23s/it, lr=1e-5, step_loss=0.00536]
Steps: 2%|▏ | 17705/1000000 [11:57:29<2297:56:08, 8.42s/it, lr=1e-5, step_loss=0.00536][RANK-0]: Step: [17705], local_loss=0.06757905334234238, train_loss=0.17134158313274384, time_cost=3.0266692638397217
+
Steps: 2%|▏ | 17705/1000000 [11:57:29<2297:56:08, 8.42s/it, lr=1e-5, step_loss=0.0676]
Steps: 2%|▏ | 17706/1000000 [11:57:34<2049:19:32, 7.51s/it, lr=1e-5, step_loss=0.0676][RANK-0]: Step: [17706], local_loss=0.08680830895900726, train_loss=6.9716796875, time_cost=2.9584105014801025
+
Steps: 2%|▏ | 17706/1000000 [11:57:34<2049:19:32, 7.51s/it, lr=1e-5, step_loss=0.0868]
Steps: 2%|▏ | 17707/1000000 [11:57:45<2296:24:08, 8.42s/it, lr=1e-5, step_loss=0.0868][RANK-0]: Step: [17707], local_loss=0.023152941837906837, train_loss=0.03532082214951515, time_cost=3.5927748680114746
+
Steps: 2%|▏ | 17707/1000000 [11:57:45<2296:24:08, 8.42s/it, lr=1e-5, step_loss=0.0232]
Steps: 2%|▏ | 17708/1000000 [11:57:54<2359:38:19, 8.65s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [17708], local_loss=0.014014730229973793, train_loss=0.022591374814510345, time_cost=3.201836109161377
+
Steps: 2%|▏ | 17708/1000000 [11:57:54<2359:38:19, 8.65s/it, lr=1e-5, step_loss=0.014]
Steps: 2%|▏ | 17709/1000000 [11:58:08<2807:43:07, 10.29s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [17709], local_loss=0.006647544447332621, train_loss=0.03813164681196213, time_cost=6.2792439460754395
+
Steps: 2%|▏ | 17709/1000000 [11:58:08<2807:43:07, 10.29s/it, lr=1e-5, step_loss=0.00665]
Steps: 2%|▏ | 17710/1000000 [11:58:15<2570:36:06, 9.42s/it, lr=1e-5, step_loss=0.00665][RANK-0]: Step: [17710], local_loss=0.03625185415148735, train_loss=0.015157158486545086, time_cost=2.8190407752990723
+
Steps: 2%|▏ | 17710/1000000 [11:58:15<2570:36:06, 9.42s/it, lr=1e-5, step_loss=0.0363]
Steps: 2%|▏ | 17711/1000000 [11:58:24<2512:16:47, 9.21s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [17711], local_loss=0.06996338069438934, train_loss=0.027253111824393272, time_cost=1.9671742916107178
+
Steps: 2%|▏ | 17711/1000000 [11:58:24<2512:16:47, 9.21s/it, lr=1e-5, step_loss=0.07]
Steps: 2%|▏ | 17712/1000000 [11:58:38<2872:50:42, 10.53s/it, lr=1e-5, step_loss=0.07][RANK-0]: Step: [17712], local_loss=0.022171182557940483, train_loss=0.04484385997056961, time_cost=3.1264705657958984
+
Steps: 2%|▏ | 17712/1000000 [11:58:38<2872:50:42, 10.53s/it, lr=1e-5, step_loss=0.0222]
Steps: 2%|▏ | 17713/1000000 [11:58:43<2436:29:08, 8.93s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [17713], local_loss=0.19846662878990173, train_loss=0.07171572744846344, time_cost=3.051872968673706
+
Steps: 2%|▏ | 17713/1000000 [11:58:43<2436:29:08, 8.93s/it, lr=1e-5, step_loss=0.198]
Steps: 2%|▏ | 17714/1000000 [11:58:50<2302:16:45, 8.44s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [17714], local_loss=0.06342677026987076, train_loss=0.03097669780254364, time_cost=2.660947561264038
+
Steps: 2%|▏ | 17714/1000000 [11:58:50<2302:16:45, 8.44s/it, lr=1e-5, step_loss=0.0634]
Steps: 2%|▏ | 17715/1000000 [11:58:58<2242:23:31, 8.22s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [17715], local_loss=0.02012000046670437, train_loss=0.013214408420026302, time_cost=1.9129230976104736
+
Steps: 2%|▏ | 17715/1000000 [11:58:58<2242:23:31, 8.22s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 17716/1000000 [11:59:03<1977:40:22, 7.25s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [17716], local_loss=0.024851273745298386, train_loss=0.05184168741106987, time_cost=2.361339807510376
+
Steps: 2%|▏ | 17716/1000000 [11:59:03<1977:40:22, 7.25s/it, lr=1e-5, step_loss=0.0249]
Steps: 2%|▏ | 17717/1000000 [11:59:13<2243:45:49, 8.22s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [17717], local_loss=0.004961606580764055, train_loss=6.201533794403076, time_cost=2.4381611347198486
+
Steps: 2%|▏ | 17717/1000000 [11:59:13<2243:45:49, 8.22s/it, lr=1e-5, step_loss=0.00496]
Steps: 2%|▏ | 17718/1000000 [11:59:25<2553:02:35, 9.36s/it, lr=1e-5, step_loss=0.00496][RANK-0]: Step: [17718], local_loss=0.006155962124466896, train_loss=0.08442185074090958, time_cost=3.904872417449951
+
Steps: 2%|▏ | 17718/1000000 [11:59:25<2553:02:35, 9.36s/it, lr=1e-5, step_loss=0.00616]
Steps: 2%|▏ | 17719/1000000 [11:59:33<2386:19:55, 8.75s/it, lr=1e-5, step_loss=0.00616][RANK-0]: Step: [17719], local_loss=0.048658423125743866, train_loss=0.023826418444514275, time_cost=2.8914597034454346
+
Steps: 2%|▏ | 17719/1000000 [11:59:33<2386:19:55, 8.75s/it, lr=1e-5, step_loss=0.0487]
Steps: 2%|▏ | 17720/1000000 [11:59:40<2248:12:22, 8.24s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [17720], local_loss=0.029198748990893364, train_loss=0.01878269761800766, time_cost=2.9819529056549072
+
Steps: 2%|▏ | 17720/1000000 [11:59:40<2248:12:22, 8.24s/it, lr=1e-5, step_loss=0.0292]
Steps: 2%|▏ | 17721/1000000 [11:59:56<2908:35:44, 10.66s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [17721], local_loss=0.0049859946593642235, train_loss=0.0095209376886487, time_cost=7.210896015167236
+
Steps: 2%|▏ | 17721/1000000 [11:59:56<2908:35:44, 10.66s/it, lr=1e-5, step_loss=0.00499]
Steps: 2%|▏ | 17722/1000000 [12:00:09<3139:53:41, 11.51s/it, lr=1e-5, step_loss=0.00499][RANK-0]: Step: [17722], local_loss=0.015167418867349625, train_loss=0.017722953110933304, time_cost=7.256860256195068
+
Steps: 2%|▏ | 17722/1000000 [12:00:09<3139:53:41, 11.51s/it, lr=1e-5, step_loss=0.0152]
Steps: 2%|▏ | 17723/1000000 [12:00:14<2566:07:36, 9.40s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [17723], local_loss=0.030318772420287132, train_loss=0.015326999127864838, time_cost=1.5673775672912598
+
Steps: 2%|▏ | 17723/1000000 [12:00:14<2566:07:36, 9.40s/it, lr=1e-5, step_loss=0.0303]
Steps: 2%|▏ | 17724/1000000 [12:00:23<2531:30:53, 9.28s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [17724], local_loss=0.01671936921775341, train_loss=0.05695807933807373, time_cost=1.2003889083862305
+
Steps: 2%|▏ | 17724/1000000 [12:00:23<2531:30:53, 9.28s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 17725/1000000 [12:00:28<2172:35:01, 7.96s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [17725], local_loss=0.003620794275775552, train_loss=0.034623753279447556, time_cost=1.7700679302215576
+
Steps: 2%|▏ | 17725/1000000 [12:00:28<2172:35:01, 7.96s/it, lr=1e-5, step_loss=0.00362]
Steps: 2%|▏ | 17726/1000000 [12:00:34<2002:30:59, 7.34s/it, lr=1e-5, step_loss=0.00362][RANK-0]: Step: [17726], local_loss=207.06552124023438, train_loss=25.915576934814453, time_cost=1.685260534286499
+
Steps: 2%|▏ | 17726/1000000 [12:00:34<2002:30:59, 7.34s/it, lr=1e-5, step_loss=207]
Steps: 2%|▏ | 17727/1000000 [12:00:43<2167:30:19, 7.94s/it, lr=1e-5, step_loss=207][RANK-0]: Step: [17727], local_loss=0.009845023043453693, train_loss=0.04647514969110489, time_cost=3.1143977642059326
+
Steps: 2%|▏ | 17727/1000000 [12:00:43<2167:30:19, 7.94s/it, lr=1e-5, step_loss=0.00985]
Steps: 2%|▏ | 17728/1000000 [12:00:55<2452:57:57, 8.99s/it, lr=1e-5, step_loss=0.00985][RANK-0]: Step: [17728], local_loss=0.007397978100925684, train_loss=0.031064406037330627, time_cost=3.918733835220337
+
Steps: 2%|▏ | 17728/1000000 [12:00:55<2452:57:57, 8.99s/it, lr=1e-5, step_loss=0.0074]
Steps: 2%|▏ | 17729/1000000 [12:01:08<2816:13:19, 10.32s/it, lr=1e-5, step_loss=0.0074][RANK-0]: Step: [17729], local_loss=0.03331111744046211, train_loss=0.03820124268531799, time_cost=3.5652031898498535
+
Steps: 2%|▏ | 17729/1000000 [12:01:08<2816:13:19, 10.32s/it, lr=1e-5, step_loss=0.0333]
Steps: 2%|▏ | 17730/1000000 [12:01:15<2561:01:28, 9.39s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [17730], local_loss=0.005144759081304073, train_loss=0.020113952457904816, time_cost=5.343270778656006
+
Steps: 2%|▏ | 17730/1000000 [12:01:15<2561:01:28, 9.39s/it, lr=1e-5, step_loss=0.00514]
Steps: 2%|▏ | 17731/1000000 [12:01:24<2498:39:50, 9.16s/it, lr=1e-5, step_loss=0.00514][RANK-0]: Step: [17731], local_loss=0.06792166829109192, train_loss=6.022210597991943, time_cost=2.7943499088287354
+
Steps: 2%|▏ | 17731/1000000 [12:01:24<2498:39:50, 9.16s/it, lr=1e-5, step_loss=0.0679]
Steps: 2%|▏ | 17732/1000000 [12:01:38<2919:55:59, 10.70s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [17732], local_loss=0.019374752417206764, train_loss=0.05629095435142517, time_cost=11.462693214416504
+
Steps: 2%|▏ | 17732/1000000 [12:01:38<2919:55:59, 10.70s/it, lr=1e-5, step_loss=0.0194]
Steps: 2%|▏ | 17733/1000000 [12:01:52<3149:02:34, 11.54s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [17733], local_loss=0.12219443917274475, train_loss=0.0504431426525116, time_cost=4.414273977279663
+
Steps: 2%|▏ | 17733/1000000 [12:01:52<3149:02:34, 11.54s/it, lr=1e-5, step_loss=0.122]
Steps: 2%|▏ | 17734/1000000 [12:02:00<2927:58:11, 10.73s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [17734], local_loss=0.008449320681393147, train_loss=0.014597135595977306, time_cost=6.60691499710083
+
Steps: 2%|▏ | 17734/1000000 [12:02:00<2927:58:11, 10.73s/it, lr=1e-5, step_loss=0.00845]
Steps: 2%|▏ | 17735/1000000 [12:02:08<2633:20:35, 9.65s/it, lr=1e-5, step_loss=0.00845][RANK-0]: Step: [17735], local_loss=0.04983080178499222, train_loss=0.014443510212004185, time_cost=2.8145267963409424
+
Steps: 2%|▏ | 17735/1000000 [12:02:08<2633:20:35, 9.65s/it, lr=1e-5, step_loss=0.0498]
Steps: 2%|▏ | 17736/1000000 [12:02:13<2251:12:47, 8.25s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [17736], local_loss=0.011322062462568283, train_loss=0.018249616026878357, time_cost=2.080042600631714
+
Steps: 2%|▏ | 17736/1000000 [12:02:13<2251:12:47, 8.25s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 17737/1000000 [12:02:23<2447:40:17, 8.97s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [17737], local_loss=0.0076489816419780254, train_loss=0.020916709676384926, time_cost=1.391021966934204
+
Steps: 2%|▏ | 17737/1000000 [12:02:23<2447:40:17, 8.97s/it, lr=1e-5, step_loss=0.00765]
Steps: 2%|▏ | 17738/1000000 [12:02:37<2864:49:16, 10.50s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [17738], local_loss=0.008365020155906677, train_loss=0.03096969798207283, time_cost=1.2910010814666748
+
Steps: 2%|▏ | 17738/1000000 [12:02:37<2864:49:16, 10.50s/it, lr=1e-5, step_loss=0.00837]
Steps: 2%|▏ | 17739/1000000 [12:02:46<2753:38:03, 10.09s/it, lr=1e-5, step_loss=0.00837][RANK-0]: Step: [17739], local_loss=0.01740962453186512, train_loss=0.022703709080815315, time_cost=3.3686273097991943
+
Steps: 2%|▏ | 17739/1000000 [12:02:46<2753:38:03, 10.09s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 17740/1000000 [12:02:53<2494:52:34, 9.14s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [17740], local_loss=0.04554948955774307, train_loss=0.02493904158473015, time_cost=2.5385658740997314
+
Steps: 2%|▏ | 17740/1000000 [12:02:53<2494:52:34, 9.14s/it, lr=1e-5, step_loss=0.0455]
Steps: 2%|▏ | 17741/1000000 [12:03:01<2357:18:25, 8.64s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [17741], local_loss=13.561931610107422, train_loss=1.7324719429016113, time_cost=3.2218503952026367
+
Steps: 2%|▏ | 17741/1000000 [12:03:01<2357:18:25, 8.64s/it, lr=1e-5, step_loss=13.6]
Steps: 2%|▏ | 17742/1000000 [12:03:12<2541:19:54, 9.31s/it, lr=1e-5, step_loss=13.6][RANK-0]: Step: [17742], local_loss=0.007235904689878225, train_loss=0.06913922727108002, time_cost=3.087820053100586
+
Steps: 2%|▏ | 17742/1000000 [12:03:12<2541:19:54, 9.31s/it, lr=1e-5, step_loss=0.00724]
Steps: 2%|▏ | 17743/1000000 [12:03:18<2285:52:39, 8.38s/it, lr=1e-5, step_loss=0.00724][RANK-0]: Step: [17743], local_loss=0.025568634271621704, train_loss=0.014056723564863205, time_cost=5.065240383148193
+
Steps: 2%|▏ | 17743/1000000 [12:03:18<2285:52:39, 8.38s/it, lr=1e-5, step_loss=0.0256]
Steps: 2%|▏ | 17744/1000000 [12:03:24<2099:06:26, 7.69s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [17744], local_loss=0.07691048830747604, train_loss=0.11994906514883041, time_cost=1.6705482006072998
+
Steps: 2%|▏ | 17744/1000000 [12:03:24<2099:06:26, 7.69s/it, lr=1e-5, step_loss=0.0769]
Steps: 2%|▏ | 17745/1000000 [12:03:28<1824:34:20, 6.69s/it, lr=1e-5, step_loss=0.0769][RANK-0]: Step: [17745], local_loss=0.0052384561859071255, train_loss=0.03073767013847828, time_cost=1.2963473796844482
+
Steps: 2%|▏ | 17745/1000000 [12:03:28<1824:34:20, 6.69s/it, lr=1e-5, step_loss=0.00524]
Steps: 2%|▏ | 17746/1000000 [12:03:40<2223:55:02, 8.15s/it, lr=1e-5, step_loss=0.00524][RANK-0]: Step: [17746], local_loss=0.011383934877812862, train_loss=0.08517733216285706, time_cost=3.6240503787994385
+
Steps: 2%|▏ | 17746/1000000 [12:03:40<2223:55:02, 8.15s/it, lr=1e-5, step_loss=0.0114]
Steps: 2%|▏ | 17747/1000000 [12:03:51<2474:28:23, 9.07s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [17747], local_loss=0.46812155842781067, train_loss=0.12932240962982178, time_cost=2.8029541969299316
+
Steps: 2%|▏ | 17747/1000000 [12:03:51<2474:28:23, 9.07s/it, lr=1e-5, step_loss=0.468]
Steps: 2%|▏ | 17748/1000000 [12:04:05<2911:30:12, 10.67s/it, lr=1e-5, step_loss=0.468][RANK-0]: Step: [17748], local_loss=0.00546248210594058, train_loss=0.059262461960315704, time_cost=7.0900256633758545
+
Steps: 2%|▏ | 17748/1000000 [12:04:05<2911:30:12, 10.67s/it, lr=1e-5, step_loss=0.00546]
Steps: 2%|▏ | 17749/1000000 [12:04:14<2752:29:17, 10.09s/it, lr=1e-5, step_loss=0.00546][RANK-0]: Step: [17749], local_loss=0.005353486631065607, train_loss=0.045693546533584595, time_cost=1.2300360202789307
+
Steps: 2%|▏ | 17749/1000000 [12:04:14<2752:29:17, 10.09s/it, lr=1e-5, step_loss=0.00535]
Steps: 2%|▏ | 17750/1000000 [12:04:30<3204:52:43, 11.75s/it, lr=1e-5, step_loss=0.00535][RANK-0]: Step: [17750], local_loss=0.03602418303489685, train_loss=0.07289446145296097, time_cost=7.2852091789245605
+
Steps: 2%|▏ | 17750/1000000 [12:04:30<3204:52:43, 11.75s/it, lr=1e-5, step_loss=0.036]
Steps: 2%|▏ | 17751/1000000 [12:04:37<2809:55:23, 10.30s/it, lr=1e-5, step_loss=0.036][RANK-0]: Step: [17751], local_loss=0.12031406164169312, train_loss=0.04046378657221794, time_cost=1.3022997379302979
+
Steps: 2%|▏ | 17751/1000000 [12:04:37<2809:55:23, 10.30s/it, lr=1e-5, step_loss=0.12]
Steps: 2%|▏ | 17752/1000000 [12:04:52<3229:35:20, 11.84s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [17752], local_loss=0.023625317960977554, train_loss=0.058881424367427826, time_cost=3.144958019256592
+
Steps: 2%|▏ | 17752/1000000 [12:04:52<3229:35:20, 11.84s/it, lr=1e-5, step_loss=0.0236]
Steps: 2%|▏ | 17753/1000000 [12:04:57<2624:33:04, 9.62s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [17753], local_loss=0.012324163690209389, train_loss=0.013594104908406734, time_cost=3.6650848388671875
+
Steps: 2%|▏ | 17753/1000000 [12:04:57<2624:33:04, 9.62s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 17754/1000000 [12:05:05<2509:52:55, 9.20s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [17754], local_loss=0.025158744305372238, train_loss=0.032791461795568466, time_cost=1.7681808471679688
+
Steps: 2%|▏ | 17754/1000000 [12:05:05<2509:52:55, 9.20s/it, lr=1e-5, step_loss=0.0252]
Steps: 2%|▏ | 17755/1000000 [12:05:12<2348:01:19, 8.61s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [17755], local_loss=0.017654435709118843, train_loss=0.03601083159446716, time_cost=2.729275941848755
+
Steps: 2%|▏ | 17755/1000000 [12:05:12<2348:01:19, 8.61s/it, lr=1e-5, step_loss=0.0177]
Steps: 2%|▏ | 17756/1000000 [12:05:17<2064:39:18, 7.57s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [17756], local_loss=0.007541388273239136, train_loss=0.0772705078125, time_cost=2.4471898078918457
+
Steps: 2%|▏ | 17756/1000000 [12:05:17<2064:39:18, 7.57s/it, lr=1e-5, step_loss=0.00754]
Steps: 2%|▏ | 17757/1000000 [12:05:25<2051:36:20, 7.52s/it, lr=1e-5, step_loss=0.00754][RANK-0]: Step: [17757], local_loss=0.0055092256516218185, train_loss=0.06377927213907242, time_cost=3.2131574153900146
+
Steps: 2%|▏ | 17757/1000000 [12:05:25<2051:36:20, 7.52s/it, lr=1e-5, step_loss=0.00551]
Steps: 2%|▏ | 17758/1000000 [12:05:30<1859:12:37, 6.81s/it, lr=1e-5, step_loss=0.00551][RANK-0]: Step: [17758], local_loss=0.009015679359436035, train_loss=0.09123881906270981, time_cost=2.121546983718872
+
Steps: 2%|▏ | 17758/1000000 [12:05:30<1859:12:37, 6.81s/it, lr=1e-5, step_loss=0.00902]
Steps: 2%|▏ | 17759/1000000 [12:05:44<2466:59:49, 9.04s/it, lr=1e-5, step_loss=0.00902][RANK-0]: Step: [17759], local_loss=0.010478357784450054, train_loss=0.04577464982867241, time_cost=2.784874677658081
+
Steps: 2%|▏ | 17759/1000000 [12:05:44<2466:59:49, 9.04s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17760/1000000 [12:05:50<2242:29:13, 8.22s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17760], local_loss=0.005652768537402153, train_loss=0.015098332427442074, time_cost=2.3975677490234375
+
Steps: 2%|▏ | 17760/1000000 [12:05:50<2242:29:13, 8.22s/it, lr=1e-5, step_loss=0.00565]
Steps: 2%|▏ | 17761/1000000 [12:05:58<2173:58:44, 7.97s/it, lr=1e-5, step_loss=0.00565][RANK-0]: Step: [17761], local_loss=0.010193660855293274, train_loss=0.034884512424468994, time_cost=3.0931766033172607
+
Steps: 2%|▏ | 17761/1000000 [12:05:58<2173:58:44, 7.97s/it, lr=1e-5, step_loss=0.0102]
Steps: 2%|▏ | 17762/1000000 [12:06:04<2015:12:31, 7.39s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [17762], local_loss=0.02956797555088997, train_loss=38.72224807739258, time_cost=1.6316921710968018
+
Steps: 2%|▏ | 17762/1000000 [12:06:04<2015:12:31, 7.39s/it, lr=1e-5, step_loss=0.0296]
Steps: 2%|▏ | 17763/1000000 [12:06:13<2160:13:12, 7.92s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [17763], local_loss=0.12914703786373138, train_loss=0.053539399057626724, time_cost=3.3401761054992676
+
Steps: 2%|▏ | 17763/1000000 [12:06:13<2160:13:12, 7.92s/it, lr=1e-5, step_loss=0.129]
Steps: 2%|▏ | 17764/1000000 [12:06:18<1903:13:56, 6.98s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [17764], local_loss=0.03429046645760536, train_loss=0.056562311947345734, time_cost=1.8812637329101562
+
Steps: 2%|▏ | 17764/1000000 [12:06:18<1903:13:56, 6.98s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 17765/1000000 [12:06:27<2077:38:45, 7.61s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [17765], local_loss=0.2883220314979553, train_loss=0.07576635479927063, time_cost=3.847858190536499
+
Steps: 2%|▏ | 17765/1000000 [12:06:27<2077:38:45, 7.61s/it, lr=1e-5, step_loss=0.288]
Steps: 2%|▏ | 17766/1000000 [12:06:31<1815:00:46, 6.65s/it, lr=1e-5, step_loss=0.288][RANK-0]: Step: [17766], local_loss=0.029515286907553673, train_loss=0.022706199437379837, time_cost=3.1690444946289062
+
Steps: 2%|▏ | 17766/1000000 [12:06:31<1815:00:46, 6.65s/it, lr=1e-5, step_loss=0.0295]
Steps: 2%|▏ | 17767/1000000 [12:06:37<1706:35:37, 6.25s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [17767], local_loss=0.06753496080636978, train_loss=0.037429288029670715, time_cost=1.7388639450073242
+
Steps: 2%|▏ | 17767/1000000 [12:06:37<1706:35:37, 6.25s/it, lr=1e-5, step_loss=0.0675]
Steps: 2%|▏ | 17768/1000000 [12:06:41<1557:25:10, 5.71s/it, lr=1e-5, step_loss=0.0675][RANK-0]: Step: [17768], local_loss=0.12254777550697327, train_loss=0.03988080471754074, time_cost=1.4760973453521729
+
Steps: 2%|▏ | 17768/1000000 [12:06:41<1557:25:10, 5.71s/it, lr=1e-5, step_loss=0.123]
Steps: 2%|▏ | 17769/1000000 [12:06:47<1554:45:04, 5.70s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [17769], local_loss=0.03423798829317093, train_loss=0.061873823404312134, time_cost=2.958162307739258
+
Steps: 2%|▏ | 17769/1000000 [12:06:47<1554:45:04, 5.70s/it, lr=1e-5, step_loss=0.0342]
Steps: 2%|▏ | 17770/1000000 [12:07:00<2144:48:12, 7.86s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [17770], local_loss=0.008241797797381878, train_loss=0.08515619486570358, time_cost=7.039396524429321
+
Steps: 2%|▏ | 17770/1000000 [12:07:00<2144:48:12, 7.86s/it, lr=1e-5, step_loss=0.00824]
Steps: 2%|▏ | 17771/1000000 [12:07:09<2316:04:22, 8.49s/it, lr=1e-5, step_loss=0.00824][RANK-0]: Step: [17771], local_loss=0.09827540069818497, train_loss=0.03423459082841873, time_cost=2.7948222160339355
+
Steps: 2%|▏ | 17771/1000000 [12:07:09<2316:04:22, 8.49s/it, lr=1e-5, step_loss=0.0983]
Steps: 2%|▏ | 17772/1000000 [12:07:21<2534:02:45, 9.29s/it, lr=1e-5, step_loss=0.0983][RANK-0]: Step: [17772], local_loss=0.05916237086057663, train_loss=0.0697539746761322, time_cost=4.6491334438323975
+
Steps: 2%|▏ | 17772/1000000 [12:07:21<2534:02:45, 9.29s/it, lr=1e-5, step_loss=0.0592]
Steps: 2%|▏ | 17773/1000000 [12:07:27<2268:28:39, 8.31s/it, lr=1e-5, step_loss=0.0592][RANK-0]: Step: [17773], local_loss=0.02384071983397007, train_loss=0.058536604046821594, time_cost=1.7725510597229004
+
Steps: 2%|▏ | 17773/1000000 [12:07:27<2268:28:39, 8.31s/it, lr=1e-5, step_loss=0.0238]
Steps: 2%|▏ | 17774/1000000 [12:07:32<2046:06:47, 7.50s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [17774], local_loss=0.01860092207789421, train_loss=0.02757115662097931, time_cost=1.2298941612243652
+
Steps: 2%|▏ | 17774/1000000 [12:07:32<2046:06:47, 7.50s/it, lr=1e-5, step_loss=0.0186]
Steps: 2%|▏ | 17775/1000000 [12:07:41<2155:47:36, 7.90s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [17775], local_loss=0.06627220660448074, train_loss=0.053724728524684906, time_cost=1.8155174255371094
+
Steps: 2%|▏ | 17775/1000000 [12:07:41<2155:47:36, 7.90s/it, lr=1e-5, step_loss=0.0663]
Steps: 2%|▏ | 17776/1000000 [12:07:48<2096:26:56, 7.68s/it, lr=1e-5, step_loss=0.0663][RANK-0]: Step: [17776], local_loss=0.010048606432974339, train_loss=0.021251395344734192, time_cost=1.2471363544464111
+
Steps: 2%|▏ | 17776/1000000 [12:07:48<2096:26:56, 7.68s/it, lr=1e-5, step_loss=0.01]
Steps: 2%|▏ | 17777/1000000 [12:07:53<1821:06:35, 6.67s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [17777], local_loss=0.01129878405481577, train_loss=0.032179392874240875, time_cost=1.4122908115386963
+
Steps: 2%|▏ | 17777/1000000 [12:07:53<1821:06:35, 6.67s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 17778/1000000 [12:07:58<1709:03:18, 6.26s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [17778], local_loss=0.011037365533411503, train_loss=0.06097412109375, time_cost=1.4414141178131104
+
Steps: 2%|▏ | 17778/1000000 [12:07:58<1709:03:18, 6.26s/it, lr=1e-5, step_loss=0.011]
Steps: 2%|▏ | 17779/1000000 [12:08:04<1682:24:57, 6.17s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [17779], local_loss=0.01532154344022274, train_loss=0.035333868116140366, time_cost=1.349355936050415
+
Steps: 2%|▏ | 17779/1000000 [12:08:04<1682:24:57, 6.17s/it, lr=1e-5, step_loss=0.0153]
Steps: 2%|▏ | 17780/1000000 [12:08:15<2070:04:23, 7.59s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [17780], local_loss=0.27773866057395935, train_loss=0.05878084525465965, time_cost=8.876327991485596
+
Steps: 2%|▏ | 17780/1000000 [12:08:15<2070:04:23, 7.59s/it, lr=1e-5, step_loss=0.278]
Steps: 2%|▏ | 17781/1000000 [12:08:27<2429:29:35, 8.90s/it, lr=1e-5, step_loss=0.278][RANK-0]: Step: [17781], local_loss=0.08592092245817184, train_loss=0.03830338269472122, time_cost=3.013108730316162
+
Steps: 2%|▏ | 17781/1000000 [12:08:27<2429:29:35, 8.90s/it, lr=1e-5, step_loss=0.0859]
Steps: 2%|▏ | 17782/1000000 [12:08:33<2249:19:27, 8.24s/it, lr=1e-5, step_loss=0.0859][RANK-0]: Step: [17782], local_loss=0.07495620846748352, train_loss=0.033680982887744904, time_cost=3.050671339035034
+
Steps: 2%|▏ | 17782/1000000 [12:08:33<2249:19:27, 8.24s/it, lr=1e-5, step_loss=0.075]
Steps: 2%|▏ | 17783/1000000 [12:08:41<2231:54:59, 8.18s/it, lr=1e-5, step_loss=0.075][RANK-0]: Step: [17783], local_loss=0.013039572164416313, train_loss=0.027317997068166733, time_cost=2.8936920166015625
+
Steps: 2%|▏ | 17783/1000000 [12:08:41<2231:54:59, 8.18s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 17784/1000000 [12:08:48<2121:32:47, 7.78s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [17784], local_loss=0.019086750224232674, train_loss=0.02065894566476345, time_cost=1.460425853729248
+
Steps: 2%|▏ | 17784/1000000 [12:08:48<2121:32:47, 7.78s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 17785/1000000 [12:08:54<1961:22:20, 7.19s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [17785], local_loss=0.028321564197540283, train_loss=0.07121852785348892, time_cost=1.8746929168701172
+
Steps: 2%|▏ | 17785/1000000 [12:08:54<1961:22:20, 7.19s/it, lr=1e-5, step_loss=0.0283]
Steps: 2%|▏ | 17786/1000000 [12:09:03<2095:31:31, 7.68s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [17786], local_loss=0.004482329357415438, train_loss=0.04037110507488251, time_cost=4.032687187194824
+
Steps: 2%|▏ | 17786/1000000 [12:09:03<2095:31:31, 7.68s/it, lr=1e-5, step_loss=0.00448]
Steps: 2%|▏ | 17787/1000000 [12:09:08<1873:05:43, 6.87s/it, lr=1e-5, step_loss=0.00448][RANK-0]: Step: [17787], local_loss=0.057157646864652634, train_loss=0.025601543486118317, time_cost=1.2262918949127197
+
Steps: 2%|▏ | 17787/1000000 [12:09:08<1873:05:43, 6.87s/it, lr=1e-5, step_loss=0.0572]
Steps: 2%|▏ | 17788/1000000 [12:09:14<1836:58:51, 6.73s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [17788], local_loss=0.06794317811727524, train_loss=0.04469263553619385, time_cost=1.9679598808288574
+
Steps: 2%|▏ | 17788/1000000 [12:09:14<1836:58:51, 6.73s/it, lr=1e-5, step_loss=0.0679]
Steps: 2%|▏ | 17789/1000000 [12:09:28<2428:23:25, 8.90s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [17789], local_loss=0.004351608920842409, train_loss=0.0827290266752243, time_cost=5.4169347286224365
+
Steps: 2%|▏ | 17789/1000000 [12:09:28<2428:23:25, 8.90s/it, lr=1e-5, step_loss=0.00435]
Steps: 2%|▏ | 17790/1000000 [12:09:38<2456:53:05, 9.00s/it, lr=1e-5, step_loss=0.00435][RANK-0]: Step: [17790], local_loss=0.022494828328490257, train_loss=0.01862930878996849, time_cost=3.2436704635620117
+
Steps: 2%|▏ | 17790/1000000 [12:09:38<2456:53:05, 9.00s/it, lr=1e-5, step_loss=0.0225]
Steps: 2%|▏ | 17791/1000000 [12:09:48<2600:15:00, 9.53s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [17791], local_loss=0.01872924156486988, train_loss=26.709671020507812, time_cost=1.7417409420013428
+
Steps: 2%|▏ | 17791/1000000 [12:09:48<2600:15:00, 9.53s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 17792/1000000 [12:09:54<2288:44:46, 8.39s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [17792], local_loss=0.05966584011912346, train_loss=0.022970564663410187, time_cost=3.0428884029388428
+
Steps: 2%|▏ | 17792/1000000 [12:09:54<2288:44:46, 8.39s/it, lr=1e-5, step_loss=0.0597]
Steps: 2%|▏ | 17793/1000000 [12:10:08<2744:49:38, 10.06s/it, lr=1e-5, step_loss=0.0597][RANK-0]: Step: [17793], local_loss=0.046586085110902786, train_loss=0.06898795068264008, time_cost=5.870790481567383
+
Steps: 2%|▏ | 17793/1000000 [12:10:08<2744:49:38, 10.06s/it, lr=1e-5, step_loss=0.0466]
Steps: 2%|▏ | 17794/1000000 [12:10:14<2404:21:14, 8.81s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [17794], local_loss=0.024584505707025528, train_loss=0.0621311217546463, time_cost=1.6172146797180176
+
Steps: 2%|▏ | 17794/1000000 [12:10:14<2404:21:14, 8.81s/it, lr=1e-5, step_loss=0.0246]
Steps: 2%|▏ | 17795/1000000 [12:10:22<2321:12:46, 8.51s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [17795], local_loss=0.035835716873407364, train_loss=0.03920796513557434, time_cost=3.4277496337890625
+
Steps: 2%|▏ | 17795/1000000 [12:10:22<2321:12:46, 8.51s/it, lr=1e-5, step_loss=0.0358]
Steps: 2%|▏ | 17796/1000000 [12:10:29<2250:30:41, 8.25s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [17796], local_loss=0.007357222959399223, train_loss=0.031001776456832886, time_cost=2.0762734413146973
+
Steps: 2%|▏ | 17796/1000000 [12:10:29<2250:30:41, 8.25s/it, lr=1e-5, step_loss=0.00736]
Steps: 2%|▏ | 17797/1000000 [12:10:35<2052:34:08, 7.52s/it, lr=1e-5, step_loss=0.00736][RANK-0]: Step: [17797], local_loss=0.008866171352565289, train_loss=0.03726955130696297, time_cost=1.3261864185333252
+
Steps: 2%|▏ | 17797/1000000 [12:10:35<2052:34:08, 7.52s/it, lr=1e-5, step_loss=0.00887]
Steps: 2%|▏ | 17798/1000000 [12:10:51<2736:53:56, 10.03s/it, lr=1e-5, step_loss=0.00887][RANK-0]: Step: [17798], local_loss=0.04187912866473198, train_loss=0.04300326853990555, time_cost=8.032742500305176
+
Steps: 2%|▏ | 17798/1000000 [12:10:51<2736:53:56, 10.03s/it, lr=1e-5, step_loss=0.0419]
Steps: 2%|▏ | 17799/1000000 [12:10:56<2358:04:00, 8.64s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [17799], local_loss=0.007815445773303509, train_loss=0.04341348260641098, time_cost=1.5701358318328857
+
Steps: 2%|▏ | 17799/1000000 [12:10:56<2358:04:00, 8.64s/it, lr=1e-5, step_loss=0.00782]
Steps: 2%|▏ | 17800/1000000 [12:11:01<2022:12:03, 7.41s/it, lr=1e-5, step_loss=0.00782][RANK-0]: Step: [17800], local_loss=0.00916201900690794, train_loss=0.02039751037955284, time_cost=1.2264835834503174
+
Steps: 2%|▏ | 17800/1000000 [12:11:01<2022:12:03, 7.41s/it, lr=1e-5, step_loss=0.00916]
Steps: 2%|▏ | 17801/1000000 [12:11:11<2273:35:42, 8.33s/it, lr=1e-5, step_loss=0.00916][RANK-0]: Step: [17801], local_loss=0.02156904526054859, train_loss=0.115391805768013, time_cost=1.205371379852295
+
Steps: 2%|▏ | 17801/1000000 [12:11:11<2273:35:42, 8.33s/it, lr=1e-5, step_loss=0.0216]
Steps: 2%|▏ | 17802/1000000 [12:11:18<2164:27:02, 7.93s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [17802], local_loss=0.004594660364091396, train_loss=0.021291812881827354, time_cost=2.5934770107269287
+
Steps: 2%|▏ | 17802/1000000 [12:11:18<2164:27:02, 7.93s/it, lr=1e-5, step_loss=0.00459]
Steps: 2%|▏ | 17803/1000000 [12:11:25<2012:23:10, 7.38s/it, lr=1e-5, step_loss=0.00459][RANK-0]: Step: [17803], local_loss=0.03594376519322395, train_loss=0.038773808628320694, time_cost=1.6675641536712646
+
Steps: 2%|▏ | 17803/1000000 [12:11:25<2012:23:10, 7.38s/it, lr=1e-5, step_loss=0.0359]
Steps: 2%|▏ | 17804/1000000 [12:11:30<1845:51:37, 6.77s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [17804], local_loss=0.006019848398864269, train_loss=0.034950654953718185, time_cost=4.120055913925171
+
Steps: 2%|▏ | 17804/1000000 [12:11:30<1845:51:37, 6.77s/it, lr=1e-5, step_loss=0.00602]
Steps: 2%|▏ | 17805/1000000 [12:11:35<1715:58:38, 6.29s/it, lr=1e-5, step_loss=0.00602][RANK-0]: Step: [17805], local_loss=0.23904366791248322, train_loss=0.06796103715896606, time_cost=2.2500829696655273
+
Steps: 2%|▏ | 17805/1000000 [12:11:35<1715:58:38, 6.29s/it, lr=1e-5, step_loss=0.239]
Steps: 2%|▏ | 17806/1000000 [12:11:45<1990:29:15, 7.30s/it, lr=1e-5, step_loss=0.239][RANK-0]: Step: [17806], local_loss=0.01092881616204977, train_loss=0.027949947863817215, time_cost=3.6962668895721436
+
Steps: 2%|▏ | 17806/1000000 [12:11:45<1990:29:15, 7.30s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 17807/1000000 [12:12:00<2650:59:56, 9.72s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [17807], local_loss=0.045539140701293945, train_loss=0.024273261427879333, time_cost=7.471825122833252
+
Steps: 2%|▏ | 17807/1000000 [12:12:00<2650:59:56, 9.72s/it, lr=1e-5, step_loss=0.0455]
Steps: 2%|▏ | 17808/1000000 [12:12:09<2583:25:02, 9.47s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [17808], local_loss=0.019188852980732918, train_loss=0.016989856958389282, time_cost=3.291396141052246
+
Steps: 2%|▏ | 17808/1000000 [12:12:09<2583:25:02, 9.47s/it, lr=1e-5, step_loss=0.0192]
Steps: 2%|▏ | 17809/1000000 [12:12:20<2721:28:02, 9.97s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [17809], local_loss=0.00798437837511301, train_loss=0.025113970041275024, time_cost=3.6806302070617676
+
Steps: 2%|▏ | 17809/1000000 [12:12:20<2721:28:02, 9.97s/it, lr=1e-5, step_loss=0.00798]
Steps: 2%|▏ | 17810/1000000 [12:12:29<2657:54:19, 9.74s/it, lr=1e-5, step_loss=0.00798][RANK-0]: Step: [17810], local_loss=0.037119895219802856, train_loss=0.07624085992574692, time_cost=3.2360925674438477
+
Steps: 2%|▏ | 17810/1000000 [12:12:29<2657:54:19, 9.74s/it, lr=1e-5, step_loss=0.0371]
Steps: 2%|▏ | 17811/1000000 [12:12:44<3067:46:11, 11.24s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [17811], local_loss=0.04690815135836601, train_loss=0.04374924302101135, time_cost=5.312042951583862
+
Steps: 2%|▏ | 17811/1000000 [12:12:44<3067:46:11, 11.24s/it, lr=1e-5, step_loss=0.0469]
Steps: 2%|▏ | 17812/1000000 [12:12:59<3333:09:06, 12.22s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [17812], local_loss=0.11408664286136627, train_loss=0.027209322899580002, time_cost=3.230175256729126
+
Steps: 2%|▏ | 17812/1000000 [12:12:59<3333:09:06, 12.22s/it, lr=1e-5, step_loss=0.114]
Steps: 2%|▏ | 17813/1000000 [12:13:04<2751:32:10, 10.09s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [17813], local_loss=0.02011777088046074, train_loss=0.1498890221118927, time_cost=3.900874614715576
+
Steps: 2%|▏ | 17813/1000000 [12:13:04<2751:32:10, 10.09s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 17814/1000000 [12:13:08<2306:17:11, 8.45s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [17814], local_loss=0.1422896534204483, train_loss=0.049924205988645554, time_cost=2.2997548580169678
+
Steps: 2%|▏ | 17814/1000000 [12:13:08<2306:17:11, 8.45s/it, lr=1e-5, step_loss=0.142]
Steps: 2%|▏ | 17815/1000000 [12:13:22<2749:03:13, 10.08s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [17815], local_loss=0.008919292129576206, train_loss=0.03117399476468563, time_cost=5.8292601108551025
+
Steps: 2%|▏ | 17815/1000000 [12:13:22<2749:03:13, 10.08s/it, lr=1e-5, step_loss=0.00892]
Steps: 2%|▏ | 17816/1000000 [12:13:30<2553:15:21, 9.36s/it, lr=1e-5, step_loss=0.00892][RANK-0]: Step: [17816], local_loss=0.01774296909570694, train_loss=0.02154868096113205, time_cost=1.8579258918762207
+
Steps: 2%|▏ | 17816/1000000 [12:13:30<2553:15:21, 9.36s/it, lr=1e-5, step_loss=0.0177]
Steps: 2%|▏ | 17817/1000000 [12:13:41<2705:48:15, 9.92s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [17817], local_loss=0.008145862258970737, train_loss=0.04164840281009674, time_cost=2.8245656490325928
+
Steps: 2%|▏ | 17817/1000000 [12:13:41<2705:48:15, 9.92s/it, lr=1e-5, step_loss=0.00815]
Steps: 2%|▏ | 17818/1000000 [12:13:47<2379:43:41, 8.72s/it, lr=1e-5, step_loss=0.00815][RANK-0]: Step: [17818], local_loss=0.017050612717866898, train_loss=0.03883139789104462, time_cost=1.4164454936981201
+
Steps: 2%|▏ | 17818/1000000 [12:13:47<2379:43:41, 8.72s/it, lr=1e-5, step_loss=0.0171]
Steps: 2%|▏ | 17819/1000000 [12:13:54<2233:39:22, 8.19s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [17819], local_loss=0.02946210466325283, train_loss=0.0420946329832077, time_cost=3.263292074203491
+
Steps: 2%|▏ | 17819/1000000 [12:13:54<2233:39:22, 8.19s/it, lr=1e-5, step_loss=0.0295]
Steps: 2%|▏ | 17820/1000000 [12:14:03<2275:19:47, 8.34s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [17820], local_loss=0.03641355782747269, train_loss=0.028365295380353928, time_cost=3.003934860229492
+
Steps: 2%|▏ | 17820/1000000 [12:14:03<2275:19:47, 8.34s/it, lr=1e-5, step_loss=0.0364]
Steps: 2%|▏ | 17821/1000000 [12:14:10<2199:45:13, 8.06s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [17821], local_loss=0.011207466945052147, train_loss=0.056807730346918106, time_cost=2.369304656982422
+
Steps: 2%|▏ | 17821/1000000 [12:14:10<2199:45:13, 8.06s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 17822/1000000 [12:14:24<2655:30:10, 9.73s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [17822], local_loss=0.005769128445535898, train_loss=0.0476553700864315, time_cost=1.226562738418579
+
Steps: 2%|▏ | 17822/1000000 [12:14:24<2655:30:10, 9.73s/it, lr=1e-5, step_loss=0.00577]
Steps: 2%|▏ | 17823/1000000 [12:14:34<2738:22:48, 10.04s/it, lr=1e-5, step_loss=0.00577][RANK-0]: Step: [17823], local_loss=0.017215020954608917, train_loss=0.05759128928184509, time_cost=2.003155469894409
+
Steps: 2%|▏ | 17823/1000000 [12:14:34<2738:22:48, 10.04s/it, lr=1e-5, step_loss=0.0172]
Steps: 2%|▏ | 17824/1000000 [12:14:43<2643:27:04, 9.69s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [17824], local_loss=0.035993725061416626, train_loss=0.044799380004405975, time_cost=2.002019166946411
+
Steps: 2%|▏ | 17824/1000000 [12:14:43<2643:27:04, 9.69s/it, lr=1e-5, step_loss=0.036]
Steps: 2%|▏ | 17825/1000000 [12:14:52<2589:17:59, 9.49s/it, lr=1e-5, step_loss=0.036][RANK-0]: Step: [17825], local_loss=0.012405955232679844, train_loss=0.030873063951730728, time_cost=1.2163324356079102
+
Steps: 2%|▏ | 17825/1000000 [12:14:52<2589:17:59, 9.49s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 17826/1000000 [12:15:06<2929:19:19, 10.74s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [17826], local_loss=0.01662786677479744, train_loss=0.025964688509702682, time_cost=4.997202634811401
+
Steps: 2%|▏ | 17826/1000000 [12:15:06<2929:19:19, 10.74s/it, lr=1e-5, step_loss=0.0166]
Steps: 2%|▏ | 17827/1000000 [12:15:12<2541:18:32, 9.31s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [17827], local_loss=0.01071759033948183, train_loss=0.02850520983338356, time_cost=1.2504470348358154
+
Steps: 2%|▏ | 17827/1000000 [12:15:12<2541:18:32, 9.31s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 17828/1000000 [12:15:23<2717:02:41, 9.96s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [17828], local_loss=0.011962009593844414, train_loss=0.02664082683622837, time_cost=3.4340686798095703
+
Steps: 2%|▏ | 17828/1000000 [12:15:23<2717:02:41, 9.96s/it, lr=1e-5, step_loss=0.012]
Steps: 2%|▏ | 17829/1000000 [12:15:34<2797:37:53, 10.25s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [17829], local_loss=0.007897292263805866, train_loss=0.03192761540412903, time_cost=3.3221161365509033
+
Steps: 2%|▏ | 17829/1000000 [12:15:34<2797:37:53, 10.25s/it, lr=1e-5, step_loss=0.0079]
Steps: 2%|▏ | 17830/1000000 [12:15:39<2335:02:17, 8.56s/it, lr=1e-5, step_loss=0.0079][RANK-0]: Step: [17830], local_loss=0.4989696443080902, train_loss=0.08380034565925598, time_cost=1.6222631931304932
+
Steps: 2%|▏ | 17830/1000000 [12:15:39<2335:02:17, 8.56s/it, lr=1e-5, step_loss=0.499]
Steps: 2%|▏ | 17831/1000000 [12:15:53<2767:10:00, 10.14s/it, lr=1e-5, step_loss=0.499][RANK-0]: Step: [17831], local_loss=0.006417750846594572, train_loss=13.313426971435547, time_cost=2.853362798690796
+
Steps: 2%|▏ | 17831/1000000 [12:15:53<2767:10:00, 10.14s/it, lr=1e-5, step_loss=0.00642]
Steps: 2%|▏ | 17832/1000000 [12:16:07<3065:26:32, 11.24s/it, lr=1e-5, step_loss=0.00642][RANK-0]: Step: [17832], local_loss=0.02190590836107731, train_loss=0.16979077458381653, time_cost=4.3511786460876465
+
Steps: 2%|▏ | 17832/1000000 [12:16:07<3065:26:32, 11.24s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 17833/1000000 [12:16:14<2737:49:16, 10.04s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [17833], local_loss=0.011904430575668812, train_loss=0.02694605477154255, time_cost=1.8401212692260742
+
Steps: 2%|▏ | 17833/1000000 [12:16:14<2737:49:16, 10.04s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 17834/1000000 [12:16:28<3075:00:36, 11.27s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [17834], local_loss=0.007732233498245478, train_loss=0.015194731764495373, time_cost=6.087425947189331
+
Steps: 2%|▏ | 17834/1000000 [12:16:28<3075:00:36, 11.27s/it, lr=1e-5, step_loss=0.00773]
Steps: 2%|▏ | 17835/1000000 [12:16:37<2855:52:42, 10.47s/it, lr=1e-5, step_loss=0.00773][RANK-0]: Step: [17835], local_loss=0.018718576058745384, train_loss=0.016934463754296303, time_cost=3.044926643371582
+
Steps: 2%|▏ | 17835/1000000 [12:16:37<2855:52:42, 10.47s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 17836/1000000 [12:16:46<2750:36:23, 10.08s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [17836], local_loss=0.016683707013726234, train_loss=0.03582841157913208, time_cost=1.9801692962646484
+
Steps: 2%|▏ | 17836/1000000 [12:16:46<2750:36:23, 10.08s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 17837/1000000 [12:16:56<2754:00:09, 10.09s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [17837], local_loss=0.02878662198781967, train_loss=0.14516359567642212, time_cost=1.7948834896087646
+
Steps: 2%|▏ | 17837/1000000 [12:16:56<2754:00:09, 10.09s/it, lr=1e-5, step_loss=0.0288]
Steps: 2%|▏ | 17838/1000000 [12:17:08<2883:33:37, 10.57s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [17838], local_loss=0.005865264683961868, train_loss=0.024418041110038757, time_cost=4.125669956207275
+
Steps: 2%|▏ | 17838/1000000 [12:17:08<2883:33:37, 10.57s/it, lr=1e-5, step_loss=0.00587]
Steps: 2%|▏ | 17839/1000000 [12:17:21<3121:32:38, 11.44s/it, lr=1e-5, step_loss=0.00587][RANK-0]: Step: [17839], local_loss=0.016280144453048706, train_loss=0.012504598125815392, time_cost=3.951986789703369
+
Steps: 2%|▏ | 17839/1000000 [12:17:21<3121:32:38, 11.44s/it, lr=1e-5, step_loss=0.0163]
Steps: 2%|▏ | 17840/1000000 [12:17:28<2765:41:18, 10.14s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [17840], local_loss=0.04587339982390404, train_loss=0.01621701940894127, time_cost=3.094465970993042
+
Steps: 2%|▏ | 17840/1000000 [12:17:28<2765:41:18, 10.14s/it, lr=1e-5, step_loss=0.0459]
Steps: 2%|▏ | 17841/1000000 [12:17:35<2496:57:33, 9.15s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [17841], local_loss=0.010879108682274818, train_loss=0.015289545059204102, time_cost=5.217860460281372
+
Steps: 2%|▏ | 17841/1000000 [12:17:35<2496:57:33, 9.15s/it, lr=1e-5, step_loss=0.0109]
Steps: 2%|▏ | 17842/1000000 [12:17:44<2491:09:47, 9.13s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [17842], local_loss=0.11207841336727142, train_loss=0.04422464594244957, time_cost=3.303086757659912
+
Steps: 2%|▏ | 17842/1000000 [12:17:44<2491:09:47, 9.13s/it, lr=1e-5, step_loss=0.112]
Steps: 2%|▏ | 17843/1000000 [12:17:49<2186:04:22, 8.01s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [17843], local_loss=0.0054399557411670685, train_loss=0.02657412551343441, time_cost=2.3691306114196777
+
Steps: 2%|▏ | 17843/1000000 [12:17:49<2186:04:22, 8.01s/it, lr=1e-5, step_loss=0.00544]
Steps: 2%|▏ | 17844/1000000 [12:18:00<2376:28:36, 8.71s/it, lr=1e-5, step_loss=0.00544][RANK-0]: Step: [17844], local_loss=0.14438287913799286, train_loss=0.05540184676647186, time_cost=3.1185944080352783
+
Steps: 2%|▏ | 17844/1000000 [12:18:00<2376:28:36, 8.71s/it, lr=1e-5, step_loss=0.144]
Steps: 2%|▏ | 17845/1000000 [12:18:07<2242:31:12, 8.22s/it, lr=1e-5, step_loss=0.144][RANK-0]: Step: [17845], local_loss=0.05359045788645744, train_loss=0.02924247272312641, time_cost=5.367563247680664
+
Steps: 2%|▏ | 17845/1000000 [12:18:07<2242:31:12, 8.22s/it, lr=1e-5, step_loss=0.0536]
Steps: 2%|▏ | 17846/1000000 [12:18:11<1931:53:52, 7.08s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [17846], local_loss=0.008939738385379314, train_loss=0.22078821063041687, time_cost=1.5207014083862305
+
Steps: 2%|▏ | 17846/1000000 [12:18:11<1931:53:52, 7.08s/it, lr=1e-5, step_loss=0.00894]
Steps: 2%|▏ | 17847/1000000 [12:18:23<2290:57:10, 8.40s/it, lr=1e-5, step_loss=0.00894][RANK-0]: Step: [17847], local_loss=0.01939893700182438, train_loss=0.06297558546066284, time_cost=1.759183645248413
+
Steps: 2%|▏ | 17847/1000000 [12:18:23<2290:57:10, 8.40s/it, lr=1e-5, step_loss=0.0194]
Steps: 2%|▏ | 17848/1000000 [12:18:29<2072:28:16, 7.60s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [17848], local_loss=0.010473516769707203, train_loss=0.06334832310676575, time_cost=2.1358642578125
+
Steps: 2%|▏ | 17848/1000000 [12:18:29<2072:28:16, 7.60s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 17849/1000000 [12:18:38<2205:48:58, 8.09s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [17849], local_loss=0.012918401509523392, train_loss=0.0446930006146431, time_cost=4.261720657348633
+
Steps: 2%|▏ | 17849/1000000 [12:18:38<2205:48:58, 8.09s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 17850/1000000 [12:18:47<2287:12:23, 8.38s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [17850], local_loss=0.01312088593840599, train_loss=0.12962524592876434, time_cost=3.7498209476470947
+
Steps: 2%|▏ | 17850/1000000 [12:18:47<2287:12:23, 8.38s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 17851/1000000 [12:18:58<2518:36:46, 9.23s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [17851], local_loss=0.014103318564593792, train_loss=0.007538041099905968, time_cost=4.709046840667725
+
Steps: 2%|▏ | 17851/1000000 [12:18:58<2518:36:46, 9.23s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 17852/1000000 [12:19:03<2195:09:38, 8.05s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [17852], local_loss=0.05197691172361374, train_loss=0.1583286076784134, time_cost=1.65748929977417
+
Steps: 2%|▏ | 17852/1000000 [12:19:03<2195:09:38, 8.05s/it, lr=1e-5, step_loss=0.052]
Steps: 2%|▏ | 17853/1000000 [12:19:15<2482:59:28, 9.10s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [17853], local_loss=0.004755248315632343, train_loss=0.01816249080002308, time_cost=2.873189687728882
+
Steps: 2%|▏ | 17853/1000000 [12:19:15<2482:59:28, 9.10s/it, lr=1e-5, step_loss=0.00476]
Steps: 2%|▏ | 17854/1000000 [12:19:24<2468:41:29, 9.05s/it, lr=1e-5, step_loss=0.00476][RANK-0]: Step: [17854], local_loss=0.00829050038009882, train_loss=0.13947592675685883, time_cost=5.4560863971710205
+
Steps: 2%|▏ | 17854/1000000 [12:19:24<2468:41:29, 9.05s/it, lr=1e-5, step_loss=0.00829]
Steps: 2%|▏ | 17855/1000000 [12:19:31<2299:37:21, 8.43s/it, lr=1e-5, step_loss=0.00829][RANK-0]: Step: [17855], local_loss=0.06074674054980278, train_loss=0.29314619302749634, time_cost=5.951694965362549
+
Steps: 2%|▏ | 17855/1000000 [12:19:31<2299:37:21, 8.43s/it, lr=1e-5, step_loss=0.0607]
Steps: 2%|▏ | 17856/1000000 [12:19:39<2241:20:10, 8.22s/it, lr=1e-5, step_loss=0.0607][RANK-0]: Step: [17856], local_loss=0.03886088356375694, train_loss=0.04751778393983841, time_cost=1.7912678718566895
+
Steps: 2%|▏ | 17856/1000000 [12:19:39<2241:20:10, 8.22s/it, lr=1e-5, step_loss=0.0389]
Steps: 2%|▏ | 17857/1000000 [12:19:44<2036:04:47, 7.46s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [17857], local_loss=0.02623276226222515, train_loss=0.05565134435892105, time_cost=3.0740315914154053
+
Steps: 2%|▏ | 17857/1000000 [12:19:44<2036:04:47, 7.46s/it, lr=1e-5, step_loss=0.0262]
Steps: 2%|▏ | 17858/1000000 [12:19:51<1987:59:57, 7.29s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [17858], local_loss=0.003777997102588415, train_loss=0.12711885571479797, time_cost=2.8286476135253906
+
Steps: 2%|▏ | 17858/1000000 [12:19:51<1987:59:57, 7.29s/it, lr=1e-5, step_loss=0.00378]
Steps: 2%|▏ | 17859/1000000 [12:20:00<2159:20:32, 7.91s/it, lr=1e-5, step_loss=0.00378][RANK-0]: Step: [17859], local_loss=0.04767311364412308, train_loss=0.04140888527035713, time_cost=3.46343994140625
+
Steps: 2%|▏ | 17859/1000000 [12:20:00<2159:20:32, 7.91s/it, lr=1e-5, step_loss=0.0477]
Steps: 2%|▏ | 17860/1000000 [12:20:10<2256:10:34, 8.27s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [17860], local_loss=0.07562367618083954, train_loss=0.026083506643772125, time_cost=5.403500556945801
+
Steps: 2%|▏ | 17860/1000000 [12:20:10<2256:10:34, 8.27s/it, lr=1e-5, step_loss=0.0756]
Steps: 2%|▏ | 17861/1000000 [12:20:24<2772:30:23, 10.16s/it, lr=1e-5, step_loss=0.0756][RANK-0]: Step: [17861], local_loss=0.020908260717988014, train_loss=0.06171455979347229, time_cost=5.895974397659302
+
Steps: 2%|▏ | 17861/1000000 [12:20:24<2772:30:23, 10.16s/it, lr=1e-5, step_loss=0.0209]
Steps: 2%|▏ | 17862/1000000 [12:20:29<2364:20:24, 8.67s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [17862], local_loss=0.021056603640317917, train_loss=0.014613820239901543, time_cost=2.6324350833892822
+
Steps: 2%|▏ | 17862/1000000 [12:20:29<2364:20:24, 8.67s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 17863/1000000 [12:20:41<2583:15:05, 9.47s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [17863], local_loss=0.00590814184397459, train_loss=0.05150970444083214, time_cost=4.288810968399048
+
Steps: 2%|▏ | 17863/1000000 [12:20:41<2583:15:05, 9.47s/it, lr=1e-5, step_loss=0.00591]
Steps: 2%|▏ | 17864/1000000 [12:20:50<2584:30:14, 9.47s/it, lr=1e-5, step_loss=0.00591][RANK-0]: Step: [17864], local_loss=0.007318655028939247, train_loss=0.022216442972421646, time_cost=1.8069093227386475
+
Steps: 2%|▏ | 17864/1000000 [12:20:50<2584:30:14, 9.47s/it, lr=1e-5, step_loss=0.00732]
Steps: 2%|▏ | 17865/1000000 [12:20:57<2345:40:36, 8.60s/it, lr=1e-5, step_loss=0.00732][RANK-0]: Step: [17865], local_loss=0.003837341209873557, train_loss=0.027687011286616325, time_cost=1.6644995212554932
+
Steps: 2%|▏ | 17865/1000000 [12:20:57<2345:40:36, 8.60s/it, lr=1e-5, step_loss=0.00384]
Steps: 2%|▏ | 17866/1000000 [12:21:03<2121:41:32, 7.78s/it, lr=1e-5, step_loss=0.00384][RANK-0]: Step: [17866], local_loss=0.01593746617436409, train_loss=0.05970752239227295, time_cost=1.4114482402801514
+
Steps: 2%|▏ | 17866/1000000 [12:21:03<2121:41:32, 7.78s/it, lr=1e-5, step_loss=0.0159]
Steps: 2%|▏ | 17867/1000000 [12:21:14<2388:43:20, 8.76s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [17867], local_loss=0.01060951966792345, train_loss=0.09205645322799683, time_cost=3.1903820037841797
+
Steps: 2%|▏ | 17867/1000000 [12:21:14<2388:43:20, 8.76s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 17868/1000000 [12:21:18<2056:34:48, 7.54s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [17868], local_loss=0.020777229219675064, train_loss=0.06276606023311615, time_cost=2.2014882564544678
+
Steps: 2%|▏ | 17868/1000000 [12:21:18<2056:34:48, 7.54s/it, lr=1e-5, step_loss=0.0208]
Steps: 2%|▏ | 17869/1000000 [12:21:24<1910:15:36, 7.00s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [17869], local_loss=0.0055411197245121, train_loss=0.029595905914902687, time_cost=3.3938751220703125
+
Steps: 2%|▏ | 17869/1000000 [12:21:24<1910:15:36, 7.00s/it, lr=1e-5, step_loss=0.00554]
Steps: 2%|▏ | 17870/1000000 [12:21:29<1761:08:36, 6.46s/it, lr=1e-5, step_loss=0.00554][RANK-0]: Step: [17870], local_loss=0.12855608761310577, train_loss=0.05630825087428093, time_cost=2.6120522022247314
+
Steps: 2%|▏ | 17870/1000000 [12:21:29<1761:08:36, 6.46s/it, lr=1e-5, step_loss=0.129]
Steps: 2%|▏ | 17871/1000000 [12:21:40<2106:44:47, 7.72s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [17871], local_loss=0.0228666253387928, train_loss=0.06703745573759079, time_cost=1.8738555908203125
+
Steps: 2%|▏ | 17871/1000000 [12:21:40<2106:44:47, 7.72s/it, lr=1e-5, step_loss=0.0229]
Steps: 2%|▏ | 17872/1000000 [12:21:49<2246:32:47, 8.23s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [17872], local_loss=0.022650670260190964, train_loss=0.029727313667535782, time_cost=7.049403667449951
+
Steps: 2%|▏ | 17872/1000000 [12:21:49<2246:32:47, 8.23s/it, lr=1e-5, step_loss=0.0227]
Steps: 2%|▏ | 17873/1000000 [12:21:58<2282:43:12, 8.37s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [17873], local_loss=0.030664289370179176, train_loss=0.08934217691421509, time_cost=3.0064187049865723
+
Steps: 2%|▏ | 17873/1000000 [12:21:58<2282:43:12, 8.37s/it, lr=1e-5, step_loss=0.0307]
Steps: 2%|▏ | 17874/1000000 [12:22:03<2022:02:48, 7.41s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [17874], local_loss=0.023974118754267693, train_loss=0.07236827909946442, time_cost=2.09379506111145
+
Steps: 2%|▏ | 17874/1000000 [12:22:03<2022:02:48, 7.41s/it, lr=1e-5, step_loss=0.024]
Steps: 2%|▏ | 17875/1000000 [12:22:11<2046:12:54, 7.50s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [17875], local_loss=0.011957320384681225, train_loss=0.12489818036556244, time_cost=4.2101850509643555
+
Steps: 2%|▏ | 17875/1000000 [12:22:11<2046:12:54, 7.50s/it, lr=1e-5, step_loss=0.012]
Steps: 2%|▏ | 17876/1000000 [12:22:18<2027:05:35, 7.43s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [17876], local_loss=0.03491167351603508, train_loss=0.03055712953209877, time_cost=5.269491910934448
+
Steps: 2%|▏ | 17876/1000000 [12:22:18<2027:05:35, 7.43s/it, lr=1e-5, step_loss=0.0349]
Steps: 2%|▏ | 17877/1000000 [12:22:28<2202:25:55, 8.07s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [17877], local_loss=0.03863590210676193, train_loss=0.03422772139310837, time_cost=2.2744693756103516
+
Steps: 2%|▏ | 17877/1000000 [12:22:28<2202:25:55, 8.07s/it, lr=1e-5, step_loss=0.0386]
Steps: 2%|▏ | 17878/1000000 [12:22:37<2314:00:24, 8.48s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [17878], local_loss=1.0022252798080444, train_loss=0.1588735282421112, time_cost=2.1795976161956787
+
Steps: 2%|▏ | 17878/1000000 [12:22:37<2314:00:24, 8.48s/it, lr=1e-5, step_loss=1]
Steps: 2%|▏ | 17879/1000000 [12:22:50<2705:00:52, 9.92s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [17879], local_loss=0.004130852874368429, train_loss=0.019953226670622826, time_cost=1.2125661373138428
+
Steps: 2%|▏ | 17879/1000000 [12:22:50<2705:00:52, 9.92s/it, lr=1e-5, step_loss=0.00413]
Steps: 2%|▏ | 17880/1000000 [12:22:55<2261:16:47, 8.29s/it, lr=1e-5, step_loss=0.00413][RANK-0]: Step: [17880], local_loss=0.00882025994360447, train_loss=0.012981884181499481, time_cost=1.5016453266143799
+
Steps: 2%|▏ | 17880/1000000 [12:22:55<2261:16:47, 8.29s/it, lr=1e-5, step_loss=0.00882]
Steps: 2%|▏ | 17881/1000000 [12:23:02<2178:39:30, 7.99s/it, lr=1e-5, step_loss=0.00882][RANK-0]: Step: [17881], local_loss=0.03173417970538139, train_loss=0.07219574600458145, time_cost=5.937160491943359
+
Steps: 2%|▏ | 17881/1000000 [12:23:02<2178:39:30, 7.99s/it, lr=1e-5, step_loss=0.0317]
Steps: 2%|▏ | 17882/1000000 [12:23:07<1953:33:17, 7.16s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [17882], local_loss=0.025601183995604515, train_loss=0.02099474146962166, time_cost=2.4871203899383545
+
Steps: 2%|▏ | 17882/1000000 [12:23:07<1953:33:17, 7.16s/it, lr=1e-5, step_loss=0.0256]
Steps: 2%|▏ | 17883/1000000 [12:23:13<1790:11:14, 6.56s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [17883], local_loss=0.006824118085205555, train_loss=0.06726216524839401, time_cost=2.2495150566101074
+
Steps: 2%|▏ | 17883/1000000 [12:23:13<1790:11:14, 6.56s/it, lr=1e-5, step_loss=0.00682]
Steps: 2%|▏ | 17884/1000000 [12:23:26<2320:34:04, 8.51s/it, lr=1e-5, step_loss=0.00682][RANK-0]: Step: [17884], local_loss=0.07465184479951859, train_loss=0.06003272533416748, time_cost=4.567664384841919
+
Steps: 2%|▏ | 17884/1000000 [12:23:26<2320:34:04, 8.51s/it, lr=1e-5, step_loss=0.0747]
Steps: 2%|▏ | 17885/1000000 [12:23:36<2498:57:52, 9.16s/it, lr=1e-5, step_loss=0.0747][RANK-0]: Step: [17885], local_loss=0.005859756842255592, train_loss=0.13349008560180664, time_cost=1.8501014709472656
+
Steps: 2%|▏ | 17885/1000000 [12:23:36<2498:57:52, 9.16s/it, lr=1e-5, step_loss=0.00586]
Steps: 2%|▏ | 17886/1000000 [12:23:48<2738:24:17, 10.04s/it, lr=1e-5, step_loss=0.00586][RANK-0]: Step: [17886], local_loss=0.043828535825014114, train_loss=0.05318237468600273, time_cost=3.7670159339904785
+
Steps: 2%|▏ | 17886/1000000 [12:23:48<2738:24:17, 10.04s/it, lr=1e-5, step_loss=0.0438]
Steps: 2%|▏ | 17887/1000000 [12:23:58<2723:38:11, 9.98s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [17887], local_loss=0.006528549827635288, train_loss=0.03353350609540939, time_cost=3.9201083183288574
+
Steps: 2%|▏ | 17887/1000000 [12:23:58<2723:38:11, 9.98s/it, lr=1e-5, step_loss=0.00653]
Steps: 2%|▏ | 17888/1000000 [12:24:03<2325:51:59, 8.53s/it, lr=1e-5, step_loss=0.00653][RANK-0]: Step: [17888], local_loss=0.04771166294813156, train_loss=0.030082058161497116, time_cost=2.703075647354126
+
Steps: 2%|▏ | 17888/1000000 [12:24:03<2325:51:59, 8.53s/it, lr=1e-5, step_loss=0.0477]
Steps: 2%|▏ | 17889/1000000 [12:24:14<2498:39:37, 9.16s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [17889], local_loss=0.008818930946290493, train_loss=0.02307925745844841, time_cost=3.410810947418213
+
Steps: 2%|▏ | 17889/1000000 [12:24:14<2498:39:37, 9.16s/it, lr=1e-5, step_loss=0.00882]
Steps: 2%|▏ | 17890/1000000 [12:24:19<2138:56:04, 7.84s/it, lr=1e-5, step_loss=0.00882][RANK-0]: Step: [17890], local_loss=0.01283056940883398, train_loss=0.06046104431152344, time_cost=2.345508098602295
+
Steps: 2%|▏ | 17890/1000000 [12:24:19<2138:56:04, 7.84s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 17891/1000000 [12:24:29<2330:55:01, 8.54s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [17891], local_loss=0.025968600064516068, train_loss=0.050346896052360535, time_cost=1.7344093322753906
+
Steps: 2%|▏ | 17891/1000000 [12:24:29<2330:55:01, 8.54s/it, lr=1e-5, step_loss=0.026]
Steps: 2%|▏ | 17892/1000000 [12:24:40<2545:45:55, 9.33s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [17892], local_loss=0.05090264976024628, train_loss=0.050526052713394165, time_cost=5.968047380447388
+
Steps: 2%|▏ | 17892/1000000 [12:24:40<2545:45:55, 9.33s/it, lr=1e-5, step_loss=0.0509]
Steps: 2%|▏ | 17893/1000000 [12:24:52<2724:09:40, 9.99s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [17893], local_loss=0.0034785110037773848, train_loss=0.0300002358853817, time_cost=4.418435096740723
+
Steps: 2%|▏ | 17893/1000000 [12:24:52<2724:09:40, 9.99s/it, lr=1e-5, step_loss=0.00348]
Steps: 2%|▏ | 17894/1000000 [12:24:57<2347:45:20, 8.61s/it, lr=1e-5, step_loss=0.00348][RANK-0]: Step: [17894], local_loss=0.028377559036016464, train_loss=0.036992065608501434, time_cost=2.791266441345215
+
Steps: 2%|▏ | 17894/1000000 [12:24:57<2347:45:20, 8.61s/it, lr=1e-5, step_loss=0.0284]
Steps: 2%|▏ | 17895/1000000 [12:25:03<2092:22:38, 7.67s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [17895], local_loss=0.0391354039311409, train_loss=0.11025217175483704, time_cost=2.784848213195801
+
Steps: 2%|▏ | 17895/1000000 [12:25:03<2092:22:38, 7.67s/it, lr=1e-5, step_loss=0.0391]
Steps: 2%|▏ | 17896/1000000 [12:25:17<2633:45:40, 9.65s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [17896], local_loss=0.09222318232059479, train_loss=0.042088642716407776, time_cost=6.610763072967529
+
Steps: 2%|▏ | 17896/1000000 [12:25:17<2633:45:40, 9.65s/it, lr=1e-5, step_loss=0.0922]
Steps: 2%|▏ | 17897/1000000 [12:25:22<2278:25:26, 8.35s/it, lr=1e-5, step_loss=0.0922][RANK-0]: Step: [17897], local_loss=0.007176059763878584, train_loss=0.0680549293756485, time_cost=2.462484359741211
+
Steps: 2%|▏ | 17897/1000000 [12:25:22<2278:25:26, 8.35s/it, lr=1e-5, step_loss=0.00718]
Steps: 2%|▏ | 17898/1000000 [12:25:39<2970:50:56, 10.89s/it, lr=1e-5, step_loss=0.00718][RANK-0]: Step: [17898], local_loss=0.021945828571915627, train_loss=0.07979169487953186, time_cost=8.610209465026855
+
Steps: 2%|▏ | 17898/1000000 [12:25:39<2970:50:56, 10.89s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 17899/1000000 [12:25:53<3219:41:26, 11.80s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [17899], local_loss=0.03447208181023598, train_loss=0.048944104462862015, time_cost=4.644716501235962
+
Steps: 2%|▏ | 17899/1000000 [12:25:53<3219:41:26, 11.80s/it, lr=1e-5, step_loss=0.0345]
Steps: 2%|▏ | 17900/1000000 [12:26:05<3240:14:21, 11.88s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [17900], local_loss=0.008480909280478954, train_loss=0.01204508077353239, time_cost=3.223353624343872
+
Steps: 2%|▏ | 17900/1000000 [12:26:05<3240:14:21, 11.88s/it, lr=1e-5, step_loss=0.00848]
Steps: 2%|▏ | 17901/1000000 [12:26:10<2686:18:33, 9.85s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [17901], local_loss=0.005935984198004007, train_loss=0.04697285592556, time_cost=2.584575653076172
+
Steps: 2%|▏ | 17901/1000000 [12:26:10<2686:18:33, 9.85s/it, lr=1e-5, step_loss=0.00594]
Steps: 2%|▏ | 17902/1000000 [12:26:19<2624:12:06, 9.62s/it, lr=1e-5, step_loss=0.00594][RANK-0]: Step: [17902], local_loss=0.02560959756374359, train_loss=0.04352099448442459, time_cost=4.168878793716431
+
Steps: 2%|▏ | 17902/1000000 [12:26:19<2624:12:06, 9.62s/it, lr=1e-5, step_loss=0.0256]
Steps: 2%|▏ | 17903/1000000 [12:26:29<2618:26:24, 9.60s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [17903], local_loss=0.019642416387796402, train_loss=0.05176958441734314, time_cost=2.386500358581543
+
Steps: 2%|▏ | 17903/1000000 [12:26:29<2618:26:24, 9.60s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 17904/1000000 [12:26:35<2337:20:20, 8.57s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [17904], local_loss=0.005184569396078587, train_loss=0.17748874425888062, time_cost=2.429222583770752
+
Steps: 2%|▏ | 17904/1000000 [12:26:35<2337:20:20, 8.57s/it, lr=1e-5, step_loss=0.00518]
Steps: 2%|▏ | 17905/1000000 [12:26:44<2423:01:48, 8.88s/it, lr=1e-5, step_loss=0.00518][RANK-0]: Step: [17905], local_loss=0.012790655717253685, train_loss=0.06247945874929428, time_cost=1.348893404006958
+
Steps: 2%|▏ | 17905/1000000 [12:26:44<2423:01:48, 8.88s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 17906/1000000 [12:26:49<2075:18:44, 7.61s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [17906], local_loss=0.017368705943226814, train_loss=0.042400091886520386, time_cost=1.8276643753051758
+
Steps: 2%|▏ | 17906/1000000 [12:26:49<2075:18:44, 7.61s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 17907/1000000 [12:26:54<1849:21:55, 6.78s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [17907], local_loss=0.022781919687986374, train_loss=0.17404717206954956, time_cost=2.1382813453674316
+
Steps: 2%|▏ | 17907/1000000 [12:26:54<1849:21:55, 6.78s/it, lr=1e-5, step_loss=0.0228]
Steps: 2%|▏ | 17908/1000000 [12:26:59<1734:27:42, 6.36s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [17908], local_loss=0.04739522933959961, train_loss=0.04243120178580284, time_cost=1.5512638092041016
+
Steps: 2%|▏ | 17908/1000000 [12:26:59<1734:27:42, 6.36s/it, lr=1e-5, step_loss=0.0474]
Steps: 2%|▏ | 17909/1000000 [12:27:12<2274:01:08, 8.34s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [17909], local_loss=0.05777614563703537, train_loss=0.07722780853509903, time_cost=1.227393627166748
+
Steps: 2%|▏ | 17909/1000000 [12:27:12<2274:01:08, 8.34s/it, lr=1e-5, step_loss=0.0578]
Steps: 2%|▏ | 17910/1000000 [12:27:24<2536:54:50, 9.30s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [17910], local_loss=0.01243501901626587, train_loss=0.017424821853637695, time_cost=8.751940965652466
+
Steps: 2%|▏ | 17910/1000000 [12:27:24<2536:54:50, 9.30s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 17911/1000000 [12:27:31<2363:20:03, 8.66s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [17911], local_loss=0.15144678950309753, train_loss=0.18349553644657135, time_cost=1.4054083824157715
+
Steps: 2%|▏ | 17911/1000000 [12:27:31<2363:20:03, 8.66s/it, lr=1e-5, step_loss=0.151]
Steps: 2%|▏ | 17912/1000000 [12:27:40<2428:21:57, 8.90s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [17912], local_loss=0.007117792498320341, train_loss=0.05099755525588989, time_cost=4.05527138710022
+
Steps: 2%|▏ | 17912/1000000 [12:27:40<2428:21:57, 8.90s/it, lr=1e-5, step_loss=0.00712]
Steps: 2%|▏ | 17913/1000000 [12:27:51<2580:05:50, 9.46s/it, lr=1e-5, step_loss=0.00712][RANK-0]: Step: [17913], local_loss=0.10814169049263, train_loss=0.032865576446056366, time_cost=1.8343944549560547
+
Steps: 2%|▏ | 17913/1000000 [12:27:51<2580:05:50, 9.46s/it, lr=1e-5, step_loss=0.108]
Steps: 2%|▏ | 17914/1000000 [12:27:59<2406:17:28, 8.82s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [17914], local_loss=0.18751299381256104, train_loss=0.03760908544063568, time_cost=1.4715430736541748
+
Steps: 2%|▏ | 17914/1000000 [12:27:59<2406:17:28, 8.82s/it, lr=1e-5, step_loss=0.188]
Steps: 2%|▏ | 17915/1000000 [12:28:03<2082:18:25, 7.63s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [17915], local_loss=0.014786377549171448, train_loss=0.02372031658887863, time_cost=2.833540678024292
+
Steps: 2%|▏ | 17915/1000000 [12:28:03<2082:18:25, 7.63s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 17916/1000000 [12:28:08<1861:06:45, 6.82s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [17916], local_loss=0.05242985859513283, train_loss=0.018742622807621956, time_cost=2.4957802295684814
+
Steps: 2%|▏ | 17916/1000000 [12:28:08<1861:06:45, 6.82s/it, lr=1e-5, step_loss=0.0524]
Steps: 2%|▏ | 17917/1000000 [12:28:18<2084:24:50, 7.64s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [17917], local_loss=0.013772055506706238, train_loss=0.02904890850186348, time_cost=3.2831742763519287
+
Steps: 2%|▏ | 17917/1000000 [12:28:18<2084:24:50, 7.64s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 17918/1000000 [12:28:27<2194:48:54, 8.05s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [17918], local_loss=0.01383072417229414, train_loss=0.04664279893040657, time_cost=1.317720651626587
+
Steps: 2%|▏ | 17918/1000000 [12:28:27<2194:48:54, 8.05s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 17919/1000000 [12:28:34<2095:31:11, 7.68s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [17919], local_loss=0.01841244474053383, train_loss=0.019213123247027397, time_cost=1.2140846252441406
+
Steps: 2%|▏ | 17919/1000000 [12:28:34<2095:31:11, 7.68s/it, lr=1e-5, step_loss=0.0184]
Steps: 2%|▏ | 17920/1000000 [12:28:41<2089:18:12, 7.66s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [17920], local_loss=0.008642150089144707, train_loss=0.014936293475329876, time_cost=3.087603807449341
+
Steps: 2%|▏ | 17920/1000000 [12:28:41<2089:18:12, 7.66s/it, lr=1e-5, step_loss=0.00864]
Steps: 2%|▏ | 17921/1000000 [12:28:57<2735:34:11, 10.03s/it, lr=1e-5, step_loss=0.00864][RANK-0]: Step: [17921], local_loss=0.012441521510481834, train_loss=0.04333959147334099, time_cost=8.462655305862427
+
Steps: 2%|▏ | 17921/1000000 [12:28:57<2735:34:11, 10.03s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 17922/1000000 [12:29:03<2388:45:30, 8.76s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [17922], local_loss=0.029297463595867157, train_loss=0.014491159468889236, time_cost=1.8562052249908447
+
Steps: 2%|▏ | 17922/1000000 [12:29:03<2388:45:30, 8.76s/it, lr=1e-5, step_loss=0.0293]
Steps: 2%|▏ | 17923/1000000 [12:29:13<2556:59:25, 9.37s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [17923], local_loss=0.014943009242415428, train_loss=22.45975112915039, time_cost=2.7920644283294678
+
Steps: 2%|▏ | 17923/1000000 [12:29:13<2556:59:25, 9.37s/it, lr=1e-5, step_loss=0.0149]
Steps: 2%|▏ | 17924/1000000 [12:29:21<2422:42:12, 8.88s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [17924], local_loss=0.012103462591767311, train_loss=0.01813068799674511, time_cost=1.2216875553131104
+
Steps: 2%|▏ | 17924/1000000 [12:29:21<2422:42:12, 8.88s/it, lr=1e-5, step_loss=0.0121]
Steps: 2%|▏ | 17925/1000000 [12:29:28<2273:09:04, 8.33s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [17925], local_loss=0.016754791140556335, train_loss=0.08590517938137054, time_cost=2.8198904991149902
+
Steps: 2%|▏ | 17925/1000000 [12:29:28<2273:09:04, 8.33s/it, lr=1e-5, step_loss=0.0168]
Steps: 2%|▏ | 17926/1000000 [12:29:33<2000:22:31, 7.33s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [17926], local_loss=0.06024467572569847, train_loss=0.04048999398946762, time_cost=4.022600412368774
+
Steps: 2%|▏ | 17926/1000000 [12:29:33<2000:22:31, 7.33s/it, lr=1e-5, step_loss=0.0602]
Steps: 2%|▏ | 17927/1000000 [12:29:49<2724:12:04, 9.99s/it, lr=1e-5, step_loss=0.0602][RANK-0]: Step: [17927], local_loss=0.009472144767642021, train_loss=0.051415301859378815, time_cost=7.339375257492065
+
Steps: 2%|▏ | 17927/1000000 [12:29:49<2724:12:04, 9.99s/it, lr=1e-5, step_loss=0.00947]
Steps: 2%|▏ | 17928/1000000 [12:30:00<2730:10:16, 10.01s/it, lr=1e-5, step_loss=0.00947][RANK-0]: Step: [17928], local_loss=0.01077522523701191, train_loss=0.04168543964624405, time_cost=1.686342716217041
+
Steps: 2%|▏ | 17928/1000000 [12:30:00<2730:10:16, 10.01s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 17929/1000000 [12:30:05<2374:14:58, 8.70s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [17929], local_loss=0.015211006626486778, train_loss=0.014641855843365192, time_cost=2.7761330604553223
+
Steps: 2%|▏ | 17929/1000000 [12:30:05<2374:14:58, 8.70s/it, lr=1e-5, step_loss=0.0152]
Steps: 2%|▏ | 17930/1000000 [12:30:17<2650:15:14, 9.72s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [17930], local_loss=0.007263639010488987, train_loss=0.019686568528413773, time_cost=5.480459451675415
+
Steps: 2%|▏ | 17930/1000000 [12:30:17<2650:15:14, 9.72s/it, lr=1e-5, step_loss=0.00726]
Steps: 2%|▏ | 17931/1000000 [12:30:23<2337:28:56, 8.57s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [17931], local_loss=0.01405867375433445, train_loss=0.03028923273086548, time_cost=1.891869306564331
+
Steps: 2%|▏ | 17931/1000000 [12:30:23<2337:28:56, 8.57s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 17932/1000000 [12:30:30<2164:33:06, 7.93s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [17932], local_loss=0.0075797513127326965, train_loss=0.02740292251110077, time_cost=1.2043657302856445
+
Steps: 2%|▏ | 17932/1000000 [12:30:30<2164:33:06, 7.93s/it, lr=1e-5, step_loss=0.00758]
Steps: 2%|▏ | 17933/1000000 [12:30:40<2365:04:38, 8.67s/it, lr=1e-5, step_loss=0.00758][RANK-0]: Step: [17933], local_loss=0.04309919476509094, train_loss=0.06026512756943703, time_cost=1.4344427585601807
+
Steps: 2%|▏ | 17933/1000000 [12:30:40<2365:04:38, 8.67s/it, lr=1e-5, step_loss=0.0431]
Steps: 2%|▏ | 17934/1000000 [12:30:47<2220:19:00, 8.14s/it, lr=1e-5, step_loss=0.0431][RANK-0]: Step: [17934], local_loss=0.09157335758209229, train_loss=0.039605144411325455, time_cost=1.2243380546569824
+
Steps: 2%|▏ | 17934/1000000 [12:30:47<2220:19:00, 8.14s/it, lr=1e-5, step_loss=0.0916]
Steps: 2%|▏ | 17935/1000000 [12:30:58<2499:50:56, 9.16s/it, lr=1e-5, step_loss=0.0916][RANK-0]: Step: [17935], local_loss=0.023325329646468163, train_loss=0.027742866426706314, time_cost=8.383139610290527
+
Steps: 2%|▏ | 17935/1000000 [12:30:58<2499:50:56, 9.16s/it, lr=1e-5, step_loss=0.0233]
Steps: 2%|▏ | 17936/1000000 [12:31:10<2686:33:23, 9.85s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [17936], local_loss=0.026648754253983498, train_loss=0.036815039813518524, time_cost=4.000796318054199
+
Steps: 2%|▏ | 17936/1000000 [12:31:10<2686:33:23, 9.85s/it, lr=1e-5, step_loss=0.0266]
Steps: 2%|▏ | 17937/1000000 [12:31:21<2797:48:49, 10.26s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [17937], local_loss=0.021693594753742218, train_loss=0.07288724184036255, time_cost=3.849874496459961
+
Steps: 2%|▏ | 17937/1000000 [12:31:21<2797:48:49, 10.26s/it, lr=1e-5, step_loss=0.0217]
Steps: 2%|▏ | 17938/1000000 [12:31:28<2520:48:02, 9.24s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [17938], local_loss=0.00429639732465148, train_loss=0.03340831398963928, time_cost=1.215174674987793
+
Steps: 2%|▏ | 17938/1000000 [12:31:28<2520:48:02, 9.24s/it, lr=1e-5, step_loss=0.0043]
Steps: 2%|▏ | 17939/1000000 [12:31:33<2176:41:14, 7.98s/it, lr=1e-5, step_loss=0.0043][RANK-0]: Step: [17939], local_loss=0.015631577000021935, train_loss=0.0435599684715271, time_cost=1.3532435894012451
+
Steps: 2%|▏ | 17939/1000000 [12:31:33<2176:41:14, 7.98s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 17940/1000000 [12:31:44<2390:02:28, 8.76s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [17940], local_loss=0.006345214322209358, train_loss=0.033584922552108765, time_cost=1.501267433166504
+
Steps: 2%|▏ | 17940/1000000 [12:31:44<2390:02:28, 8.76s/it, lr=1e-5, step_loss=0.00635]
Steps: 2%|▏ | 17941/1000000 [12:31:57<2761:30:39, 10.12s/it, lr=1e-5, step_loss=0.00635][RANK-0]: Step: [17941], local_loss=0.012277060188353062, train_loss=0.01135043054819107, time_cost=5.7328267097473145
+
Steps: 2%|▏ | 17941/1000000 [12:31:57<2761:30:39, 10.12s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 17942/1000000 [12:32:09<2926:04:46, 10.73s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [17942], local_loss=0.029441671445965767, train_loss=0.038579151034355164, time_cost=4.146987676620483
+
Steps: 2%|▏ | 17942/1000000 [12:32:09<2926:04:46, 10.73s/it, lr=1e-5, step_loss=0.0294]
Steps: 2%|▏ | 17943/1000000 [12:32:16<2611:54:48, 9.57s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [17943], local_loss=0.021053068339824677, train_loss=0.02284419909119606, time_cost=2.4512617588043213
+
Steps: 2%|▏ | 17943/1000000 [12:32:16<2611:54:48, 9.57s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 17944/1000000 [12:32:29<2926:41:48, 10.73s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [17944], local_loss=0.038085903972387314, train_loss=0.04865441471338272, time_cost=1.2185392379760742
+
Steps: 2%|▏ | 17944/1000000 [12:32:29<2926:41:48, 10.73s/it, lr=1e-5, step_loss=0.0381]
Steps: 2%|▏ | 17945/1000000 [12:32:41<2991:55:13, 10.97s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [17945], local_loss=0.008865561336278915, train_loss=0.1437455713748932, time_cost=4.934053897857666
+
Steps: 2%|▏ | 17945/1000000 [12:32:41<2991:55:13, 10.97s/it, lr=1e-5, step_loss=0.00887]
Steps: 2%|▏ | 17946/1000000 [12:32:49<2799:34:44, 10.26s/it, lr=1e-5, step_loss=0.00887][RANK-0]: Step: [17946], local_loss=0.004733753390610218, train_loss=0.03656509518623352, time_cost=1.4645042419433594
+
Steps: 2%|▏ | 17946/1000000 [12:32:49<2799:34:44, 10.26s/it, lr=1e-5, step_loss=0.00473]
Steps: 2%|▏ | 17947/1000000 [12:32:57<2546:54:41, 9.34s/it, lr=1e-5, step_loss=0.00473][RANK-0]: Step: [17947], local_loss=0.040400296449661255, train_loss=0.05452495068311691, time_cost=1.802011489868164
+
Steps: 2%|▏ | 17947/1000000 [12:32:57<2546:54:41, 9.34s/it, lr=1e-5, step_loss=0.0404]
Steps: 2%|▏ | 17948/1000000 [12:33:04<2384:23:19, 8.74s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [17948], local_loss=1.0171080827713013, train_loss=0.13976351916790009, time_cost=2.887436628341675
+
Steps: 2%|▏ | 17948/1000000 [12:33:04<2384:23:19, 8.74s/it, lr=1e-5, step_loss=1.02]
Steps: 2%|▏ | 17949/1000000 [12:33:13<2402:29:23, 8.81s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [17949], local_loss=0.028859445825219154, train_loss=0.08353665471076965, time_cost=1.6158668994903564
+
Steps: 2%|▏ | 17949/1000000 [12:33:13<2402:29:23, 8.81s/it, lr=1e-5, step_loss=0.0289]
Steps: 2%|▏ | 17950/1000000 [12:33:23<2471:28:34, 9.06s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [17950], local_loss=0.016451247036457062, train_loss=0.022916045039892197, time_cost=2.2794606685638428
+
Steps: 2%|▏ | 17950/1000000 [12:33:23<2471:28:34, 9.06s/it, lr=1e-5, step_loss=0.0165]
Steps: 2%|▏ | 17951/1000000 [12:33:28<2207:05:04, 8.09s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [17951], local_loss=0.007247315254062414, train_loss=0.012682616710662842, time_cost=4.268211603164673
+
Steps: 2%|▏ | 17951/1000000 [12:33:28<2207:05:04, 8.09s/it, lr=1e-5, step_loss=0.00725]
Steps: 2%|▏ | 17952/1000000 [12:33:34<1982:55:03, 7.27s/it, lr=1e-5, step_loss=0.00725][RANK-0]: Step: [17952], local_loss=0.09631579369306564, train_loss=0.0803908035159111, time_cost=1.2229373455047607
+
Steps: 2%|▏ | 17952/1000000 [12:33:34<1982:55:03, 7.27s/it, lr=1e-5, step_loss=0.0963]
Steps: 2%|▏ | 17953/1000000 [12:33:38<1719:24:35, 6.30s/it, lr=1e-5, step_loss=0.0963][RANK-0]: Step: [17953], local_loss=0.02192172221839428, train_loss=0.0719967857003212, time_cost=1.3293859958648682
+
Steps: 2%|▏ | 17953/1000000 [12:33:38<1719:24:35, 6.30s/it, lr=1e-5, step_loss=0.0219]
Steps: 2%|▏ | 17954/1000000 [12:33:46<1860:25:32, 6.82s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [17954], local_loss=0.008801371790468693, train_loss=0.08051949739456177, time_cost=1.8531830310821533
+
Steps: 2%|▏ | 17954/1000000 [12:33:46<1860:25:32, 6.82s/it, lr=1e-5, step_loss=0.0088]
Steps: 2%|▏ | 17955/1000000 [12:33:50<1667:21:21, 6.11s/it, lr=1e-5, step_loss=0.0088][RANK-0]: Step: [17955], local_loss=0.037718042731285095, train_loss=0.13065841794013977, time_cost=3.741471290588379
+
Steps: 2%|▏ | 17955/1000000 [12:33:50<1667:21:21, 6.11s/it, lr=1e-5, step_loss=0.0377]
Steps: 2%|▏ | 17956/1000000 [12:34:04<2278:08:13, 8.35s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [17956], local_loss=0.03429877385497093, train_loss=0.03147604316473007, time_cost=4.605804443359375
+
Steps: 2%|▏ | 17956/1000000 [12:34:04<2278:08:13, 8.35s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 17957/1000000 [12:34:09<2042:53:50, 7.49s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [17957], local_loss=0.0066771795973181725, train_loss=0.015476387925446033, time_cost=1.220848560333252
+
Steps: 2%|▏ | 17957/1000000 [12:34:09<2042:53:50, 7.49s/it, lr=1e-5, step_loss=0.00668]
Steps: 2%|▏ | 17958/1000000 [12:34:18<2177:29:59, 7.98s/it, lr=1e-5, step_loss=0.00668][RANK-0]: Step: [17958], local_loss=0.05146445333957672, train_loss=0.0342976413667202, time_cost=1.2788550853729248
+
Steps: 2%|▏ | 17958/1000000 [12:34:19<2177:29:59, 7.98s/it, lr=1e-5, step_loss=0.0515]
Steps: 2%|▏ | 17959/1000000 [12:34:23<1866:13:32, 6.84s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [17959], local_loss=0.05207955464720726, train_loss=0.041908249258995056, time_cost=1.4269659519195557
+
Steps: 2%|▏ | 17959/1000000 [12:34:23<1866:13:32, 6.84s/it, lr=1e-5, step_loss=0.0521]
Steps: 2%|▏ | 17960/1000000 [12:34:37<2441:09:25, 8.95s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [17960], local_loss=0.005922788754105568, train_loss=0.049518439918756485, time_cost=6.341046094894409
+
Steps: 2%|▏ | 17960/1000000 [12:34:37<2441:09:25, 8.95s/it, lr=1e-5, step_loss=0.00592]
Steps: 2%|▏ | 17961/1000000 [12:34:44<2326:12:08, 8.53s/it, lr=1e-5, step_loss=0.00592][RANK-0]: Step: [17961], local_loss=0.00457411166280508, train_loss=0.01385405845940113, time_cost=1.5301694869995117
+
Steps: 2%|▏ | 17961/1000000 [12:34:44<2326:12:08, 8.53s/it, lr=1e-5, step_loss=0.00457]
Steps: 2%|▏ | 17962/1000000 [12:34:50<2096:30:02, 7.69s/it, lr=1e-5, step_loss=0.00457][RANK-0]: Step: [17962], local_loss=0.02101333625614643, train_loss=0.04077344015240669, time_cost=3.323462724685669
+
Steps: 2%|▏ | 17962/1000000 [12:34:50<2096:30:02, 7.69s/it, lr=1e-5, step_loss=0.021]
Steps: 2%|▏ | 17963/1000000 [12:35:01<2368:37:40, 8.68s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [17963], local_loss=0.013766143471002579, train_loss=0.023141654208302498, time_cost=3.847090005874634
+
Steps: 2%|▏ | 17963/1000000 [12:35:01<2368:37:40, 8.68s/it, lr=1e-5, step_loss=0.0138]
Steps: 2%|▏ | 17964/1000000 [12:35:06<2079:40:29, 7.62s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [17964], local_loss=0.0094796447083354, train_loss=0.14928178489208221, time_cost=2.1270105838775635
+
Steps: 2%|▏ | 17964/1000000 [12:35:06<2079:40:29, 7.62s/it, lr=1e-5, step_loss=0.00948]
Steps: 2%|▏ | 17965/1000000 [12:35:11<1885:45:26, 6.91s/it, lr=1e-5, step_loss=0.00948][RANK-0]: Step: [17965], local_loss=0.0064234137535095215, train_loss=0.013072457164525986, time_cost=2.2984020709991455
+
Steps: 2%|▏ | 17965/1000000 [12:35:11<1885:45:26, 6.91s/it, lr=1e-5, step_loss=0.00642]
Steps: 2%|▏ | 17966/1000000 [12:35:23<2282:57:18, 8.37s/it, lr=1e-5, step_loss=0.00642][RANK-0]: Step: [17966], local_loss=0.04270780086517334, train_loss=0.09983804821968079, time_cost=4.492175102233887
+
Steps: 2%|▏ | 17966/1000000 [12:35:23<2282:57:18, 8.37s/it, lr=1e-5, step_loss=0.0427]
Steps: 2%|▏ | 17967/1000000 [12:35:37<2761:40:17, 10.12s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [17967], local_loss=0.05481475964188576, train_loss=0.07178756594657898, time_cost=4.461632490158081
+
Steps: 2%|▏ | 17967/1000000 [12:35:37<2761:40:17, 10.12s/it, lr=1e-5, step_loss=0.0548]
Steps: 2%|▏ | 17968/1000000 [12:35:51<3089:00:53, 11.32s/it, lr=1e-5, step_loss=0.0548][RANK-0]: Step: [17968], local_loss=0.007082045543938875, train_loss=0.016585323959589005, time_cost=1.5952460765838623
+
Steps: 2%|▏ | 17968/1000000 [12:35:51<3089:00:53, 11.32s/it, lr=1e-5, step_loss=0.00708]
Steps: 2%|▏ | 17969/1000000 [12:36:04<3218:08:22, 11.80s/it, lr=1e-5, step_loss=0.00708][RANK-0]: Step: [17969], local_loss=0.008708292618393898, train_loss=0.055003780871629715, time_cost=4.288941144943237
+
Steps: 2%|▏ | 17969/1000000 [12:36:04<3218:08:22, 11.80s/it, lr=1e-5, step_loss=0.00871]
Steps: 2%|▏ | 17970/1000000 [12:36:15<3099:07:31, 11.36s/it, lr=1e-5, step_loss=0.00871][RANK-0]: Step: [17970], local_loss=0.004217030014842749, train_loss=0.022366641089320183, time_cost=7.681714773178101
+
Steps: 2%|▏ | 17970/1000000 [12:36:15<3099:07:31, 11.36s/it, lr=1e-5, step_loss=0.00422]
Steps: 2%|▏ | 17971/1000000 [12:36:26<3097:47:54, 11.36s/it, lr=1e-5, step_loss=0.00422][RANK-0]: Step: [17971], local_loss=0.01771760918200016, train_loss=0.0142293032258749, time_cost=3.3026583194732666
+
Steps: 2%|▏ | 17971/1000000 [12:36:26<3097:47:54, 11.36s/it, lr=1e-5, step_loss=0.0177]
Steps: 2%|▏ | 17972/1000000 [12:36:41<3436:26:19, 12.60s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [17972], local_loss=0.014183048158884048, train_loss=0.019602350890636444, time_cost=4.307652950286865
+
Steps: 2%|▏ | 17972/1000000 [12:36:41<3436:26:19, 12.60s/it, lr=1e-5, step_loss=0.0142]
Steps: 2%|▏ | 17973/1000000 [12:36:47<2838:33:06, 10.41s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [17973], local_loss=0.003819131525233388, train_loss=0.039446648210287094, time_cost=3.8531367778778076
+
Steps: 2%|▏ | 17973/1000000 [12:36:47<2838:33:06, 10.41s/it, lr=1e-5, step_loss=0.00382]
Steps: 2%|▏ | 17974/1000000 [12:37:03<3290:28:14, 12.06s/it, lr=1e-5, step_loss=0.00382][RANK-0]: Step: [17974], local_loss=0.011651475913822651, train_loss=0.052980512380599976, time_cost=4.928093194961548
+
Steps: 2%|▏ | 17974/1000000 [12:37:03<3290:28:14, 12.06s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 17975/1000000 [12:37:10<2871:18:47, 10.53s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [17975], local_loss=0.008258476853370667, train_loss=0.007672571577131748, time_cost=3.029482364654541
+
Steps: 2%|▏ | 17975/1000000 [12:37:10<2871:18:47, 10.53s/it, lr=1e-5, step_loss=0.00826]
Steps: 2%|▏ | 17976/1000000 [12:37:21<2904:01:23, 10.65s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [17976], local_loss=0.018153749406337738, train_loss=0.027894869446754456, time_cost=2.2596423625946045
+
Steps: 2%|▏ | 17976/1000000 [12:37:21<2904:01:23, 10.65s/it, lr=1e-5, step_loss=0.0182]
Steps: 2%|▏ | 17977/1000000 [12:37:31<2927:25:07, 10.73s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [17977], local_loss=0.050324372947216034, train_loss=0.01748715341091156, time_cost=1.2490286827087402
+
Steps: 2%|▏ | 17977/1000000 [12:37:31<2927:25:07, 10.73s/it, lr=1e-5, step_loss=0.0503]
Steps: 2%|▏ | 17978/1000000 [12:37:42<2952:00:57, 10.82s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [17978], local_loss=0.006116174161434174, train_loss=0.024471573531627655, time_cost=2.7277214527130127
+
Steps: 2%|▏ | 17978/1000000 [12:37:42<2952:00:57, 10.82s/it, lr=1e-5, step_loss=0.00612]
Steps: 2%|▏ | 17979/1000000 [12:37:48<2551:47:48, 9.35s/it, lr=1e-5, step_loss=0.00612][RANK-0]: Step: [17979], local_loss=0.004756017588078976, train_loss=0.01617986522614956, time_cost=1.2766571044921875
+
Steps: 2%|▏ | 17979/1000000 [12:37:48<2551:47:48, 9.35s/it, lr=1e-5, step_loss=0.00476]
Steps: 2%|▏ | 17980/1000000 [12:37:54<2219:09:16, 8.14s/it, lr=1e-5, step_loss=0.00476][RANK-0]: Step: [17980], local_loss=0.10286211222410202, train_loss=0.08286666870117188, time_cost=2.379770040512085
+
Steps: 2%|▏ | 17980/1000000 [12:37:54<2219:09:16, 8.14s/it, lr=1e-5, step_loss=0.103]
Steps: 2%|▏ | 17981/1000000 [12:37:58<1911:41:49, 7.01s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [17981], local_loss=0.0110731590539217, train_loss=0.017705265432596207, time_cost=1.695892333984375
+
Steps: 2%|▏ | 17981/1000000 [12:37:58<1911:41:49, 7.01s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 17982/1000000 [12:38:07<2083:57:20, 7.64s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [17982], local_loss=0.009260691702365875, train_loss=0.0228253360837698, time_cost=4.295865535736084
+
Steps: 2%|▏ | 17982/1000000 [12:38:07<2083:57:20, 7.64s/it, lr=1e-5, step_loss=0.00926]
Steps: 2%|▏ | 17983/1000000 [12:38:20<2540:24:19, 9.31s/it, lr=1e-5, step_loss=0.00926][RANK-0]: Step: [17983], local_loss=0.011753981001675129, train_loss=0.01949445903301239, time_cost=3.8822720050811768
+
Steps: 2%|▏ | 17983/1000000 [12:38:20<2540:24:19, 9.31s/it, lr=1e-5, step_loss=0.0118]
Steps: 2%|▏ | 17984/1000000 [12:38:34<2861:40:39, 10.49s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [17984], local_loss=0.009224876761436462, train_loss=0.17133952677249908, time_cost=4.702662467956543
+
Steps: 2%|▏ | 17984/1000000 [12:38:34<2861:40:39, 10.49s/it, lr=1e-5, step_loss=0.00922]
Steps: 2%|▏ | 17985/1000000 [12:38:41<2599:33:30, 9.53s/it, lr=1e-5, step_loss=0.00922][RANK-0]: Step: [17985], local_loss=0.02224106714129448, train_loss=0.02013210952281952, time_cost=1.2563698291778564
+
Steps: 2%|▏ | 17985/1000000 [12:38:41<2599:33:30, 9.53s/it, lr=1e-5, step_loss=0.0222]
Steps: 2%|▏ | 17986/1000000 [12:38:47<2314:05:33, 8.48s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [17986], local_loss=0.03864350542426109, train_loss=0.02331826277077198, time_cost=1.9713153839111328
+
Steps: 2%|▏ | 17986/1000000 [12:38:47<2314:05:33, 8.48s/it, lr=1e-5, step_loss=0.0386]
Steps: 2%|▏ | 17987/1000000 [12:38:54<2221:32:11, 8.14s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [17987], local_loss=0.05610689893364906, train_loss=0.07449714094400406, time_cost=5.371147155761719
+
Steps: 2%|▏ | 17987/1000000 [12:38:54<2221:32:11, 8.14s/it, lr=1e-5, step_loss=0.0561]
Steps: 2%|▏ | 17988/1000000 [12:39:06<2473:26:59, 9.07s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [17988], local_loss=0.05686890333890915, train_loss=0.08883734047412872, time_cost=9.711448907852173
+
Steps: 2%|▏ | 17988/1000000 [12:39:06<2473:26:59, 9.07s/it, lr=1e-5, step_loss=0.0569]
Steps: 2%|▏ | 17989/1000000 [12:39:22<3041:42:32, 11.15s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [17989], local_loss=0.021121511235833168, train_loss=0.03208468481898308, time_cost=7.564169645309448
+
Steps: 2%|▏ | 17989/1000000 [12:39:22<3041:42:32, 11.15s/it, lr=1e-5, step_loss=0.0211]
Steps: 2%|▏ | 17990/1000000 [12:39:31<2885:12:22, 10.58s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [17990], local_loss=0.04703294113278389, train_loss=0.054719336330890656, time_cost=3.2673139572143555
+
Steps: 2%|▏ | 17990/1000000 [12:39:31<2885:12:22, 10.58s/it, lr=1e-5, step_loss=0.047]
Steps: 2%|▏ | 17991/1000000 [12:39:38<2580:02:36, 9.46s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [17991], local_loss=0.061889201402664185, train_loss=0.03489116579294205, time_cost=5.868875026702881
+
Steps: 2%|▏ | 17991/1000000 [12:39:38<2580:02:36, 9.46s/it, lr=1e-5, step_loss=0.0619]
Steps: 2%|▏ | 17992/1000000 [12:39:51<2937:52:30, 10.77s/it, lr=1e-5, step_loss=0.0619][RANK-0]: Step: [17992], local_loss=1.0105072259902954, train_loss=0.17526227235794067, time_cost=4.514612197875977
+
Steps: 2%|▏ | 17992/1000000 [12:39:51<2937:52:30, 10.77s/it, lr=1e-5, step_loss=1.01]
Steps: 2%|▏ | 17993/1000000 [12:40:03<2966:40:44, 10.88s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [17993], local_loss=0.006966591812670231, train_loss=0.1347023993730545, time_cost=3.7019124031066895
+
Steps: 2%|▏ | 17993/1000000 [12:40:03<2966:40:44, 10.88s/it, lr=1e-5, step_loss=0.00697]
Steps: 2%|▏ | 17994/1000000 [12:40:13<2933:43:50, 10.75s/it, lr=1e-5, step_loss=0.00697][RANK-0]: Step: [17994], local_loss=0.013402510434389114, train_loss=0.0500679537653923, time_cost=3.4124481678009033
+
Steps: 2%|▏ | 17994/1000000 [12:40:13<2933:43:50, 10.75s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 17995/1000000 [12:40:25<3032:06:58, 11.12s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [17995], local_loss=0.005933205597102642, train_loss=0.02822365239262581, time_cost=1.2242000102996826
+
Steps: 2%|▏ | 17995/1000000 [12:40:25<3032:06:58, 11.12s/it, lr=1e-5, step_loss=0.00593]
Steps: 2%|▏ | 17996/1000000 [12:40:29<2488:47:30, 9.12s/it, lr=1e-5, step_loss=0.00593][RANK-0]: Step: [17996], local_loss=0.013096500188112259, train_loss=0.011964349076151848, time_cost=1.6527254581451416
+
Steps: 2%|▏ | 17996/1000000 [12:40:30<2488:47:30, 9.12s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 17997/1000000 [12:40:34<2142:00:25, 7.85s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [17997], local_loss=0.14183536171913147, train_loss=0.037725239992141724, time_cost=2.3537073135375977
+
Steps: 2%|▏ | 17997/1000000 [12:40:34<2142:00:25, 7.85s/it, lr=1e-5, step_loss=0.142]
Steps: 2%|▏ | 17998/1000000 [12:40:40<1920:54:59, 7.04s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [17998], local_loss=0.07739560306072235, train_loss=0.050404928624629974, time_cost=2.726788282394409
+
Steps: 2%|▏ | 17998/1000000 [12:40:40<1920:54:59, 7.04s/it, lr=1e-5, step_loss=0.0774]
Steps: 2%|▏ | 17999/1000000 [12:40:49<2156:12:36, 7.90s/it, lr=1e-5, step_loss=0.0774][RANK-0]: Step: [17999], local_loss=0.9856921434402466, train_loss=0.15713249146938324, time_cost=1.211484670639038
+
Steps: 2%|▏ | 17999/1000000 [12:40:49<2156:12:36, 7.90s/it, lr=1e-5, step_loss=0.986]
Steps: 2%|▏ | 18000/1000000 [12:40:59<2250:49:33, 8.25s/it, lr=1e-5, step_loss=0.986][RANK-0]: Step: [18000], local_loss=0.2826196551322937, train_loss=0.09152820706367493, time_cost=1.7744417190551758
+09/18/2024 22:05:01 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1/checkpoint-18000
+09/18/2024 22:05:01 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-18 22:05:01,894] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-18 22:05:01,924] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-18 22:05:01,924] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-18 22:05:19,936] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:19,954] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-18 22:05:53,876] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:53,876] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:53,876] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:55,824] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:55,825] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:55,825] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:55,915] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:55,916] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:55,916] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:55,965] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:55,965] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:55,965] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:55,994] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:55,994] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:55,994] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:55,996] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:55,996] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:55,996] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:56,108] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:56,165] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:56,165] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-18 22:05:56,193] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-18 22:05:56,193] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-18 22:05:56,193] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/18/2024 22:05:56 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/pytorch_model
+{'norm_num_groups', 'dropout', 'use_additional_conditions'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/model/diffusion_pytorch_model.safetensors
+09/18/2024 22:06:59 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/scheduler.bin
+09/18/2024 22:06:59 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/sampler.bin
+09/18/2024 22:06:59 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/random_states_0.pkl
+09/18/2024 22:06:59 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1/checkpoint-18000
+
Steps: 2%|▏ | 18000/1000000 [12:42:56<2250:49:33, 8.25s/it, lr=1e-5, step_loss=0.283]
Steps: 2%|▏ | 18001/1000000 [12:43:00<11537:47:47, 42.30s/it, lr=1e-5, step_loss=0.283][RANK-0]: Step: [18001], local_loss=0.058915164321660995, train_loss=0.042529698461294174, time_cost=1.3276517391204834
+
Steps: 2%|▏ | 18001/1000000 [12:43:00<11537:47:47, 42.30s/it, lr=1e-5, step_loss=0.0589]
Steps: 2%|▏ | 18002/1000000 [12:43:12<9061:56:23, 33.22s/it, lr=1e-5, step_loss=0.0589] [RANK-0]: Step: [18002], local_loss=0.008859915658831596, train_loss=0.019945882260799408, time_cost=5.942732572555542
+
Steps: 2%|▏ | 18002/1000000 [12:43:12<9061:56:23, 33.22s/it, lr=1e-5, step_loss=0.00886]
Steps: 2%|▏ | 18003/1000000 [12:43:21<7085:20:01, 25.97s/it, lr=1e-5, step_loss=0.00886][RANK-0]: Step: [18003], local_loss=0.07131955772638321, train_loss=0.03823030740022659, time_cost=1.6855108737945557
+
Steps: 2%|▏ | 18003/1000000 [12:43:21<7085:20:01, 25.97s/it, lr=1e-5, step_loss=0.0713]
Steps: 2%|▏ | 18004/1000000 [12:43:26<5368:39:12, 19.68s/it, lr=1e-5, step_loss=0.0713][RANK-0]: Step: [18004], local_loss=0.03430434316396713, train_loss=0.04339243099093437, time_cost=1.3135874271392822
+
Steps: 2%|▏ | 18004/1000000 [12:43:26<5368:39:12, 19.68s/it, lr=1e-5, step_loss=0.0343]
Steps: 2%|▏ | 18005/1000000 [12:43:31<4128:41:51, 15.14s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [18005], local_loss=0.011130310595035553, train_loss=0.03223086893558502, time_cost=1.7480583190917969
+
Steps: 2%|▏ | 18005/1000000 [12:43:31<4128:41:51, 15.14s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 18006/1000000 [12:43:44<3930:49:48, 14.41s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [18006], local_loss=0.003618052462115884, train_loss=0.03826231509447098, time_cost=3.950289249420166
+
Steps: 2%|▏ | 18006/1000000 [12:43:44<3930:49:48, 14.41s/it, lr=1e-5, step_loss=0.00362]
Steps: 2%|▏ | 18007/1000000 [12:43:51<3372:32:20, 12.36s/it, lr=1e-5, step_loss=0.00362][RANK-0]: Step: [18007], local_loss=0.040771834552288055, train_loss=0.04030274599790573, time_cost=3.7671728134155273
+
Steps: 2%|▏ | 18007/1000000 [12:43:51<3372:32:20, 12.36s/it, lr=1e-5, step_loss=0.0408]
Steps: 2%|▏ | 18008/1000000 [12:43:56<2786:09:09, 10.21s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [18008], local_loss=0.050718165934085846, train_loss=0.028998032212257385, time_cost=2.237872838973999
+
Steps: 2%|▏ | 18008/1000000 [12:43:56<2786:09:09, 10.21s/it, lr=1e-5, step_loss=0.0507]
Steps: 2%|▏ | 18009/1000000 [12:44:10<3097:57:33, 11.36s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [18009], local_loss=0.015151206403970718, train_loss=0.0701194629073143, time_cost=5.744014739990234
+
Steps: 2%|▏ | 18009/1000000 [12:44:10<3097:57:33, 11.36s/it, lr=1e-5, step_loss=0.0152]
Steps: 2%|▏ | 18010/1000000 [12:44:27<3521:56:10, 12.91s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [18010], local_loss=0.01030721515417099, train_loss=0.022150501608848572, time_cost=8.196459293365479
+
Steps: 2%|▏ | 18010/1000000 [12:44:27<3521:56:10, 12.91s/it, lr=1e-5, step_loss=0.0103]
Steps: 2%|▏ | 18011/1000000 [12:44:41<3642:57:09, 13.36s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [18011], local_loss=0.006005221977829933, train_loss=0.01165944617241621, time_cost=6.789825439453125
+
Steps: 2%|▏ | 18011/1000000 [12:44:41<3642:57:09, 13.36s/it, lr=1e-5, step_loss=0.00601]
Steps: 2%|▏ | 18012/1000000 [12:44:57<3861:17:58, 14.16s/it, lr=1e-5, step_loss=0.00601][RANK-0]: Step: [18012], local_loss=0.02279116027057171, train_loss=0.032908692955970764, time_cost=6.75730562210083
+
Steps: 2%|▏ | 18012/1000000 [12:44:57<3861:17:58, 14.16s/it, lr=1e-5, step_loss=0.0228]
Steps: 2%|▏ | 18013/1000000 [12:45:04<3263:05:12, 11.96s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [18013], local_loss=0.005707903299480677, train_loss=0.012488391250371933, time_cost=2.32489275932312
+
Steps: 2%|▏ | 18013/1000000 [12:45:04<3263:05:12, 11.96s/it, lr=1e-5, step_loss=0.00571]
Steps: 2%|▏ | 18014/1000000 [12:45:15<3177:05:49, 11.65s/it, lr=1e-5, step_loss=0.00571][RANK-0]: Step: [18014], local_loss=0.025022296234965324, train_loss=0.02358689159154892, time_cost=1.813185214996338
+
Steps: 2%|▏ | 18014/1000000 [12:45:15<3177:05:49, 11.65s/it, lr=1e-5, step_loss=0.025]
Steps: 2%|▏ | 18015/1000000 [12:45:22<2814:04:25, 10.32s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [18015], local_loss=0.04358759522438049, train_loss=0.04642503336071968, time_cost=6.021589279174805
+
Steps: 2%|▏ | 18015/1000000 [12:45:22<2814:04:25, 10.32s/it, lr=1e-5, step_loss=0.0436]
Steps: 2%|▏ | 18016/1000000 [12:45:32<2722:47:33, 9.98s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [18016], local_loss=0.017075112089514732, train_loss=0.03646958991885185, time_cost=2.1862940788269043
+
Steps: 2%|▏ | 18016/1000000 [12:45:32<2722:47:33, 9.98s/it, lr=1e-5, step_loss=0.0171]
Steps: 2%|▏ | 18017/1000000 [12:45:38<2464:50:47, 9.04s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [18017], local_loss=0.030798476189374924, train_loss=0.04960236698389053, time_cost=1.3446087837219238
+
Steps: 2%|▏ | 18017/1000000 [12:45:38<2464:50:47, 9.04s/it, lr=1e-5, step_loss=0.0308]
Steps: 2%|▏ | 18018/1000000 [12:45:46<2310:56:22, 8.47s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [18018], local_loss=0.004002555273473263, train_loss=0.06308402866125107, time_cost=2.7150473594665527
+
Steps: 2%|▏ | 18018/1000000 [12:45:46<2310:56:22, 8.47s/it, lr=1e-5, step_loss=0.004]
Steps: 2%|▏ | 18019/1000000 [12:45:56<2501:54:47, 9.17s/it, lr=1e-5, step_loss=0.004][RANK-0]: Step: [18019], local_loss=0.04872378334403038, train_loss=0.04423556476831436, time_cost=3.8600544929504395
+
Steps: 2%|▏ | 18019/1000000 [12:45:56<2501:54:47, 9.17s/it, lr=1e-5, step_loss=0.0487]
Steps: 2%|▏ | 18020/1000000 [12:46:01<2096:50:06, 7.69s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [18020], local_loss=0.014505078084766865, train_loss=0.016161687672138214, time_cost=1.398451328277588
+
Steps: 2%|▏ | 18020/1000000 [12:46:01<2096:50:06, 7.69s/it, lr=1e-5, step_loss=0.0145]
Steps: 2%|▏ | 18021/1000000 [12:46:11<2361:02:55, 8.66s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [18021], local_loss=0.0055577801540493965, train_loss=0.07832854986190796, time_cost=4.583942174911499
+
Steps: 2%|▏ | 18021/1000000 [12:46:11<2361:02:55, 8.66s/it, lr=1e-5, step_loss=0.00556]
Steps: 2%|▏ | 18022/1000000 [12:46:18<2200:23:05, 8.07s/it, lr=1e-5, step_loss=0.00556][RANK-0]: Step: [18022], local_loss=0.07415978610515594, train_loss=0.0416533499956131, time_cost=1.5053975582122803
+
Steps: 2%|▏ | 18022/1000000 [12:46:18<2200:23:05, 8.07s/it, lr=1e-5, step_loss=0.0742]
Steps: 2%|▏ | 18023/1000000 [12:46:25<2134:34:49, 7.83s/it, lr=1e-5, step_loss=0.0742][RANK-0]: Step: [18023], local_loss=0.012862570583820343, train_loss=0.028047718107700348, time_cost=1.2392005920410156
+
Steps: 2%|▏ | 18023/1000000 [12:46:25<2134:34:49, 7.83s/it, lr=1e-5, step_loss=0.0129]
Steps: 2%|▏ | 18024/1000000 [12:46:32<2008:16:07, 7.36s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [18024], local_loss=0.05123762786388397, train_loss=0.03347780182957649, time_cost=1.208810806274414
+
Steps: 2%|▏ | 18024/1000000 [12:46:32<2008:16:07, 7.36s/it, lr=1e-5, step_loss=0.0512]
Steps: 2%|▏ | 18025/1000000 [12:46:37<1872:41:40, 6.87s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [18025], local_loss=0.009605539962649345, train_loss=0.03378564119338989, time_cost=3.1401782035827637
+
Steps: 2%|▏ | 18025/1000000 [12:46:37<1872:41:40, 6.87s/it, lr=1e-5, step_loss=0.00961]
Steps: 2%|▏ | 18026/1000000 [12:46:42<1675:18:11, 6.14s/it, lr=1e-5, step_loss=0.00961][RANK-0]: Step: [18026], local_loss=0.008328273892402649, train_loss=0.03691964969038963, time_cost=1.7119956016540527
+
Steps: 2%|▏ | 18026/1000000 [12:46:42<1675:18:11, 6.14s/it, lr=1e-5, step_loss=0.00833]
Steps: 2%|▏ | 18027/1000000 [12:46:46<1509:58:30, 5.54s/it, lr=1e-5, step_loss=0.00833][RANK-0]: Step: [18027], local_loss=0.02014363370835781, train_loss=0.03287212550640106, time_cost=1.2471060752868652
+
Steps: 2%|▏ | 18027/1000000 [12:46:46<1509:58:30, 5.54s/it, lr=1e-5, step_loss=0.0201]
Steps: 2%|▏ | 18028/1000000 [12:46:53<1610:52:51, 5.91s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [18028], local_loss=0.008984294719994068, train_loss=0.01831599697470665, time_cost=2.420081853866577
+
Steps: 2%|▏ | 18028/1000000 [12:46:53<1610:52:51, 5.91s/it, lr=1e-5, step_loss=0.00898]
Steps: 2%|▏ | 18029/1000000 [12:46:57<1501:37:01, 5.51s/it, lr=1e-5, step_loss=0.00898][RANK-0]: Step: [18029], local_loss=0.06319669634103775, train_loss=0.022128982469439507, time_cost=1.8749423027038574
+
Steps: 2%|▏ | 18029/1000000 [12:46:57<1501:37:01, 5.51s/it, lr=1e-5, step_loss=0.0632]
Steps: 2%|▏ | 18030/1000000 [12:47:08<1901:22:22, 6.97s/it, lr=1e-5, step_loss=0.0632][RANK-0]: Step: [18030], local_loss=0.05718492344021797, train_loss=0.02071573957800865, time_cost=5.089327335357666
+
Steps: 2%|▏ | 18030/1000000 [12:47:08<1901:22:22, 6.97s/it, lr=1e-5, step_loss=0.0572]
Steps: 2%|▏ | 18031/1000000 [12:47:14<1817:54:02, 6.66s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [18031], local_loss=0.06323881447315216, train_loss=0.021843431517481804, time_cost=4.817842245101929
+
Steps: 2%|▏ | 18031/1000000 [12:47:14<1817:54:02, 6.66s/it, lr=1e-5, step_loss=0.0632]
Steps: 2%|▏ | 18032/1000000 [12:47:29<2513:17:02, 9.21s/it, lr=1e-5, step_loss=0.0632][RANK-0]: Step: [18032], local_loss=0.039892349392175674, train_loss=0.03044910542666912, time_cost=5.664201736450195
+
Steps: 2%|▏ | 18032/1000000 [12:47:29<2513:17:02, 9.21s/it, lr=1e-5, step_loss=0.0399]
Steps: 2%|▏ | 18033/1000000 [12:47:33<2115:08:57, 7.75s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [18033], local_loss=0.02000107429921627, train_loss=0.06298111379146576, time_cost=1.6555836200714111
+
Steps: 2%|▏ | 18033/1000000 [12:47:33<2115:08:57, 7.75s/it, lr=1e-5, step_loss=0.02]
Steps: 2%|▏ | 18034/1000000 [12:47:45<2441:10:59, 8.95s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [18034], local_loss=0.05426398664712906, train_loss=0.03662284463644028, time_cost=2.2220921516418457
+
Steps: 2%|▏ | 18034/1000000 [12:47:45<2441:10:59, 8.95s/it, lr=1e-5, step_loss=0.0543]
Steps: 2%|▏ | 18035/1000000 [12:47:58<2752:45:17, 10.09s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [18035], local_loss=0.011855942197144032, train_loss=0.01787715218961239, time_cost=3.6288907527923584
+
Steps: 2%|▏ | 18035/1000000 [12:47:58<2752:45:17, 10.09s/it, lr=1e-5, step_loss=0.0119]
Steps: 2%|▏ | 18036/1000000 [12:48:07<2670:49:53, 9.79s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [18036], local_loss=0.008690639398992062, train_loss=0.009881878271698952, time_cost=1.3139398097991943
+
Steps: 2%|▏ | 18036/1000000 [12:48:07<2670:49:53, 9.79s/it, lr=1e-5, step_loss=0.00869]
Steps: 2%|▏ | 18037/1000000 [12:48:16<2610:35:55, 9.57s/it, lr=1e-5, step_loss=0.00869][RANK-0]: Step: [18037], local_loss=0.007312421221286058, train_loss=0.04582776501774788, time_cost=1.5782678127288818
+
Steps: 2%|▏ | 18037/1000000 [12:48:16<2610:35:55, 9.57s/it, lr=1e-5, step_loss=0.00731]
Steps: 2%|▏ | 18038/1000000 [12:48:20<2178:27:58, 7.99s/it, lr=1e-5, step_loss=0.00731][RANK-0]: Step: [18038], local_loss=0.08081024885177612, train_loss=0.027186671271920204, time_cost=1.4293100833892822
+
Steps: 2%|▏ | 18038/1000000 [12:48:20<2178:27:58, 7.99s/it, lr=1e-5, step_loss=0.0808]
Steps: 2%|▏ | 18039/1000000 [12:48:32<2472:37:05, 9.06s/it, lr=1e-5, step_loss=0.0808][RANK-0]: Step: [18039], local_loss=0.011053778231143951, train_loss=0.029493030160665512, time_cost=2.951946973800659
+
Steps: 2%|▏ | 18039/1000000 [12:48:32<2472:37:05, 9.06s/it, lr=1e-5, step_loss=0.0111]
Steps: 2%|▏ | 18040/1000000 [12:48:43<2642:29:26, 9.69s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [18040], local_loss=0.0059082466177642345, train_loss=0.019201867282390594, time_cost=3.223776340484619
+
Steps: 2%|▏ | 18040/1000000 [12:48:43<2642:29:26, 9.69s/it, lr=1e-5, step_loss=0.00591]
Steps: 2%|▏ | 18041/1000000 [12:48:55<2831:46:23, 10.38s/it, lr=1e-5, step_loss=0.00591][RANK-0]: Step: [18041], local_loss=0.015649812296032906, train_loss=0.021258899942040443, time_cost=5.0774149894714355
+
Steps: 2%|▏ | 18041/1000000 [12:48:55<2831:46:23, 10.38s/it, lr=1e-5, step_loss=0.0156]
Steps: 2%|▏ | 18042/1000000 [12:49:01<2470:04:50, 9.06s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [18042], local_loss=0.01978405937552452, train_loss=0.024731913581490517, time_cost=1.2925238609313965
+
Steps: 2%|▏ | 18042/1000000 [12:49:01<2470:04:50, 9.06s/it, lr=1e-5, step_loss=0.0198]
Steps: 2%|▏ | 18043/1000000 [12:49:06<2128:22:35, 7.80s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [18043], local_loss=0.08231678605079651, train_loss=0.06900531053543091, time_cost=1.6266415119171143
+
Steps: 2%|▏ | 18043/1000000 [12:49:06<2128:22:35, 7.80s/it, lr=1e-5, step_loss=0.0823]
Steps: 2%|▏ | 18044/1000000 [12:49:12<1992:54:44, 7.31s/it, lr=1e-5, step_loss=0.0823][RANK-0]: Step: [18044], local_loss=0.03690468147397041, train_loss=0.04606448486447334, time_cost=2.1729373931884766
+
Steps: 2%|▏ | 18044/1000000 [12:49:12<1992:54:44, 7.31s/it, lr=1e-5, step_loss=0.0369]
Steps: 2%|▏ | 18045/1000000 [12:49:16<1770:26:12, 6.49s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [18045], local_loss=0.013864520937204361, train_loss=0.03245900571346283, time_cost=1.4529023170471191
+
Steps: 2%|▏ | 18045/1000000 [12:49:16<1770:26:12, 6.49s/it, lr=1e-5, step_loss=0.0139]
Steps: 2%|▏ | 18046/1000000 [12:49:23<1812:33:57, 6.65s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [18046], local_loss=0.005847637075930834, train_loss=0.0473075732588768, time_cost=2.939220905303955
+
Steps: 2%|▏ | 18046/1000000 [12:49:23<1812:33:57, 6.65s/it, lr=1e-5, step_loss=0.00585]
Steps: 2%|▏ | 18047/1000000 [12:49:29<1749:15:32, 6.41s/it, lr=1e-5, step_loss=0.00585][RANK-0]: Step: [18047], local_loss=0.00810939259827137, train_loss=0.01878335326910019, time_cost=1.5848615169525146
+
Steps: 2%|▏ | 18047/1000000 [12:49:29<1749:15:32, 6.41s/it, lr=1e-5, step_loss=0.00811]
Steps: 2%|▏ | 18048/1000000 [12:49:35<1660:34:19, 6.09s/it, lr=1e-5, step_loss=0.00811][RANK-0]: Step: [18048], local_loss=0.022642342373728752, train_loss=0.048088423907756805, time_cost=1.6891670227050781
+
Steps: 2%|▏ | 18048/1000000 [12:49:35<1660:34:19, 6.09s/it, lr=1e-5, step_loss=0.0226]
Steps: 2%|▏ | 18049/1000000 [12:49:41<1669:04:09, 6.12s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [18049], local_loss=0.014340002089738846, train_loss=0.03456432744860649, time_cost=1.8598599433898926
+
Steps: 2%|▏ | 18049/1000000 [12:49:41<1669:04:09, 6.12s/it, lr=1e-5, step_loss=0.0143]
Steps: 2%|▏ | 18050/1000000 [12:49:53<2194:28:43, 8.05s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [18050], local_loss=0.028113946318626404, train_loss=0.02064250223338604, time_cost=3.048551082611084
+
Steps: 2%|▏ | 18050/1000000 [12:49:53<2194:28:43, 8.05s/it, lr=1e-5, step_loss=0.0281]
Steps: 2%|▏ | 18051/1000000 [12:49:59<2013:06:51, 7.38s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [18051], local_loss=0.007679094094783068, train_loss=0.021281711757183075, time_cost=3.516563892364502
+
Steps: 2%|▏ | 18051/1000000 [12:49:59<2013:06:51, 7.38s/it, lr=1e-5, step_loss=0.00768]
Steps: 2%|▏ | 18052/1000000 [12:50:05<1860:15:23, 6.82s/it, lr=1e-5, step_loss=0.00768][RANK-0]: Step: [18052], local_loss=0.008103631436824799, train_loss=0.03568551316857338, time_cost=3.217022180557251
+
Steps: 2%|▏ | 18052/1000000 [12:50:05<1860:15:23, 6.82s/it, lr=1e-5, step_loss=0.0081]
Steps: 2%|▏ | 18053/1000000 [12:50:16<2245:26:31, 8.23s/it, lr=1e-5, step_loss=0.0081][RANK-0]: Step: [18053], local_loss=0.013117148540914059, train_loss=0.06232758238911629, time_cost=2.9898200035095215
+
Steps: 2%|▏ | 18053/1000000 [12:50:16<2245:26:31, 8.23s/it, lr=1e-5, step_loss=0.0131]
Steps: 2%|▏ | 18054/1000000 [12:50:27<2420:53:22, 8.88s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [18054], local_loss=0.008157162927091122, train_loss=0.0271468423306942, time_cost=2.4291481971740723
+
Steps: 2%|▏ | 18054/1000000 [12:50:27<2420:53:22, 8.88s/it, lr=1e-5, step_loss=0.00816]
Steps: 2%|▏ | 18055/1000000 [12:50:35<2393:43:45, 8.78s/it, lr=1e-5, step_loss=0.00816][RANK-0]: Step: [18055], local_loss=0.004510104190558195, train_loss=0.008307878859341145, time_cost=2.0679776668548584
+
Steps: 2%|▏ | 18055/1000000 [12:50:35<2393:43:45, 8.78s/it, lr=1e-5, step_loss=0.00451]
Steps: 2%|▏ | 18056/1000000 [12:50:46<2576:34:10, 9.45s/it, lr=1e-5, step_loss=0.00451][RANK-0]: Step: [18056], local_loss=0.1389724612236023, train_loss=0.07871157675981522, time_cost=1.5398030281066895
+
Steps: 2%|▏ | 18056/1000000 [12:50:46<2576:34:10, 9.45s/it, lr=1e-5, step_loss=0.139]
Steps: 2%|▏ | 18057/1000000 [12:50:53<2333:15:55, 8.55s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [18057], local_loss=0.011244789697229862, train_loss=0.010897466912865639, time_cost=2.9065704345703125
+
Steps: 2%|▏ | 18057/1000000 [12:50:53<2333:15:55, 8.55s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 18058/1000000 [12:51:00<2215:24:57, 8.12s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [18058], local_loss=0.1350603699684143, train_loss=0.038509923964738846, time_cost=1.3408496379852295
+
Steps: 2%|▏ | 18058/1000000 [12:51:00<2215:24:57, 8.12s/it, lr=1e-5, step_loss=0.135]
Steps: 2%|▏ | 18059/1000000 [12:51:11<2446:26:15, 8.97s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [18059], local_loss=0.03273353353142738, train_loss=0.09849155694246292, time_cost=1.3144841194152832
+
Steps: 2%|▏ | 18059/1000000 [12:51:11<2446:26:15, 8.97s/it, lr=1e-5, step_loss=0.0327]
Steps: 2%|▏ | 18060/1000000 [12:51:16<2152:30:12, 7.89s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [18060], local_loss=0.07006534188985825, train_loss=0.08584171533584595, time_cost=4.085931062698364
+
Steps: 2%|▏ | 18060/1000000 [12:51:16<2152:30:12, 7.89s/it, lr=1e-5, step_loss=0.0701]
Steps: 2%|▏ | 18061/1000000 [12:51:25<2229:51:52, 8.18s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [18061], local_loss=0.11050921678543091, train_loss=0.057441871613264084, time_cost=3.4128003120422363
+
Steps: 2%|▏ | 18061/1000000 [12:51:25<2229:51:52, 8.18s/it, lr=1e-5, step_loss=0.111]
Steps: 2%|▏ | 18062/1000000 [12:51:36<2451:24:18, 8.99s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [18062], local_loss=0.02128230407834053, train_loss=0.03291739523410797, time_cost=1.3292906284332275
+
Steps: 2%|▏ | 18062/1000000 [12:51:36<2451:24:18, 8.99s/it, lr=1e-5, step_loss=0.0213]
Steps: 2%|▏ | 18063/1000000 [12:51:43<2347:11:55, 8.61s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [18063], local_loss=0.004702581092715263, train_loss=6.118527412414551, time_cost=2.4414925575256348
+
Steps: 2%|▏ | 18063/1000000 [12:51:43<2347:11:55, 8.61s/it, lr=1e-5, step_loss=0.0047]
Steps: 2%|▏ | 18064/1000000 [12:51:55<2548:55:02, 9.34s/it, lr=1e-5, step_loss=0.0047][RANK-0]: Step: [18064], local_loss=0.012954901903867722, train_loss=0.032160840928554535, time_cost=1.8635337352752686
+
Steps: 2%|▏ | 18064/1000000 [12:51:55<2548:55:02, 9.34s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 18065/1000000 [12:52:02<2394:27:15, 8.78s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [18065], local_loss=0.005671302322298288, train_loss=0.016682881861925125, time_cost=4.424083948135376
+
Steps: 2%|▏ | 18065/1000000 [12:52:02<2394:27:15, 8.78s/it, lr=1e-5, step_loss=0.00567]
Steps: 2%|▏ | 18066/1000000 [12:52:13<2593:17:54, 9.51s/it, lr=1e-5, step_loss=0.00567][RANK-0]: Step: [18066], local_loss=0.01613592728972435, train_loss=0.03856220841407776, time_cost=2.9793684482574463
+
Steps: 2%|▏ | 18066/1000000 [12:52:13<2593:17:54, 9.51s/it, lr=1e-5, step_loss=0.0161]
Steps: 2%|▏ | 18067/1000000 [12:52:25<2742:42:47, 10.06s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [18067], local_loss=0.007977423258125782, train_loss=0.027895428240299225, time_cost=8.746424913406372
+
Steps: 2%|▏ | 18067/1000000 [12:52:25<2742:42:47, 10.06s/it, lr=1e-5, step_loss=0.00798]
Steps: 2%|▏ | 18068/1000000 [12:52:33<2579:55:56, 9.46s/it, lr=1e-5, step_loss=0.00798][RANK-0]: Step: [18068], local_loss=0.02070832997560501, train_loss=0.10454795509576797, time_cost=3.765437364578247
+
Steps: 2%|▏ | 18068/1000000 [12:52:33<2579:55:56, 9.46s/it, lr=1e-5, step_loss=0.0207]
Steps: 2%|▏ | 18069/1000000 [12:52:44<2738:07:03, 10.04s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [18069], local_loss=0.04837774485349655, train_loss=0.02308417484164238, time_cost=3.586916923522949
+
Steps: 2%|▏ | 18069/1000000 [12:52:44<2738:07:03, 10.04s/it, lr=1e-5, step_loss=0.0484]
Steps: 2%|▏ | 18070/1000000 [12:52:52<2580:01:55, 9.46s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [18070], local_loss=0.12702205777168274, train_loss=0.06016043573617935, time_cost=2.513953447341919
+
Steps: 2%|▏ | 18070/1000000 [12:52:52<2580:01:55, 9.46s/it, lr=1e-5, step_loss=0.127]
Steps: 2%|▏ | 18071/1000000 [12:53:00<2442:18:49, 8.95s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [18071], local_loss=0.011998928152024746, train_loss=0.06057393178343773, time_cost=3.443883180618286
+
Steps: 2%|▏ | 18071/1000000 [12:53:00<2442:18:49, 8.95s/it, lr=1e-5, step_loss=0.012]
Steps: 2%|▏ | 18072/1000000 [12:53:10<2576:34:43, 9.45s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [18072], local_loss=0.0080493725836277, train_loss=0.023468714207410812, time_cost=9.056174278259277
+
Steps: 2%|▏ | 18072/1000000 [12:53:10<2576:34:43, 9.45s/it, lr=1e-5, step_loss=0.00805]
Steps: 2%|▏ | 18073/1000000 [12:53:20<2545:31:11, 9.33s/it, lr=1e-5, step_loss=0.00805][RANK-0]: Step: [18073], local_loss=0.011561542749404907, train_loss=0.033727727830410004, time_cost=4.338506698608398
+
Steps: 2%|▏ | 18073/1000000 [12:53:20<2545:31:11, 9.33s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 18074/1000000 [12:53:27<2399:19:44, 8.80s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [18074], local_loss=0.004505546297878027, train_loss=0.023764971643686295, time_cost=5.67158055305481
+
Steps: 2%|▏ | 18074/1000000 [12:53:27<2399:19:44, 8.80s/it, lr=1e-5, step_loss=0.00451]
Steps: 2%|▏ | 18075/1000000 [12:53:32<2092:04:55, 7.67s/it, lr=1e-5, step_loss=0.00451][RANK-0]: Step: [18075], local_loss=0.13713215291500092, train_loss=0.041880715638399124, time_cost=2.0575613975524902
+
Steps: 2%|▏ | 18075/1000000 [12:53:32<2092:04:55, 7.67s/it, lr=1e-5, step_loss=0.137]
Steps: 2%|▏ | 18076/1000000 [12:53:39<2033:41:48, 7.46s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [18076], local_loss=0.006750529166311026, train_loss=0.020271383225917816, time_cost=1.4334735870361328
+
Steps: 2%|▏ | 18076/1000000 [12:53:39<2033:41:48, 7.46s/it, lr=1e-5, step_loss=0.00675]
Steps: 2%|▏ | 18077/1000000 [12:53:49<2266:46:34, 8.31s/it, lr=1e-5, step_loss=0.00675][RANK-0]: Step: [18077], local_loss=0.008608730509877205, train_loss=0.02970338985323906, time_cost=2.2103800773620605
+
Steps: 2%|▏ | 18077/1000000 [12:53:49<2266:46:34, 8.31s/it, lr=1e-5, step_loss=0.00861]
Steps: 2%|▏ | 18078/1000000 [12:54:04<2788:24:28, 10.22s/it, lr=1e-5, step_loss=0.00861][RANK-0]: Step: [18078], local_loss=0.013165979646146297, train_loss=0.017266925424337387, time_cost=7.306210517883301
+
Steps: 2%|▏ | 18078/1000000 [12:54:04<2788:24:28, 10.22s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 18079/1000000 [12:54:09<2321:53:02, 8.51s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [18079], local_loss=0.012486591935157776, train_loss=0.029106423258781433, time_cost=1.5863041877746582
+
Steps: 2%|▏ | 18079/1000000 [12:54:09<2321:53:02, 8.51s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 18080/1000000 [12:54:20<2536:16:39, 9.30s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [18080], local_loss=0.05472494289278984, train_loss=0.1598605513572693, time_cost=3.641716241836548
+
Steps: 2%|▏ | 18080/1000000 [12:54:20<2536:16:39, 9.30s/it, lr=1e-5, step_loss=0.0547]
Steps: 2%|▏ | 18081/1000000 [12:54:31<2668:39:53, 9.78s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [18081], local_loss=0.018712446093559265, train_loss=0.030976075679063797, time_cost=2.825160503387451
+
Steps: 2%|▏ | 18081/1000000 [12:54:31<2668:39:53, 9.78s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 18082/1000000 [12:54:45<3004:07:49, 11.01s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [18082], local_loss=0.005244141444563866, train_loss=0.00979945994913578, time_cost=4.516545057296753
+
Steps: 2%|▏ | 18082/1000000 [12:54:45<3004:07:49, 11.01s/it, lr=1e-5, step_loss=0.00524]
Steps: 2%|▏ | 18083/1000000 [12:54:57<3160:53:08, 11.59s/it, lr=1e-5, step_loss=0.00524][RANK-0]: Step: [18083], local_loss=0.014053348451852798, train_loss=0.021938689053058624, time_cost=4.248012542724609
+
Steps: 2%|▏ | 18083/1000000 [12:54:57<3160:53:08, 11.59s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 18084/1000000 [12:55:08<3101:08:08, 11.37s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [18084], local_loss=0.012626049108803272, train_loss=0.04019972309470177, time_cost=2.641096591949463
+
Steps: 2%|▏ | 18084/1000000 [12:55:08<3101:08:08, 11.37s/it, lr=1e-5, step_loss=0.0126]
Steps: 2%|▏ | 18085/1000000 [12:55:19<3068:48:24, 11.25s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [18085], local_loss=0.0037528870161622763, train_loss=0.13105471432209015, time_cost=2.3294174671173096
+
Steps: 2%|▏ | 18085/1000000 [12:55:19<3068:48:24, 11.25s/it, lr=1e-5, step_loss=0.00375]
Steps: 2%|▏ | 18086/1000000 [12:55:24<2500:55:05, 9.17s/it, lr=1e-5, step_loss=0.00375][RANK-0]: Step: [18086], local_loss=0.009931922890245914, train_loss=0.023378703743219376, time_cost=1.57790207862854
+
Steps: 2%|▏ | 18086/1000000 [12:55:24<2500:55:05, 9.17s/it, lr=1e-5, step_loss=0.00993]
Steps: 2%|▏ | 18087/1000000 [12:55:38<2942:55:30, 10.79s/it, lr=1e-5, step_loss=0.00993][RANK-0]: Step: [18087], local_loss=0.01733812689781189, train_loss=0.031426120549440384, time_cost=3.4114065170288086
+
Steps: 2%|▏ | 18087/1000000 [12:55:38<2942:55:30, 10.79s/it, lr=1e-5, step_loss=0.0173]
Steps: 2%|▏ | 18088/1000000 [12:55:53<3251:59:41, 11.92s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [18088], local_loss=0.006303813774138689, train_loss=0.012570841237902641, time_cost=5.923975706100464
+
Steps: 2%|▏ | 18088/1000000 [12:55:53<3251:59:41, 11.92s/it, lr=1e-5, step_loss=0.0063]
Steps: 2%|▏ | 18089/1000000 [12:56:04<3219:08:18, 11.80s/it, lr=1e-5, step_loss=0.0063][RANK-0]: Step: [18089], local_loss=0.009493826888501644, train_loss=0.023204542696475983, time_cost=1.2909328937530518
+
Steps: 2%|▏ | 18089/1000000 [12:56:04<3219:08:18, 11.80s/it, lr=1e-5, step_loss=0.00949]
Steps: 2%|▏ | 18090/1000000 [12:56:09<2641:06:49, 9.68s/it, lr=1e-5, step_loss=0.00949][RANK-0]: Step: [18090], local_loss=0.006055073346942663, train_loss=0.1442720890045166, time_cost=1.8485324382781982
+
Steps: 2%|▏ | 18090/1000000 [12:56:09<2641:06:49, 9.68s/it, lr=1e-5, step_loss=0.00606]
Steps: 2%|▏ | 18091/1000000 [12:56:17<2524:57:08, 9.26s/it, lr=1e-5, step_loss=0.00606][RANK-0]: Step: [18091], local_loss=0.1385454535484314, train_loss=0.04516419768333435, time_cost=3.790627956390381
+
Steps: 2%|▏ | 18091/1000000 [12:56:17<2524:57:08, 9.26s/it, lr=1e-5, step_loss=0.139]
Steps: 2%|▏ | 18092/1000000 [12:56:28<2668:15:18, 9.78s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [18092], local_loss=0.006187988445162773, train_loss=0.031226642429828644, time_cost=3.12539005279541
+
Steps: 2%|▏ | 18092/1000000 [12:56:28<2668:15:18, 9.78s/it, lr=1e-5, step_loss=0.00619]
Steps: 2%|▏ | 18093/1000000 [12:56:39<2762:17:34, 10.13s/it, lr=1e-5, step_loss=0.00619][RANK-0]: Step: [18093], local_loss=0.01813315413892269, train_loss=0.05359969288110733, time_cost=9.65090537071228
+
Steps: 2%|▏ | 18093/1000000 [12:56:39<2762:17:34, 10.13s/it, lr=1e-5, step_loss=0.0181]
Steps: 2%|▏ | 18094/1000000 [12:56:46<2511:19:42, 9.21s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [18094], local_loss=0.08192083984613419, train_loss=0.028456242755055428, time_cost=2.520284652709961
+
Steps: 2%|▏ | 18094/1000000 [12:56:46<2511:19:42, 9.21s/it, lr=1e-5, step_loss=0.0819]
Steps: 2%|▏ | 18095/1000000 [12:56:53<2335:17:33, 8.56s/it, lr=1e-5, step_loss=0.0819][RANK-0]: Step: [18095], local_loss=0.04669946804642677, train_loss=0.02765137515962124, time_cost=1.4435348510742188
+
Steps: 2%|▏ | 18095/1000000 [12:56:53<2335:17:33, 8.56s/it, lr=1e-5, step_loss=0.0467]
Steps: 2%|▏ | 18096/1000000 [12:57:00<2168:04:25, 7.95s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [18096], local_loss=0.09362559020519257, train_loss=0.05006860941648483, time_cost=2.556389808654785
+
Steps: 2%|▏ | 18096/1000000 [12:57:00<2168:04:25, 7.95s/it, lr=1e-5, step_loss=0.0936]
Steps: 2%|▏ | 18097/1000000 [12:57:11<2425:42:03, 8.89s/it, lr=1e-5, step_loss=0.0936][RANK-0]: Step: [18097], local_loss=0.028820641338825226, train_loss=0.02165031060576439, time_cost=1.3316683769226074
+
Steps: 2%|▏ | 18097/1000000 [12:57:11<2425:42:03, 8.89s/it, lr=1e-5, step_loss=0.0288]
Steps: 2%|▏ | 18098/1000000 [12:57:16<2113:21:06, 7.75s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [18098], local_loss=5.116754055023193, train_loss=0.6592747569084167, time_cost=1.4675488471984863
+
Steps: 2%|▏ | 18098/1000000 [12:57:16<2113:21:06, 7.75s/it, lr=1e-5, step_loss=5.12]
Steps: 2%|▏ | 18099/1000000 [12:57:27<2412:49:31, 8.85s/it, lr=1e-5, step_loss=5.12][RANK-0]: Step: [18099], local_loss=0.017234014347195625, train_loss=0.02431296370923519, time_cost=3.704984188079834
+
Steps: 2%|▏ | 18099/1000000 [12:57:27<2412:49:31, 8.85s/it, lr=1e-5, step_loss=0.0172]
Steps: 2%|▏ | 18100/1000000 [12:57:39<2647:29:55, 9.71s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [18100], local_loss=0.06013127788901329, train_loss=0.02027992531657219, time_cost=4.263632774353027
+
Steps: 2%|▏ | 18100/1000000 [12:57:39<2647:29:55, 9.71s/it, lr=1e-5, step_loss=0.0601]
Steps: 2%|▏ | 18101/1000000 [12:57:44<2256:05:25, 8.27s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [18101], local_loss=0.005654092878103256, train_loss=0.14840145409107208, time_cost=1.8219823837280273
+
Steps: 2%|▏ | 18101/1000000 [12:57:44<2256:05:25, 8.27s/it, lr=1e-5, step_loss=0.00565]
Steps: 2%|▏ | 18102/1000000 [12:57:57<2598:15:49, 9.53s/it, lr=1e-5, step_loss=0.00565][RANK-0]: Step: [18102], local_loss=0.08728161454200745, train_loss=0.05548582598567009, time_cost=4.237905979156494
+
Steps: 2%|▏ | 18102/1000000 [12:57:57<2598:15:49, 9.53s/it, lr=1e-5, step_loss=0.0873]
Steps: 2%|▏ | 18103/1000000 [12:58:09<2805:44:29, 10.29s/it, lr=1e-5, step_loss=0.0873][RANK-0]: Step: [18103], local_loss=0.019588900730013847, train_loss=0.03173253312706947, time_cost=1.2105481624603271
+
Steps: 2%|▏ | 18103/1000000 [12:58:09<2805:44:29, 10.29s/it, lr=1e-5, step_loss=0.0196]
Steps: 2%|▏ | 18104/1000000 [12:58:20<2878:51:13, 10.55s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [18104], local_loss=0.03758152201771736, train_loss=0.07525958865880966, time_cost=5.090531826019287
+
Steps: 2%|▏ | 18104/1000000 [12:58:20<2878:51:13, 10.55s/it, lr=1e-5, step_loss=0.0376]
Steps: 2%|▏ | 18105/1000000 [12:58:26<2501:46:39, 9.17s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [18105], local_loss=0.008113610558211803, train_loss=0.0185561403632164, time_cost=4.240069389343262
+
Steps: 2%|▏ | 18105/1000000 [12:58:26<2501:46:39, 9.17s/it, lr=1e-5, step_loss=0.00811]
Steps: 2%|▏ | 18106/1000000 [12:58:34<2464:26:40, 9.04s/it, lr=1e-5, step_loss=0.00811][RANK-0]: Step: [18106], local_loss=0.023303022608160973, train_loss=0.02256731688976288, time_cost=1.965635061264038
+
Steps: 2%|▏ | 18106/1000000 [12:58:34<2464:26:40, 9.04s/it, lr=1e-5, step_loss=0.0233]
Steps: 2%|▏ | 18107/1000000 [12:58:40<2194:50:50, 8.05s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [18107], local_loss=0.006572871468961239, train_loss=0.05097946524620056, time_cost=3.116777181625366
+
Steps: 2%|▏ | 18107/1000000 [12:58:40<2194:50:50, 8.05s/it, lr=1e-5, step_loss=0.00657]
Steps: 2%|▏ | 18108/1000000 [12:58:51<2395:54:25, 8.78s/it, lr=1e-5, step_loss=0.00657][RANK-0]: Step: [18108], local_loss=0.04689225181937218, train_loss=0.04017878323793411, time_cost=2.518204927444458
+
Steps: 2%|▏ | 18108/1000000 [12:58:51<2395:54:25, 8.78s/it, lr=1e-5, step_loss=0.0469]
Steps: 2%|▏ | 18109/1000000 [12:58:55<2066:20:38, 7.58s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [18109], local_loss=0.005057353992015123, train_loss=0.02059006877243519, time_cost=2.0139076709747314
+
Steps: 2%|▏ | 18109/1000000 [12:58:55<2066:20:38, 7.58s/it, lr=1e-5, step_loss=0.00506]
Steps: 2%|▏ | 18110/1000000 [12:59:09<2546:00:57, 9.33s/it, lr=1e-5, step_loss=0.00506][RANK-0]: Step: [18110], local_loss=0.011577485129237175, train_loss=0.01979893445968628, time_cost=4.348998785018921
+
Steps: 2%|▏ | 18110/1000000 [12:59:09<2546:00:57, 9.33s/it, lr=1e-5, step_loss=0.0116]
Steps: 2%|▏ | 18111/1000000 [12:59:20<2697:47:47, 9.89s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [18111], local_loss=0.01122137252241373, train_loss=0.026101941242814064, time_cost=1.4391281604766846
+
Steps: 2%|▏ | 18111/1000000 [12:59:20<2697:47:47, 9.89s/it, lr=1e-5, step_loss=0.0112]
Steps: 2%|▏ | 18112/1000000 [12:59:27<2450:11:15, 8.98s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [18112], local_loss=0.03253685310482979, train_loss=0.03140717372298241, time_cost=1.2107892036437988
+
Steps: 2%|▏ | 18112/1000000 [12:59:27<2450:11:15, 8.98s/it, lr=1e-5, step_loss=0.0325]
Steps: 2%|▏ | 18113/1000000 [12:59:37<2555:59:10, 9.37s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [18113], local_loss=0.006966624408960342, train_loss=0.014903002418577671, time_cost=7.411816120147705
+
Steps: 2%|▏ | 18113/1000000 [12:59:37<2555:59:10, 9.37s/it, lr=1e-5, step_loss=0.00697]
Steps: 2%|▏ | 18114/1000000 [12:59:50<2840:40:00, 10.42s/it, lr=1e-5, step_loss=0.00697][RANK-0]: Step: [18114], local_loss=0.039089396595954895, train_loss=0.02101624384522438, time_cost=6.802067995071411
+
Steps: 2%|▏ | 18114/1000000 [12:59:50<2840:40:00, 10.42s/it, lr=1e-5, step_loss=0.0391]
Steps: 2%|▏ | 18115/1000000 [12:59:56<2499:14:08, 9.16s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [18115], local_loss=0.040558140724897385, train_loss=0.028374025598168373, time_cost=1.3416094779968262
+
Steps: 2%|▏ | 18115/1000000 [12:59:56<2499:14:08, 9.16s/it, lr=1e-5, step_loss=0.0406]
Steps: 2%|▏ | 18116/1000000 [13:00:07<2603:01:10, 9.54s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [18116], local_loss=0.0100690433755517, train_loss=0.05261124297976494, time_cost=2.0302774906158447
+
Steps: 2%|▏ | 18116/1000000 [13:00:07<2603:01:10, 9.54s/it, lr=1e-5, step_loss=0.0101]
Steps: 2%|▏ | 18117/1000000 [13:00:14<2452:53:23, 8.99s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [18117], local_loss=0.015362400561571121, train_loss=0.012979596853256226, time_cost=1.5476126670837402
+
Steps: 2%|▏ | 18117/1000000 [13:00:14<2452:53:23, 8.99s/it, lr=1e-5, step_loss=0.0154]
Steps: 2%|▏ | 18118/1000000 [13:00:26<2655:10:14, 9.73s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [18118], local_loss=0.010634391568601131, train_loss=0.03782660514116287, time_cost=3.4412295818328857
+
Steps: 2%|▏ | 18118/1000000 [13:00:26<2655:10:14, 9.73s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 18119/1000000 [13:00:37<2743:11:51, 10.06s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [18119], local_loss=0.2946016192436218, train_loss=0.096869558095932, time_cost=2.7435145378112793
+
Steps: 2%|▏ | 18119/1000000 [13:00:37<2743:11:51, 10.06s/it, lr=1e-5, step_loss=0.295]
Steps: 2%|▏ | 18120/1000000 [13:00:50<2976:17:20, 10.91s/it, lr=1e-5, step_loss=0.295][RANK-0]: Step: [18120], local_loss=0.022085925564169884, train_loss=0.014567682519555092, time_cost=3.2198166847229004
+
Steps: 2%|▏ | 18120/1000000 [13:00:50<2976:17:20, 10.91s/it, lr=1e-5, step_loss=0.0221]
Steps: 2%|▏ | 18121/1000000 [13:00:54<2472:44:18, 9.07s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [18121], local_loss=0.00841212272644043, train_loss=0.02210244908928871, time_cost=2.3365187644958496
+
Steps: 2%|▏ | 18121/1000000 [13:00:54<2472:44:18, 9.07s/it, lr=1e-5, step_loss=0.00841]
Steps: 2%|▏ | 18122/1000000 [13:01:10<2976:07:07, 10.91s/it, lr=1e-5, step_loss=0.00841][RANK-0]: Step: [18122], local_loss=0.31085315346717834, train_loss=0.08381512016057968, time_cost=7.384209394454956
+
Steps: 2%|▏ | 18122/1000000 [13:01:10<2976:07:07, 10.91s/it, lr=1e-5, step_loss=0.311]
Steps: 2%|▏ | 18123/1000000 [13:01:15<2522:10:10, 9.25s/it, lr=1e-5, step_loss=0.311][RANK-0]: Step: [18123], local_loss=0.18132628500461578, train_loss=0.038041647523641586, time_cost=2.512467622756958
+
Steps: 2%|▏ | 18123/1000000 [13:01:15<2522:10:10, 9.25s/it, lr=1e-5, step_loss=0.181]
Steps: 2%|▏ | 18124/1000000 [13:01:25<2602:38:52, 9.54s/it, lr=1e-5, step_loss=0.181][RANK-0]: Step: [18124], local_loss=0.016477428376674652, train_loss=0.0633167028427124, time_cost=1.328817367553711
+
Steps: 2%|▏ | 18124/1000000 [13:01:25<2602:38:52, 9.54s/it, lr=1e-5, step_loss=0.0165]
Steps: 2%|▏ | 18125/1000000 [13:01:31<2255:39:17, 8.27s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [18125], local_loss=0.026868585497140884, train_loss=0.018144376575946808, time_cost=2.189964532852173
+
Steps: 2%|▏ | 18125/1000000 [13:01:31<2255:39:17, 8.27s/it, lr=1e-5, step_loss=0.0269]
Steps: 2%|▏ | 18126/1000000 [13:01:41<2441:11:57, 8.95s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [18126], local_loss=0.04916054755449295, train_loss=0.03223206102848053, time_cost=1.5038433074951172
+
Steps: 2%|▏ | 18126/1000000 [13:01:41<2441:11:57, 8.95s/it, lr=1e-5, step_loss=0.0492]
Steps: 2%|▏ | 18127/1000000 [13:01:46<2114:01:06, 7.75s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [18127], local_loss=0.004387033171951771, train_loss=0.019444987177848816, time_cost=2.4339489936828613
+
Steps: 2%|▏ | 18127/1000000 [13:01:46<2114:01:06, 7.75s/it, lr=1e-5, step_loss=0.00439]
Steps: 2%|▏ | 18128/1000000 [13:01:52<1974:46:33, 7.24s/it, lr=1e-5, step_loss=0.00439][RANK-0]: Step: [18128], local_loss=0.030601274222135544, train_loss=0.09673301130533218, time_cost=1.7435100078582764
+
Steps: 2%|▏ | 18128/1000000 [13:01:52<1974:46:33, 7.24s/it, lr=1e-5, step_loss=0.0306]
Steps: 2%|▏ | 18129/1000000 [13:02:01<2108:08:00, 7.73s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [18129], local_loss=0.008038010448217392, train_loss=0.025588776916265488, time_cost=2.8953425884246826
+
Steps: 2%|▏ | 18129/1000000 [13:02:01<2108:08:00, 7.73s/it, lr=1e-5, step_loss=0.00804]
Steps: 2%|▏ | 18130/1000000 [13:02:06<1878:36:44, 6.89s/it, lr=1e-5, step_loss=0.00804][RANK-0]: Step: [18130], local_loss=0.04071275517344475, train_loss=0.06746409088373184, time_cost=2.2673428058624268
+
Steps: 2%|▏ | 18130/1000000 [13:02:06<1878:36:44, 6.89s/it, lr=1e-5, step_loss=0.0407]
Steps: 2%|▏ | 18131/1000000 [13:02:14<2011:12:57, 7.37s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [18131], local_loss=0.003097651759162545, train_loss=0.0700126513838768, time_cost=1.2383787631988525
+
Steps: 2%|▏ | 18131/1000000 [13:02:14<2011:12:57, 7.37s/it, lr=1e-5, step_loss=0.0031]
Steps: 2%|▏ | 18132/1000000 [13:02:20<1841:56:06, 6.75s/it, lr=1e-5, step_loss=0.0031][RANK-0]: Step: [18132], local_loss=0.012511065229773521, train_loss=5.529755592346191, time_cost=1.6789278984069824
+
Steps: 2%|▏ | 18132/1000000 [13:02:20<1841:56:06, 6.75s/it, lr=1e-5, step_loss=0.0125]
Steps: 2%|▏ | 18133/1000000 [13:02:31<2234:55:23, 8.19s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [18133], local_loss=0.00695835379883647, train_loss=0.034370001405477524, time_cost=3.5460410118103027
+
Steps: 2%|▏ | 18133/1000000 [13:02:31<2234:55:23, 8.19s/it, lr=1e-5, step_loss=0.00696]
Steps: 2%|▏ | 18134/1000000 [13:02:42<2473:52:02, 9.07s/it, lr=1e-5, step_loss=0.00696][RANK-0]: Step: [18134], local_loss=0.02664591744542122, train_loss=0.04019903764128685, time_cost=4.00756311416626
+
Steps: 2%|▏ | 18134/1000000 [13:02:42<2473:52:02, 9.07s/it, lr=1e-5, step_loss=0.0266]
Steps: 2%|▏ | 18135/1000000 [13:02:53<2633:48:35, 9.66s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [18135], local_loss=0.011651216074824333, train_loss=0.01017991453409195, time_cost=1.685283899307251
+
Steps: 2%|▏ | 18135/1000000 [13:02:53<2633:48:35, 9.66s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 18136/1000000 [13:03:11<3261:02:45, 11.96s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [18136], local_loss=0.030316269025206566, train_loss=0.05436406657099724, time_cost=3.635044574737549
+
Steps: 2%|▏ | 18136/1000000 [13:03:11<3261:02:45, 11.96s/it, lr=1e-5, step_loss=0.0303]
Steps: 2%|▏ | 18137/1000000 [13:03:21<3159:22:22, 11.58s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [18137], local_loss=0.022861165925860405, train_loss=0.020899761468172073, time_cost=7.882483243942261
+
Steps: 2%|▏ | 18137/1000000 [13:03:21<3159:22:22, 11.58s/it, lr=1e-5, step_loss=0.0229]
Steps: 2%|▏ | 18138/1000000 [13:03:29<2860:55:13, 10.49s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [18138], local_loss=0.023832494392991066, train_loss=0.029829148203134537, time_cost=2.5140459537506104
+
Steps: 2%|▏ | 18138/1000000 [13:03:29<2860:55:13, 10.49s/it, lr=1e-5, step_loss=0.0238]
Steps: 2%|▏ | 18139/1000000 [13:03:33<2342:07:17, 8.59s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [18139], local_loss=0.016431184485554695, train_loss=0.03907529637217522, time_cost=3.2516467571258545
+
Steps: 2%|▏ | 18139/1000000 [13:03:33<2342:07:17, 8.59s/it, lr=1e-5, step_loss=0.0164]
Steps: 2%|▏ | 18140/1000000 [13:03:46<2648:51:04, 9.71s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [18140], local_loss=0.050601355731487274, train_loss=0.06363190710544586, time_cost=5.629173994064331
+
Steps: 2%|▏ | 18140/1000000 [13:03:46<2648:51:04, 9.71s/it, lr=1e-5, step_loss=0.0506]
Steps: 2%|▏ | 18141/1000000 [13:03:54<2552:09:19, 9.36s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [18141], local_loss=0.010712921619415283, train_loss=0.013011796399950981, time_cost=2.2770400047302246
+
Steps: 2%|▏ | 18141/1000000 [13:03:54<2552:09:19, 9.36s/it, lr=1e-5, step_loss=0.0107]
Steps: 2%|▏ | 18142/1000000 [13:04:00<2223:59:29, 8.15s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [18142], local_loss=0.019409289583563805, train_loss=19.986385345458984, time_cost=2.9271881580352783
+
Steps: 2%|▏ | 18142/1000000 [13:04:00<2223:59:29, 8.15s/it, lr=1e-5, step_loss=0.0194]
Steps: 2%|▏ | 18143/1000000 [13:04:08<2216:18:58, 8.13s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [18143], local_loss=0.007337430492043495, train_loss=0.1527988612651825, time_cost=3.7527430057525635
+
Steps: 2%|▏ | 18143/1000000 [13:04:08<2216:18:58, 8.13s/it, lr=1e-5, step_loss=0.00734]
Steps: 2%|▏ | 18144/1000000 [13:04:21<2612:17:53, 9.58s/it, lr=1e-5, step_loss=0.00734][RANK-0]: Step: [18144], local_loss=0.017760861665010452, train_loss=0.020179037004709244, time_cost=1.2023816108703613
+
Steps: 2%|▏ | 18144/1000000 [13:04:21<2612:17:53, 9.58s/it, lr=1e-5, step_loss=0.0178]
Steps: 2%|▏ | 18145/1000000 [13:04:35<2959:33:56, 10.85s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [18145], local_loss=0.05009603872895241, train_loss=0.05596742406487465, time_cost=5.850573778152466
+
Steps: 2%|▏ | 18145/1000000 [13:04:35<2959:33:56, 10.85s/it, lr=1e-5, step_loss=0.0501]
Steps: 2%|▏ | 18146/1000000 [13:04:44<2828:00:29, 10.37s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [18146], local_loss=0.008749647065997124, train_loss=0.016400396823883057, time_cost=2.0875699520111084
+
Steps: 2%|▏ | 18146/1000000 [13:04:44<2828:00:29, 10.37s/it, lr=1e-5, step_loss=0.00875]
Steps: 2%|▏ | 18147/1000000 [13:04:52<2624:48:01, 9.62s/it, lr=1e-5, step_loss=0.00875][RANK-0]: Step: [18147], local_loss=0.008040503598749638, train_loss=0.02492631785571575, time_cost=6.555104970932007
+
Steps: 2%|▏ | 18147/1000000 [13:04:52<2624:48:01, 9.62s/it, lr=1e-5, step_loss=0.00804]
Steps: 2%|▏ | 18148/1000000 [13:04:57<2271:37:20, 8.33s/it, lr=1e-5, step_loss=0.00804][RANK-0]: Step: [18148], local_loss=0.07900635898113251, train_loss=0.1481042206287384, time_cost=2.5586469173431396
+
Steps: 2%|▏ | 18148/1000000 [13:04:57<2271:37:20, 8.33s/it, lr=1e-5, step_loss=0.079]
Steps: 2%|▏ | 18149/1000000 [13:05:03<2068:19:05, 7.58s/it, lr=1e-5, step_loss=0.079][RANK-0]: Step: [18149], local_loss=0.008280497044324875, train_loss=0.018200956284999847, time_cost=1.8511793613433838
+
Steps: 2%|▏ | 18149/1000000 [13:05:03<2068:19:05, 7.58s/it, lr=1e-5, step_loss=0.00828]
Steps: 2%|▏ | 18150/1000000 [13:05:08<1857:42:12, 6.81s/it, lr=1e-5, step_loss=0.00828][RANK-0]: Step: [18150], local_loss=0.10861857235431671, train_loss=0.039792753756046295, time_cost=1.4187510013580322
+
Steps: 2%|▏ | 18150/1000000 [13:05:08<1857:42:12, 6.81s/it, lr=1e-5, step_loss=0.109]
Steps: 2%|▏ | 18151/1000000 [13:05:22<2493:36:34, 9.14s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [18151], local_loss=0.04451649636030197, train_loss=0.0493919663131237, time_cost=8.097970724105835
+
Steps: 2%|▏ | 18151/1000000 [13:05:22<2493:36:34, 9.14s/it, lr=1e-5, step_loss=0.0445]
Steps: 2%|▏ | 18152/1000000 [13:05:27<2105:42:09, 7.72s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [18152], local_loss=0.0329488180577755, train_loss=0.03343535214662552, time_cost=1.6983904838562012
+
Steps: 2%|▏ | 18152/1000000 [13:05:27<2105:42:09, 7.72s/it, lr=1e-5, step_loss=0.0329]
Steps: 2%|▏ | 18153/1000000 [13:05:34<2065:54:00, 7.57s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [18153], local_loss=0.01509257685393095, train_loss=0.016451383009552956, time_cost=2.4217824935913086
+
Steps: 2%|▏ | 18153/1000000 [13:05:34<2065:54:00, 7.57s/it, lr=1e-5, step_loss=0.0151]
Steps: 2%|▏ | 18154/1000000 [13:05:46<2430:22:04, 8.91s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [18154], local_loss=0.007734485901892185, train_loss=0.03585539013147354, time_cost=5.319468975067139
+
Steps: 2%|▏ | 18154/1000000 [13:05:46<2430:22:04, 8.91s/it, lr=1e-5, step_loss=0.00773]
Steps: 2%|▏ | 18155/1000000 [13:05:57<2598:33:34, 9.53s/it, lr=1e-5, step_loss=0.00773][RANK-0]: Step: [18155], local_loss=0.011285942047834396, train_loss=0.05386850982904434, time_cost=8.719141721725464
+
Steps: 2%|▏ | 18155/1000000 [13:05:57<2598:33:34, 9.53s/it, lr=1e-5, step_loss=0.0113]
Steps: 2%|▏ | 18156/1000000 [13:06:11<2981:21:32, 10.93s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [18156], local_loss=0.014827125705778599, train_loss=0.019555915147066116, time_cost=6.2821269035339355
+
Steps: 2%|▏ | 18156/1000000 [13:06:11<2981:21:32, 10.93s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 18157/1000000 [13:06:20<2810:09:28, 10.30s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [18157], local_loss=0.00808802992105484, train_loss=0.020391400903463364, time_cost=4.256243944168091
+
Steps: 2%|▏ | 18157/1000000 [13:06:20<2810:09:28, 10.30s/it, lr=1e-5, step_loss=0.00809]
Steps: 2%|▏ | 18158/1000000 [13:06:24<2298:24:17, 8.43s/it, lr=1e-5, step_loss=0.00809][RANK-0]: Step: [18158], local_loss=0.010833580046892166, train_loss=0.020672574639320374, time_cost=1.7245559692382812
+
Steps: 2%|▏ | 18158/1000000 [13:06:24<2298:24:17, 8.43s/it, lr=1e-5, step_loss=0.0108]
Steps: 2%|▏ | 18159/1000000 [13:06:30<2067:33:21, 7.58s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [18159], local_loss=0.012967286631464958, train_loss=0.03151210397481918, time_cost=3.2395846843719482
+
Steps: 2%|▏ | 18159/1000000 [13:06:30<2067:33:21, 7.58s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 18160/1000000 [13:06:41<2385:50:21, 8.75s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [18160], local_loss=0.08293623477220535, train_loss=0.03551097214221954, time_cost=3.2004027366638184
+
Steps: 2%|▏ | 18160/1000000 [13:06:41<2385:50:21, 8.75s/it, lr=1e-5, step_loss=0.0829]
Steps: 2%|▏ | 18161/1000000 [13:06:57<2999:20:28, 11.00s/it, lr=1e-5, step_loss=0.0829][RANK-0]: Step: [18161], local_loss=0.009556920267641544, train_loss=0.01586160436272621, time_cost=3.529108762741089
+
Steps: 2%|▏ | 18161/1000000 [13:06:57<2999:20:28, 11.00s/it, lr=1e-5, step_loss=0.00956]
Steps: 2%|▏ | 18162/1000000 [13:07:05<2705:35:20, 9.92s/it, lr=1e-5, step_loss=0.00956][RANK-0]: Step: [18162], local_loss=0.008630966767668724, train_loss=0.016263868659734726, time_cost=1.9477179050445557
+
Steps: 2%|▏ | 18162/1000000 [13:07:05<2705:35:20, 9.92s/it, lr=1e-5, step_loss=0.00863]
Steps: 2%|▏ | 18163/1000000 [13:07:16<2831:33:54, 10.38s/it, lr=1e-5, step_loss=0.00863][RANK-0]: Step: [18163], local_loss=0.047604724764823914, train_loss=0.03327178582549095, time_cost=3.90305757522583
+
Steps: 2%|▏ | 18163/1000000 [13:07:16<2831:33:54, 10.38s/it, lr=1e-5, step_loss=0.0476]
Steps: 2%|▏ | 18164/1000000 [13:07:22<2473:53:02, 9.07s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [18164], local_loss=0.009201042354106903, train_loss=0.020530477166175842, time_cost=1.5002079010009766
+
Steps: 2%|▏ | 18164/1000000 [13:07:22<2473:53:02, 9.07s/it, lr=1e-5, step_loss=0.0092]
Steps: 2%|▏ | 18165/1000000 [13:07:35<2770:15:23, 10.16s/it, lr=1e-5, step_loss=0.0092][RANK-0]: Step: [18165], local_loss=0.03302065283060074, train_loss=0.05170507729053497, time_cost=3.798096179962158
+
Steps: 2%|▏ | 18165/1000000 [13:07:35<2770:15:23, 10.16s/it, lr=1e-5, step_loss=0.033]
Steps: 2%|▏ | 18166/1000000 [13:07:44<2694:18:27, 9.88s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [18166], local_loss=0.016665758565068245, train_loss=0.030653491616249084, time_cost=2.9473774433135986
+
Steps: 2%|▏ | 18166/1000000 [13:07:44<2694:18:27, 9.88s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 18167/1000000 [13:07:50<2362:58:43, 8.66s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [18167], local_loss=0.013220681808888912, train_loss=0.024440567940473557, time_cost=1.4648547172546387
+
Steps: 2%|▏ | 18167/1000000 [13:07:50<2362:58:43, 8.66s/it, lr=1e-5, step_loss=0.0132]
Steps: 2%|▏ | 18168/1000000 [13:07:55<2075:00:39, 7.61s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [18168], local_loss=0.03251759707927704, train_loss=0.1598416268825531, time_cost=2.3800737857818604
+
Steps: 2%|▏ | 18168/1000000 [13:07:55<2075:00:39, 7.61s/it, lr=1e-5, step_loss=0.0325]
Steps: 2%|▏ | 18169/1000000 [13:08:07<2425:20:28, 8.89s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [18169], local_loss=0.026191161945462227, train_loss=0.036153070628643036, time_cost=3.1055140495300293
+
Steps: 2%|▏ | 18169/1000000 [13:08:07<2425:20:28, 8.89s/it, lr=1e-5, step_loss=0.0262]
Steps: 2%|▏ | 18170/1000000 [13:08:19<2655:11:08, 9.74s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [18170], local_loss=0.008363633416593075, train_loss=0.04002979397773743, time_cost=9.583123207092285
+
Steps: 2%|▏ | 18170/1000000 [13:08:19<2655:11:08, 9.74s/it, lr=1e-5, step_loss=0.00836]
Steps: 2%|▏ | 18171/1000000 [13:08:26<2440:18:33, 8.95s/it, lr=1e-5, step_loss=0.00836][RANK-0]: Step: [18171], local_loss=0.01913624070584774, train_loss=0.07338858395814896, time_cost=2.9874727725982666
+
Steps: 2%|▏ | 18171/1000000 [13:08:26<2440:18:33, 8.95s/it, lr=1e-5, step_loss=0.0191]
Steps: 2%|▏ | 18172/1000000 [13:08:36<2541:22:30, 9.32s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [18172], local_loss=0.00531058618798852, train_loss=0.0214066281914711, time_cost=3.078632354736328
+
Steps: 2%|▏ | 18172/1000000 [13:08:36<2541:22:30, 9.32s/it, lr=1e-5, step_loss=0.00531]
Steps: 2%|▏ | 18173/1000000 [13:08:44<2396:16:21, 8.79s/it, lr=1e-5, step_loss=0.00531][RANK-0]: Step: [18173], local_loss=0.005499673541635275, train_loss=0.01842917501926422, time_cost=1.6964454650878906
+
Steps: 2%|▏ | 18173/1000000 [13:08:44<2396:16:21, 8.79s/it, lr=1e-5, step_loss=0.0055]
Steps: 2%|▏ | 18174/1000000 [13:08:53<2431:16:57, 8.91s/it, lr=1e-5, step_loss=0.0055][RANK-0]: Step: [18174], local_loss=0.006793018896132708, train_loss=0.03867608681321144, time_cost=1.616814136505127
+
Steps: 2%|▏ | 18174/1000000 [13:08:53<2431:16:57, 8.91s/it, lr=1e-5, step_loss=0.00679]
Steps: 2%|▏ | 18175/1000000 [13:08:58<2143:13:14, 7.86s/it, lr=1e-5, step_loss=0.00679][RANK-0]: Step: [18175], local_loss=0.18611939251422882, train_loss=0.05271674692630768, time_cost=2.6147642135620117
+
Steps: 2%|▏ | 18175/1000000 [13:08:58<2143:13:14, 7.86s/it, lr=1e-5, step_loss=0.186]
Steps: 2%|▏ | 18176/1000000 [13:09:08<2312:05:08, 8.48s/it, lr=1e-5, step_loss=0.186][RANK-0]: Step: [18176], local_loss=0.014075011014938354, train_loss=0.05439840629696846, time_cost=2.9146838188171387
+
Steps: 2%|▏ | 18176/1000000 [13:09:08<2312:05:08, 8.48s/it, lr=1e-5, step_loss=0.0141]
Steps: 2%|▏ | 18177/1000000 [13:09:18<2392:42:48, 8.77s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [18177], local_loss=0.04706624150276184, train_loss=0.0233624167740345, time_cost=4.641847610473633
+
Steps: 2%|▏ | 18177/1000000 [13:09:18<2392:42:48, 8.77s/it, lr=1e-5, step_loss=0.0471]
Steps: 2%|▏ | 18178/1000000 [13:09:32<2841:01:09, 10.42s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [18178], local_loss=0.007294048089534044, train_loss=0.031600482761859894, time_cost=5.240105628967285
+
Steps: 2%|▏ | 18178/1000000 [13:09:32<2841:01:09, 10.42s/it, lr=1e-5, step_loss=0.00729]
Steps: 2%|▏ | 18179/1000000 [13:09:45<3024:35:42, 11.09s/it, lr=1e-5, step_loss=0.00729][RANK-0]: Step: [18179], local_loss=0.0026243345346301794, train_loss=0.020435690879821777, time_cost=5.47660231590271
+
Steps: 2%|▏ | 18179/1000000 [13:09:45<3024:35:42, 11.09s/it, lr=1e-5, step_loss=0.00262]
Steps: 2%|▏ | 18180/1000000 [13:09:49<2477:35:42, 9.08s/it, lr=1e-5, step_loss=0.00262][RANK-0]: Step: [18180], local_loss=0.018984168767929077, train_loss=0.06349588930606842, time_cost=1.7351298332214355
+
Steps: 2%|▏ | 18180/1000000 [13:09:49<2477:35:42, 9.08s/it, lr=1e-5, step_loss=0.019]
Steps: 2%|▏ | 18181/1000000 [13:09:58<2478:03:53, 9.09s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [18181], local_loss=0.034598346799612045, train_loss=0.0497102290391922, time_cost=1.960754156112671
+
Steps: 2%|▏ | 18181/1000000 [13:09:58<2478:03:53, 9.09s/it, lr=1e-5, step_loss=0.0346]
Steps: 2%|▏ | 18182/1000000 [13:10:07<2498:35:05, 9.16s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [18182], local_loss=0.07421636581420898, train_loss=0.03189878165721893, time_cost=3.5339815616607666
+
Steps: 2%|▏ | 18182/1000000 [13:10:07<2498:35:05, 9.16s/it, lr=1e-5, step_loss=0.0742]
Steps: 2%|▏ | 18183/1000000 [13:10:16<2491:32:31, 9.14s/it, lr=1e-5, step_loss=0.0742][RANK-0]: Step: [18183], local_loss=0.2589266002178192, train_loss=0.07392262667417526, time_cost=6.871935844421387
+
Steps: 2%|▏ | 18183/1000000 [13:10:16<2491:32:31, 9.14s/it, lr=1e-5, step_loss=0.259]
Steps: 2%|▏ | 18184/1000000 [13:10:28<2667:05:07, 9.78s/it, lr=1e-5, step_loss=0.259][RANK-0]: Step: [18184], local_loss=0.05131683871150017, train_loss=0.032035671174526215, time_cost=3.6549501419067383
+
Steps: 2%|▏ | 18184/1000000 [13:10:28<2667:05:07, 9.78s/it, lr=1e-5, step_loss=0.0513]
Steps: 2%|▏ | 18185/1000000 [13:10:45<3293:44:52, 12.08s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [18185], local_loss=0.009033126756548882, train_loss=0.017819300293922424, time_cost=14.519775629043579
+
Steps: 2%|▏ | 18185/1000000 [13:10:45<3293:44:52, 12.08s/it, lr=1e-5, step_loss=0.00903]
Steps: 2%|▏ | 18186/1000000 [13:10:50<2682:48:19, 9.84s/it, lr=1e-5, step_loss=0.00903][RANK-0]: Step: [18186], local_loss=0.006330321542918682, train_loss=0.19271469116210938, time_cost=1.7913281917572021
+
Steps: 2%|▏ | 18186/1000000 [13:10:50<2682:48:19, 9.84s/it, lr=1e-5, step_loss=0.00633]
Steps: 2%|▏ | 18187/1000000 [13:11:00<2672:40:01, 9.80s/it, lr=1e-5, step_loss=0.00633][RANK-0]: Step: [18187], local_loss=0.00491942698135972, train_loss=0.023490479215979576, time_cost=3.695791721343994
+
Steps: 2%|▏ | 18187/1000000 [13:11:00<2672:40:01, 9.80s/it, lr=1e-5, step_loss=0.00492]
Steps: 2%|▏ | 18188/1000000 [13:11:04<2277:05:48, 8.35s/it, lr=1e-5, step_loss=0.00492][RANK-0]: Step: [18188], local_loss=0.031018828973174095, train_loss=0.0334644541144371, time_cost=2.386580228805542
+
Steps: 2%|▏ | 18188/1000000 [13:11:04<2277:05:48, 8.35s/it, lr=1e-5, step_loss=0.031]
Steps: 2%|▏ | 18189/1000000 [13:11:18<2710:24:43, 9.94s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [18189], local_loss=0.012328856624662876, train_loss=0.06767033040523529, time_cost=6.101442337036133
+
Steps: 2%|▏ | 18189/1000000 [13:11:18<2710:24:43, 9.94s/it, lr=1e-5, step_loss=0.0123]
Steps: 2%|▏ | 18190/1000000 [13:11:23<2322:35:37, 8.52s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [18190], local_loss=0.01329047605395317, train_loss=0.013187378644943237, time_cost=2.2424449920654297
+
Steps: 2%|▏ | 18190/1000000 [13:11:23<2322:35:37, 8.52s/it, lr=1e-5, step_loss=0.0133]
Steps: 2%|▏ | 18191/1000000 [13:11:37<2747:04:39, 10.07s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [18191], local_loss=0.007932107895612717, train_loss=0.015539792366325855, time_cost=5.8860554695129395
+
Steps: 2%|▏ | 18191/1000000 [13:11:37<2747:04:39, 10.07s/it, lr=1e-5, step_loss=0.00793]
Steps: 2%|▏ | 18192/1000000 [13:11:48<2829:19:37, 10.37s/it, lr=1e-5, step_loss=0.00793][RANK-0]: Step: [18192], local_loss=0.006774603389203548, train_loss=0.01868734508752823, time_cost=1.5871937274932861
+
Steps: 2%|▏ | 18192/1000000 [13:11:48<2829:19:37, 10.37s/it, lr=1e-5, step_loss=0.00677]
Steps: 2%|▏ | 18193/1000000 [13:11:53<2387:28:53, 8.75s/it, lr=1e-5, step_loss=0.00677][RANK-0]: Step: [18193], local_loss=0.019737444818019867, train_loss=0.01734045520424843, time_cost=1.9344372749328613
+
Steps: 2%|▏ | 18193/1000000 [13:11:53<2387:28:53, 8.75s/it, lr=1e-5, step_loss=0.0197]
Steps: 2%|▏ | 18194/1000000 [13:12:05<2647:46:56, 9.71s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [18194], local_loss=0.07256454974412918, train_loss=0.057274267077445984, time_cost=1.2111947536468506
+
Steps: 2%|▏ | 18194/1000000 [13:12:05<2647:46:56, 9.71s/it, lr=1e-5, step_loss=0.0726]
Steps: 2%|▏ | 18195/1000000 [13:12:15<2689:54:36, 9.86s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [18195], local_loss=0.04856255277991295, train_loss=0.027812041342258453, time_cost=2.9715116024017334
+
Steps: 2%|▏ | 18195/1000000 [13:12:15<2689:54:36, 9.86s/it, lr=1e-5, step_loss=0.0486]
Steps: 2%|▏ | 18196/1000000 [13:12:30<3088:23:52, 11.32s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [18196], local_loss=0.02726825699210167, train_loss=0.022549565881490707, time_cost=3.230879783630371
+
Steps: 2%|▏ | 18196/1000000 [13:12:30<3088:23:52, 11.32s/it, lr=1e-5, step_loss=0.0273]
Steps: 2%|▏ | 18197/1000000 [13:12:39<2917:26:11, 10.70s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [18197], local_loss=0.02641266956925392, train_loss=0.023282000795006752, time_cost=2.3216824531555176
+
Steps: 2%|▏ | 18197/1000000 [13:12:39<2917:26:11, 10.70s/it, lr=1e-5, step_loss=0.0264]
Steps: 2%|▏ | 18198/1000000 [13:12:48<2798:11:45, 10.26s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [18198], local_loss=0.018710564821958542, train_loss=0.07191748917102814, time_cost=7.252506971359253
+
Steps: 2%|▏ | 18198/1000000 [13:12:48<2798:11:45, 10.26s/it, lr=1e-5, step_loss=0.0187]
Steps: 2%|▏ | 18199/1000000 [13:12:54<2406:32:08, 8.82s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [18199], local_loss=0.05575239658355713, train_loss=0.036965616047382355, time_cost=1.6456592082977295
+
Steps: 2%|▏ | 18199/1000000 [13:12:54<2406:32:08, 8.82s/it, lr=1e-5, step_loss=0.0558]
Steps: 2%|▏ | 18200/1000000 [13:13:04<2542:45:23, 9.32s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [18200], local_loss=0.04244329407811165, train_loss=0.06397491693496704, time_cost=1.576612949371338
+
Steps: 2%|▏ | 18200/1000000 [13:13:04<2542:45:23, 9.32s/it, lr=1e-5, step_loss=0.0424]
Steps: 2%|▏ | 18201/1000000 [13:13:12<2391:12:09, 8.77s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [18201], local_loss=0.016882797703146935, train_loss=0.057430461049079895, time_cost=2.5326220989227295
+
Steps: 2%|▏ | 18201/1000000 [13:13:12<2391:12:09, 8.77s/it, lr=1e-5, step_loss=0.0169]
Steps: 2%|▏ | 18202/1000000 [13:13:27<2878:16:18, 10.55s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [18202], local_loss=0.07865499705076218, train_loss=0.04606262594461441, time_cost=7.521785736083984
+
Steps: 2%|▏ | 18202/1000000 [13:13:27<2878:16:18, 10.55s/it, lr=1e-5, step_loss=0.0787]
Steps: 2%|▏ | 18203/1000000 [13:13:35<2674:38:32, 9.81s/it, lr=1e-5, step_loss=0.0787][RANK-0]: Step: [18203], local_loss=0.03113398514688015, train_loss=0.02686055563390255, time_cost=4.0263307094573975
+
Steps: 2%|▏ | 18203/1000000 [13:13:35<2674:38:32, 9.81s/it, lr=1e-5, step_loss=0.0311]
Steps: 2%|▏ | 18204/1000000 [13:13:39<2221:11:27, 8.14s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [18204], local_loss=0.04953749105334282, train_loss=0.02475406602025032, time_cost=1.8042175769805908
+
Steps: 2%|▏ | 18204/1000000 [13:13:39<2221:11:27, 8.14s/it, lr=1e-5, step_loss=0.0495]
Steps: 2%|▏ | 18205/1000000 [13:13:54<2816:47:32, 10.33s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [18205], local_loss=0.04174800217151642, train_loss=0.03916594386100769, time_cost=9.760315418243408
+
Steps: 2%|▏ | 18205/1000000 [13:13:54<2816:47:32, 10.33s/it, lr=1e-5, step_loss=0.0417]
Steps: 2%|▏ | 18206/1000000 [13:14:00<2456:10:02, 9.01s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [18206], local_loss=0.07433898746967316, train_loss=0.029432352632284164, time_cost=1.4665484428405762
+
Steps: 2%|▏ | 18206/1000000 [13:14:00<2456:10:02, 9.01s/it, lr=1e-5, step_loss=0.0743]
Steps: 2%|▏ | 18207/1000000 [13:14:05<2096:26:57, 7.69s/it, lr=1e-5, step_loss=0.0743][RANK-0]: Step: [18207], local_loss=0.8713523149490356, train_loss=0.12840136885643005, time_cost=1.874584674835205
+
Steps: 2%|▏ | 18207/1000000 [13:14:05<2096:26:57, 7.69s/it, lr=1e-5, step_loss=0.871]
Steps: 2%|▏ | 18208/1000000 [13:14:10<1878:22:09, 6.89s/it, lr=1e-5, step_loss=0.871][RANK-0]: Step: [18208], local_loss=0.05851798504590988, train_loss=0.2115682065486908, time_cost=1.53047513961792
+
Steps: 2%|▏ | 18208/1000000 [13:14:10<1878:22:09, 6.89s/it, lr=1e-5, step_loss=0.0585]
Steps: 2%|▏ | 18209/1000000 [13:14:17<1891:59:53, 6.94s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [18209], local_loss=0.006879366934299469, train_loss=0.027879569679498672, time_cost=1.3255009651184082
+
Steps: 2%|▏ | 18209/1000000 [13:14:17<1891:59:53, 6.94s/it, lr=1e-5, step_loss=0.00688]
Steps: 2%|▏ | 18210/1000000 [13:14:30<2427:13:51, 8.90s/it, lr=1e-5, step_loss=0.00688][RANK-0]: Step: [18210], local_loss=0.0038590149488300085, train_loss=0.036261819303035736, time_cost=4.821540832519531
+
Steps: 2%|▏ | 18210/1000000 [13:14:30<2427:13:51, 8.90s/it, lr=1e-5, step_loss=0.00386]
Steps: 2%|▏ | 18211/1000000 [13:14:39<2392:38:38, 8.77s/it, lr=1e-5, step_loss=0.00386][RANK-0]: Step: [18211], local_loss=0.08980710804462433, train_loss=0.03088475950062275, time_cost=2.6469881534576416
+
Steps: 2%|▏ | 18211/1000000 [13:14:39<2392:38:38, 8.77s/it, lr=1e-5, step_loss=0.0898]
Steps: 2%|▏ | 18212/1000000 [13:14:46<2236:11:58, 8.20s/it, lr=1e-5, step_loss=0.0898][RANK-0]: Step: [18212], local_loss=0.007541914936155081, train_loss=0.06273919343948364, time_cost=3.1898868083953857
+
Steps: 2%|▏ | 18212/1000000 [13:14:46<2236:11:58, 8.20s/it, lr=1e-5, step_loss=0.00754]
Steps: 2%|▏ | 18213/1000000 [13:14:59<2671:50:47, 9.80s/it, lr=1e-5, step_loss=0.00754][RANK-0]: Step: [18213], local_loss=0.010594048537313938, train_loss=0.020503828302025795, time_cost=3.837414026260376
+
Steps: 2%|▏ | 18213/1000000 [13:14:59<2671:50:47, 9.80s/it, lr=1e-5, step_loss=0.0106]
Steps: 2%|▏ | 18214/1000000 [13:15:04<2228:28:57, 8.17s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [18214], local_loss=0.045643482357263565, train_loss=0.023809440433979034, time_cost=2.0329649448394775
+
Steps: 2%|▏ | 18214/1000000 [13:15:04<2228:28:57, 8.17s/it, lr=1e-5, step_loss=0.0456]
Steps: 2%|▏ | 18215/1000000 [13:15:17<2662:59:26, 9.76s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [18215], local_loss=0.1040508970618248, train_loss=0.027703436091542244, time_cost=4.285916328430176
+
Steps: 2%|▏ | 18215/1000000 [13:15:17<2662:59:26, 9.76s/it, lr=1e-5, step_loss=0.104]
Steps: 2%|▏ | 18216/1000000 [13:15:30<2885:09:12, 10.58s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [18216], local_loss=0.017433296889066696, train_loss=0.030642617493867874, time_cost=4.686971187591553
+
Steps: 2%|▏ | 18216/1000000 [13:15:30<2885:09:12, 10.58s/it, lr=1e-5, step_loss=0.0174]
Steps: 2%|▏ | 18217/1000000 [13:15:40<2880:52:51, 10.56s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [18217], local_loss=0.02385997213423252, train_loss=0.0533732995390892, time_cost=4.335027694702148
+
Steps: 2%|▏ | 18217/1000000 [13:15:40<2880:52:51, 10.56s/it, lr=1e-5, step_loss=0.0239]
Steps: 2%|▏ | 18218/1000000 [13:15:45<2439:49:06, 8.95s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [18218], local_loss=0.3154400587081909, train_loss=0.05492938682436943, time_cost=2.4623863697052
+
Steps: 2%|▏ | 18218/1000000 [13:15:45<2439:49:06, 8.95s/it, lr=1e-5, step_loss=0.315]
Steps: 2%|▏ | 18219/1000000 [13:15:51<2179:00:37, 7.99s/it, lr=1e-5, step_loss=0.315][RANK-0]: Step: [18219], local_loss=0.032036855816841125, train_loss=0.08009522408246994, time_cost=1.5059797763824463
+
Steps: 2%|▏ | 18219/1000000 [13:15:51<2179:00:37, 7.99s/it, lr=1e-5, step_loss=0.032]
Steps: 2%|▏ | 18220/1000000 [13:15:57<2017:41:09, 7.40s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [18220], local_loss=0.013036358170211315, train_loss=0.1084657832980156, time_cost=1.818274974822998
+
Steps: 2%|▏ | 18220/1000000 [13:15:57<2017:41:09, 7.40s/it, lr=1e-5, step_loss=0.013]
Steps: 2%|▏ | 18221/1000000 [13:16:08<2303:48:07, 8.45s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [18221], local_loss=0.04620731994509697, train_loss=0.08392614126205444, time_cost=2.898996591567993
+
Steps: 2%|▏ | 18221/1000000 [13:16:08<2303:48:07, 8.45s/it, lr=1e-5, step_loss=0.0462]
Steps: 2%|▏ | 18222/1000000 [13:16:19<2473:39:34, 9.07s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [18222], local_loss=0.05044595152139664, train_loss=0.02520740032196045, time_cost=1.2276723384857178
+
Steps: 2%|▏ | 18222/1000000 [13:16:19<2473:39:34, 9.07s/it, lr=1e-5, step_loss=0.0504]
Steps: 2%|▏ | 18223/1000000 [13:16:24<2149:03:32, 7.88s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [18223], local_loss=0.008232327178120613, train_loss=0.014657719060778618, time_cost=2.326227903366089
+
Steps: 2%|▏ | 18223/1000000 [13:16:24<2149:03:32, 7.88s/it, lr=1e-5, step_loss=0.00823]
Steps: 2%|▏ | 18224/1000000 [13:16:28<1852:22:23, 6.79s/it, lr=1e-5, step_loss=0.00823][RANK-0]: Step: [18224], local_loss=0.06790313124656677, train_loss=0.042823560535907745, time_cost=1.4946677684783936
+
Steps: 2%|▏ | 18224/1000000 [13:16:28<1852:22:23, 6.79s/it, lr=1e-5, step_loss=0.0679]
Steps: 2%|▏ | 18225/1000000 [13:16:38<2151:55:42, 7.89s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [18225], local_loss=0.04235704988241196, train_loss=0.1596534550189972, time_cost=2.8861896991729736
+
Steps: 2%|▏ | 18225/1000000 [13:16:38<2151:55:42, 7.89s/it, lr=1e-5, step_loss=0.0424]
Steps: 2%|▏ | 18226/1000000 [13:16:51<2567:40:20, 9.42s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [18226], local_loss=0.027558963745832443, train_loss=0.04286317154765129, time_cost=5.0060272216796875
+
Steps: 2%|▏ | 18226/1000000 [13:16:51<2567:40:20, 9.42s/it, lr=1e-5, step_loss=0.0276]
Steps: 2%|▏ | 18227/1000000 [13:16:59<2384:37:11, 8.74s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [18227], local_loss=0.016670845448970795, train_loss=0.05371379852294922, time_cost=2.902592897415161
+
Steps: 2%|▏ | 18227/1000000 [13:16:59<2384:37:11, 8.74s/it, lr=1e-5, step_loss=0.0167]
Steps: 2%|▏ | 18228/1000000 [13:17:09<2504:56:57, 9.19s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [18228], local_loss=0.03507485240697861, train_loss=0.025228215381503105, time_cost=1.7840416431427002
+
Steps: 2%|▏ | 18228/1000000 [13:17:09<2504:56:57, 9.19s/it, lr=1e-5, step_loss=0.0351]
Steps: 2%|▏ | 18229/1000000 [13:17:13<2101:55:22, 7.71s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [18229], local_loss=0.02072945423424244, train_loss=0.022406432777643204, time_cost=1.223726511001587
+
Steps: 2%|▏ | 18229/1000000 [13:17:13<2101:55:22, 7.71s/it, lr=1e-5, step_loss=0.0207]
Steps: 2%|▏ | 18230/1000000 [13:17:21<2106:22:21, 7.72s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [18230], local_loss=0.005896301940083504, train_loss=0.04408491775393486, time_cost=3.780560255050659
+
Steps: 2%|▏ | 18230/1000000 [13:17:21<2106:22:21, 7.72s/it, lr=1e-5, step_loss=0.0059]
Steps: 2%|▏ | 18231/1000000 [13:17:26<1890:45:31, 6.93s/it, lr=1e-5, step_loss=0.0059][RANK-0]: Step: [18231], local_loss=0.03564104437828064, train_loss=0.02055257558822632, time_cost=1.499140977859497
+
Steps: 2%|▏ | 18231/1000000 [13:17:26<1890:45:31, 6.93s/it, lr=1e-5, step_loss=0.0356]
Steps: 2%|▏ | 18232/1000000 [13:17:35<2086:06:24, 7.65s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [18232], local_loss=0.04132828861474991, train_loss=0.02408253401517868, time_cost=7.822662591934204
+
Steps: 2%|▏ | 18232/1000000 [13:17:35<2086:06:24, 7.65s/it, lr=1e-5, step_loss=0.0413]
Steps: 2%|▏ | 18233/1000000 [13:17:49<2572:14:52, 9.43s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [18233], local_loss=0.00906942319124937, train_loss=0.01854204759001732, time_cost=1.2135941982269287
+
Steps: 2%|▏ | 18233/1000000 [13:17:49<2572:14:52, 9.43s/it, lr=1e-5, step_loss=0.00907]
Steps: 2%|▏ | 18234/1000000 [13:18:05<3100:42:19, 11.37s/it, lr=1e-5, step_loss=0.00907][RANK-0]: Step: [18234], local_loss=0.007118017412722111, train_loss=0.008029108867049217, time_cost=7.142882347106934
+
Steps: 2%|▏ | 18234/1000000 [13:18:05<3100:42:19, 11.37s/it, lr=1e-5, step_loss=0.00712]
Steps: 2%|▏ | 18235/1000000 [13:18:15<2988:23:17, 10.96s/it, lr=1e-5, step_loss=0.00712][RANK-0]: Step: [18235], local_loss=0.014825109392404556, train_loss=0.019622020423412323, time_cost=5.422454595565796
+
Steps: 2%|▏ | 18235/1000000 [13:18:15<2988:23:17, 10.96s/it, lr=1e-5, step_loss=0.0148]
Steps: 2%|▏ | 18236/1000000 [13:18:20<2551:15:26, 9.36s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [18236], local_loss=0.05262140929698944, train_loss=0.02242320217192173, time_cost=2.4438021183013916
+
Steps: 2%|▏ | 18236/1000000 [13:18:20<2551:15:26, 9.36s/it, lr=1e-5, step_loss=0.0526]
Steps: 2%|▏ | 18237/1000000 [13:18:30<2565:20:33, 9.41s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [18237], local_loss=0.029404599219560623, train_loss=0.0785069689154625, time_cost=1.6473801136016846
+
Steps: 2%|▏ | 18237/1000000 [13:18:30<2565:20:33, 9.41s/it, lr=1e-5, step_loss=0.0294]
Steps: 2%|▏ | 18238/1000000 [13:18:42<2776:00:32, 10.18s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [18238], local_loss=0.026647958904504776, train_loss=0.026873895898461342, time_cost=3.985995054244995
+
Steps: 2%|▏ | 18238/1000000 [13:18:42<2776:00:32, 10.18s/it, lr=1e-5, step_loss=0.0266]
Steps: 2%|▏ | 18239/1000000 [13:18:46<2285:34:41, 8.38s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [18239], local_loss=0.009586872532963753, train_loss=0.05673127993941307, time_cost=1.221010684967041
+
Steps: 2%|▏ | 18239/1000000 [13:18:46<2285:34:41, 8.38s/it, lr=1e-5, step_loss=0.00959]
Steps: 2%|▏ | 18240/1000000 [13:19:00<2727:05:07, 10.00s/it, lr=1e-5, step_loss=0.00959][RANK-0]: Step: [18240], local_loss=0.8028590679168701, train_loss=0.1623721718788147, time_cost=5.932931423187256
+
Steps: 2%|▏ | 18240/1000000 [13:19:00<2727:05:07, 10.00s/it, lr=1e-5, step_loss=0.803]
Steps: 2%|▏ | 18241/1000000 [13:19:05<2343:22:17, 8.59s/it, lr=1e-5, step_loss=0.803][RANK-0]: Step: [18241], local_loss=0.008068175055086613, train_loss=0.027436677366495132, time_cost=2.422665596008301
+
Steps: 2%|▏ | 18241/1000000 [13:19:05<2343:22:17, 8.59s/it, lr=1e-5, step_loss=0.00807]
Steps: 2%|▏ | 18242/1000000 [13:19:13<2272:15:30, 8.33s/it, lr=1e-5, step_loss=0.00807][RANK-0]: Step: [18242], local_loss=0.0035130148753523827, train_loss=0.18553197383880615, time_cost=3.976341724395752
+
Steps: 2%|▏ | 18242/1000000 [13:19:13<2272:15:30, 8.33s/it, lr=1e-5, step_loss=0.00351]
Steps: 2%|▏ | 18243/1000000 [13:19:25<2582:38:32, 9.47s/it, lr=1e-5, step_loss=0.00351][RANK-0]: Step: [18243], local_loss=0.08343058824539185, train_loss=0.10802359879016876, time_cost=3.0108273029327393
+
Steps: 2%|▏ | 18243/1000000 [13:19:25<2582:38:32, 9.47s/it, lr=1e-5, step_loss=0.0834]
Steps: 2%|▏ | 18244/1000000 [13:19:32<2368:56:38, 8.69s/it, lr=1e-5, step_loss=0.0834][RANK-0]: Step: [18244], local_loss=0.029221605509519577, train_loss=0.022740164771676064, time_cost=2.2538342475891113
+
Steps: 2%|▏ | 18244/1000000 [13:19:32<2368:56:38, 8.69s/it, lr=1e-5, step_loss=0.0292]
Steps: 2%|▏ | 18245/1000000 [13:19:41<2389:09:00, 8.76s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [18245], local_loss=0.009345479309558868, train_loss=0.048889052122831345, time_cost=2.963603973388672
+
Steps: 2%|▏ | 18245/1000000 [13:19:41<2389:09:00, 8.76s/it, lr=1e-5, step_loss=0.00935]
Steps: 2%|▏ | 18246/1000000 [13:19:49<2321:08:37, 8.51s/it, lr=1e-5, step_loss=0.00935][RANK-0]: Step: [18246], local_loss=0.006636491045355797, train_loss=0.03144994378089905, time_cost=4.267294406890869
+
Steps: 2%|▏ | 18246/1000000 [13:19:49<2321:08:37, 8.51s/it, lr=1e-5, step_loss=0.00664]
Steps: 2%|▏ | 18247/1000000 [13:19:58<2404:34:41, 8.82s/it, lr=1e-5, step_loss=0.00664][RANK-0]: Step: [18247], local_loss=0.03638343885540962, train_loss=0.09416626393795013, time_cost=3.5908186435699463
+
Steps: 2%|▏ | 18247/1000000 [13:19:58<2404:34:41, 8.82s/it, lr=1e-5, step_loss=0.0364]
Steps: 2%|▏ | 18248/1000000 [13:20:13<2879:08:18, 10.56s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [18248], local_loss=0.23244942724704742, train_loss=0.057009607553482056, time_cost=5.537808179855347
+
Steps: 2%|▏ | 18248/1000000 [13:20:13<2879:08:18, 10.56s/it, lr=1e-5, step_loss=0.232]
Steps: 2%|▏ | 18249/1000000 [13:20:18<2456:25:07, 9.01s/it, lr=1e-5, step_loss=0.232][RANK-0]: Step: [18249], local_loss=0.007701480761170387, train_loss=0.014532959088683128, time_cost=2.719228982925415
+
Steps: 2%|▏ | 18249/1000000 [13:20:18<2456:25:07, 9.01s/it, lr=1e-5, step_loss=0.0077]
Steps: 2%|▏ | 18250/1000000 [13:20:28<2498:17:30, 9.16s/it, lr=1e-5, step_loss=0.0077][RANK-0]: Step: [18250], local_loss=0.00693521648645401, train_loss=0.02530178613960743, time_cost=4.0996246337890625
+
Steps: 2%|▏ | 18250/1000000 [13:20:28<2498:17:30, 9.16s/it, lr=1e-5, step_loss=0.00694]
Steps: 2%|▏ | 18251/1000000 [13:20:33<2147:17:19, 7.87s/it, lr=1e-5, step_loss=0.00694][RANK-0]: Step: [18251], local_loss=0.011745452880859375, train_loss=0.10033602267503738, time_cost=1.9636869430541992
+
Steps: 2%|▏ | 18251/1000000 [13:20:33<2147:17:19, 7.87s/it, lr=1e-5, step_loss=0.0117]
Steps: 2%|▏ | 18252/1000000 [13:20:44<2425:36:48, 8.89s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [18252], local_loss=0.004249457735568285, train_loss=0.02418951876461506, time_cost=2.327040672302246
+
Steps: 2%|▏ | 18252/1000000 [13:20:44<2425:36:48, 8.89s/it, lr=1e-5, step_loss=0.00425]
Steps: 2%|▏ | 18253/1000000 [13:20:53<2461:18:59, 9.03s/it, lr=1e-5, step_loss=0.00425][RANK-0]: Step: [18253], local_loss=0.1413460522890091, train_loss=0.03325413912534714, time_cost=2.0960938930511475
+
Steps: 2%|▏ | 18253/1000000 [13:20:53<2461:18:59, 9.03s/it, lr=1e-5, step_loss=0.141]
Steps: 2%|▏ | 18254/1000000 [13:21:05<2726:30:17, 10.00s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [18254], local_loss=0.0634334608912468, train_loss=0.03238895907998085, time_cost=1.2297554016113281
+
Steps: 2%|▏ | 18254/1000000 [13:21:05<2726:30:17, 10.00s/it, lr=1e-5, step_loss=0.0634]
Steps: 2%|▏ | 18255/1000000 [13:21:22<3251:27:14, 11.92s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [18255], local_loss=0.005382951349020004, train_loss=0.10066655278205872, time_cost=8.318339824676514
+
Steps: 2%|▏ | 18255/1000000 [13:21:22<3251:27:14, 11.92s/it, lr=1e-5, step_loss=0.00538]
Steps: 2%|▏ | 18256/1000000 [13:21:28<2751:55:04, 10.09s/it, lr=1e-5, step_loss=0.00538][RANK-0]: Step: [18256], local_loss=0.025225810706615448, train_loss=0.025507498532533646, time_cost=1.3672480583190918
+
Steps: 2%|▏ | 18256/1000000 [13:21:28<2751:55:04, 10.09s/it, lr=1e-5, step_loss=0.0252]
Steps: 2%|▏ | 18257/1000000 [13:21:38<2781:56:42, 10.20s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [18257], local_loss=0.007917089387774467, train_loss=0.04967004805803299, time_cost=7.76461124420166
+
Steps: 2%|▏ | 18257/1000000 [13:21:38<2781:56:42, 10.20s/it, lr=1e-5, step_loss=0.00792]
Steps: 2%|▏ | 18258/1000000 [13:21:49<2875:48:23, 10.55s/it, lr=1e-5, step_loss=0.00792][RANK-0]: Step: [18258], local_loss=0.05155472457408905, train_loss=0.026927240192890167, time_cost=4.060027122497559
+
Steps: 2%|▏ | 18258/1000000 [13:21:49<2875:48:23, 10.55s/it, lr=1e-5, step_loss=0.0516]
Steps: 2%|▏ | 18259/1000000 [13:21:55<2476:23:14, 9.08s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [18259], local_loss=0.012835858389735222, train_loss=0.06815233826637268, time_cost=1.74680757522583
+
Steps: 2%|▏ | 18259/1000000 [13:21:55<2476:23:14, 9.08s/it, lr=1e-5, step_loss=0.0128]
Steps: 2%|▏ | 18260/1000000 [13:22:08<2800:19:07, 10.27s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [18260], local_loss=0.00496422965079546, train_loss=39.082698822021484, time_cost=1.5522103309631348
+
Steps: 2%|▏ | 18260/1000000 [13:22:08<2800:19:07, 10.27s/it, lr=1e-5, step_loss=0.00496]
Steps: 2%|▏ | 18261/1000000 [13:22:22<3070:56:46, 11.26s/it, lr=1e-5, step_loss=0.00496][RANK-0]: Step: [18261], local_loss=0.007654671091586351, train_loss=0.05808160826563835, time_cost=2.538212537765503
+
Steps: 2%|▏ | 18261/1000000 [13:22:22<3070:56:46, 11.26s/it, lr=1e-5, step_loss=0.00765]
Steps: 2%|▏ | 18262/1000000 [13:22:31<2946:37:24, 10.81s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [18262], local_loss=0.0044708093628287315, train_loss=0.011139003559947014, time_cost=6.960188627243042
+
Steps: 2%|▏ | 18262/1000000 [13:22:31<2946:37:24, 10.81s/it, lr=1e-5, step_loss=0.00447]
Steps: 2%|▏ | 18263/1000000 [13:22:42<2930:46:43, 10.75s/it, lr=1e-5, step_loss=0.00447][RANK-0]: Step: [18263], local_loss=0.04210379347205162, train_loss=0.02380547486245632, time_cost=1.7851231098175049
+
Steps: 2%|▏ | 18263/1000000 [13:22:42<2930:46:43, 10.75s/it, lr=1e-5, step_loss=0.0421]
Steps: 2%|▏ | 18264/1000000 [13:22:51<2774:54:47, 10.18s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [18264], local_loss=0.02808733843266964, train_loss=0.05903425067663193, time_cost=2.3139851093292236
+
Steps: 2%|▏ | 18264/1000000 [13:22:51<2774:54:47, 10.18s/it, lr=1e-5, step_loss=0.0281]
Steps: 2%|▏ | 18265/1000000 [13:22:56<2389:49:39, 8.76s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [18265], local_loss=0.013399798423051834, train_loss=0.03928293660283089, time_cost=3.1227941513061523
+
Steps: 2%|▏ | 18265/1000000 [13:22:56<2389:49:39, 8.76s/it, lr=1e-5, step_loss=0.0134]
Steps: 2%|▏ | 18266/1000000 [13:23:11<2897:48:29, 10.63s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [18266], local_loss=0.02159915864467621, train_loss=0.04786066710948944, time_cost=1.5667078495025635
+
Steps: 2%|▏ | 18266/1000000 [13:23:11<2897:48:29, 10.63s/it, lr=1e-5, step_loss=0.0216]
Steps: 2%|▏ | 18267/1000000 [13:23:16<2448:53:42, 8.98s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [18267], local_loss=0.00762158865109086, train_loss=0.05867950990796089, time_cost=1.3873193264007568
+
Steps: 2%|▏ | 18267/1000000 [13:23:16<2448:53:42, 8.98s/it, lr=1e-5, step_loss=0.00762]
Steps: 2%|▏ | 18268/1000000 [13:23:30<2845:21:01, 10.43s/it, lr=1e-5, step_loss=0.00762][RANK-0]: Step: [18268], local_loss=0.039832498878240585, train_loss=0.03032878413796425, time_cost=5.817519664764404
+
Steps: 2%|▏ | 18268/1000000 [13:23:30<2845:21:01, 10.43s/it, lr=1e-5, step_loss=0.0398]
Steps: 2%|▏ | 18269/1000000 [13:23:41<2880:04:15, 10.56s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [18269], local_loss=0.008403963409364223, train_loss=0.04050463065505028, time_cost=4.113060474395752
+
Steps: 2%|▏ | 18269/1000000 [13:23:41<2880:04:15, 10.56s/it, lr=1e-5, step_loss=0.0084]
Steps: 2%|▏ | 18270/1000000 [13:23:46<2374:38:06, 8.71s/it, lr=1e-5, step_loss=0.0084][RANK-0]: Step: [18270], local_loss=0.008667237125337124, train_loss=0.02405361831188202, time_cost=1.5617890357971191
+
Steps: 2%|▏ | 18270/1000000 [13:23:46<2374:38:06, 8.71s/it, lr=1e-5, step_loss=0.00867]
Steps: 2%|▏ | 18271/1000000 [13:23:51<2107:37:26, 7.73s/it, lr=1e-5, step_loss=0.00867][RANK-0]: Step: [18271], local_loss=0.0074485717341303825, train_loss=0.06037013232707977, time_cost=2.3457791805267334
+
Steps: 2%|▏ | 18271/1000000 [13:23:51<2107:37:26, 7.73s/it, lr=1e-5, step_loss=0.00745]
Steps: 2%|▏ | 18272/1000000 [13:23:57<1925:55:56, 7.06s/it, lr=1e-5, step_loss=0.00745][RANK-0]: Step: [18272], local_loss=0.010516317561268806, train_loss=0.08617115765810013, time_cost=2.217813014984131
+
Steps: 2%|▏ | 18272/1000000 [13:23:57<1925:55:56, 7.06s/it, lr=1e-5, step_loss=0.0105]
Steps: 2%|▏ | 18273/1000000 [13:24:02<1761:16:06, 6.46s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [18273], local_loss=0.06393295526504517, train_loss=0.04154984652996063, time_cost=3.8496599197387695
+
Steps: 2%|▏ | 18273/1000000 [13:24:02<1761:16:06, 6.46s/it, lr=1e-5, step_loss=0.0639]
Steps: 2%|▏ | 18274/1000000 [13:24:07<1701:36:46, 6.24s/it, lr=1e-5, step_loss=0.0639][RANK-0]: Step: [18274], local_loss=0.008249446749687195, train_loss=0.032304756343364716, time_cost=3.1206881999969482
+
Steps: 2%|▏ | 18274/1000000 [13:24:07<1701:36:46, 6.24s/it, lr=1e-5, step_loss=0.00825]
Steps: 2%|▏ | 18275/1000000 [13:24:16<1921:44:46, 7.05s/it, lr=1e-5, step_loss=0.00825][RANK-0]: Step: [18275], local_loss=0.012710154056549072, train_loss=0.021678339689970016, time_cost=1.71012282371521
+
Steps: 2%|▏ | 18275/1000000 [13:24:16<1921:44:46, 7.05s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 18276/1000000 [13:24:22<1847:42:30, 6.78s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [18276], local_loss=0.07891883701086044, train_loss=0.027955494821071625, time_cost=4.372750282287598
+
Steps: 2%|▏ | 18276/1000000 [13:24:22<1847:42:30, 6.78s/it, lr=1e-5, step_loss=0.0789]
Steps: 2%|▏ | 18277/1000000 [13:24:27<1704:00:37, 6.25s/it, lr=1e-5, step_loss=0.0789][RANK-0]: Step: [18277], local_loss=0.013735580258071423, train_loss=0.06428004801273346, time_cost=2.56070613861084
+
Steps: 2%|▏ | 18277/1000000 [13:24:27<1704:00:37, 6.25s/it, lr=1e-5, step_loss=0.0137]
Steps: 2%|▏ | 18278/1000000 [13:24:40<2217:41:15, 8.13s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [18278], local_loss=0.0453418605029583, train_loss=0.0344887301325798, time_cost=3.6930623054504395
+
Steps: 2%|▏ | 18278/1000000 [13:24:40<2217:41:15, 8.13s/it, lr=1e-5, step_loss=0.0453]
Steps: 2%|▏ | 18279/1000000 [13:24:52<2561:32:15, 9.39s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [18279], local_loss=0.010373787954449654, train_loss=0.016307570040225983, time_cost=5.100217580795288
+
Steps: 2%|▏ | 18279/1000000 [13:24:52<2561:32:15, 9.39s/it, lr=1e-5, step_loss=0.0104]
Steps: 2%|▏ | 18280/1000000 [13:24:58<2278:50:20, 8.36s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [18280], local_loss=0.030536573380231857, train_loss=0.021642977371811867, time_cost=1.2786865234375
+
Steps: 2%|▏ | 18280/1000000 [13:24:58<2278:50:20, 8.36s/it, lr=1e-5, step_loss=0.0305]
Steps: 2%|▏ | 18281/1000000 [13:25:09<2511:26:06, 9.21s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [18281], local_loss=0.026507047936320305, train_loss=0.01955418661236763, time_cost=2.776423454284668
+
Steps: 2%|▏ | 18281/1000000 [13:25:09<2511:26:06, 9.21s/it, lr=1e-5, step_loss=0.0265]
Steps: 2%|▏ | 18282/1000000 [13:25:23<2873:36:48, 10.54s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [18282], local_loss=0.038423940539360046, train_loss=0.026679566130042076, time_cost=5.9999566078186035
+
Steps: 2%|▏ | 18282/1000000 [13:25:23<2873:36:48, 10.54s/it, lr=1e-5, step_loss=0.0384]
Steps: 2%|▏ | 18283/1000000 [13:25:34<2944:54:11, 10.80s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [18283], local_loss=0.012417146936058998, train_loss=0.01093099731951952, time_cost=4.033362150192261
+
Steps: 2%|▏ | 18283/1000000 [13:25:34<2944:54:11, 10.80s/it, lr=1e-5, step_loss=0.0124]
Steps: 2%|▏ | 18284/1000000 [13:25:40<2501:35:02, 9.17s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [18284], local_loss=0.01272924616932869, train_loss=0.012890750542283058, time_cost=2.385314702987671
+
Steps: 2%|▏ | 18284/1000000 [13:25:40<2501:35:02, 9.17s/it, lr=1e-5, step_loss=0.0127]
Steps: 2%|▏ | 18285/1000000 [13:25:45<2163:02:33, 7.93s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [18285], local_loss=0.015168114565312862, train_loss=0.05702533200383186, time_cost=2.1877994537353516
+
Steps: 2%|▏ | 18285/1000000 [13:25:45<2163:02:33, 7.93s/it, lr=1e-5, step_loss=0.0152]
Steps: 2%|▏ | 18286/1000000 [13:25:50<1914:03:19, 7.02s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [18286], local_loss=0.031209032982587814, train_loss=0.05101520195603371, time_cost=2.0473885536193848
+
Steps: 2%|▏ | 18286/1000000 [13:25:50<1914:03:19, 7.02s/it, lr=1e-5, step_loss=0.0312]
Steps: 2%|▏ | 18287/1000000 [13:25:57<1917:25:36, 7.03s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [18287], local_loss=0.19110465049743652, train_loss=0.08075180649757385, time_cost=2.1307907104492188
+
Steps: 2%|▏ | 18287/1000000 [13:25:57<1917:25:36, 7.03s/it, lr=1e-5, step_loss=0.191]
Steps: 2%|▏ | 18288/1000000 [13:26:02<1755:15:56, 6.44s/it, lr=1e-5, step_loss=0.191][RANK-0]: Step: [18288], local_loss=0.3357612192630768, train_loss=0.09112051129341125, time_cost=2.574532985687256
+
Steps: 2%|▏ | 18288/1000000 [13:26:02<1755:15:56, 6.44s/it, lr=1e-5, step_loss=0.336]
Steps: 2%|▏ | 18289/1000000 [13:26:07<1646:51:54, 6.04s/it, lr=1e-5, step_loss=0.336][RANK-0]: Step: [18289], local_loss=0.03890828415751457, train_loss=0.03511017560958862, time_cost=3.139523983001709
+
Steps: 2%|▏ | 18289/1000000 [13:26:07<1646:51:54, 6.04s/it, lr=1e-5, step_loss=0.0389]
Steps: 2%|▏ | 18290/1000000 [13:26:12<1555:46:04, 5.71s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [18290], local_loss=0.008316083811223507, train_loss=0.05562715604901314, time_cost=2.7619786262512207
+
Steps: 2%|▏ | 18290/1000000 [13:26:12<1555:46:04, 5.71s/it, lr=1e-5, step_loss=0.00832]
Steps: 2%|▏ | 18291/1000000 [13:26:16<1452:18:41, 5.33s/it, lr=1e-5, step_loss=0.00832][RANK-0]: Step: [18291], local_loss=0.05923028662800789, train_loss=0.04662489518523216, time_cost=1.8636465072631836
+
Steps: 2%|▏ | 18291/1000000 [13:26:16<1452:18:41, 5.33s/it, lr=1e-5, step_loss=0.0592]
Steps: 2%|▏ | 18292/1000000 [13:26:23<1575:04:27, 5.78s/it, lr=1e-5, step_loss=0.0592][RANK-0]: Step: [18292], local_loss=0.006395383737981319, train_loss=0.020594626665115356, time_cost=5.363509654998779
+
Steps: 2%|▏ | 18292/1000000 [13:26:23<1575:04:27, 5.78s/it, lr=1e-5, step_loss=0.0064]scripts/text_condition/npu/train_inpaint_sparse1d_newmodel_motion.sh: line 81: 212 Killed accelerate launch --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml --machine_rank=${MACHINE_RANK} --main_process_ip=${MAIN_PROCESS_IP_VALUE} opensora/train/train_inpaint.py --model OpenSoraInpaint-L/122 --text_encoder_name google/mt5-xxl --cache_dir "../../cache_dir/" --dataset inpaint --data "scripts/train_data/video_data_debug.txt" --ae WFVAEModel_D8_4x8x8 --ae_path "/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL" --sample_rate 1 --num_frames 93 --max_height 320 --max_width 320 --interpolation_scale_t 1.0 --interpolation_scale_h 1.0 --interpolation_scale_w 1.0 --attention_mode xformers --gradient_checkpointing --train_batch_size=1 --dataloader_num_workers 0 --gradient_accumulation_steps=1 --max_train_steps=1000000 --learning_rate=1e-5 --lr_scheduler="constant" --lr_warmup_steps=0 --mixed_precision="bf16" --report_to="wandb" --checkpointing_steps=1000 --allow_tf32 --model_max_length 512 --use_image_num 0 --use_ema --ema_start_step 0 --cfg 0.1 --noise_offset 0.0 --use_rope --skip_low_resolution --speed_factor 1.0 --ema_decay 0.9999 --drop_short_ratio 0.0 --hw_stride 32 --sparse1d --sparse_n 4 --use_motion --train_fps 16 --seed 1234 --trained_data_global_step 0 --group_data --use_decord --prediction_type "v_prediction" --rescale_betas_zero_snr --t2v_ratio 0.0 --i2v_ratio 0.0 --transition_ratio 0.0 --v2v_ratio 0.0 --Semantic_ratio 0.2 --bbox_ratio 0.2 --background_ratio 0.2 --fixed_ratio 0.1 --Semantic_expansion_ratio 0.1 --fixed_bg_ratio 0.1 --clear_video_ratio 0.0 --min_clear_ratio 0.25 --default_text_ratio 0.0 --output_dir /home/save_dir/runs/$PROJECT --pretrained_transformer_model_path "/home/image_data/captions/vpre_latest_134k/model_ema" --yolomodel_pathorname "/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt" --resume_from_checkpoint="/home/save_dir/runs/allinpaint_stage1/checkpoint-13000"
diff --git a/log_allinpaint_stage1_2.txt b/log_allinpaint_stage1_2.txt
new file mode 100644
index 000000000..eac5ba4f4
--- /dev/null
+++ b/log_allinpaint_stage1_2.txt
@@ -0,0 +1,10077 @@
+[2024-09-18 23:05:25,467] torch.distributed.run: [WARNING]
+[2024-09-18 23:05:25,467] torch.distributed.run: [WARNING] *****************************************
+[2024-09-18 23:05:25,467] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+[2024-09-18 23:05:25,467] torch.distributed.run: [WARNING] *****************************************
+[2024-09-18 23:05:31,502] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:209: ImportWarning:
+ *************************************************************************************************************
+ The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
+ The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
+ The backend in torch.distributed.init_process_group set to hccl now..
+ The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
+ The device parameters have been replaced with npu in the function below:
+ torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty
+ *************************************************************************************************************
+
+ warnings.warn(msg, ImportWarning)
+[2024-09-18 23:05:31,610] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 23:05:31,672] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[2024-09-18 23:05:31,675] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 23:05:31,721] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 23:05:31,765] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[2024-09-18 23:05:31,767] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-18 23:05:31,837] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 411's current affinity list: 0-191
+pid 411's new affinity list: 24-47
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 414's current affinity list: 0-191
+pid 414's new affinity list: 96-119
+pid 413's current affinity list: 0-191
+pid 413's new affinity list: 72-95
+skip replace _has_inf_or_nan
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+The npu_config.on_npu is True
+pid 410's current affinity list: 0-191
+pid 410's new affinity list: 0-23
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 415's current affinity list: 0-191
+pid 415's new affinity list: 120-143
+pid 412's current affinity list: 0-191
+pid 412's new affinity list: 48-71
+pid 417's current affinity list: 0-191
+pid 417's new affinity list: 168-191
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 416's current affinity list: 0-191
+pid 416's new affinity list: 144-167
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-3]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:45,334] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:45,335] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:45 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 3
+Local process index: 3
+Device: npu:3
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+[RANK-1]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:45,364] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:45,364] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:45 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 1
+Local process index: 1
+Device: npu:1
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-4]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:45,707] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:45,708] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:45 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 4
+Local process index: 4
+Device: npu:4
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+[RANK-2]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:45,744] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:45,744] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:45 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 2
+Local process index: 2
+Device: npu:2
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-7]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:45,832] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:45,832] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:45 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 7
+Local process index: 7
+Device: npu:7
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-0]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:46,020] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:46,020] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-09-18 23:05:46,020] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend hccl
+Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+09/18/2024 23:05:46 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 0
+Local process index: 0
+Device: npu:0
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-5]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:46,692] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:46,692] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:46 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 5
+Local process index: 5
+Device: npu:5
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-6]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=None, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/allinpaint_stage1_2', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt', max_sequence_length=512)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-18 23:05:48,189] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-18 23:05:48,190] [INFO] [comm.py:637:init_distributed] cdb=None
+09/18/2024 23:05:48 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 6
+Local process index: 6
+Device: npu:6
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+09/18/2024 23:07:48 - INFO - __main__ - optimizer: AdamW (
+Parameter Group 0
+ amsgrad: False
+ betas: (0.9, 0.999)
+ capturable: False
+ differentiable: False
+ eps: 1e-08
+ foreach: False
+ fused: None
+ lr: 1e-05
+ maximize: False
+ weight_decay: 0.01
+)
+
0%| | 0/1 [00:00, ?it/s]You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
+
0%| | 0/1 [00:00, ?it/s]/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3206/478625 [00:00<00:14, 32052.08it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%|▏ | 6412/478625 [00:00<00:15, 31475.54it/s][A
+
1%| | 3242/478625 [00:00<00:14, 32412.53it/s][A
+
2%|▏ | 9719/478625 [00:00<00:14, 32195.30it/s][A
+
1%|▏ | 6484/478625 [00:00<00:14, 31642.37it/s][A
+
3%|▎ | 13049/478625 [00:00<00:14, 32625.33it/s][A
+
2%|▏ | 9796/478625 [00:00<00:14, 32306.20it/s][A
+
3%|▎ | 16313/478625 [00:00<00:14, 31788.34it/s][A
+
3%|▎ | 13029/478625 [00:00<00:14, 31491.23it/s][A
+
4%|▍ | 19614/478625 [00:00<00:14, 32193.79it/s][A
+
3%|▎ | 16182/478625 [00:00<00:14, 30969.36it/s][A
+
5%|▍ | 22925/478625 [00:00<00:14, 32487.15it/s][A
+
4%|▍ | 19429/478625 [00:00<00:14, 31463.27it/s][A
+
5%|▌ | 26177/478625 [00:00<00:14, 31794.05it/s][A
+
5%|▍ | 22579/478625 [00:00<00:14, 31305.30it/s][A
+
6%|▌ | 29446/478625 [00:00<00:14, 32067.60it/s][A
+
5%|▌ | 25712/478625 [00:00<00:14, 30816.01it/s][A
+
7%|▋ | 32657/478625 [00:01<00:14, 31649.93it/s][A
+
6%|▌ | 28928/478625 [00:00<00:14, 31224.27it/s][A
+
8%|▊ | 35958/478625 [00:01<00:13, 32057.01it/s][A
+
7%|▋ | 32053/478625 [00:01<00:14, 30695.64it/s][A
+
8%|▊ | 39239/478625 [00:01<00:13, 32280.51it/s][A
+
7%|▋ | 35199/478625 [00:01<00:14, 30922.76it/s][A
+
9%|▉ | 42470/478625 [00:01<00:13, 31720.77it/s][A
0%| | 0/1 [00:00, ?it/s]09/18/2024 23:07:56 - INFO - opensora.dataset.t2v_datasets - Building /home/image_data/captions/TV01_clips_final_478625_llavanext_217405_aes478625.json...
+
+
8%|▊ | 38421/478625 [00:01<00:14, 31309.46it/s][A
+
10%|▉ | 45746/478625 [00:01<00:13, 32027.89it/s][A
+
9%|▊ | 41555/478625 [00:01<00:14, 30514.62it/s][A
+
10%|█ | 48952/478625 [00:01<00:13, 31569.24it/s][A
+
9%|▉ | 44830/478625 [00:01<00:13, 31170.21it/s][A
+
11%|█ | 52234/478625 [00:01<00:13, 31937.06it/s][A
+
10%|█ | 48103/478625 [00:01<00:13, 31630.37it/s][A
+
12%|█▏ | 55530/478625 [00:01<00:13, 32239.13it/s][A
+
11%|█ | 51271/478625 [00:01<00:13, 31136.11it/s][A
+
12%|█▏ | 58757/478625 [00:01<00:13, 31667.53it/s][A
+
11%|█▏ | 54390/478625 [00:01<00:13, 30607.08it/s][A
+
13%|█▎ | 62028/478625 [00:01<00:13, 31972.41it/s][A
+
12%|█▏ | 57455/478625 [00:01<00:13, 30128.38it/s][A
+
14%|█▎ | 65229/478625 [00:02<00:13, 31376.40it/s][A
+
13%|█▎ | 60712/478625 [00:01<00:13, 30837.48it/s][A
+
14%|█▍ | 68515/478625 [00:02<00:12, 31809.54it/s][A
+
13%|█▎ | 63936/478625 [00:02<00:13, 31246.43it/s][A
+
15%|█▍ | 71750/478625 [00:02<00:12, 31966.83it/s][A
+
14%|█▍ | 67065/478625 [00:02<00:13, 30369.00it/s][A
+
16%|█▌ | 74950/478625 [00:02<00:12, 31402.34it/s][A
+
15%|█▍ | 70307/478625 [00:02<00:13, 30965.19it/s][A
+
16%|█▋ | 78213/478625 [00:02<00:12, 31761.65it/s][A
+
15%|█▌ | 73589/478625 [00:02<00:13, 30765.17it/s][A
+
17%|█▋ | 81458/478625 [00:02<00:12, 31964.42it/s][A
+
16%|█▌ | 76888/478625 [00:02<00:12, 31408.55it/s][A
+
18%|█▊ | 84658/478625 [00:02<00:12, 31341.68it/s][A
+
17%|█▋ | 80035/478625 [00:02<00:12, 30849.50it/s][A
+
18%|█▊ | 87916/478625 [00:02<00:12, 31703.58it/s][A
+
17%|█▋ | 83126/478625 [00:02<00:12, 30672.93it/s][A
+
19%|█▉ | 91090/478625 [00:02<00:12, 31155.81it/s][A
+
18%|█▊ | 86324/478625 [00:02<00:12, 31054.76it/s][A
+
20%|█▉ | 94267/478625 [00:02<00:12, 31334.70it/s][A
+
19%|█▊ | 89516/478625 [00:02<00:12, 31309.25it/s][A
+
20%|██ | 97517/478625 [00:03<00:12, 31677.99it/s][A
+
19%|█▉ | 92650/478625 [00:02<00:12, 30715.46it/s][A
+
21%|██ | 100688/478625 [00:03<00:12, 31205.63it/s][A
+
20%|██ | 95911/478625 [00:03<00:12, 31269.09it/s][A
+
22%|██▏ | 103911/478625 [00:03<00:11, 31504.47it/s][A
+
21%|██ | 99042/478625 [00:03<00:12, 30096.96it/s][A
+
22%|██▏ | 107172/478625 [00:03<00:11, 31830.99it/s][A
+
21%|██▏ | 102221/478625 [00:03<00:12, 30583.32it/s][A
+
23%|██▎ | 110358/478625 [00:03<00:11, 31277.53it/s][A
+
22%|██▏ | 105404/478625 [00:03<00:12, 30946.60it/s][A
+
24%|██▎ | 113588/478625 [00:03<00:11, 31577.83it/s][A
+
23%|██▎ | 108507/478625 [00:03<00:12, 30673.80it/s][A
+
24%|██▍ | 116749/478625 [00:03<00:11, 31040.12it/s][A
+
23%|██▎ | 111769/478625 [00:03<00:11, 31245.10it/s][A
+
25%|██▌ | 120011/478625 [00:03<00:11, 31501.67it/s][A
+
24%|██▍ | 115030/478625 [00:03<00:11, 31646.05it/s][A
+
26%|██▌ | 123264/478625 [00:03<00:11, 31803.66it/s][A
+
25%|██▍ | 118199/478625 [00:03<00:11, 30769.02it/s][A
+
26%|██▋ | 126448/478625 [00:03<00:11, 31270.53it/s][A
+
25%|██▌ | 121475/478625 [00:03<00:11, 31349.12it/s][A
+
27%|██▋ | 129701/478625 [00:04<00:11, 31639.21it/s][A
+
26%|██▌ | 124617/478625 [00:04<00:11, 30691.21it/s][A
+
28%|██▊ | 132869/478625 [00:04<00:11, 31119.53it/s][A
+
27%|██▋ | 127720/478625 [00:04<00:11, 30740.15it/s][A
+
28%|██▊ | 136115/478625 [00:04<00:10, 31511.34it/s][A
+
27%|██▋ | 131008/478625 [00:04<00:11, 31367.03it/s][A
+
29%|██▉ | 139326/478625 [00:04<00:10, 31685.77it/s][A
+
28%|██▊ | 134150/478625 [00:04<00:11, 30804.15it/s][A
+
30%|██▉ | 142498/478625 [00:04<00:10, 31174.94it/s][A
+
29%|██▊ | 137330/478625 [00:04<00:10, 31094.31it/s][A
+
30%|███ | 145705/478625 [00:04<00:10, 31435.22it/s][A
+
29%|██▉ | 140590/478625 [00:04<00:10, 31537.38it/s][A
+
31%|███ | 148961/478625 [00:04<00:10, 31767.05it/s][A
+
30%|███ | 143748/478625 [00:04<00:11, 29938.92it/s][A
+
32%|███▏ | 152141/478625 [00:04<00:10, 31240.41it/s][A
+
31%|███ | 146989/478625 [00:04<00:10, 30644.65it/s][A
+
32%|███▏ | 155366/478625 [00:04<00:10, 31534.33it/s][A
+
31%|███▏ | 150070/478625 [00:04<00:10, 30334.86it/s][A
+
33%|███▎ | 158523/478625 [00:05<00:10, 31059.47it/s][A
+
32%|███▏ | 153302/478625 [00:04<00:10, 30911.55it/s][A
+
34%|███▍ | 161702/478625 [00:05<00:10, 31273.76it/s][A
+
33%|███▎ | 156571/478625 [00:05<00:10, 31431.75it/s][A
+
34%|███▍ | 164961/478625 [00:05<00:09, 31660.74it/s][A
+
33%|███▎ | 159722/478625 [00:05<00:10, 30863.68it/s][A
+
35%|███▌ | 168130/478625 [00:05<00:09, 31106.84it/s][A
+
34%|███▍ | 162912/478625 [00:05<00:10, 31163.88it/s][A
+
36%|███▌ | 171364/478625 [00:05<00:09, 31469.17it/s][A
+
35%|███▍ | 166036/478625 [00:05<00:10, 31185.39it/s][A
+
36%|███▋ | 174636/478625 [00:05<00:09, 31838.10it/s][A
+
35%|███▌ | 169159/478625 [00:05<00:10, 30873.40it/s][A
+
37%|███▋ | 177823/478625 [00:05<00:09, 31299.99it/s][A
+
36%|███▌ | 172372/478625 [00:05<00:09, 31241.42it/s][A
+
38%|███▊ | 181045/478625 [00:05<00:09, 31569.41it/s][A
+
37%|███▋ | 175499/478625 [00:05<00:09, 30983.52it/s][A
+
38%|███▊ | 184205/478625 [00:05<00:09, 30974.49it/s][A
+
37%|███▋ | 178666/478625 [00:05<00:09, 31185.59it/s][A
+
39%|███▉ | 187410/478625 [00:05<00:09, 31289.36it/s][A
+
38%|███▊ | 181787/478625 [00:05<00:09, 31109.79it/s][A
+
40%|███▉ | 190688/478625 [00:06<00:09, 31728.17it/s][A
+
39%|███▊ | 184900/478625 [00:05<00:09, 30501.01it/s][A
+
41%|████ | 193864/478625 [00:06<00:09, 31199.77it/s][A
+
39%|███▉ | 187954/478625 [00:06<00:09, 30239.45it/s][A
+
41%|████ | 197127/478625 [00:06<00:08, 31618.69it/s][A
+
40%|███▉ | 191203/478625 [00:06<00:09, 30897.74it/s][A
+
42%|████▏ | 200293/478625 [00:06<00:08, 31155.45it/s][A
+
41%|████ | 194296/478625 [00:06<00:09, 30606.56it/s][A
+
43%|████▎ | 203520/478625 [00:06<00:08, 31480.40it/s][A
+
41%|████▏ | 197545/478625 [00:06<00:09, 31161.19it/s][A
+
43%|████▎ | 206713/478625 [00:06<00:08, 31612.75it/s][A
+
42%|████▏ | 200664/478625 [00:06<00:09, 30673.33it/s][A
+
44%|████▍ | 209877/478625 [00:06<00:08, 31034.55it/s][A
+
43%|████▎ | 203845/478625 [00:06<00:08, 31006.71it/s][A
+
45%|████▍ | 213125/478625 [00:06<00:08, 31458.00it/s][A
+
43%|████▎ | 207017/478625 [00:06<00:08, 31217.10it/s][A
+
45%|████▌ | 216275/478625 [00:06<00:08, 31347.31it/s][A
+
44%|████▍ | 210141/478625 [00:06<00:08, 30626.77it/s][A
+
46%|████▌ | 219412/478625 [00:06<00:08, 30915.01it/s][A
+
45%|████▍ | 213263/478625 [00:06<00:08, 30800.03it/s][A
+
47%|████▋ | 222681/478625 [00:07<00:08, 31401.47it/s][A
+
45%|████▌ | 216470/478625 [00:06<00:08, 31173.20it/s][A
+
47%|████▋ | 225824/478625 [00:07<00:08, 31001.24it/s][A
+
46%|████▌ | 219590/478625 [00:07<00:08, 30792.30it/s][A
+
48%|████▊ | 229055/478625 [00:07<00:07, 31384.91it/s][A
+
47%|████▋ | 222730/478625 [00:07<00:08, 30970.80it/s][A
+
49%|████▊ | 232261/478625 [00:07<00:07, 31583.47it/s][A
+
47%|████▋ | 225830/478625 [00:07<00:08, 30650.13it/s][A
+
49%|████▉ | 235422/478625 [00:07<00:07, 31135.34it/s][A
+
48%|████▊ | 229084/478625 [00:07<00:07, 31204.78it/s][A
+
50%|████▉ | 238696/478625 [00:07<00:07, 31607.07it/s][A
+
49%|████▊ | 232207/478625 [00:07<00:08, 30271.16it/s][A
+
51%|█████ | 241950/478625 [00:07<00:07, 31881.97it/s][A
+
49%|████▉ | 235241/478625 [00:07<00:08, 30056.44it/s][A
+
51%|█████ | 245141/478625 [00:07<00:07, 31359.37it/s][A
+
50%|████▉ | 238399/478625 [00:07<00:07, 30501.11it/s][A
+
52%|█████▏ | 248380/478625 [00:07<00:07, 31662.26it/s][A
+
50%|█████ | 241667/478625 [00:07<00:07, 31124.85it/s][A
+
53%|█████▎ | 251549/478625 [00:07<00:07, 30671.94it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
51%|█████ | 244784/478625 [00:07<00:07, 30603.05it/s][A
+
53%|█████▎ | 254827/478625 [00:08<00:07, 31282.83it/s][A
+
1%| | 3202/478625 [00:00<00:14, 31871.78it/s][A
+
52%|█████▏ | 248039/478625 [00:08<00:07, 31173.44it/s][A
+
54%|█████▍ | 258056/478625 [00:08<00:06, 31576.09it/s][A
+
1%|▏ | 6390/478625 [00:00<00:15, 31112.01it/s][A
+
52%|█████▏ | 251161/478625 [00:08<00:07, 30513.07it/s][A
+
55%|█████▍ | 261220/478625 [00:08<00:07, 30955.90it/s][A
+
2%|▏ | 9503/478625 [00:00<00:15, 30763.23it/s][A
+
53%|█████▎ | 254390/478625 [00:08<00:07, 31030.19it/s][A
+
55%|█████▌ | 264475/478625 [00:08<00:06, 31421.37it/s][A
+
3%|▎ | 12580/478625 [00:00<00:15, 30600.18it/s][A
+
54%|█████▍ | 257553/478625 [00:08<00:07, 31206.30it/s][A
+
56%|█████▌ | 267698/478625 [00:08<00:06, 31048.03it/s][A
+
3%|▎ | 15641/478625 [00:00<00:15, 29927.16it/s][A
+
54%|█████▍ | 260678/478625 [00:08<00:07, 30430.03it/s][A
+
57%|█████▋ | 270912/478625 [00:08<00:06, 31366.26it/s][A
+
4%|▍ | 18723/478625 [00:00<00:15, 30222.22it/s][A
+
55%|█████▌ | 263870/478625 [00:08<00:06, 30862.65it/s][A
+
57%|█████▋ | 274165/478625 [00:08<00:06, 31708.02it/s][A
+
5%|▍ | 21764/478625 [00:00<00:15, 30279.48it/s][A
+
56%|█████▌ | 267122/478625 [00:08<00:06, 31348.72it/s][A
+
58%|█████▊ | 277340/478625 [00:08<00:06, 31209.07it/s][A
+
5%|▌ | 24794/478625 [00:00<00:15, 29513.99it/s][A
+
56%|█████▋ | 270262/478625 [00:08<00:06, 30911.46it/s][A
+
59%|█████▊ | 280588/478625 [00:08<00:06, 31579.25it/s][A
+
6%|▌ | 27750/478625 [00:00<00:15, 29178.31it/s][A
+
57%|█████▋ | 273431/478625 [00:08<00:06, 31137.79it/s][A
+
59%|█████▉ | 283868/478625 [00:08<00:06, 31938.36it/s][A
+
6%|▋ | 30671/478625 [00:01<00:15, 28910.61it/s][A
+
58%|█████▊ | 276549/478625 [00:08<00:06, 29813.52it/s][A
+
60%|█████▉ | 287065/478625 [00:09<00:06, 30911.31it/s][A
+
7%|▋ | 33766/478625 [00:01<00:15, 29518.55it/s][A
+
58%|█████▊ | 279816/478625 [00:09<00:06, 30635.89it/s][A
+
61%|██████ | 290304/478625 [00:09<00:06, 31340.95it/s][A
+
8%|▊ | 36919/478625 [00:01<00:14, 30116.97it/s][A
+
59%|█████▉ | 283076/478625 [00:09<00:06, 31206.29it/s][A
+
61%|██████▏ | 293446/478625 [00:09<00:06, 30753.82it/s][A
+
8%|▊ | 39934/478625 [00:01<00:14, 29613.66it/s][A
+
60%|█████▉ | 286208/478625 [00:09<00:06, 30822.10it/s][A
+
62%|██████▏ | 296705/478625 [00:09<00:05, 31285.39it/s][A
+
9%|▉ | 42927/478625 [00:01<00:14, 29706.16it/s][A
+
60%|██████ | 289298/478625 [00:09<00:06, 30676.34it/s][A
+
63%|██████▎ | 299983/478625 [00:09<00:05, 31721.12it/s][A
+
10%|▉ | 46007/478625 [00:01<00:14, 30028.71it/s][A
+
61%|██████ | 292465/478625 [00:09<00:06, 30966.57it/s][A
+
63%|██████▎ | 303161/478625 [00:09<00:05, 31248.83it/s][A
+
10%|█ | 49013/478625 [00:01<00:14, 29519.48it/s][A
+
62%|██████▏ | 295566/478625 [00:09<00:05, 30589.13it/s][A
+
64%|██████▍ | 306414/478625 [00:09<00:05, 31624.21it/s][A
+
11%|█ | 52164/478625 [00:01<00:14, 30105.72it/s][A
+
62%|██████▏ | 298756/478625 [00:09<00:05, 30973.93it/s][A
+
65%|██████▍ | 309672/478625 [00:09<00:05, 31905.09it/s][A
+
12%|█▏ | 55351/478625 [00:01<00:13, 30627.60it/s][A
+
63%|██████▎ | 301857/478625 [00:09<00:05, 30366.12it/s][A
+
65%|██████▌ | 312866/478625 [00:09<00:05, 31373.98it/s][A
+
12%|█▏ | 58417/478625 [00:01<00:13, 30255.77it/s][A
+
64%|██████▎ | 305019/478625 [00:09<00:05, 30730.98it/s][A
+
66%|██████▌ | 316105/478625 [00:10<00:05, 31671.33it/s][A
+
13%|█▎ | 61446/478625 [00:02<00:13, 29874.62it/s][A
+
64%|██████▍ | 308290/478625 [00:09<00:05, 31312.56it/s][A
+
67%|██████▋ | 319276/478625 [00:10<00:05, 31178.76it/s][A
+
13%|█▎ | 64436/478625 [00:02<00:14, 29427.54it/s][A
+
65%|██████▌ | 311425/478625 [00:10<00:05, 30953.26it/s][A
+
67%|██████▋ | 322398/478625 [00:10<00:05, 31117.65it/s][A
+
14%|█▍ | 67460/478625 [00:02<00:13, 29664.27it/s][A
+
66%|██████▌ | 314711/478625 [00:10<00:05, 31513.21it/s][A
+
68%|██████▊ | 325623/478625 [00:10<00:04, 31449.66it/s][A
+
15%|█▍ | 70659/478625 [00:02<00:13, 30349.32it/s][A
+
66%|██████▋ | 317903/478625 [00:10<00:05, 31630.60it/s][A
+
69%|██████▊ | 328771/478625 [00:10<00:04, 30811.84it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
15%|█▌ | 73697/478625 [00:02<00:13, 29868.03it/s][A
+
67%|██████▋ | 321069/478625 [00:10<00:05, 30218.69it/s][A
+
69%|██████▉ | 332009/478625 [00:10<00:04, 31267.81it/s][A
+
16%|█▌ | 76867/478625 [00:02<00:13, 30405.27it/s][A
+
68%|██████▊ | 324313/478625 [00:10<00:05, 30859.53it/s][A
+
70%|███████ | 335222/478625 [00:10<00:04, 30879.02it/s][A
+
17%|█▋ | 79911/478625 [00:02<00:13, 29969.90it/s][A
+
68%|██████▊ | 327412/478625 [00:10<00:04, 30567.95it/s][A
+
71%|███████ | 338456/478625 [00:10<00:04, 31303.09it/s][A
+
17%|█▋ | 82912/478625 [00:02<00:13, 29617.66it/s][A
+
69%|██████▉ | 330596/478625 [00:10<00:04, 30937.60it/s][A
+
71%|███████▏ | 341723/478625 [00:10<00:04, 31703.75it/s][A
+
18%|█▊ | 86053/478625 [00:02<00:13, 30141.52it/s][A
+
70%|██████▉ | 333845/478625 [00:10<00:04, 31394.27it/s][A
+
72%|███████▏ | 344897/478625 [00:10<00:04, 31234.90it/s][A
+
19%|█▊ | 89252/478625 [00:02<00:12, 30684.40it/s][A
+
70%|███████ | 336991/478625 [00:10<00:04, 30561.87it/s][A
+
73%|███████▎ | 348138/478625 [00:11<00:04, 31579.87it/s][A
+
19%|█▉ | 92324/478625 [00:03<00:12, 30162.62it/s][A
+
71%|███████ | 340280/478625 [00:11<00:04, 31238.45it/s][A
+
73%|███████▎ | 351394/478625 [00:11<00:03, 31845.45it/s][A
+
20%|█▉ | 95413/478625 [00:03<00:12, 30374.70it/s][A
+
72%|███████▏ | 343539/478625 [00:11<00:04, 31633.26it/s][A
+
74%|███████▍ | 354582/478625 [00:11<00:03, 31342.20it/s][A
+
21%|██ | 98454/478625 [00:03<00:13, 29145.37it/s][A
+
72%|███████▏ | 346709/478625 [00:11<00:04, 31159.81it/s][A
+
75%|███████▍ | 357720/478625 [00:11<00:03, 31231.13it/s][A
+
21%|██ | 101539/478625 [00:03<00:12, 29635.40it/s][A
+
73%|███████▎ | 349997/478625 [00:11<00:04, 31629.65it/s][A
+
75%|███████▌ | 360846/478625 [00:11<00:03, 30749.80it/s][A
+
22%|██▏ | 104750/478625 [00:03<00:12, 30354.90it/s][A
+
74%|███████▍ | 353165/478625 [00:11<00:04, 31230.86it/s][A
+
76%|███████▌ | 364080/478625 [00:11<00:03, 31215.02it/s][A
+
23%|██▎ | 107795/478625 [00:03<00:12, 30066.45it/s][A
+
74%|███████▍ | 356410/478625 [00:11<00:03, 31588.70it/s][A
+
77%|███████▋ | 367317/478625 [00:11<00:03, 31553.49it/s][A
+
23%|██▎ | 110978/478625 [00:03<00:12, 30583.37it/s][A
+
75%|███████▌ | 359691/478625 [00:11<00:03, 31947.75it/s][A
+
77%|███████▋ | 370475/478625 [00:11<00:03, 30821.80it/s][A
+
24%|██▍ | 114180/478625 [00:03<00:11, 31007.13it/s][A
+
76%|███████▌ | 362889/478625 [00:11<00:03, 31466.48it/s][A
+
78%|███████▊ | 373753/478625 [00:11<00:03, 31392.71it/s][A
+
25%|██▍ | 117286/478625 [00:03<00:11, 30437.52it/s][A
+
79%|███████▉ | 377009/478625 [00:11<00:03, 31734.94it/s][A
+
76%|███████▋ | 366039/478625 [00:11<00:03, 30752.19it/s][A
0%| | 0/1 [00:00, ?it/s]
+
25%|██▌ | 120474/478625 [00:04<00:11, 30859.23it/s][A
+
79%|███████▉ | 380187/478625 [00:12<00:03, 31207.63it/s][A
+
77%|███████▋ | 369119/478625 [00:11<00:03, 30229.08it/s][A
+
26%|██▌ | 123565/478625 [00:04<00:11, 30232.16it/s][A
+
80%|████████ | 383429/478625 [00:12<00:03, 31562.27it/s][A
+
78%|███████▊ | 372336/478625 [00:12<00:03, 30791.54it/s][A
+
26%|██▋ | 126766/478625 [00:04<00:11, 30751.80it/s][A
+
78%|███████▊ | 375595/478625 [00:12<00:03, 31316.56it/s][A
+
81%|████████ | 386589/478625 [00:12<00:02, 31112.82it/s][A
+
27%|██▋ | 129847/478625 [00:04<00:11, 30611.96it/s][A
+
81%|████████▏ | 389862/478625 [00:12<00:02, 31585.18it/s][A
+
79%|███████▉ | 378732/478625 [00:12<00:03, 30663.54it/s][A
+
28%|██▊ | 132912/478625 [00:04<00:11, 29057.50it/s][A
+
82%|████████▏ | 393024/478625 [00:12<00:02, 31437.42it/s][A
+
80%|███████▉ | 382006/478625 [00:12<00:03, 31269.36it/s][A
+
28%|██▊ | 136041/478625 [00:04<00:11, 29694.76it/s][A
+
80%|████████ | 385269/478625 [00:12<00:02, 31668.15it/s][A
+
83%|████████▎ | 396171/478625 [00:12<00:02, 30955.04it/s][A
+
29%|██▉ | 139215/478625 [00:04<00:11, 30285.75it/s][A
+
83%|████████▎ | 399428/478625 [00:12<00:02, 31428.53it/s][A
+
81%|████████ | 388441/478625 [00:12<00:02, 31094.17it/s][A
+
30%|██▉ | 142257/478625 [00:04<00:11, 30007.21it/s][A
+
84%|████████▍ | 402682/478625 [00:12<00:02, 31755.54it/s][A
+
82%|████████▏ | 391727/478625 [00:12<00:02, 31610.00it/s][A
+
30%|███ | 145434/478625 [00:04<00:10, 30521.09it/s][A
+
85%|████████▍ | 405861/478625 [00:12<00:02, 31192.49it/s][A
+
83%|████████▎ | 394893/478625 [00:12<00:02, 30721.28it/s][A
+
31%|███ | 148655/478625 [00:04<00:10, 31016.76it/s][A
+
85%|████████▌ | 409107/478625 [00:12<00:02, 31562.69it/s][A
+
83%|████████▎ | 398092/478625 [00:12<00:02, 31087.25it/s][A
+
32%|███▏ | 151763/478625 [00:05<00:10, 30456.41it/s][A
+
86%|████████▌ | 412267/478625 [00:13<00:02, 31043.92it/s][A
+
84%|████████▍ | 401332/478625 [00:12<00:02, 31470.58it/s][A
+
32%|███▏ | 154960/478625 [00:05<00:10, 30897.05it/s][A
+
87%|████████▋ | 415417/478625 [00:13<00:02, 31177.25it/s][A
+
85%|████████▍ | 404485/478625 [00:13<00:02, 30871.77it/s][A
+
33%|███▎ | 158055/478625 [00:05<00:10, 30373.22it/s][A
+
87%|████████▋ | 418689/478625 [00:13<00:01, 31630.21it/s][A
+
85%|████████▌ | 407627/478625 [00:13<00:02, 31030.14it/s][A
+
34%|███▎ | 161264/478625 [00:05<00:10, 30875.62it/s][A
+
88%|████████▊ | 421855/478625 [00:13<00:01, 31148.05it/s][A
+
86%|████████▌ | 410735/478625 [00:13<00:02, 30525.30it/s][A
+
34%|███▍ | 164456/478625 [00:05<00:10, 31181.11it/s][A
+
89%|████████▉ | 425125/478625 [00:13<00:01, 31602.41it/s][A
+
86%|████████▋ | 413792/478625 [00:13<00:02, 30189.56it/s][A
+
35%|███▌ | 167578/478625 [00:05<00:10, 29750.32it/s][A
+
89%|████████▉ | 428289/478625 [00:13<00:01, 30533.00it/s][A
+
87%|████████▋ | 416970/478625 [00:13<00:02, 30651.73it/s][A
+
36%|███▌ | 170791/478625 [00:05<00:10, 30433.92it/s][A
+
90%|█████████ | 431551/478625 [00:13<00:01, 31135.59it/s][A
+
88%|████████▊ | 420039/478625 [00:13<00:01, 30513.85it/s][A
+
36%|███▋ | 173896/478625 [00:05<00:09, 30610.02it/s][A
+
91%|█████████ | 434840/478625 [00:13<00:01, 31647.97it/s][A
+
88%|████████▊ | 423312/478625 [00:13<00:01, 31167.90it/s][A
+
37%|███▋ | 176968/478625 [00:05<00:09, 30411.21it/s][A
+
89%|████████▉ | 426578/478625 [00:13<00:01, 31609.81it/s][A
+
92%|█████████▏| 438012/478625 [00:13<00:01, 31220.62it/s][A
+
38%|███▊ | 180066/478625 [00:05<00:09, 30576.70it/s][A
+
92%|█████████▏| 441300/478625 [00:14<00:01, 31707.05it/s][A
+
90%|████████▉ | 429742/478625 [00:13<00:01, 31203.78it/s][A
+
38%|███▊ | 183129/478625 [00:06<00:09, 29933.58it/s][A
+
93%|█████████▎| 444562/478625 [00:14<00:01, 31975.96it/s][A
+
90%|█████████ | 433024/478625 [00:13<00:01, 31680.24it/s][A
+
39%|███▉ | 186366/478625 [00:06<00:09, 30644.95it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
91%|█████████ | 436311/478625 [00:14<00:01, 32030.56it/s][A
+
94%|█████████▎| 447764/478625 [00:14<00:00, 31474.08it/s][A
+
40%|███▉ | 189632/478625 [00:06<00:09, 31238.24it/s][A
+
94%|█████████▍| 450991/478625 [00:14<00:00, 31705.97it/s][A
+
92%|█████████▏| 439517/478625 [00:14<00:01, 31432.83it/s][A
+
40%|████ | 192762/478625 [00:06<00:09, 30571.78it/s][A
+
93%|█████████▎| 442805/478625 [00:14<00:01, 31856.95it/s][A
+
95%|█████████▍| 454165/478625 [00:14<00:00, 31254.71it/s][A
+
41%|████ | 195990/478625 [00:06<00:09, 31069.09it/s][A
+
96%|█████████▌| 457460/478625 [00:14<00:00, 31750.51it/s][A
+
93%|█████████▎| 445994/478625 [00:14<00:01, 31158.79it/s][A
+
42%|████▏ | 199226/478625 [00:06<00:08, 31447.15it/s][A
+
96%|█████████▋| 460738/478625 [00:14<00:00, 32054.18it/s][A
+
94%|█████████▍| 449233/478625 [00:14<00:00, 31518.57it/s][A
+
42%|████▏ | 202376/478625 [00:06<00:09, 30034.98it/s][A
+
95%|█████████▍| 452513/478625 [00:14<00:00, 31895.00it/s][A
+
97%|█████████▋| 463947/478625 [00:14<00:00, 30849.45it/s][A
+
43%|████▎ | 205567/478625 [00:06<00:08, 30573.14it/s][A
+
98%|█████████▊| 467124/478625 [00:14<00:00, 31114.03it/s][A
+
95%|█████████▌| 455707/478625 [00:14<00:00, 30543.39it/s][A
+
44%|████▎ | 208638/478625 [00:06<00:08, 30292.98it/s][A
+
98%|█████████▊| 470255/478625 [00:14<00:00, 30837.64it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
96%|█████████▌| 458994/478625 [00:14<00:00, 31211.41it/s][A
+
44%|████▍ | 211890/478625 [00:06<00:08, 30940.18it/s][A
+
99%|█████████▉| 473520/478625 [00:15<00:00, 31365.32it/s][A
+
97%|█████████▋| 462128/478625 [00:14<00:00, 30903.70it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
45%|████▍ | 215080/478625 [00:07<00:08, 31220.13it/s][A
+
100%|█████████▉| 476792/478625 [00:15<00:00, 31763.16it/s][A
+
97%|█████████▋| 465356/478625 [00:15<00:00, 31302.27it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31465.81it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.57s/it]
100%|██████████| 1/1 [00:21<00:00, 21.57s/it]
+
+
46%|████▌ | 218209/478625 [00:07<00:08, 30784.55it/s][A
+
98%|█████████▊| 468544/478625 [00:15<00:00, 31469.55it/s][A
+
46%|████▋ | 221393/478625 [00:07<00:08, 31093.72it/s][A
+
99%|█████████▊| 471697/478625 [00:15<00:00, 30957.87it/s][A
+
47%|████▋ | 224657/478625 [00:07<00:08, 31549.27it/s][A
+
99%|█████████▉| 474947/478625 [00:15<00:00, 31409.88it/s][A
+
48%|████▊ | 227816/478625 [00:07<00:08, 30974.33it/s][A
+
100%|█████████▉| 478161/478625 [00:15<00:00, 31623.23it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31009.82it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.77s/it]
100%|██████████| 1/1 [00:21<00:00, 21.77s/it]
+
+
48%|████▊ | 230987/478625 [00:07<00:07, 31188.57it/s][A
0%| | 0/1 [00:00, ?it/s]
+
49%|████▉ | 234110/478625 [00:07<00:08, 30559.09it/s][A
+
50%|████▉ | 237356/478625 [00:07<00:07, 31114.78it/s][Atime 22.3045437335968
+n_elements: 474899
+data length: 474899
+
+
50%|█████ | 240473/478625 [00:07<00:07, 30643.52it/s][A
+
51%|█████ | 243542/478625 [00:08<00:07, 30390.27it/s][A
+
52%|█████▏ | 246738/478625 [00:08<00:07, 30849.54it/s][Atime 22.441800355911255
+
+
52%|█████▏ | 249989/478625 [00:08<00:07, 31339.22it/s][An_elements: 474899
+data length: 474899
+
+
53%|█████▎ | 253126/478625 [00:08<00:07, 30829.92it/s][A
0%| | 0/1 [00:00, ?it/s]
+
54%|█████▎ | 256384/478625 [00:08<00:07, 31341.60it/s][A
+
54%|█████▍ | 259522/478625 [00:08<00:07, 30434.41it/s][A
0%| | 0/1 [00:00, ?it/s]
+
55%|█████▍ | 262769/478625 [00:08<00:06, 30983.99it/s][A
+
56%|█████▌ | 266044/478625 [00:08<00:06, 31499.00it/s][A
+
56%|█████▌ | 269200/478625 [00:08<00:06, 30878.50it/s][A
+
57%|█████▋ | 272431/478625 [00:08<00:06, 31294.40it/s][A
+
58%|█████▊ | 275566/478625 [00:09<00:06, 29862.75it/s][A
+
58%|█████▊ | 278797/478625 [00:09<00:06, 30561.50it/s][A
+
59%|█████▉ | 282010/478625 [00:09<00:06, 31014.85it/s][A
+
60%|█████▉ | 285123/478625 [00:09<00:06, 30578.01it/s][A
+
60%|██████ | 288190/478625 [00:09<00:06, 30527.75it/s][A
+
61%|██████ | 291358/478625 [00:09<00:06, 30830.66it/s][A
+
62%|██████▏ | 294446/478625 [00:09<00:06, 30484.48it/s][A
+
62%|██████▏ | 297655/478625 [00:09<00:05, 30954.48it/s][A
+
63%|██████▎ | 300762/478625 [00:09<00:05, 30610.65it/s][A
+
64%|██████▎ | 304013/478625 [00:09<00:05, 31168.05it/s][A
+
64%|██████▍ | 307222/478625 [00:10<00:05, 31438.74it/s][A
+
65%|██████▍ | 310369/478625 [00:10<00:05, 30008.32it/s][A
+
66%|██████▌ | 313546/478625 [00:10<00:05, 30513.49it/s][A
+
66%|██████▌ | 316766/478625 [00:10<00:05, 31003.45it/s][A
+
67%|██████▋ | 319877/478625 [00:10<00:05, 30565.74it/s][A
+
68%|██████▊ | 323122/478625 [00:10<00:04, 31114.80it/s][A
+
68%|██████▊ | 326241/478625 [00:10<00:04, 30576.01it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
69%|██████▉ | 329396/478625 [00:10<00:04, 30859.38it/s][A
+
1%| | 3134/478625 [00:00<00:15, 31304.06it/s][A
+
69%|██████▉ | 332636/478625 [00:10<00:04, 31311.39it/s][A
+
1%|▏ | 6265/478625 [00:00<00:15, 30731.04it/s][A
+
70%|███████ | 335772/478625 [00:11<00:04, 30518.13it/s][A
+
2%|▏ | 9514/478625 [00:00<00:14, 31524.16it/s][A
+
71%|███████ | 339001/478625 [00:11<00:04, 31033.87it/s][A
+
3%|▎ | 12744/478625 [00:00<00:14, 31826.47it/s][A
+
72%|███████▏ | 342242/478625 [00:11<00:04, 31438.34it/s][A
+
3%|▎ | 15928/478625 [00:00<00:15, 30641.07it/s][A
+
72%|███████▏ | 345391/478625 [00:11<00:04, 30883.26it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
4%|▍ | 19151/478625 [00:00<00:14, 31164.67it/s][A
+
73%|███████▎ | 348485/478625 [00:11<00:04, 30243.38it/s][A
+
5%|▍ | 22274/478625 [00:00<00:14, 30575.15it/s][A
+
73%|███████▎ | 351515/478625 [00:11<00:04, 29723.54it/s][A
+
5%|▌ | 25496/478625 [00:00<00:14, 31083.84it/s][A
+
74%|███████▍ | 354747/478625 [00:11<00:04, 30475.25it/s][A
+
6%|▌ | 28711/478625 [00:00<00:14, 31409.56it/s][A
+
75%|███████▍ | 357927/478625 [00:11<00:03, 30860.34it/s][A
+
7%|▋ | 31857/478625 [00:01<00:14, 30727.39it/s][A
+
75%|███████▌ | 361018/478625 [00:11<00:03, 30275.10it/s][A
+
7%|▋ | 35073/478625 [00:01<00:14, 31154.41it/s][A
+
76%|███████▌ | 364213/478625 [00:11<00:03, 30762.39it/s][A
+
8%|▊ | 38306/478625 [00:01<00:13, 31504.94it/s][A
+
77%|███████▋ | 367392/478625 [00:12<00:03, 31062.68it/s][A
+
9%|▊ | 41461/478625 [00:01<00:14, 30849.82it/s][A
+
77%|███████▋ | 370503/478625 [00:12<00:03, 30107.04it/s][A
+
9%|▉ | 44688/478625 [00:01<00:13, 31266.70it/s][A
+
78%|███████▊ | 373747/478625 [00:12<00:03, 30783.78it/s][A
+
10%|▉ | 47820/478625 [00:01<00:14, 30652.62it/s][A
+
79%|███████▊ | 376834/478625 [00:12<00:03, 30360.88it/s][A
+
11%|█ | 51030/478625 [00:01<00:13, 31074.93it/s][A
+
79%|███████▉ | 380078/478625 [00:12<00:03, 30967.25it/s][A
+
11%|█▏ | 54252/478625 [00:01<00:13, 31410.92it/s][A
+
80%|████████ | 383181/478625 [00:12<00:03, 30321.45it/s][A
+
12%|█▏ | 57397/478625 [00:01<00:13, 30866.65it/s][A
+
81%|████████ | 386220/478625 [00:12<00:03, 30147.71it/s][A
+
13%|█▎ | 60631/478625 [00:01<00:13, 31298.87it/s][A
+
81%|████████▏ | 389447/478625 [00:12<00:02, 30766.60it/s][A
+
13%|█▎ | 63853/478625 [00:02<00:13, 31569.43it/s][A
0%| | 0/1 [00:00, ?it/s]
+
82%|████████▏ | 392712/478625 [00:12<00:02, 31319.82it/s][A
+
14%|█▍ | 67014/478625 [00:02<00:13, 30954.15it/s][A
+
83%|████████▎ | 395848/478625 [00:12<00:02, 30419.72it/s][A
+
15%|█▍ | 70236/478625 [00:02<00:13, 31324.27it/s][A
+
83%|████████▎ | 399091/478625 [00:13<00:02, 31004.89it/s][A
+
15%|█▌ | 73373/478625 [00:02<00:13, 30790.44it/s][A
+
84%|████████▍ | 402199/478625 [00:13<00:02, 30413.17it/s][A
+
16%|█▌ | 76601/478625 [00:02<00:12, 31225.84it/s][A
+
85%|████████▍ | 405372/478625 [00:13<00:02, 30794.29it/s][A
+
17%|█▋ | 79822/478625 [00:02<00:12, 31513.27it/s][A
+
85%|████████▌ | 408575/478625 [00:13<00:02, 31155.32it/s][A
+
17%|█▋ | 82977/478625 [00:02<00:12, 30621.03it/s][A
+
86%|████████▌ | 411696/478625 [00:13<00:02, 30435.80it/s][A
+
18%|█▊ | 86188/478625 [00:02<00:12, 31051.75it/s][A
+
87%|████████▋ | 414780/478625 [00:13<00:02, 30553.47it/s][A
+
19%|█▊ | 89396/478625 [00:02<00:12, 31353.00it/s][A
+
87%|████████▋ | 417840/478625 [00:13<00:02, 29664.73it/s][A
+
19%|█▉ | 92537/478625 [00:02<00:12, 30739.49it/s][A
+
88%|████████▊ | 420814/478625 [00:13<00:01, 29662.37it/s][A
+
20%|██ | 95768/478625 [00:03<00:12, 31198.68it/s][A
+
89%|████████▊ | 423915/478625 [00:13<00:01, 30056.11it/s][A
+
21%|██ | 98893/478625 [00:03<00:12, 30316.62it/s][A
+
89%|████████▉ | 427041/478625 [00:14<00:01, 30410.04it/s][A
+
21%|██▏ | 102108/478625 [00:03<00:12, 30846.91it/s][A
+
90%|████████▉ | 430086/478625 [00:14<00:01, 30106.12it/s][A
+
22%|██▏ | 105349/478625 [00:03<00:11, 31302.36it/s][A
+
91%|█████████ | 433318/478625 [00:14<00:01, 30759.25it/s][A
+
23%|██▎ | 108486/478625 [00:03<00:11, 30936.87it/s][A
+
91%|█████████ | 436398/478625 [00:14<00:01, 29986.79it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
23%|██▎ | 111759/478625 [00:03<00:11, 31460.86it/s][A
+
92%|█████████▏| 439606/478625 [00:14<00:01, 30596.75it/s][A
+
1%| | 3256/478625 [00:00<00:14, 32549.48it/s][A
+
24%|██▍ | 115016/478625 [00:03<00:11, 31786.64it/s][A
+
93%|█████████▎| 442838/478625 [00:14<00:01, 31102.80it/s][A
+
1%|▏ | 6511/478625 [00:00<00:14, 31944.27it/s][A
+
25%|██▍ | 118199/478625 [00:03<00:11, 31172.87it/s][A
+
93%|█████████▎| 445954/478625 [00:14<00:01, 30454.07it/s][A
+
2%|▏ | 9835/478625 [00:00<00:14, 32526.50it/s][A
+
25%|██▌ | 121321/478625 [00:03<00:11, 31126.97it/s][A
+
94%|█████████▍| 449187/478625 [00:14<00:00, 31002.71it/s][A
+
3%|▎ | 13150/478625 [00:00<00:14, 32768.35it/s][A
+
26%|██▌ | 124437/478625 [00:04<00:11, 30713.45it/s][A
+
94%|█████████▍| 452293/478625 [00:14<00:00, 30499.80it/s][A
+
3%|▎ | 16428/478625 [00:00<00:14, 32063.95it/s][A
+
27%|██▋ | 127719/478625 [00:04<00:11, 31287.28it/s][A
+
95%|█████████▌| 455348/478625 [00:14<00:00, 30242.07it/s][A
+
4%|▍ | 19772/478625 [00:00<00:14, 32522.72it/s][A
+
27%|██▋ | 130990/478625 [00:04<00:10, 31704.72it/s][A
+
96%|█████████▌| 458577/478625 [00:15<00:00, 30839.89it/s][A
+
5%|▍ | 23027/478625 [00:00<00:14, 31914.73it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
28%|██▊ | 134164/478625 [00:04<00:11, 31106.99it/s][A
+
96%|█████████▋| 461665/478625 [00:15<00:00, 30528.06it/s][A
+
6%|▌ | 26377/478625 [00:00<00:13, 32407.27it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3319/478625 [00:00<00:14, 33186.31it/s][A
+
29%|██▊ | 137425/478625 [00:04<00:10, 31547.75it/s][A
+
97%|█████████▋| 464766/478625 [00:15<00:00, 30667.15it/s][A
+
6%|▌ | 29699/478625 [00:00<00:13, 32656.14it/s][A
+
1%| | 3086/478625 [00:00<00:15, 30849.95it/s][A
+
1%|▏ | 6638/478625 [00:00<00:14, 32444.22it/s][A
+
29%|██▉ | 140584/478625 [00:04<00:11, 30524.98it/s][A
+
98%|█████████▊| 467840/478625 [00:15<00:00, 30687.25it/s][A
+
7%|▋ | 32968/478625 [00:01<00:13, 32074.78it/s][A
+
1%|▏ | 6171/478625 [00:00<00:15, 30737.97it/s][A
+
2%|▏ | 10001/478625 [00:00<00:14, 32976.70it/s][A
+
30%|███ | 143813/478625 [00:04<00:10, 31033.88it/s][A
+
98%|█████████▊| 470911/478625 [00:15<00:00, 30192.80it/s][A
+
8%|▊ | 36281/478625 [00:01<00:13, 32388.81it/s][A
+
2%|▏ | 9305/478625 [00:00<00:15, 31008.85it/s][A
+
3%|▎ | 13366/478625 [00:00<00:13, 33236.17it/s][A
+
31%|███ | 147071/478625 [00:04<00:10, 31483.74it/s][A
+
99%|█████████▉| 474109/478625 [00:15<00:00, 30716.93it/s][A
+
8%|▊ | 39575/478625 [00:01<00:13, 32552.16it/s][A
+
3%|▎ | 12530/478625 [00:00<00:14, 31494.84it/s][A
+
3%|▎ | 16691/478625 [00:00<00:14, 32328.44it/s][A
+
31%|███▏ | 150226/478625 [00:04<00:10, 30920.54it/s][A
+
100%|█████████▉| 477324/478625 [00:15<00:00, 31138.80it/s][A
+
9%|▉ | 42833/478625 [00:01<00:13, 31932.81it/s][A
+
3%|▎ | 15680/478625 [00:00<00:14, 30872.48it/s][A
+
4%|▍ | 20071/478625 [00:00<00:13, 32814.76it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 30497.14it/s]
+
100%|██████████| 1/1 [00:22<00:00, 22.41s/it]
100%|██████████| 1/1 [00:22<00:00, 22.41s/it]
+
+
32%|███▏ | 153487/478625 [00:04<00:10, 31412.18it/s][A
+
10%|▉ | 46147/478625 [00:01<00:13, 32288.72it/s][A
+
4%|▍ | 18908/478625 [00:00<00:14, 31343.29it/s][A
+
5%|▍ | 23357/478625 [00:00<00:14, 32195.42it/s][A
+
33%|███▎ | 156763/478625 [00:05<00:10, 31806.27it/s][A09/18/2024 23:08:19 - INFO - opensora.dataset.t2v_datasets - no_cap: 0, too_long: 3711, too_short: 2, no_resolution: 0, resolution_mismatch: 0, Counter(sample_size): Counter({'93x160x320': 84930, '29x160x320': 73201, '45x160x320': 68295, '61x160x320': 44578, '77x160x320': 38630, '93x128x320': 17805, '29x128x320': 16948, '93x224x320': 16403, '93x192x320': 15259, '45x128x320': 14788, '61x128x320': 9795, '29x224x320': 8615, '29x192x320': 8528, '45x224x320': 8477, '45x192x320': 8309, '77x128x320': 7730, '61x224x320': 6211, '61x192x320': 5983, '77x224x320': 5788, '77x192x320': 5268, '93x256x320': 3164, '45x256x320': 1510, '29x256x320': 1480, '61x256x320': 1152, '77x256x320': 1090, '93x96x320': 282, '45x96x320': 200, '29x96x320': 169, '61x96x320': 163, '77x96x320': 148}), cnt_movie: 0, cnt_img: 0, before filter: 478625, after filter: 474899
+
+
5%|▍ | 22146/478625 [00:00<00:14, 31676.44it/s][A
+
10%|█ | 49380/478625 [00:01<00:13, 31850.60it/s][A
+
6%|▌ | 26716/478625 [00:00<00:13, 32626.31it/s][A
+
33%|███▎ | 159949/478625 [00:05<00:10, 31211.81it/s][A
+
11%|█ | 52696/478625 [00:01<00:13, 32235.51it/s][A
+
5%|▌ | 25316/478625 [00:00<00:14, 30792.73it/s][A
+
6%|▋ | 30077/478625 [00:00<00:13, 32925.93it/s][A
+
34%|███▍ | 163076/478625 [00:05<00:10, 30966.07it/s][A
+
12%|█▏ | 56008/478625 [00:01<00:13, 32496.72it/s][A
+
6%|▌ | 28539/478625 [00:00<00:14, 31218.57it/s][A
+
35%|███▍ | 166177/478625 [00:05<00:10, 30631.98it/s][A
+
7%|▋ | 33373/478625 [00:01<00:14, 31459.77it/s][A09/18/2024 23:08:19 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | motion_score: 474899, cnt_no_motion: 13 | 192077 > 0.95, 0.7 > 65730 Mean: 0.8593367888417824, Var: 0.03075349223473551, Std: 0.17536673639757203, Min: -0.0717548280954361, Max: 1.0
+
+
12%|█▏ | 59261/478625 [00:01<00:13, 31960.43it/s][A
+
7%|▋ | 31666/478625 [00:01<00:14, 30786.80it/s][A
+
35%|███▌ | 169428/478625 [00:05<00:09, 31180.32it/s][A
+
8%|▊ | 36718/478625 [00:01<00:13, 32043.17it/s][A
+
13%|█▎ | 62573/478625 [00:01<00:12, 32300.33it/s][A
+
7%|▋ | 34890/478625 [00:01<00:14, 31221.03it/s][A
+
36%|███▌ | 172690/478625 [00:05<00:09, 31602.06it/s][A
+
8%|▊ | 39935/478625 [00:01<00:13, 31639.28it/s][A
+
14%|█▎ | 65806/478625 [00:02<00:12, 31800.45it/s][A
+
8%|▊ | 38111/478625 [00:01<00:13, 31515.33it/s][A09/18/2024 23:08:19 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | aesthetic_score: 478625, cnt_no_aesthetic: 0 | 14374 > 5.75, 4.5 > 113830 Mean: 4.846693657797633, Var: 0.24147353645946146, Std: 0.4913995690468821, Min: 2.685077953338623, Max: 6.742257436116536
+
+
9%|▉ | 43294/478625 [00:01<00:13, 32184.17it/s][A
+
37%|███▋ | 175854/478625 [00:05<00:09, 31042.22it/s][Atime 23.148473262786865
+n_elements: 474899
+data length: 474899
+
+
14%|█▍ | 69165/478625 [00:02<00:12, 32323.43it/s][A
+
9%|▊ | 41266/478625 [00:01<00:14, 30847.20it/s][A
+
10%|▉ | 46661/478625 [00:01<00:13, 32621.70it/s][A
+
37%|███▋ | 179105/478625 [00:05<00:09, 31471.66it/s][A
+
15%|█▌ | 72464/478625 [00:02<00:12, 32518.54it/s][A
+
9%|▉ | 44472/478625 [00:01<00:13, 31204.24it/s][A
+
38%|███▊ | 182256/478625 [00:05<00:09, 31129.07it/s][A
+
10%|█ | 49930/478625 [00:01<00:13, 32038.71it/s][A
+
16%|█▌ | 75719/478625 [00:02<00:12, 31974.57it/s][A
+
10%|▉ | 47703/478625 [00:01<00:13, 31530.12it/s][A
+
11%|█ | 53274/478625 [00:01<00:13, 32449.29it/s][A
+
39%|███▊ | 185372/478625 [00:05<00:09, 30763.46it/s][A
+
17%|█▋ | 79052/478625 [00:02<00:12, 32372.89it/s][A
+
11%|█ | 50860/478625 [00:01<00:13, 31007.92it/s][A
+
12%|█▏ | 56608/478625 [00:01<00:12, 32709.91it/s][A
+
39%|███▉ | 188633/478625 [00:06<00:09, 31305.18it/s][A
+
17%|█▋ | 82293/478625 [00:02<00:12, 31849.69it/s][A
+
11%|█▏ | 54086/478625 [00:01<00:13, 31374.47it/s][A
+
13%|█▎ | 59884/478625 [00:01<00:13, 32120.68it/s][A
+
40%|████ | 191767/478625 [00:06<00:09, 30865.20it/s][A
+
18%|█▊ | 85605/478625 [00:02<00:12, 32221.89it/s][A
+
12%|█▏ | 57227/478625 [00:01<00:13, 30810.41it/s][A
+
13%|█▎ | 63236/478625 [00:01<00:12, 32531.53it/s][A
+
41%|████ | 195038/478625 [00:06<00:09, 31404.63it/s][A
+
19%|█▊ | 88929/478625 [00:02<00:11, 32522.28it/s][A
+
13%|█▎ | 60444/478625 [00:01<00:13, 31208.86it/s][A
+
41%|████▏ | 198294/478625 [00:06<00:08, 31744.56it/s][A
+
14%|█▍ | 66494/478625 [00:02<00:13, 31197.09it/s][A
+
19%|█▉ | 92184/478625 [00:02<00:12, 31870.17it/s][A
+
13%|█▎ | 63678/478625 [00:02<00:13, 31541.89it/s][A09/18/2024 23:08:20 - INFO - __main__ - after train_dataloader
+09/18/2024 23:08:20 - INFO - __main__ - before accelerator.prepare
+
+
42%|████▏ | 201472/478625 [00:06<00:08, 31225.97it/s][A[2024-09-18 23:08:20,602] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.6, git-hash=unknown, git-branch=unknown
+
+
15%|█▍ | 69857/478625 [00:02<00:12, 31896.77it/s][A
+
20%|█▉ | 95492/478625 [00:02<00:11, 32224.46it/s][A
+
14%|█▍ | 66836/478625 [00:02<00:13, 30979.07it/s][A
+
43%|████▎ | 204598/478625 [00:06<00:08, 31108.72it/s][A
+
15%|█▌ | 73182/478625 [00:02<00:12, 32291.02it/s][A
+
21%|██ | 98837/478625 [00:03<00:11, 32583.11it/s][A
+
15%|█▍ | 70065/478625 [00:02<00:13, 31362.99it/s][A
+
43%|████▎ | 207817/478625 [00:06<00:08, 31426.07it/s][A
+
16%|█▌ | 76421/478625 [00:02<00:12, 31880.48it/s][A
+
21%|██▏ | 102099/478625 [00:03<00:11, 31943.57it/s][A
+
15%|█▌ | 73283/478625 [00:02<00:12, 31600.54it/s][A
+
44%|████▍ | 210962/478625 [00:06<00:08, 30974.04it/s][A
+
17%|█▋ | 79764/478625 [00:02<00:12, 32332.77it/s][A
+
22%|██▏ | 105408/478625 [00:03<00:11, 32277.93it/s][A
+
16%|█▌ | 76446/478625 [00:02<00:12, 30983.09it/s][A
+
45%|████▍ | 214234/478625 [00:06<00:08, 31486.33it/s][A
+
17%|█▋ | 83004/478625 [00:02<00:12, 31951.61it/s][A
+
23%|██▎ | 108640/478625 [00:03<00:11, 31862.12it/s][A
+
17%|█▋ | 79682/478625 [00:02<00:12, 31384.55it/s][A
+
45%|████▌ | 217386/478625 [00:06<00:08, 31035.29it/s][A
+
18%|█▊ | 86335/478625 [00:02<00:12, 32349.76it/s][A
+
23%|██▎ | 111969/478625 [00:03<00:11, 32280.11it/s][A
+
17%|█▋ | 82825/478625 [00:02<00:12, 30847.36it/s][A
+
46%|████▌ | 220670/478625 [00:07<00:08, 31563.99it/s][A
+
19%|█▊ | 89705/478625 [00:02<00:11, 32747.94it/s][A
+
24%|██▍ | 115214/478625 [00:03<00:11, 32328.95it/s][A
+
18%|█▊ | 86053/478625 [00:02<00:12, 31264.36it/s][A
+
47%|████▋ | 223830/478625 [00:07<00:08, 31498.32it/s][A
+
19%|█▉ | 92984/478625 [00:02<00:12, 32073.10it/s][A
+
25%|██▍ | 118450/478625 [00:03<00:11, 31769.21it/s][A
+
19%|█▊ | 89278/478625 [00:02<00:12, 31554.75it/s][A
+
47%|████▋ | 226982/478625 [00:07<00:08, 31069.16it/s][A
+
20%|██ | 96321/478625 [00:02<00:11, 32451.03it/s][A
+
25%|██▌ | 121768/478625 [00:03<00:11, 32182.79it/s][A
+
19%|█▉ | 92437/478625 [00:02<00:12, 30878.69it/s][A
+
48%|████▊ | 230186/478625 [00:07<00:07, 31352.50it/s][A
+
21%|██ | 99571/478625 [00:03<00:12, 31111.65it/s][A
+
26%|██▌ | 124990/478625 [00:03<00:11, 31633.37it/s][A
+
20%|█▉ | 95631/478625 [00:03<00:12, 31186.87it/s][A
+
49%|████▊ | 233324/478625 [00:07<00:07, 30948.17it/s][A
+
21%|██▏ | 102896/478625 [00:03<00:11, 31725.13it/s][A
+
27%|██▋ | 128271/478625 [00:03<00:10, 31977.32it/s][A
+
21%|██ | 98872/478625 [00:03<00:12, 30781.23it/s][A
+
49%|████▉ | 236622/478625 [00:07<00:07, 31545.20it/s][A
+
22%|██▏ | 106229/478625 [00:03<00:11, 32190.65it/s][A
+
27%|██▋ | 131595/478625 [00:04<00:10, 32347.45it/s][A
+
21%|██▏ | 102068/478625 [00:03<00:12, 31122.58it/s][A
+
50%|█████ | 239890/478625 [00:07<00:07, 31879.41it/s][A
+
23%|██▎ | 109458/478625 [00:03<00:11, 31763.37it/s][A
+
28%|██▊ | 134833/478625 [00:04<00:10, 31818.12it/s][A
+
22%|██▏ | 105266/478625 [00:03<00:11, 31373.93it/s][A
+
51%|█████ | 243081/478625 [00:07<00:07, 31328.54it/s][A
+
24%|██▎ | 112795/478625 [00:03<00:11, 32232.65it/s][A
+
29%|██▉ | 138146/478625 [00:04<00:10, 32202.49it/s][A
+
23%|██▎ | 108407/478625 [00:03<00:12, 30826.23it/s][A
+
51%|█████▏ | 246218/478625 [00:07<00:07, 31251.04it/s][A
+
24%|██▍ | 116025/478625 [00:03<00:11, 31742.94it/s][A
+
30%|██▉ | 141370/478625 [00:04<00:10, 31726.82it/s][A
+
23%|██▎ | 111647/478625 [00:03<00:11, 31286.84it/s][A
+
52%|█████▏ | 249477/478625 [00:07<00:07, 31645.43it/s][A
+
25%|██▍ | 119357/478625 [00:03<00:11, 32202.30it/s][A
+
30%|███ | 144599/478625 [00:04<00:10, 31892.00it/s][A
+
24%|██▍ | 114859/478625 [00:03<00:11, 31531.22it/s][A
+
53%|█████▎ | 252644/478625 [00:08<00:07, 31234.87it/s][A
+
26%|██▌ | 122710/478625 [00:03<00:10, 32593.26it/s][A
+
31%|███ | 147918/478625 [00:04<00:10, 32275.25it/s][A
+
25%|██▍ | 118016/478625 [00:03<00:11, 30986.43it/s][A
+
53%|█████▎ | 255919/478625 [00:08<00:07, 31681.27it/s][A
+
26%|██▋ | 125974/478625 [00:03<00:10, 32087.09it/s][A
+
32%|███▏ | 151148/478625 [00:04<00:10, 31717.95it/s][A
+
25%|██▌ | 121257/478625 [00:03<00:11, 31402.02it/s][A
+
54%|█████▍ | 259090/478625 [00:08<00:07, 30982.15it/s][A
+
27%|██▋ | 129297/478625 [00:04<00:11, 31498.54it/s][A
+
32%|███▏ | 154447/478625 [00:04<00:10, 32089.76it/s][A
+
26%|██▌ | 124401/478625 [00:03<00:11, 30814.92it/s][A
+
55%|█████▍ | 262364/478625 [00:08<00:06, 31495.14it/s][A
+
28%|██▊ | 132600/478625 [00:04<00:11, 31378.33it/s][A
+
33%|███▎ | 157791/478625 [00:04<00:09, 32487.79it/s][A
+
27%|██▋ | 127633/478625 [00:04<00:11, 31255.24it/s][A
+
55%|█████▌ | 265518/478625 [00:08<00:06, 31399.44it/s][A
+
28%|██▊ | 135971/478625 [00:04<00:10, 32049.71it/s][A
+
34%|███▎ | 161043/478625 [00:05<00:09, 31893.77it/s][A
+
27%|██▋ | 130842/478625 [00:04<00:11, 31500.06it/s][A
+
56%|█████▌ | 268661/478625 [00:08<00:06, 30926.64it/s][A
+
29%|██▉ | 139311/478625 [00:04<00:10, 32443.75it/s][A
+
34%|███▍ | 164260/478625 [00:05<00:09, 31972.70it/s][A
+
28%|██▊ | 133996/478625 [00:04<00:11, 31050.01it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
57%|█████▋ | 271936/478625 [00:08<00:06, 31460.68it/s][A
+
30%|██▉ | 142561/478625 [00:04<00:10, 31995.14it/s][A
+
35%|███▍ | 167461/478625 [00:05<00:09, 31603.26it/s][A
+
29%|██▊ | 137224/478625 [00:04<00:10, 31411.40it/s][A
+
1%| | 3110/478625 [00:00<00:15, 31091.50it/s][A
+
58%|█████▊ | 275216/478625 [00:08<00:06, 31855.51it/s][A
+
30%|███ | 145878/478625 [00:04<00:10, 32336.79it/s][A
+
36%|███▌ | 170740/478625 [00:05<00:09, 31950.30it/s][A
+
29%|██▉ | 140451/478625 [00:04<00:10, 31664.37it/s][A
+
1%|▏ | 6220/478625 [00:00<00:15, 30972.00it/s][A
+
58%|█████▊ | 278405/478625 [00:08<00:06, 31291.70it/s][A
+
31%|███ | 149228/478625 [00:04<00:10, 32679.27it/s][A
+
36%|███▋ | 174055/478625 [00:05<00:09, 32303.23it/s][A
+
30%|███ | 143620/478625 [00:04<00:10, 31017.04it/s][A
+
2%|▏ | 9426/478625 [00:00<00:14, 31465.14it/s][A
+
59%|█████▉ | 281686/478625 [00:09<00:06, 31735.65it/s][A
+
32%|███▏ | 152499/478625 [00:04<00:10, 32096.73it/s][A
+
37%|███▋ | 177288/478625 [00:05<00:09, 31816.97it/s][A
+
31%|███ | 146810/478625 [00:04<00:10, 31275.65it/s][A
+
3%|▎ | 12705/478625 [00:00<00:14, 31980.78it/s][A
+
60%|█████▉ | 284864/478625 [00:09<00:06, 31253.97it/s][A
+
33%|███▎ | 155850/478625 [00:04<00:09, 32510.97it/s][A
+
38%|███▊ | 180554/478625 [00:05<00:09, 32063.38it/s][A
+
31%|███▏ | 149941/478625 [00:04<00:10, 30831.25it/s][A
+
3%|▎ | 15904/478625 [00:00<00:14, 31325.94it/s][A
+
60%|██████ | 287993/478625 [00:09<00:06, 31209.91it/s][A
+
33%|███▎ | 159105/478625 [00:04<00:09, 32097.39it/s][A
+
38%|███▊ | 183763/478625 [00:05<00:09, 31566.06it/s][A
+
32%|███▏ | 153153/478625 [00:04<00:10, 31207.23it/s][A
+
4%|▍ | 19201/478625 [00:00<00:14, 31873.17it/s][A
+
61%|██████ | 291255/478625 [00:09<00:05, 31623.82it/s][A
+
34%|███▍ | 162318/478625 [00:05<00:09, 31637.90it/s][A
+
39%|███▉ | 187052/478625 [00:05<00:09, 31954.52it/s][A
+
33%|███▎ | 156423/478625 [00:05<00:10, 31647.70it/s][A
+
5%|▍ | 22487/478625 [00:00<00:14, 32189.48it/s][A
+
62%|██████▏ | 294420/478625 [00:09<00:05, 30930.87it/s][A
+
35%|███▍ | 165666/478625 [00:05<00:09, 32167.04it/s][A
+
40%|███▉ | 190365/478625 [00:05<00:08, 32301.19it/s][A
+
33%|███▎ | 159591/478625 [00:05<00:10, 31168.09it/s][A
+
5%|▌ | 25708/478625 [00:00<00:14, 31614.02it/s][A
+
62%|██████▏ | 297674/478625 [00:09<00:05, 31400.43it/s][A
+
35%|███▌ | 168886/478625 [00:05<00:09, 31804.54it/s][A
+
40%|████ | 193598/478625 [00:06<00:08, 31806.54it/s][A
+
34%|███▍ | 162860/478625 [00:05<00:09, 31609.86it/s][A
+
6%|▌ | 28873/478625 [00:00<00:14, 31477.71it/s][A
+
63%|██████▎ | 300819/478625 [00:09<00:05, 30954.39it/s][A
+
36%|███▌ | 172225/478625 [00:05<00:09, 32270.30it/s][A
+
41%|████ | 196906/478625 [00:06<00:08, 32179.72it/s][A
+
35%|███▍ | 166172/478625 [00:05<00:09, 32056.58it/s][A
+
7%|▋ | 32023/478625 [00:01<00:14, 31128.63it/s][A
+
64%|██████▎ | 304088/478625 [00:09<00:05, 31462.57it/s][A
+
37%|███▋ | 175455/478625 [00:05<00:09, 31872.78it/s][A
+
42%|████▏ | 200135/478625 [00:06<00:08, 31683.90it/s][A
+
35%|███▌ | 169381/478625 [00:05<00:09, 31528.05it/s][A
+
7%|▋ | 35285/478625 [00:01<00:14, 31575.30it/s][A
+
64%|██████▍ | 307238/478625 [00:09<00:05, 31290.54it/s][A
+
37%|███▋ | 178797/478625 [00:05<00:09, 32327.59it/s][A
+
43%|████▎ | 203444/478625 [00:06<00:08, 32095.28it/s][A
+
36%|███▌ | 172675/478625 [00:05<00:09, 31941.28it/s][A
+
8%|▊ | 38551/478625 [00:01<00:13, 31900.42it/s][A
+
65%|██████▍ | 310370/478625 [00:09<00:05, 30879.54it/s][A
+
38%|███▊ | 182073/478625 [00:05<00:09, 32452.84it/s][A
+
43%|████▎ | 206721/478625 [00:06<00:08, 32293.03it/s][A
+
37%|███▋ | 175873/478625 [00:05<00:09, 31409.28it/s][A
+
9%|▊ | 41743/478625 [00:01<00:13, 31381.55it/s][A
+
66%|██████▌ | 313618/478625 [00:10<00:05, 31347.76it/s][A
+
39%|███▊ | 185321/478625 [00:05<00:09, 32028.29it/s][A
+
44%|████▍ | 209953/478625 [00:06<00:08, 31836.39it/s][A
+
37%|███▋ | 179166/478625 [00:05<00:09, 31853.36it/s][A
+
9%|▉ | 45004/478625 [00:01<00:13, 31743.06it/s][A
+
66%|██████▌ | 316881/478625 [00:10<00:05, 31725.50it/s][A
+
39%|███▉ | 188675/478625 [00:05<00:08, 32472.76it/s][A
+
45%|████▍ | 213250/478625 [00:06<00:08, 32169.94it/s][A
+
38%|███▊ | 182417/478625 [00:05<00:09, 32044.79it/s][A
+
10%|█ | 48181/478625 [00:01<00:13, 31566.82it/s][A
+
67%|██████▋ | 320056/478625 [00:10<00:05, 31103.89it/s][A
+
40%|████ | 191925/478625 [00:05<00:09, 31509.42it/s][A
+
45%|████▌ | 216572/478625 [00:06<00:08, 32480.26it/s][A
+
39%|███▉ | 185625/478625 [00:05<00:09, 31530.23it/s][A
+
11%|█ | 51340/478625 [00:01<00:13, 31078.29it/s][A
+
68%|██████▊ | 323312/478625 [00:10<00:04, 31530.23it/s][A
+
41%|████ | 195264/478625 [00:06<00:08, 32054.49it/s][A
+
46%|████▌ | 219823/478625 [00:06<00:08, 31983.92it/s][A
+
39%|███▉ | 188942/478625 [00:06<00:09, 32010.72it/s][A
+
11%|█▏ | 54607/478625 [00:01<00:13, 31545.22it/s][A
+
68%|██████▊ | 326469/478625 [00:10<00:04, 30976.98it/s][A
+
41%|████▏ | 198609/478625 [00:06<00:08, 32463.38it/s][A
+
47%|████▋ | 223157/478625 [00:06<00:07, 32382.05it/s][A
+
40%|████ | 192147/478625 [00:06<00:09, 31497.36it/s][A
+
12%|█▏ | 57765/478625 [00:01<00:13, 31140.28it/s][A
+
69%|██████▉ | 329571/478625 [00:10<00:04, 30742.88it/s][A
+
42%|████▏ | 201861/478625 [00:06<00:08, 31990.39it/s][A
+
47%|████▋ | 226398/478625 [00:07<00:07, 31930.93it/s][A
+
41%|████ | 195422/478625 [00:06<00:08, 31863.67it/s][A
+
13%|█▎ | 61039/478625 [00:01<00:13, 31610.10it/s][A
+
70%|██████▉ | 332808/478625 [00:10<00:04, 31217.09it/s][A
+
43%|████▎ | 205130/478625 [00:06<00:08, 32193.30it/s][A
+
48%|████▊ | 229692/478625 [00:07<00:07, 32225.32it/s][A
+
42%|████▏ | 198724/478625 [00:06<00:08, 32203.46it/s][A
+
13%|█▎ | 64333/478625 [00:02<00:12, 32003.56it/s][A
+
70%|███████ | 335933/478625 [00:10<00:04, 30761.43it/s][A
+
44%|████▎ | 208502/478625 [00:06<00:08, 32643.38it/s][A
+
49%|████▊ | 232940/478625 [00:07<00:07, 32299.31it/s][A
+
42%|████▏ | 201948/478625 [00:06<00:08, 31627.11it/s][A
+
14%|█▍ | 67536/478625 [00:02<00:13, 31478.07it/s][A
+
71%|███████ | 339184/478625 [00:10<00:04, 31274.13it/s][A
+
44%|████▍ | 211770/478625 [00:06<00:08, 32119.07it/s][A
+
49%|████▉ | 236172/478625 [00:07<00:07, 31789.81it/s][A
+
43%|████▎ | 205158/478625 [00:06<00:08, 31764.49it/s][A
+
15%|█▍ | 70687/478625 [00:02<00:13, 31363.32it/s][A
+
72%|███████▏ | 342429/478625 [00:10<00:04, 31620.64it/s][A
+
45%|████▍ | 215100/478625 [00:06<00:08, 32465.31it/s][A
+
50%|█████ | 239457/478625 [00:07<00:07, 32101.11it/s][A
+
44%|████▎ | 208458/478625 [00:06<00:08, 32126.98it/s][A
+
15%|█▌ | 73826/478625 [00:02<00:13, 30966.42it/s][A
+
72%|███████▏ | 345594/478625 [00:11<00:04, 31091.22it/s][A
+
46%|████▌ | 218350/478625 [00:06<00:08, 32041.22it/s][A
+
51%|█████ | 242670/478625 [00:07<00:07, 31600.55it/s][A
+
44%|████▍ | 211674/478625 [00:06<00:08, 31261.50it/s][A
+
16%|█▌ | 77091/478625 [00:02<00:12, 31459.82it/s][A
+
73%|███████▎ | 348707/478625 [00:11<00:04, 30963.18it/s][A
+
46%|████▋ | 221573/478625 [00:06<00:08, 32094.02it/s][A
+
51%|█████▏ | 245976/478625 [00:07<00:07, 32027.55it/s][A
+
45%|████▍ | 214938/478625 [00:06<00:08, 31662.29it/s][A
+
17%|█▋ | 80401/478625 [00:02<00:12, 31942.77it/s][A
+
74%|███████▎ | 351806/478625 [00:11<00:04, 30607.46it/s][A
+
47%|████▋ | 224927/478625 [00:06<00:07, 32521.19it/s][A
+
52%|█████▏ | 249286/478625 [00:07<00:07, 32343.12it/s][A
+
46%|████▌ | 218110/478625 [00:06<00:08, 31234.61it/s][A
+
17%|█▋ | 83598/478625 [00:02<00:12, 31569.32it/s][A
+
74%|███████▍ | 355058/478625 [00:11<00:03, 31167.42it/s][A
+
48%|████▊ | 228182/478625 [00:07<00:07, 32073.90it/s][A
+
53%|█████▎ | 252523/478625 [00:07<00:07, 31737.05it/s][A
+
46%|████▋ | 221369/478625 [00:07<00:08, 31631.03it/s][A
+
18%|█▊ | 86921/478625 [00:02<00:12, 32057.56it/s][A
+
75%|███████▍ | 358306/478625 [00:11<00:03, 31552.49it/s][A
+
48%|████▊ | 231444/478625 [00:07<00:07, 32234.29it/s][A
+
53%|█████▎ | 255846/478625 [00:07<00:06, 32173.62it/s][A
+
47%|████▋ | 224673/478625 [00:07<00:07, 32045.34it/s][A
+
19%|█▉ | 90130/478625 [00:02<00:12, 31869.14it/s][A
+
76%|███████▌ | 361464/478625 [00:11<00:03, 30999.23it/s][A
+
49%|████▉ | 234670/478625 [00:07<00:07, 31834.53it/s][A
+
54%|█████▍ | 259079/478625 [00:08<00:06, 32218.03it/s][A
+
48%|████▊ | 227882/478625 [00:07<00:08, 30940.37it/s][A
+
19%|█▉ | 93319/478625 [00:02<00:12, 31445.94it/s][A
+
76%|███████▌ | 364728/478625 [00:11<00:03, 31479.34it/s][A
+
50%|████▉ | 238042/478625 [00:07<00:07, 32389.11it/s][A
+
55%|█████▍ | 262304/478625 [00:08<00:06, 31667.55it/s][A
+
48%|████▊ | 231080/478625 [00:07<00:07, 31227.21it/s][A
+
20%|██ | 96633/478625 [00:03<00:11, 31943.78it/s][A
+
77%|███████▋ | 367945/478625 [00:11<00:03, 31681.53it/s][A
+
50%|█████ | 241378/478625 [00:07<00:07, 32675.55it/s][A
+
55%|█████▌ | 265609/478625 [00:08<00:06, 32071.58it/s][A
+
49%|████▉ | 234211/478625 [00:07<00:07, 30940.62it/s][A
+
21%|██ | 99830/478625 [00:03<00:11, 31567.03it/s][A
+
78%|███████▊ | 371116/478625 [00:11<00:03, 30406.32it/s][A
+
51%|█████ | 244648/478625 [00:07<00:07, 31597.66it/s][A
+
56%|█████▌ | 268820/478625 [00:08<00:06, 31617.53it/s][A
+
50%|████▉ | 237486/478625 [00:07<00:07, 31468.94it/s][A
+
22%|██▏ | 103135/478625 [00:03<00:11, 32001.16it/s][A
+
78%|███████▊ | 374382/478625 [00:11<00:03, 31057.97it/s][A
+
52%|█████▏ | 247986/478625 [00:07<00:07, 32114.43it/s][A
+
57%|█████▋ | 272141/478625 [00:08<00:06, 32083.93it/s][A
+
50%|█████ | 240776/478625 [00:07<00:07, 31888.08it/s][A
+
22%|██▏ | 106466/478625 [00:03<00:11, 32386.29it/s][A
+
79%|███████▉ | 377499/478625 [00:12<00:03, 30665.59it/s][A
+
52%|█████▏ | 251205/478625 [00:07<00:07, 31752.81it/s][A
+
58%|█████▊ | 275446/478625 [00:08<00:06, 32369.02it/s][A
+
51%|█████ | 243970/478625 [00:07<00:07, 31347.96it/s][A
+
23%|██▎ | 109707/478625 [00:03<00:11, 31847.50it/s][A
+
80%|███████▉ | 380761/478625 [00:12<00:03, 31235.44it/s][A
+
53%|█████▎ | 254548/478625 [00:07<00:06, 32242.73it/s][A
+
58%|█████▊ | 278686/478625 [00:08<00:06, 31791.15it/s][A
+
52%|█████▏ | 247110/478625 [00:07<00:07, 31276.63it/s][A
+
24%|██▎ | 112895/478625 [00:03<00:11, 31822.46it/s][A
+
80%|████████ | 384011/478625 [00:12<00:02, 31606.40it/s][A
+
54%|█████▍ | 257879/478625 [00:08<00:06, 32557.14it/s][A
+
59%|█████▉ | 282001/478625 [00:08<00:06, 32188.82it/s][A
+
52%|█████▏ | 250362/478625 [00:07<00:07, 31640.93it/s][A
+
24%|██▍ | 116080/478625 [00:03<00:11, 31470.56it/s][A
+
81%|████████ | 387178/478625 [00:12<00:02, 31105.35it/s][A
+
55%|█████▍ | 261139/478625 [00:08<00:06, 31908.36it/s][A
+
60%|█████▉ | 285224/478625 [00:08<00:06, 31774.02it/s][A
+
53%|█████▎ | 253529/478625 [00:08<00:07, 31245.82it/s][A
+
25%|██▍ | 119402/478625 [00:03<00:11, 31985.33it/s][A
+
82%|████████▏ | 390294/478625 [00:12<00:02, 30938.54it/s][A
+
55%|█████▌ | 264477/478625 [00:08<00:06, 32338.36it/s][A
+
60%|██████ | 288516/478625 [00:08<00:05, 32108.27it/s][A
+
54%|█████▎ | 256809/478625 [00:08<00:06, 31702.86it/s][A
+
26%|██▌ | 122744/478625 [00:03<00:10, 32407.71it/s][A
+
82%|████████▏ | 393508/478625 [00:12<00:02, 31288.10it/s][A
+
56%|█████▌ | 267716/478625 [00:08<00:06, 31871.65it/s][A
+
61%|██████ | 291730/478625 [00:09<00:05, 32093.01it/s][A
+
54%|█████▍ | 259982/478625 [00:08<00:07, 30970.48it/s][A
+
26%|██▋ | 125987/478625 [00:03<00:11, 31861.23it/s][A
+
83%|████████▎ | 396640/478625 [00:12<00:02, 30484.51it/s][A
+
57%|█████▋ | 271066/478625 [00:08<00:06, 32348.30it/s][A
+
62%|██████▏ | 294942/478625 [00:09<00:05, 31637.14it/s][A
+
55%|█████▌ | 263278/478625 [00:08<00:06, 31549.53it/s][A
+
27%|██▋ | 129303/478625 [00:04<00:10, 32240.89it/s][A
+
84%|████████▎ | 399785/478625 [00:12<00:02, 30764.13it/s][A
+
57%|█████▋ | 274305/478625 [00:08<00:06, 32142.90it/s][A
+
62%|██████▏ | 298231/478625 [00:09<00:05, 32006.19it/s][A
+
56%|█████▌ | 266525/478625 [00:08<00:06, 31819.96it/s][A
+
28%|██▊ | 132530/478625 [00:04<00:10, 32101.25it/s][A
+
84%|████████▍ | 402867/478625 [00:12<00:02, 30183.23it/s][A
+
58%|█████▊ | 277523/478625 [00:08<00:06, 31768.06it/s][A
+
63%|██████▎ | 301441/478625 [00:09<00:05, 31481.18it/s][A
+
56%|█████▋ | 269711/478625 [00:08<00:06, 30806.88it/s][A
+
28%|██▊ | 135743/478625 [00:04<00:10, 31675.77it/s][A
+
85%|████████▍ | 406003/478625 [00:13<00:02, 30523.57it/s][A
+
59%|█████▊ | 280877/478625 [00:08<00:06, 32287.22it/s][A
+
64%|██████▎ | 304765/478625 [00:09<00:05, 31996.31it/s][A
+
57%|█████▋ | 272963/478625 [00:08<00:06, 31302.61it/s][A
+
29%|██▉ | 139054/478625 [00:04<00:10, 32097.79it/s][A
+
85%|████████▌ | 409133/478625 [00:13<00:02, 30750.77it/s][A
+
59%|█████▉ | 284238/478625 [00:08<00:05, 32677.01it/s][A
+
64%|██████▍ | 308066/478625 [00:09<00:05, 32295.05it/s][A
+
58%|█████▊ | 276105/478625 [00:08<00:06, 30940.88it/s][A
+
30%|██▉ | 142267/478625 [00:04<00:10, 31718.11it/s][A
+
86%|████████▌ | 412212/478625 [00:13<00:02, 29608.18it/s][A
+
60%|██████ | 287509/478625 [00:08<00:05, 32150.08it/s][A
+
65%|██████▌ | 311299/478625 [00:09<00:05, 31793.63it/s][A
+
58%|█████▊ | 279361/478625 [00:08<00:06, 31411.55it/s][A
+
30%|███ | 145551/478625 [00:04<00:10, 32045.25it/s][A
+
87%|████████▋ | 415271/478625 [00:13<00:02, 29891.55it/s][A
+
61%|██████ | 290861/478625 [00:09<00:05, 32550.82it/s][A
+
66%|██████▌ | 314598/478625 [00:09<00:05, 32144.75it/s][A
+
59%|█████▉ | 282610/478625 [00:09<00:06, 31726.42it/s][A
+
31%|███ | 148888/478625 [00:04<00:10, 32436.77it/s][A
+
87%|████████▋ | 418390/478625 [00:13<00:01, 30268.21it/s][A
+
61%|██████▏ | 294120/478625 [00:09<00:05, 31826.04it/s][A
+
66%|██████▋ | 317879/478625 [00:09<00:04, 32340.64it/s][A
+
60%|█████▉ | 285787/478625 [00:09<00:06, 31260.64it/s][A
+
32%|███▏ | 152134/478625 [00:04<00:10, 31933.86it/s][A
+
88%|████████▊ | 421424/478625 [00:13<00:01, 29760.32it/s][A
+
62%|██████▏ | 297476/478625 [00:09<00:05, 32332.23it/s][A
+
67%|██████▋ | 321116/478625 [00:10<00:04, 31772.90it/s][A
+
60%|██████ | 288918/478625 [00:09<00:06, 31158.16it/s][A
+
32%|███▏ | 155331/478625 [00:04<00:10, 31810.74it/s][A
+
89%|████████▊ | 424561/478625 [00:13<00:01, 30228.55it/s][A
+
63%|██████▎ | 300812/478625 [00:09<00:05, 32634.00it/s][A
+
68%|██████▊ | 324435/478625 [00:10<00:04, 32188.94it/s][A
+
61%|██████ | 292106/478625 [00:09<00:05, 31369.89it/s][A
+
33%|███▎ | 158514/478625 [00:04<00:10, 31567.58it/s][A
+
89%|████████▉ | 427590/478625 [00:13<00:01, 29698.97it/s][A
+
64%|██████▎ | 304080/478625 [00:09<00:05, 31650.47it/s][A
+
68%|██████▊ | 327658/478625 [00:10<00:04, 31634.94it/s][A
+
62%|██████▏ | 295246/478625 [00:09<00:05, 30990.05it/s][A
+
34%|███▍ | 161790/478625 [00:05<00:09, 31918.44it/s][A
+
90%|████████▉ | 430650/478625 [00:13<00:01, 29959.75it/s][A
+
64%|██████▍ | 307415/478625 [00:09<00:05, 32143.26it/s][A
+
69%|██████▉ | 330880/478625 [00:10<00:04, 31805.28it/s][A
+
62%|██████▏ | 298538/478625 [00:09<00:05, 31557.92it/s][A
+
34%|███▍ | 165110/478625 [00:05<00:09, 32297.81it/s][A
+
91%|█████████ | 433769/478625 [00:13<00:01, 30319.99it/s][A
+
65%|██████▍ | 310637/478625 [00:09<00:05, 31738.82it/s][A
+
70%|██████▉ | 334200/478625 [00:10<00:04, 32216.02it/s][A
+
63%|██████▎ | 301697/478625 [00:09<00:05, 31126.63it/s][A
+
35%|███▌ | 168342/478625 [00:05<00:09, 31838.01it/s][A
+
91%|█████████▏| 436805/478625 [00:14<00:01, 29938.09it/s][A
+
66%|██████▌ | 313982/478625 [00:09<00:05, 32237.55it/s][A
+
70%|███████ | 337425/478625 [00:10<00:04, 31553.73it/s][A
+
64%|██████▎ | 304970/478625 [00:09<00:05, 31595.81it/s][A
+
36%|███▌ | 171648/478625 [00:05<00:09, 32197.81it/s][A
+
92%|█████████▏| 439962/478625 [00:14<00:01, 30415.41it/s][A
+
66%|██████▋ | 317318/478625 [00:09<00:04, 32567.55it/s][A
+
71%|███████ | 340785/478625 [00:10<00:04, 32151.83it/s][A
+
64%|██████▍ | 308248/478625 [00:09<00:05, 31945.76it/s][A
+
37%|███▋ | 174870/478625 [00:05<00:09, 31245.67it/s][A
+
93%|█████████▎| 443142/478625 [00:14<00:01, 30823.88it/s][A
+
67%|██████▋ | 320579/478625 [00:09<00:04, 32026.95it/s][A
+
72%|███████▏ | 344005/478625 [00:10<00:04, 31661.40it/s][A
+
65%|██████▌ | 311445/478625 [00:09<00:05, 30934.59it/s][A
+
37%|███▋ | 178214/478625 [00:05<00:09, 31883.05it/s][A
+
93%|█████████▎| 446228/478625 [00:14<00:01, 30192.49it/s][A
+
68%|██████▊ | 323947/478625 [00:10<00:04, 32474.08it/s][A
+
73%|███████▎ | 347334/478625 [00:10<00:04, 32138.19it/s][A
+
66%|██████▌ | 314727/478625 [00:10<00:05, 31483.09it/s][A
+
38%|███▊ | 181512/478625 [00:05<00:09, 32202.49it/s][A
+
94%|█████████▍| 449366/478625 [00:14<00:00, 30539.37it/s][A
+
68%|██████▊ | 327199/478625 [00:10<00:04, 31934.36it/s][A
+
73%|███████▎ | 350672/478625 [00:10<00:03, 32503.10it/s][A
+
66%|██████▋ | 318009/478625 [00:10<00:05, 31873.08it/s][A
+
39%|███▊ | 184738/478625 [00:05<00:09, 31592.32it/s][A
+
95%|█████████▍| 452507/478625 [00:14<00:00, 30794.45it/s][A
+
69%|██████▉ | 330451/478625 [00:10<00:04, 32103.31it/s][A
+
74%|███████▍ | 353926/478625 [00:11<00:03, 32009.21it/s][A
+
67%|██████▋ | 321203/478625 [00:10<00:05, 31355.07it/s][A
+
39%|███▉ | 188066/478625 [00:05<00:09, 32085.73it/s][A
+
95%|█████████▌| 455590/478625 [00:14<00:00, 30239.12it/s][A
+
70%|██████▉ | 333665/478625 [00:10<00:04, 32018.96it/s][A
+
75%|███████▍ | 357265/478625 [00:11<00:03, 32414.36it/s][A
+
68%|██████▊ | 324500/478625 [00:10<00:04, 31826.41it/s][A
+
40%|███▉ | 191389/478625 [00:06<00:08, 32421.15it/s][A
+
96%|█████████▌| 458757/478625 [00:14<00:00, 30658.49it/s][A
+
70%|███████ | 336870/478625 [00:10<00:04, 31615.20it/s][A
+
75%|███████▌ | 360510/478625 [00:11<00:03, 31930.96it/s][A
+
68%|██████▊ | 327688/478625 [00:10<00:04, 31248.35it/s][A
+
41%|████ | 194636/478625 [00:06<00:08, 31821.56it/s][A
+
96%|█████████▋| 461827/478625 [00:14<00:00, 30137.56it/s][A
+
71%|███████ | 340217/478625 [00:10<00:04, 32159.22it/s][A
+
76%|███████▌ | 363844/478625 [00:11<00:03, 32344.70it/s][A
+
69%|██████▉ | 330818/478625 [00:10<00:04, 30926.11it/s][A
+
41%|████▏ | 197823/478625 [00:06<00:08, 31728.60it/s][A
+
97%|█████████▋| 464930/478625 [00:14<00:00, 30396.00it/s][A
+
72%|███████▏ | 343572/478625 [00:10<00:04, 32567.17it/s][A
+
77%|███████▋ | 367139/478625 [00:11<00:03, 32520.48it/s][A
+
70%|██████▉ | 334076/478625 [00:10<00:04, 31408.83it/s][A
+
42%|████▏ | 201000/478625 [00:06<00:08, 31354.35it/s][A
+
98%|█████████▊| 467973/478625 [00:15<00:00, 30385.73it/s][A
+
72%|███████▏ | 346832/478625 [00:10<00:04, 31963.08it/s][A
+
77%|███████▋ | 370394/478625 [00:11<00:03, 31626.00it/s][A
+
70%|███████ | 337221/478625 [00:10<00:04, 30957.53it/s][A
+
43%|████▎ | 204310/478625 [00:06<00:08, 31865.03it/s][A
+
98%|█████████▊| 471014/478625 [00:15<00:00, 29883.32it/s][A
+
73%|███████▎ | 350168/478625 [00:10<00:03, 32370.18it/s][A
+
78%|███████▊ | 373757/478625 [00:11<00:03, 32208.80it/s][A
+
71%|███████ | 340477/478625 [00:10<00:04, 31425.33it/s][A
+
43%|████▎ | 207584/478625 [00:06<00:08, 32121.40it/s][A
+
99%|█████████▉| 474185/478625 [00:15<00:00, 30417.59it/s][A
+
74%|███████▍ | 353409/478625 [00:10<00:03, 31914.28it/s][A
+
79%|███████▉ | 377084/478625 [00:11<00:03, 32517.78it/s][A
+
72%|███████▏ | 343626/478625 [00:10<00:04, 30985.50it/s][A
+
44%|████▍ | 210799/478625 [00:06<00:08, 31662.34it/s][A
+
100%|█████████▉| 477340/478625 [00:15<00:00, 30750.00it/s][A
+
75%|███████▍ | 356753/478625 [00:11<00:03, 32359.84it/s][A
+
79%|███████▉ | 380341/478625 [00:11<00:03, 31870.91it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31010.62it/s]
+
100%|██████████| 1/1 [00:22<00:00, 22.23s/it]
100%|██████████| 1/1 [00:22<00:00, 22.23s/it]
+
+
72%|███████▏ | 346888/478625 [00:11<00:04, 31463.87it/s][A
+
45%|████▍ | 214121/478625 [00:06<00:08, 32118.54it/s][A
+
75%|███████▌ | 360044/478625 [00:11<00:03, 32259.93it/s][A
+
80%|████████ | 383669/478625 [00:11<00:02, 32282.23it/s][A
+
73%|███████▎ | 350139/478625 [00:11<00:04, 31770.80it/s][A
+
45%|████▌ | 217336/478625 [00:06<00:08, 31142.39it/s][A
+
76%|███████▌ | 363273/478625 [00:11<00:03, 31843.53it/s][A
+
81%|████████ | 386903/478625 [00:12<00:02, 31626.61it/s][A
+
74%|███████▍ | 353319/478625 [00:11<00:04, 30747.94it/s][A
+
46%|████▌ | 220662/478625 [00:06<00:08, 31757.82it/s][A
+
77%|███████▋ | 366586/478625 [00:11<00:03, 32220.07it/s][A
+
82%|████████▏ | 390226/478625 [00:12<00:02, 32092.11it/s][A
+
75%|███████▍ | 356579/478625 [00:11<00:03, 31284.27it/s][A
+
47%|████▋ | 224003/478625 [00:07<00:07, 32240.11it/s][A
+
77%|███████▋ | 369811/478625 [00:11<00:03, 31563.45it/s][A
+
82%|████████▏ | 393561/478625 [00:12<00:02, 32445.43it/s][A
+
75%|███████▌ | 359839/478625 [00:11<00:03, 31669.15it/s][A
+
47%|████▋ | 227234/478625 [00:07<00:07, 31776.83it/s][A
+
78%|███████▊ | 373158/478625 [00:11<00:03, 32095.91it/s][A
+
83%|████████▎ | 396810/478625 [00:12<00:02, 31836.07it/s][A
+
76%|███████▌ | 363012/478625 [00:11<00:03, 31261.30it/s][A
+
48%|████▊ | 230492/478625 [00:07<00:07, 32011.45it/s][A
+
79%|███████▊ | 376508/478625 [00:11<00:03, 32506.37it/s][A
+
84%|████████▎ | 400143/478625 [00:12<00:02, 32272.81it/s][A
+
77%|███████▋ | 366246/478625 [00:11<00:03, 31577.32it/s][A
+
49%|████▉ | 233826/478625 [00:07<00:07, 32403.22it/s][A
+
79%|███████▉ | 379763/478625 [00:11<00:03, 31990.10it/s][A
+
84%|████████▍ | 403375/478625 [00:12<00:02, 31814.56it/s][A
+
77%|███████▋ | 369408/478625 [00:11<00:03, 30962.80it/s][A
+
50%|████▉ | 237070/478625 [00:07<00:07, 31443.01it/s][Atime 23.01806139945984
+
+
80%|████████ | 383121/478625 [00:11<00:02, 32455.18it/s][A
+
85%|████████▍ | 406688/478625 [00:12<00:02, 32197.60it/s][An_elements: 474899
+data length: 474899
+
+
78%|███████▊ | 372510/478625 [00:11<00:03, 30940.43it/s][A
+
50%|█████ | 240394/478625 [00:07<00:07, 31965.70it/s][A
+
81%|████████ | 386371/478625 [00:12<00:02, 31485.42it/s][A
+
86%|████████▌ | 410009/478625 [00:12<00:02, 32494.84it/s][A
+
79%|███████▊ | 375785/478625 [00:11<00:03, 31472.65it/s][A
+
51%|█████ | 243598/478625 [00:07<00:07, 31597.60it/s][A
+
81%|████████▏ | 389713/478625 [00:12<00:02, 32044.30it/s][A
+
86%|████████▋ | 413262/478625 [00:12<00:02, 31673.41it/s][A
+
79%|███████▉ | 378936/478625 [00:12<00:03, 31036.36it/s][A
+
52%|█████▏ | 246927/478625 [00:07<00:07, 32092.08it/s][A
+
82%|████████▏ | 393069/478625 [00:12<00:02, 32488.06it/s][A
+
87%|████████▋ | 416481/478625 [00:12<00:01, 31821.51it/s][A
+
80%|███████▉ | 382227/478625 [00:12<00:03, 31586.42it/s][A
+
52%|█████▏ | 250230/478625 [00:07<00:07, 32367.96it/s][A
+
83%|████████▎ | 396324/478625 [00:12<00:02, 31992.07it/s][A
+
88%|████████▊ | 419668/478625 [00:13<00:01, 31520.01it/s][A
+
81%|████████ | 385490/478625 [00:12<00:02, 31892.92it/s][A
+
53%|█████▎ | 253471/478625 [00:07<00:07, 31903.19it/s][A
+
84%|████████▎ | 399663/478625 [00:12<00:02, 32400.27it/s][A
+
88%|████████▊ | 423008/478625 [00:13<00:01, 32071.08it/s][A
+
81%|████████ | 388682/478625 [00:12<00:02, 31415.45it/s][A
+
54%|█████▎ | 256787/478625 [00:08<00:06, 32269.77it/s][A
+
84%|████████▍ | 402908/478625 [00:12<00:02, 31931.29it/s][A
+
89%|████████▉ | 426327/478625 [00:13<00:01, 32400.60it/s][A
+
82%|████████▏ | 391967/478625 [00:12<00:02, 31838.02it/s][A
+
54%|█████▍ | 260018/478625 [00:08<00:07, 31012.54it/s][A
+
85%|████████▍ | 406251/478625 [00:12<00:02, 32370.10it/s][A
+
90%|████████▉ | 429570/478625 [00:13<00:01, 31776.58it/s][A
+
83%|████████▎ | 395154/478625 [00:12<00:02, 30837.63it/s][A
+
55%|█████▌ | 263341/478625 [00:08<00:06, 31652.71it/s][A
+
86%|████████▌ | 409608/478625 [00:12<00:02, 32722.47it/s][A
+
90%|█████████ | 432930/478625 [00:13<00:01, 32309.09it/s][A
+
83%|████████▎ | 398436/478625 [00:12<00:02, 31411.13it/s][A
+
56%|█████▌ | 266645/478625 [00:08<00:06, 32056.84it/s][A
+
86%|████████▋ | 412884/478625 [00:12<00:02, 32041.56it/s][A
+
91%|█████████ | 436273/478625 [00:13<00:01, 32638.89it/s][A
+
84%|████████▍ | 401712/478625 [00:12<00:02, 31805.09it/s][A
+
56%|█████▋ | 269860/478625 [00:08<00:06, 31558.82it/s][A
+
87%|████████▋ | 416093/478625 [00:12<00:01, 31625.91it/s][A
+
92%|█████████▏| 439541/478625 [00:13<00:01, 31823.84it/s][A
+
85%|████████▍ | 404899/478625 [00:12<00:02, 31320.67it/s][A
+
57%|█████▋ | 273188/478625 [00:08<00:06, 32061.43it/s][A
+
88%|████████▊ | 419452/478625 [00:13<00:01, 32199.43it/s][A
+
93%|█████████▎| 442873/478625 [00:13<00:01, 32259.74it/s][A
+
85%|████████▌ | 408170/478625 [00:13<00:02, 31727.37it/s][A
+
58%|█████▊ | 276401/478625 [00:08<00:06, 31589.06it/s][A
+
88%|████████▊ | 422677/478625 [00:13<00:01, 31787.83it/s][A
+
93%|█████████▎| 446105/478625 [00:13<00:01, 31827.26it/s][A
+
86%|████████▌ | 411348/478625 [00:13<00:02, 31261.06it/s][A
+
58%|█████▊ | 279566/478625 [00:08<00:06, 31590.72it/s][A
+
89%|████████▉ | 426018/478625 [00:13<00:01, 32262.85it/s][A
+
94%|█████████▍| 449436/478625 [00:14<00:00, 32258.80it/s][A
+
87%|████████▋ | 414479/478625 [00:13<00:02, 31093.85it/s][A
+
59%|█████▉ | 282881/478625 [00:08<00:06, 32049.08it/s][A
+
90%|████████▉ | 429248/478625 [00:13<00:01, 31807.68it/s][A
+
95%|█████████▍| 452761/478625 [00:14<00:00, 32548.29it/s][A
+
87%|████████▋ | 417613/478625 [00:13<00:01, 31165.13it/s][A
+
60%|█████▉ | 286090/478625 [00:09<00:06, 31625.32it/s][A
+
90%|█████████ | 432588/478625 [00:13<00:01, 32274.23it/s][A
+
95%|█████████▌| 456020/478625 [00:14<00:00, 31973.36it/s][A
+
88%|████████▊ | 420732/478625 [00:13<00:01, 30862.99it/s][A
+
60%|██████ | 289398/478625 [00:09<00:05, 32053.28it/s][A
+
91%|█████████ | 435947/478625 [00:13<00:01, 32660.93it/s][A
+
96%|█████████▌| 459348/478625 [00:14<00:00, 32356.44it/s][A
+
89%|████████▊ | 423999/478625 [00:13<00:01, 31394.91it/s][A
+
61%|██████ | 292639/478625 [00:09<00:05, 32156.47it/s][A
+
92%|█████████▏| 439216/478625 [00:13<00:01, 32169.69it/s][A
+
97%|█████████▋| 462588/478625 [00:14<00:00, 31769.57it/s][A
+
89%|████████▉ | 427275/478625 [00:13<00:01, 31798.82it/s][A
+
62%|██████▏ | 295857/478625 [00:09<00:05, 31660.95it/s][A
+
92%|█████████▏| 442437/478625 [00:13<00:01, 32043.47it/s][A
+
97%|█████████▋| 465848/478625 [00:14<00:00, 32010.99it/s][A
+
90%|████████▉ | 430457/478625 [00:13<00:01, 31342.69it/s][A
+
63%|██████▎ | 299192/478625 [00:09<00:05, 32156.60it/s][A
+
93%|█████████▎| 445644/478625 [00:13<00:01, 31661.40it/s][A
+
98%|█████████▊| 469059/478625 [00:14<00:00, 32036.97it/s][A
+
91%|█████████ | 433700/478625 [00:13<00:01, 31663.34it/s][A
+
63%|██████▎ | 302411/478625 [00:09<00:05, 31189.43it/s][A
+
94%|█████████▍| 449018/478625 [00:13<00:00, 32272.26it/s][A
+
99%|█████████▊| 472266/478625 [00:14<00:00, 31630.89it/s][A
+
91%|█████████▏| 436869/478625 [00:13<00:01, 31254.43it/s][A
+
64%|██████▍ | 305705/478625 [00:09<00:05, 31696.51it/s][A
+
95%|█████████▍| 452341/478625 [00:14<00:00, 32553.89it/s][A
+
99%|█████████▉| 475599/478625 [00:14<00:00, 32131.37it/s][A
+
92%|█████████▏| 440138/478625 [00:14<00:01, 31675.27it/s][A
+
65%|██████▍ | 309018/478625 [00:09<00:05, 32114.53it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32083.86it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.55s/it]
100%|██████████| 1/1 [00:21<00:00, 21.55s/it]
+
+
95%|█████████▌| 455599/478625 [00:14<00:00, 32025.78it/s][A
+
93%|█████████▎| 443425/478625 [00:14<00:01, 32025.59it/s][A
+
65%|██████▌ | 312236/478625 [00:09<00:05, 31690.00it/s][A
+
96%|█████████▌| 458971/478625 [00:14<00:00, 32479.58it/s][A
+
93%|█████████▎| 446630/478625 [00:14<00:01, 31485.17it/s][A
+
66%|██████▌ | 315536/478625 [00:09<00:05, 32072.97it/s][A
+
97%|█████████▋| 462222/478625 [00:14<00:00, 32063.73it/s][A
+
94%|█████████▍| 449901/478625 [00:14<00:00, 31845.53it/s][A
+
67%|██████▋ | 318748/478625 [00:10<00:05, 31662.09it/s][A
+
97%|█████████▋| 465534/478625 [00:14<00:00, 32372.66it/s][A
+
95%|█████████▍| 453152/478625 [00:14<00:00, 32039.84it/s][A
+
67%|██████▋ | 321918/478625 [00:10<00:04, 31572.50it/s][A
+
98%|█████████▊| 468774/478625 [00:14<00:00, 32377.79it/s][A
+
95%|█████████▌| 456359/478625 [00:14<00:00, 31467.99it/s][A
+
68%|██████▊ | 325226/478625 [00:10<00:04, 32015.46it/s][A
+
99%|█████████▊| 472014/478625 [00:14<00:00, 31434.65it/s][A
+
96%|█████████▌| 459654/478625 [00:14<00:00, 31901.47it/s][A
+
69%|██████▊ | 328430/478625 [00:10<00:04, 31331.83it/s][A
+
99%|█████████▉| 475369/478625 [00:14<00:00, 32049.27it/s][A
+
97%|█████████▋| 462848/478625 [00:14<00:00, 31391.03it/s][Atime 22.210426092147827
+
+
69%|██████▉ | 331725/478625 [00:10<00:04, 31804.41it/s][An_elements: 474899
+data length: 474899
+
100%|██████████| 478625/478625 [00:14<00:00, 32162.02it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.54s/it]
100%|██████████| 1/1 [00:21<00:00, 21.54s/it]
+
+
97%|█████████▋| 466078/478625 [00:14<00:00, 31657.48it/s][A
+
70%|███████ | 335047/478625 [00:10<00:04, 32221.44it/s][A
+
98%|█████████▊| 469247/478625 [00:14<00:00, 31590.69it/s][A
+
71%|███████ | 338273/478625 [00:10<00:04, 31671.98it/s][A
+
99%|█████████▊| 472409/478625 [00:15<00:00, 31142.31it/s][A
+
71%|███████▏ | 341587/478625 [00:10<00:04, 32102.53it/s][A
+
99%|█████████▉| 475701/478625 [00:15<00:00, 31663.14it/s][A
+
72%|███████▏ | 344801/478625 [00:10<00:04, 31113.13it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31391.32it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.84s/it]
100%|██████████| 1/1 [00:21<00:00, 21.84s/it]
+
+
73%|███████▎ | 348099/478625 [00:10<00:04, 31654.57it/s][A
+
73%|███████▎ | 351405/478625 [00:11<00:03, 32064.49it/s][Atime 22.209940433502197
+
+
74%|███████▍ | 354618/478625 [00:11<00:03, 31643.41it/s][An_elements: 474899
+data length: 474899
+
+
75%|███████▍ | 357925/478625 [00:11<00:03, 32059.46it/s][A
+
75%|███████▌ | 361136/478625 [00:11<00:03, 31675.37it/s][A
+
76%|███████▌ | 364308/478625 [00:11<00:03, 31564.67it/s][A
+
77%|███████▋ | 367588/478625 [00:11<00:03, 31928.09it/s][Atime 22.49973440170288
+n_elements: 474899
+data length: 474899
+
+
77%|███████▋ | 370784/478625 [00:11<00:03, 31301.34it/s][A
+
78%|███████▊ | 374108/478625 [00:11<00:03, 31868.47it/s][A
+
79%|███████▉ | 377396/478625 [00:11<00:03, 31475.95it/s][A
+
80%|███████▉ | 380727/478625 [00:11<00:03, 32009.00it/s][A
+
80%|████████ | 384051/478625 [00:12<00:02, 32369.93it/s][A
+
81%|████████ | 387292/478625 [00:12<00:02, 31349.51it/s][A
+
82%|████████▏ | 390623/478625 [00:12<00:02, 31918.69it/s][A
+
82%|████████▏ | 393944/478625 [00:12<00:02, 32296.15it/s][A
+
83%|████████▎ | 397180/478625 [00:12<00:02, 31758.67it/s][A
+
84%|████████▎ | 400501/478625 [00:12<00:02, 32183.53it/s][A
+
84%|████████▍ | 403725/478625 [00:12<00:02, 31748.41it/s][A
+
85%|████████▌ | 406905/478625 [00:12<00:02, 31604.78it/s][A
+
86%|████████▌ | 410209/478625 [00:12<00:02, 32024.91it/s][A
+
86%|████████▋ | 413415/478625 [00:13<00:02, 31429.71it/s][A
+
87%|████████▋ | 416623/478625 [00:13<00:01, 31618.40it/s][A
+
88%|████████▊ | 419788/478625 [00:13<00:01, 31291.11it/s][A
+
88%|████████▊ | 423103/478625 [00:13<00:01, 31837.67it/s][A
+
89%|████████▉ | 426415/478625 [00:13<00:01, 32215.91it/s][A
+
90%|████████▉ | 429639/478625 [00:13<00:01, 31191.14it/s][A
+
90%|█████████ | 432965/478625 [00:13<00:01, 31792.69it/s][A
+
91%|█████████ | 436306/478625 [00:13<00:01, 32266.72it/s][A
+
92%|█████████▏| 439539/478625 [00:13<00:01, 31762.60it/s][A
+
93%|█████████▎| 442862/478625 [00:13<00:01, 32191.56it/s][A
+
93%|█████████▎| 446086/478625 [00:14<00:01, 31791.82it/s][A
+
94%|█████████▍| 449273/478625 [00:14<00:00, 31813.12it/s][A
+
95%|█████████▍| 452595/478625 [00:14<00:00, 32228.84it/s][A
+
95%|█████████▌| 455821/478625 [00:14<00:00, 31747.15it/s][A
+
96%|█████████▌| 459144/478625 [00:14<00:00, 32181.67it/s][A
+
97%|█████████▋| 462366/478625 [00:14<00:00, 31777.57it/s][A
+
97%|█████████▋| 465641/478625 [00:14<00:00, 32062.02it/s][A
+
98%|█████████▊| 468850/478625 [00:14<00:00, 32049.90it/s][A
+
99%|█████████▊| 472057/478625 [00:14<00:00, 31539.94it/s][A
+
99%|█████████▉| 475385/478625 [00:14<00:00, 32050.31it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31812.50it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.68s/it]
100%|██████████| 1/1 [00:21<00:00, 21.68s/it]
+time 22.3456609249115
+n_elements: 474899
+data length: 474899
+[2024-09-18 23:08:59,077] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2024-09-18 23:08:59,081] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2024-09-18 23:08:59,081] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+zp rank is 7, zp_size=8
+zp rank is 1, zp_size=8
+zp rank is 5, zp_size=8
+zp rank is 6, zp_size=8
+zp rank is 2, zp_size=8
+zp rank is 3, zp_size=8
+zp rank is 4, zp_size=8
+[2024-09-18 23:08:59,253] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
+[2024-09-18 23:08:59,253] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=.NewCls'>
+[2024-09-18 23:08:59,254] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
+[2024-09-18 23:08:59,254] [INFO] [stage_1_and_2.py:173:__init__] Reduce bucket size 536870912
+[2024-09-18 23:08:59,254] [INFO] [stage_1_and_2.py:174:__init__] Allgather bucket size 536870912
+[2024-09-18 23:08:59,254] [INFO] [stage_1_and_2.py:175:__init__] CPU Offload: False
+[2024-09-18 23:08:59,254] [INFO] [stage_1_and_2.py:176:__init__] Round robin gradient partitioning: False
+zp rank is 0, zp_size=8
+[2024-09-18 23:09:05,545] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states
+[2024-09-18 23:09:05,546] [INFO] [utils.py:792:see_memory_usage] MA 17.78 GB Max_MA 18.44 GB CA 18.78 GB Max_CA 19 GB
+[2024-09-18 23:09:05,547] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 238.8 GB, percent = 15.8%
+[2024-09-18 23:09:07,644] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states
+[2024-09-18 23:09:07,646] [INFO] [utils.py:792:see_memory_usage] MA 20.41 GB Max_MA 24.35 GB CA 25.36 GB Max_CA 25 GB
+[2024-09-18 23:09:07,646] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 238.8 GB, percent = 15.8%
+[2024-09-18 23:09:07,646] [INFO] [stage_1_and_2.py:552:__init__] optimizer state initialized
+[2024-09-18 23:09:09,534] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer
+[2024-09-18 23:09:09,535] [INFO] [utils.py:792:see_memory_usage] MA 20.41 GB Max_MA 20.41 GB CA 25.36 GB Max_CA 25 GB
+[2024-09-18 23:09:09,535] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 233.49 GB, percent = 15.5%
+[2024-09-18 23:09:09,544] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
+[2024-09-18 23:09:09,544] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2024-09-18 23:09:09,544] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2024-09-18 23:09:09,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[(0.9, 0.999)]
+[2024-09-18 23:09:09,547] [INFO] [config.py:984:print] DeepSpeedEngine configuration:
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] activation_checkpointing_config {
+ "partition_activations": false,
+ "contiguous_memory_optimization": false,
+ "cpu_checkpointing": false,
+ "number_checkpoints": null,
+ "synchronize_checkpoint_boundary": false,
+ "profile": false
+}
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] amp_enabled .................. False
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] amp_params ................... False
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] autotuning_config ............ {
+ "enabled": false,
+ "start_step": null,
+ "end_step": null,
+ "metric_path": null,
+ "arg_mappings": null,
+ "metric": "throughput",
+ "model_info": null,
+ "results_dir": "autotuning_results",
+ "exps_dir": "autotuning_exps",
+ "overwrite": true,
+ "fast": true,
+ "start_profile_step": 3,
+ "end_profile_step": 5,
+ "tuner_type": "gridsearch",
+ "tuner_early_stopping": 5,
+ "tuner_num_trials": 50,
+ "model_info_path": null,
+ "mp_size": 1,
+ "max_train_batch_size": null,
+ "min_train_batch_size": 1,
+ "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+ "min_train_micro_batch_size_per_gpu": 1,
+ "num_tuning_micro_batch_sizes": 3
+}
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] bfloat16_enabled ............. True
+[2024-09-18 23:09:09,548] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] comms_config .................
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] communication_data_type ...... torch.float32
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] curriculum_params_legacy ..... False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] data_efficiency_enabled ...... False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] dataloader_drop_last ......... False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] disable_allgather ............ False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] dump_state ................... False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_enabled ........... False
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06
+[2024-09-18 23:09:09,549] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] eigenvalue_verbose ........... False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] elasticity_enabled ........... False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] flops_profiler_config ........ {
+ "enabled": false,
+ "recompute_fwd_factor": 0.0,
+ "profile_step": 1,
+ "module_depth": -1,
+ "top_modules": 1,
+ "detailed": true,
+ "output_file": null
+}
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] fp16_auto_cast ............... None
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] fp16_enabled ................. False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] global_rank .................. 0
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] grad_accum_dtype ............. None
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] gradient_accumulation_steps .. 1
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] gradient_clipping ............ 1.0
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] graph_harvesting ............. False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] load_universal_checkpoint .... False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] loss_scale ................... 1.0
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] memory_breakdown ............. False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] mics_hierarchial_params_gather False
+[2024-09-18 23:09:09,550] [INFO] [config.py:988:print] mics_shard_size .............. -1
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] nebula_config ................ {
+ "enabled": false,
+ "persistent_storage_path": null,
+ "persistent_time_interval": 100,
+ "num_of_version_in_retention": 2,
+ "enable_nebula_load": true,
+ "load_path": null
+}
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] optimizer_name ............... None
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] optimizer_params ............. None
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] pld_enabled .................. False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] pld_params ................... False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] prescale_gradients ........... False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] scheduler_name ............... None
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] scheduler_params ............. None
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] sparse_attention ............. None
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] steps_per_print .............. inf
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] train_batch_size ............. 8
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 1
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False
+[2024-09-18 23:09:09,551] [INFO] [config.py:988:print] use_node_local_storage ....... False
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] wall_clock_breakdown ......... False
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] weight_quantization_config ... None
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] world_size ................... 8
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] zero_allow_untested_optimizer True
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=536870912 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=536870912 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] zero_enabled ................. True
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True
+[2024-09-18 23:09:09,552] [INFO] [config.py:988:print] zero_optimization_stage ...... 2
+[2024-09-18 23:09:09,552] [INFO] [config.py:974:print_user_config] json = {
+ "fp16": {
+ "enabled": false,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 16,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "bf16": {
+ "enabled": true
+ },
+ "communication_data_type": "fp32",
+ "gradient_clipping": 1.0,
+ "train_micro_batch_size_per_gpu": 1,
+ "train_batch_size": 8,
+ "gradient_accumulation_steps": 1,
+ "zero_optimization": {
+ "stage": 2,
+ "overlap_comm": true,
+ "allgather_bucket_size": 5.368709e+08,
+ "contiguous_gradients": true,
+ "reduce_bucket_size": 5.368709e+08
+ },
+ "steps_per_print": inf,
+ "zero_allow_untested_optimizer": true
+}
+09/18/2024 23:09:09 - INFO - __main__ - after accelerator.prepare
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[] -> [195210]
+[] -> [184079]
+[] -> [164918]
+[] -> [184079]
+[] -> [195210]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [164918]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[] -> [164918]
+[] -> [195210]
+[] -> [184079]
+[] -> [184079]
+[] -> [164918]
+[] -> [164918]
+[] -> [164918]
+[] -> [164918]
+[] -> [164918]
+[] -> [184079]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+-
shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+[] -> [195210]
+[] -> [184079]
+[] -> [164918]
+[] -> [195210]
+[] -> [184079]
+-
shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+-
\
-
-
\
-
[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+\
09/18/2024 23:09:44 - INFO - __main__ - init trackers...
+[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+|
shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+\
\
wandb: Currently logged in as: pkuhxy (pkuhxy-Peking University). Use `wandb login --relogin` to force relogin
+|
wandb: - Waiting for wandb.init()...
wandb: \ Waiting for wandb.init()...
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+wandb: wandb version 0.18.1 is available! To upgrade, please run:
+wandb: $ pip install wandb --upgrade
+wandb: Tracking run with wandb version 0.16.3
+wandb: Run data is saved locally in /home/image_data/hxy/Open-Sora-Plan/wandb/run-20240918_230950-ku7b33wt
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run rose-voice-1
+wandb: ⭐️ View project at https://wandb.ai/pkuhxy-Peking%20University/allinpaint_stage1_2
+wandb: 🚀 View run at https://wandb.ai/pkuhxy-Peking%20University/allinpaint_stage1_2/runs/ku7b33wt
+09/18/2024 23:09:52 - INFO - __main__ - ***** Running training *****
+09/18/2024 23:09:52 - INFO - __main__ - Model = DeepSpeedEngine(
+ (module): OpenSoraInpaint(
+ (pos_embed): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (transformer_blocks): ModuleList(
+ (0-31): 32 x BasicTransformerBlock(
+ (norm1): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn1): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (norm2): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn2): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (ff): FeedForward(
+ (net): ModuleList(
+ (0): GELU(
+ (proj): Linear(in_features=2304, out_features=9216, bias=True)
+ )
+ (1): Dropout(p=0.0, inplace=False)
+ (2): Linear(in_features=9216, out_features=2304, bias=True)
+ )
+ )
+ )
+ )
+ (norm_out): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (proj_out): Linear(in_features=2304, out_features=32, bias=True)
+ (adaln_single): AdaLayerNormSingle(
+ (emb): PixArtAlphaCombinedTimestepSizeEmbeddings(
+ (time_proj): Timesteps()
+ (timestep_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (caption_projection): PixArtAlphaTextProjection(
+ (linear_1): Linear(in_features=4096, out_features=2304, bias=True)
+ (act_1): GELU(approximate='tanh')
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ (motion_projection): MotionAdaLayerNormSingle(
+ (emb): MotionEmbeddings(
+ (motion_proj): Timesteps()
+ (motion_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (pos_embed_mask): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(4, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ (pos_embed_masked_hidden_states): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ )
+)
+09/18/2024 23:09:52 - INFO - __main__ - Num examples = 474899
+09/18/2024 23:09:52 - INFO - __main__ - Num Epochs = 17
+09/18/2024 23:09:52 - INFO - __main__ - Instantaneous batch size per device = 1
+09/18/2024 23:09:52 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8
+09/18/2024 23:09:52 - INFO - __main__ - Gradient Accumulation steps = 1
+09/18/2024 23:09:52 - INFO - __main__ - Total optimization steps = 1000000
+09/18/2024 23:09:52 - INFO - __main__ - Total optimization steps (num_update_steps_per_epoch) = 59362
+09/18/2024 23:09:52 - INFO - __main__ - Total trainable parameters = 2.8204808 B
+
Steps: 0%| | 0/1000000 [00:00, ?it/s]\
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+/
shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+-
|
|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+/
|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+/
\
/
-
/
/
\
|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+|
[W VariableFallbackKernel.cpp:51] Warning: CAUTION: The operator 'torchvision::nms' is not currently supported on the NPU backend and will fall back to run on the CPU. This may have performance implications. (function npu_cpu_fallback)
+/
/
-
-
-
-
-
-
-
Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: Device do not support double dtype now, dtype cast repalce with float.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='npu') to create tensors.
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations
+ warnings.warn(
+
Steps: 0%| | 1/1000000 [00:57<16087:54:16, 57.92s/it][RANK-0]: Step: [1], local_loss=0.08541805297136307, train_loss=0.1792256087064743, time_cost=4.692285060882568
+
Steps: 0%| | 1/1000000 [00:57<16087:54:16, 57.92s/it, lr=1e-5, step_loss=0.0854]
Steps: 0%| | 2/1000000 [01:11<8896:32:19, 32.03s/it, lr=1e-5, step_loss=0.0854] [RANK-0]: Step: [2], local_loss=0.11051416397094727, train_loss=0.24745801091194153, time_cost=4.221612930297852
+
Steps: 0%| | 2/1000000 [01:11<8896:32:19, 32.03s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 3/1000000 [01:17<5601:07:49, 20.16s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [3], local_loss=2.5186643600463867, train_loss=0.48181089758872986, time_cost=1.32352614402771
+
Steps: 0%| | 3/1000000 [01:17<5601:07:49, 20.16s/it, lr=1e-5, step_loss=2.52]
Steps: 0%| | 4/1000000 [01:29<4610:51:29, 16.60s/it, lr=1e-5, step_loss=2.52][RANK-0]: Step: [4], local_loss=0.9233622550964355, train_loss=0.3211063742637634, time_cost=3.737381935119629
+
Steps: 0%| | 4/1000000 [01:29<4610:51:29, 16.60s/it, lr=1e-5, step_loss=0.923]
Steps: 0%| | 5/1000000 [01:37<3784:40:25, 13.62s/it, lr=1e-5, step_loss=0.923][RANK-0]: Step: [5], local_loss=0.05214603245258331, train_loss=0.12317056208848953, time_cost=3.3074402809143066
+
Steps: 0%| | 5/1000000 [01:37<3784:40:25, 13.62s/it, lr=1e-5, step_loss=0.0521]
Steps: 0%| | 6/1000000 [01:50<3709:03:48, 13.35s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [6], local_loss=0.3342996835708618, train_loss=0.17410527169704437, time_cost=3.7152833938598633
+
Steps: 0%| | 6/1000000 [01:50<3709:03:48, 13.35s/it, lr=1e-5, step_loss=0.334]
Steps: 0%| | 7/1000000 [02:01<3556:35:54, 12.80s/it, lr=1e-5, step_loss=0.334][RANK-0]: Step: [7], local_loss=0.28946468234062195, train_loss=0.16285081207752228, time_cost=4.917309761047363
+
Steps: 0%| | 7/1000000 [02:01<3556:35:54, 12.80s/it, lr=1e-5, step_loss=0.289]
Steps: 0%| | 8/1000000 [02:15<3657:54:46, 13.17s/it, lr=1e-5, step_loss=0.289][RANK-0]: Step: [8], local_loss=0.05316493660211563, train_loss=0.17844244837760925, time_cost=3.9648332595825195
+
Steps: 0%| | 8/1000000 [02:15<3657:54:46, 13.17s/it, lr=1e-5, step_loss=0.0532]
Steps: 0%| | 9/1000000 [02:26<3427:12:34, 12.34s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [9], local_loss=0.049481697380542755, train_loss=0.2209196537733078, time_cost=5.0367271900177
+
Steps: 0%| | 9/1000000 [02:26<3427:12:34, 12.34s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 10/1000000 [02:33<2983:41:18, 10.74s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [10], local_loss=0.046853236854076385, train_loss=0.14753589034080505, time_cost=2.6615476608276367
+
Steps: 0%| | 10/1000000 [02:33<2983:41:18, 10.74s/it, lr=1e-5, step_loss=0.0469]
Steps: 0%| | 11/1000000 [02:46<3194:58:13, 11.50s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [11], local_loss=0.29659563302993774, train_loss=0.2635786235332489, time_cost=3.8884761333465576
+
Steps: 0%| | 11/1000000 [02:46<3194:58:13, 11.50s/it, lr=1e-5, step_loss=0.297]
Steps: 0%| | 12/1000000 [02:51<2653:41:10, 9.55s/it, lr=1e-5, step_loss=0.297][RANK-0]: Step: [12], local_loss=0.10840804129838943, train_loss=0.09388047456741333, time_cost=2.036655902862549
+
Steps: 0%| | 12/1000000 [02:51<2653:41:10, 9.55s/it, lr=1e-5, step_loss=0.108]
Steps: 0%| | 13/1000000 [03:03<2800:04:04, 10.08s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [13], local_loss=0.08104031533002853, train_loss=0.2084972858428955, time_cost=1.1853342056274414
+
Steps: 0%| | 13/1000000 [03:03<2800:04:04, 10.08s/it, lr=1e-5, step_loss=0.081]
Steps: 0%| | 14/1000000 [03:14<2919:48:18, 10.51s/it, lr=1e-5, step_loss=0.081][RANK-0]: Step: [14], local_loss=0.2118549346923828, train_loss=0.09053151309490204, time_cost=1.176692247390747
+
Steps: 0%| | 14/1000000 [03:14<2919:48:18, 10.51s/it, lr=1e-5, step_loss=0.212]
Steps: 0%| | 15/1000000 [03:18<2400:50:11, 8.64s/it, lr=1e-5, step_loss=0.212][RANK-0]: Step: [15], local_loss=0.07924187183380127, train_loss=0.23054936528205872, time_cost=1.1851048469543457
+
Steps: 0%| | 15/1000000 [03:18<2400:50:11, 8.64s/it, lr=1e-5, step_loss=0.0792]
Steps: 0%| | 16/1000000 [03:24<2171:40:03, 7.82s/it, lr=1e-5, step_loss=0.0792][RANK-0]: Step: [16], local_loss=0.067225381731987, train_loss=0.11837515234947205, time_cost=1.7630584239959717
+
Steps: 0%| | 16/1000000 [03:24<2171:40:03, 7.82s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 17/1000000 [03:40<2839:02:44, 10.22s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [17], local_loss=0.07968063652515411, train_loss=0.21760660409927368, time_cost=6.914053440093994
+
Steps: 0%| | 17/1000000 [03:40<2839:02:44, 10.22s/it, lr=1e-5, step_loss=0.0797]
Steps: 0%| | 18/1000000 [03:48<2601:42:50, 9.37s/it, lr=1e-5, step_loss=0.0797][RANK-0]: Step: [18], local_loss=0.8355136513710022, train_loss=0.20734983682632446, time_cost=2.3017632961273193
+
Steps: 0%| | 18/1000000 [03:48<2601:42:50, 9.37s/it, lr=1e-5, step_loss=0.836]
Steps: 0%| | 19/1000000 [03:57<2580:03:29, 9.29s/it, lr=1e-5, step_loss=0.836][RANK-0]: Step: [19], local_loss=0.06046191602945328, train_loss=0.2132394164800644, time_cost=4.025752782821655
+
Steps: 0%| | 19/1000000 [03:57<2580:03:29, 9.29s/it, lr=1e-5, step_loss=0.0605]
Steps: 0%| | 20/1000000 [04:07<2653:58:14, 9.55s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [20], local_loss=0.08980755507946014, train_loss=0.15208220481872559, time_cost=2.173980236053467
+
Steps: 0%| | 20/1000000 [04:07<2653:58:14, 9.55s/it, lr=1e-5, step_loss=0.0898]
Steps: 0%| | 21/1000000 [04:14<2500:17:31, 9.00s/it, lr=1e-5, step_loss=0.0898][RANK-0]: Step: [21], local_loss=0.21570995450019836, train_loss=0.13628330826759338, time_cost=2.1332108974456787
+
Steps: 0%| | 21/1000000 [04:14<2500:17:31, 9.00s/it, lr=1e-5, step_loss=0.216]
Steps: 0%| | 22/1000000 [04:29<2974:21:56, 10.71s/it, lr=1e-5, step_loss=0.216][RANK-0]: Step: [22], local_loss=0.0698854923248291, train_loss=0.10503639280796051, time_cost=1.1754200458526611
+
Steps: 0%| | 22/1000000 [04:29<2974:21:56, 10.71s/it, lr=1e-5, step_loss=0.0699]
Steps: 0%| | 23/1000000 [04:44<3312:37:51, 11.93s/it, lr=1e-5, step_loss=0.0699][RANK-0]: Step: [23], local_loss=0.1853482574224472, train_loss=0.09914093464612961, time_cost=1.2229225635528564
+
Steps: 0%| | 23/1000000 [04:44<3312:37:51, 11.93s/it, lr=1e-5, step_loss=0.185]
Steps: 0%| | 24/1000000 [04:59<3555:43:54, 12.80s/it, lr=1e-5, step_loss=0.185][RANK-0]: Step: [24], local_loss=0.5204455256462097, train_loss=0.16504335403442383, time_cost=1.152834415435791
+
Steps: 0%| | 24/1000000 [04:59<3555:43:54, 12.80s/it, lr=1e-5, step_loss=0.52]
Steps: 0%| | 25/1000000 [05:10<3458:53:01, 12.45s/it, lr=1e-5, step_loss=0.52][RANK-0]: Step: [25], local_loss=0.1378231793642044, train_loss=0.12015485763549805, time_cost=2.2574126720428467
+
Steps: 0%| | 25/1000000 [05:10<3458:53:01, 12.45s/it, lr=1e-5, step_loss=0.138]
Steps: 0%| | 26/1000000 [05:16<2907:26:41, 10.47s/it, lr=1e-5, step_loss=0.138][RANK-0]: Step: [26], local_loss=0.16141054034233093, train_loss=0.1283172219991684, time_cost=4.701879262924194
+
Steps: 0%| | 26/1000000 [05:16<2907:26:41, 10.47s/it, lr=1e-5, step_loss=0.161]
Steps: 0%| | 27/1000000 [05:25<2766:25:32, 9.96s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [27], local_loss=0.06421211361885071, train_loss=0.16335679590702057, time_cost=2.2703444957733154
+
Steps: 0%| | 27/1000000 [05:25<2766:25:32, 9.96s/it, lr=1e-5, step_loss=0.0642]
Steps: 0%| | 28/1000000 [05:38<3021:28:55, 10.88s/it, lr=1e-5, step_loss=0.0642][RANK-0]: Step: [28], local_loss=0.0519217886030674, train_loss=0.0690666139125824, time_cost=4.810363531112671
+
Steps: 0%| | 28/1000000 [05:38<3021:28:55, 10.88s/it, lr=1e-5, step_loss=0.0519]
Steps: 0%| | 29/1000000 [05:53<3345:25:17, 12.04s/it, lr=1e-5, step_loss=0.0519][RANK-0]: Step: [29], local_loss=0.06643711030483246, train_loss=0.14858001470565796, time_cost=5.554578542709351
+
Steps: 0%| | 29/1000000 [05:53<3345:25:17, 12.04s/it, lr=1e-5, step_loss=0.0664]
Steps: 0%| | 30/1000000 [06:02<3108:51:44, 11.19s/it, lr=1e-5, step_loss=0.0664][RANK-0]: Step: [30], local_loss=0.045847661793231964, train_loss=0.06428307294845581, time_cost=1.2293074131011963
+
Steps: 0%| | 30/1000000 [06:02<3108:51:44, 11.19s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 31/1000000 [06:11<2903:15:10, 10.45s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [31], local_loss=0.07386929541826248, train_loss=0.08185486495494843, time_cost=2.0386874675750732
+
Steps: 0%| | 31/1000000 [06:11<2903:15:10, 10.45s/it, lr=1e-5, step_loss=0.0739]
Steps: 0%| | 32/1000000 [06:20<2786:40:42, 10.03s/it, lr=1e-5, step_loss=0.0739][RANK-0]: Step: [32], local_loss=0.1385626345872879, train_loss=0.11878755688667297, time_cost=1.5311429500579834
+
Steps: 0%| | 32/1000000 [06:20<2786:40:42, 10.03s/it, lr=1e-5, step_loss=0.139]
Steps: 0%| | 33/1000000 [06:30<2814:02:16, 10.13s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [33], local_loss=0.0762484073638916, train_loss=0.06514029949903488, time_cost=1.4359869956970215
+
Steps: 0%| | 33/1000000 [06:30<2814:02:16, 10.13s/it, lr=1e-5, step_loss=0.0762]
Steps: 0%| | 34/1000000 [06:43<3073:04:06, 11.06s/it, lr=1e-5, step_loss=0.0762][RANK-0]: Step: [34], local_loss=0.07920431345701218, train_loss=0.06382615864276886, time_cost=3.78934383392334
+
Steps: 0%| | 34/1000000 [06:43<3073:04:06, 11.06s/it, lr=1e-5, step_loss=0.0792]
Steps: 0%| | 35/1000000 [06:48<2565:26:31, 9.24s/it, lr=1e-5, step_loss=0.0792][RANK-0]: Step: [35], local_loss=0.1138140931725502, train_loss=0.11296191811561584, time_cost=1.781900405883789
+
Steps: 0%| | 35/1000000 [06:48<2565:26:31, 9.24s/it, lr=1e-5, step_loss=0.114]
Steps: 0%| | 36/1000000 [06:57<2535:48:48, 9.13s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [36], local_loss=0.03153151646256447, train_loss=0.13723455369472504, time_cost=3.775050163269043
+
Steps: 0%| | 36/1000000 [06:57<2535:48:48, 9.13s/it, lr=1e-5, step_loss=0.0315]
Steps: 0%| | 37/1000000 [07:02<2184:15:45, 7.86s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [37], local_loss=0.050741713494062424, train_loss=0.11389151215553284, time_cost=1.9086828231811523
+
Steps: 0%| | 37/1000000 [07:02<2184:15:45, 7.86s/it, lr=1e-5, step_loss=0.0507]
Steps: 0%| | 38/1000000 [07:10<2152:12:57, 7.75s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [38], local_loss=0.054432641714811325, train_loss=0.11559846997261047, time_cost=2.469082832336426
+
Steps: 0%| | 38/1000000 [07:10<2152:12:57, 7.75s/it, lr=1e-5, step_loss=0.0544]
Steps: 0%| | 39/1000000 [07:17<2112:48:40, 7.61s/it, lr=1e-5, step_loss=0.0544][RANK-0]: Step: [39], local_loss=0.346626341342926, train_loss=0.11403577029705048, time_cost=2.2678604125976562
+
Steps: 0%| | 39/1000000 [07:17<2112:48:40, 7.61s/it, lr=1e-5, step_loss=0.347]
Steps: 0%| | 40/1000000 [07:30<2586:36:21, 9.31s/it, lr=1e-5, step_loss=0.347][RANK-0]: Step: [40], local_loss=0.1193912997841835, train_loss=0.1378181427717209, time_cost=3.4313271045684814
+
Steps: 0%| | 40/1000000 [07:30<2586:36:21, 9.31s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 41/1000000 [07:35<2199:23:31, 7.92s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [41], local_loss=0.11509580165147781, train_loss=0.12229713797569275, time_cost=3.4607818126678467
+
Steps: 0%| | 41/1000000 [07:35<2199:23:31, 7.92s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 42/1000000 [07:40<1926:49:18, 6.94s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [42], local_loss=0.028420250862836838, train_loss=0.06942380219697952, time_cost=1.5657572746276855
+
Steps: 0%| | 42/1000000 [07:40<1926:49:18, 6.94s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 43/1000000 [07:45<1836:16:34, 6.61s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [43], local_loss=0.04250843822956085, train_loss=0.09540428221225739, time_cost=2.959319829940796
+
Steps: 0%| | 43/1000000 [07:45<1836:16:34, 6.61s/it, lr=1e-5, step_loss=0.0425]
Steps: 0%| | 44/1000000 [07:57<2249:20:49, 8.10s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [44], local_loss=0.12175924330949783, train_loss=0.1321972906589508, time_cost=3.85135817527771
+
Steps: 0%| | 44/1000000 [07:57<2249:20:49, 8.10s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 45/1000000 [08:01<1915:32:16, 6.90s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [45], local_loss=0.05090689659118652, train_loss=0.11524039506912231, time_cost=1.3235130310058594
+
Steps: 0%| | 45/1000000 [08:01<1915:32:16, 6.90s/it, lr=1e-5, step_loss=0.0509]
Steps: 0%| | 46/1000000 [08:08<1915:38:12, 6.90s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [46], local_loss=0.08828508853912354, train_loss=0.14606507122516632, time_cost=2.2375781536102295
+
Steps: 0%| | 46/1000000 [08:08<1915:38:12, 6.90s/it, lr=1e-5, step_loss=0.0883]
Steps: 0%| | 47/1000000 [08:14<1835:53:15, 6.61s/it, lr=1e-5, step_loss=0.0883][RANK-0]: Step: [47], local_loss=0.0670504942536354, train_loss=0.06104867160320282, time_cost=3.175389528274536
+
Steps: 0%| | 47/1000000 [08:14<1835:53:15, 6.61s/it, lr=1e-5, step_loss=0.0671]
Steps: 0%| | 48/1000000 [08:24<2141:19:26, 7.71s/it, lr=1e-5, step_loss=0.0671][RANK-0]: Step: [48], local_loss=0.06668698042631149, train_loss=25.811246871948242, time_cost=2.1316609382629395
+
Steps: 0%| | 48/1000000 [08:24<2141:19:26, 7.71s/it, lr=1e-5, step_loss=0.0667]
Steps: 0%| | 49/1000000 [08:39<2712:23:20, 9.77s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [49], local_loss=0.11029122769832611, train_loss=0.09142522513866425, time_cost=5.838991403579712
+
Steps: 0%| | 49/1000000 [08:39<2712:23:20, 9.77s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 50/1000000 [08:53<3108:45:48, 11.19s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [50], local_loss=0.07708383351564407, train_loss=0.08124163746833801, time_cost=6.462100505828857
+
Steps: 0%| | 50/1000000 [08:53<3108:45:48, 11.19s/it, lr=1e-5, step_loss=0.0771]
Steps: 0%| | 51/1000000 [09:04<3082:41:09, 11.10s/it, lr=1e-5, step_loss=0.0771][RANK-0]: Step: [51], local_loss=0.09144819527864456, train_loss=0.07694871723651886, time_cost=7.441950559616089
+
Steps: 0%| | 51/1000000 [09:04<3082:41:09, 11.10s/it, lr=1e-5, step_loss=0.0914]
Steps: 0%| | 52/1000000 [09:08<2515:43:00, 9.06s/it, lr=1e-5, step_loss=0.0914][RANK-0]: Step: [52], local_loss=0.10583716630935669, train_loss=0.20043759047985077, time_cost=1.5691325664520264
+
Steps: 0%| | 52/1000000 [09:08<2515:43:00, 9.06s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 53/1000000 [09:18<2541:42:37, 9.15s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [53], local_loss=0.08967398852109909, train_loss=0.07144539058208466, time_cost=1.3140289783477783
+
Steps: 0%| | 53/1000000 [09:18<2541:42:37, 9.15s/it, lr=1e-5, step_loss=0.0897]
Steps: 0%| | 54/1000000 [09:23<2206:12:13, 7.94s/it, lr=1e-5, step_loss=0.0897][RANK-0]: Step: [54], local_loss=0.2202392965555191, train_loss=0.13397841155529022, time_cost=4.0130980014801025
+
Steps: 0%| | 54/1000000 [09:23<2206:12:13, 7.94s/it, lr=1e-5, step_loss=0.22]
Steps: 0%| | 55/1000000 [09:31<2213:51:48, 7.97s/it, lr=1e-5, step_loss=0.22][RANK-0]: Step: [55], local_loss=0.07939141988754272, train_loss=0.10608991980552673, time_cost=1.2557930946350098
+
Steps: 0%| | 55/1000000 [09:31<2213:51:48, 7.97s/it, lr=1e-5, step_loss=0.0794]
Steps: 0%| | 56/1000000 [09:46<2769:54:25, 9.97s/it, lr=1e-5, step_loss=0.0794][RANK-0]: Step: [56], local_loss=0.08803783357143402, train_loss=0.07428908348083496, time_cost=3.0010950565338135
+
Steps: 0%| | 56/1000000 [09:46<2769:54:25, 9.97s/it, lr=1e-5, step_loss=0.088]
Steps: 0%| | 57/1000000 [09:50<2339:39:36, 8.42s/it, lr=1e-5, step_loss=0.088][RANK-0]: Step: [57], local_loss=0.06443771719932556, train_loss=0.07375520467758179, time_cost=1.846085786819458
+
Steps: 0%| | 57/1000000 [09:50<2339:39:36, 8.42s/it, lr=1e-5, step_loss=0.0644]
Steps: 0%| | 58/1000000 [09:59<2345:14:20, 8.44s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [58], local_loss=0.0719250962138176, train_loss=0.08779755234718323, time_cost=4.183083772659302
+
Steps: 0%| | 58/1000000 [09:59<2345:14:20, 8.44s/it, lr=1e-5, step_loss=0.0719]
Steps: 0%| | 59/1000000 [10:08<2387:05:57, 8.59s/it, lr=1e-5, step_loss=0.0719][RANK-0]: Step: [59], local_loss=0.11850124597549438, train_loss=0.1408875584602356, time_cost=2.0604746341705322
+
Steps: 0%| | 59/1000000 [10:08<2387:05:57, 8.59s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 60/1000000 [10:12<2030:31:05, 7.31s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [60], local_loss=0.03894251957535744, train_loss=0.06897333264350891, time_cost=1.2487757205963135
+
Steps: 0%| | 60/1000000 [10:12<2030:31:05, 7.31s/it, lr=1e-5, step_loss=0.0389]
Steps: 0%| | 61/1000000 [10:19<1974:49:54, 7.11s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [61], local_loss=0.06768696010112762, train_loss=0.07417939603328705, time_cost=4.858593463897705
+
Steps: 0%| | 61/1000000 [10:19<1974:49:54, 7.11s/it, lr=1e-5, step_loss=0.0677]
Steps: 0%| | 62/1000000 [10:29<2272:38:11, 8.18s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [62], local_loss=0.05122203379869461, train_loss=0.111269511282444, time_cost=1.9127707481384277
+
Steps: 0%| | 62/1000000 [10:29<2272:38:11, 8.18s/it, lr=1e-5, step_loss=0.0512]
Steps: 0%| | 63/1000000 [10:34<2002:20:41, 7.21s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [63], local_loss=0.10694558173418045, train_loss=0.06270664930343628, time_cost=1.7686967849731445
+
Steps: 0%| | 63/1000000 [10:34<2002:20:41, 7.21s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 64/1000000 [10:40<1904:39:24, 6.86s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [64], local_loss=0.07238713651895523, train_loss=0.07751712203025818, time_cost=1.8208718299865723
+
Steps: 0%| | 64/1000000 [10:40<1904:39:24, 6.86s/it, lr=1e-5, step_loss=0.0724]
Steps: 0%| | 65/1000000 [10:53<2343:56:06, 8.44s/it, lr=1e-5, step_loss=0.0724][RANK-0]: Step: [65], local_loss=0.046123627573251724, train_loss=0.0976080447435379, time_cost=6.031701564788818
+
Steps: 0%| | 65/1000000 [10:53<2343:56:06, 8.44s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 66/1000000 [11:02<2409:36:20, 8.68s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [66], local_loss=0.09333977848291397, train_loss=0.08352774381637573, time_cost=3.2661468982696533
+
Steps: 0%| | 66/1000000 [11:02<2409:36:20, 8.68s/it, lr=1e-5, step_loss=0.0933]
Steps: 0%| | 67/1000000 [11:07<2105:48:19, 7.58s/it, lr=1e-5, step_loss=0.0933][RANK-0]: Step: [67], local_loss=0.033725615590810776, train_loss=0.05823956057429314, time_cost=2.3583078384399414
+
Steps: 0%| | 67/1000000 [11:07<2105:48:19, 7.58s/it, lr=1e-5, step_loss=0.0337]
Steps: 0%| | 68/1000000 [11:12<1888:34:00, 6.80s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [68], local_loss=0.050491202622652054, train_loss=0.060782939195632935, time_cost=2.536235809326172
+
Steps: 0%| | 68/1000000 [11:12<1888:34:00, 6.80s/it, lr=1e-5, step_loss=0.0505]
Steps: 0%| | 69/1000000 [11:19<1912:33:00, 6.89s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [69], local_loss=0.05720077082514763, train_loss=0.1329159438610077, time_cost=5.162072420120239
+
Steps: 0%| | 69/1000000 [11:19<1912:33:00, 6.89s/it, lr=1e-5, step_loss=0.0572]
Steps: 0%| | 70/1000000 [11:24<1797:36:50, 6.47s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [70], local_loss=0.1804530769586563, train_loss=0.14399684965610504, time_cost=3.1469388008117676
+
Steps: 0%| | 70/1000000 [11:24<1797:36:50, 6.47s/it, lr=1e-5, step_loss=0.18]
Steps: 0%| | 71/1000000 [11:32<1920:08:32, 6.91s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [71], local_loss=0.18356825411319733, train_loss=0.22499454021453857, time_cost=4.298264741897583
+
Steps: 0%| | 71/1000000 [11:32<1920:08:32, 6.91s/it, lr=1e-5, step_loss=0.184]
Steps: 0%| | 72/1000000 [11:41<2061:51:25, 7.42s/it, lr=1e-5, step_loss=0.184][RANK-0]: Step: [72], local_loss=0.06852193176746368, train_loss=0.07051150500774384, time_cost=2.349529266357422
+
Steps: 0%| | 72/1000000 [11:41<2061:51:25, 7.42s/it, lr=1e-5, step_loss=0.0685]
Steps: 0%| | 73/1000000 [11:52<2329:06:03, 8.39s/it, lr=1e-5, step_loss=0.0685][RANK-0]: Step: [73], local_loss=0.11826591938734055, train_loss=0.07904328405857086, time_cost=1.2769296169281006
+
Steps: 0%| | 73/1000000 [11:52<2329:06:03, 8.39s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 74/1000000 [11:59<2247:51:33, 8.09s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [74], local_loss=0.053338855504989624, train_loss=0.06442876160144806, time_cost=3.3755459785461426
+
Steps: 0%| | 74/1000000 [11:59<2247:51:33, 8.09s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 75/1000000 [12:09<2392:16:32, 8.61s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [75], local_loss=0.06423915177583694, train_loss=0.10658328235149384, time_cost=1.1932227611541748
+
Steps: 0%| | 75/1000000 [12:09<2392:16:32, 8.61s/it, lr=1e-5, step_loss=0.0642]
Steps: 0%| | 76/1000000 [12:20<2619:53:36, 9.43s/it, lr=1e-5, step_loss=0.0642][RANK-0]: Step: [76], local_loss=0.04252700135111809, train_loss=0.10594156384468079, time_cost=1.4073801040649414
+
Steps: 0%| | 76/1000000 [12:20<2619:53:36, 9.43s/it, lr=1e-5, step_loss=0.0425]
Steps: 0%| | 77/1000000 [12:28<2515:58:37, 9.06s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [77], local_loss=0.05693782493472099, train_loss=0.17275075614452362, time_cost=1.9398810863494873
+
Steps: 0%| | 77/1000000 [12:28<2515:58:37, 9.06s/it, lr=1e-5, step_loss=0.0569]
Steps: 0%| | 78/1000000 [12:34<2195:08:01, 7.90s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [78], local_loss=0.06721875071525574, train_loss=0.07066525518894196, time_cost=2.2485885620117188
+
Steps: 0%| | 78/1000000 [12:34<2195:08:01, 7.90s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 79/1000000 [12:40<2099:54:30, 7.56s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [79], local_loss=0.06016189232468605, train_loss=0.10781514644622803, time_cost=2.704425096511841
+
Steps: 0%| | 79/1000000 [12:40<2099:54:30, 7.56s/it, lr=1e-5, step_loss=0.0602]
Steps: 0%| | 80/1000000 [12:54<2573:07:14, 9.26s/it, lr=1e-5, step_loss=0.0602][RANK-0]: Step: [80], local_loss=0.3742271065711975, train_loss=9.159887313842773, time_cost=1.1962306499481201
+
Steps: 0%| | 80/1000000 [12:54<2573:07:14, 9.26s/it, lr=1e-5, step_loss=0.374]
Steps: 0%| | 81/1000000 [13:05<2746:58:24, 9.89s/it, lr=1e-5, step_loss=0.374][RANK-0]: Step: [81], local_loss=0.09291175752878189, train_loss=0.06482996046543121, time_cost=3.716813802719116
+
Steps: 0%| | 81/1000000 [13:05<2746:58:24, 9.89s/it, lr=1e-5, step_loss=0.0929]
Steps: 0%| | 82/1000000 [13:14<2672:10:43, 9.62s/it, lr=1e-5, step_loss=0.0929][RANK-0]: Step: [82], local_loss=203.98890686035156, train_loss=25.564266204833984, time_cost=1.2120110988616943
+
Steps: 0%| | 82/1000000 [13:14<2672:10:43, 9.62s/it, lr=1e-5, step_loss=204]
Steps: 0%| | 83/1000000 [13:23<2638:54:58, 9.50s/it, lr=1e-5, step_loss=204][RANK-0]: Step: [83], local_loss=0.06255728006362915, train_loss=0.1039259135723114, time_cost=1.6436681747436523
+
Steps: 0%| | 83/1000000 [13:23<2638:54:58, 9.50s/it, lr=1e-5, step_loss=0.0626]
Steps: 0%| | 84/1000000 [13:36<2961:50:55, 10.66s/it, lr=1e-5, step_loss=0.0626][RANK-0]: Step: [84], local_loss=0.09482987225055695, train_loss=0.10542532801628113, time_cost=1.2911596298217773
+
Steps: 0%| | 84/1000000 [13:37<2961:50:55, 10.66s/it, lr=1e-5, step_loss=0.0948]
Steps: 0%| | 85/1000000 [13:43<2645:12:53, 9.52s/it, lr=1e-5, step_loss=0.0948][RANK-0]: Step: [85], local_loss=0.09102194756269455, train_loss=0.08268599212169647, time_cost=2.5986239910125732
+
Steps: 0%| | 85/1000000 [13:43<2645:12:53, 9.52s/it, lr=1e-5, step_loss=0.091]
Steps: 0%| | 86/1000000 [13:55<2817:19:24, 10.14s/it, lr=1e-5, step_loss=0.091][RANK-0]: Step: [86], local_loss=0.08665219694375992, train_loss=0.07165205478668213, time_cost=1.318044900894165
+
Steps: 0%| | 86/1000000 [13:55<2817:19:24, 10.14s/it, lr=1e-5, step_loss=0.0867]
Steps: 0%| | 87/1000000 [14:02<2559:37:48, 9.22s/it, lr=1e-5, step_loss=0.0867][RANK-0]: Step: [87], local_loss=0.06296861916780472, train_loss=0.0741170346736908, time_cost=2.757246971130371
+
Steps: 0%| | 87/1000000 [14:02<2559:37:48, 9.22s/it, lr=1e-5, step_loss=0.063]
Steps: 0%| | 88/1000000 [14:11<2507:51:51, 9.03s/it, lr=1e-5, step_loss=0.063][RANK-0]: Step: [88], local_loss=0.07476916909217834, train_loss=0.06635011732578278, time_cost=2.5909054279327393
+
Steps: 0%| | 88/1000000 [14:11<2507:51:51, 9.03s/it, lr=1e-5, step_loss=0.0748]
Steps: 0%| | 89/1000000 [14:16<2168:37:47, 7.81s/it, lr=1e-5, step_loss=0.0748][RANK-0]: Step: [89], local_loss=0.07749442011117935, train_loss=0.07529357075691223, time_cost=1.178596019744873
+
Steps: 0%| | 89/1000000 [14:16<2168:37:47, 7.81s/it, lr=1e-5, step_loss=0.0775]
Steps: 0%| | 90/1000000 [14:22<2095:14:51, 7.54s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [90], local_loss=0.06439533829689026, train_loss=0.07064840197563171, time_cost=2.837360382080078
+
Steps: 0%| | 90/1000000 [14:22<2095:14:51, 7.54s/it, lr=1e-5, step_loss=0.0644]
Steps: 0%| | 91/1000000 [14:30<2112:10:11, 7.60s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [91], local_loss=0.04792582243680954, train_loss=0.056744202971458435, time_cost=3.23382306098938
+
Steps: 0%| | 91/1000000 [14:30<2112:10:11, 7.60s/it, lr=1e-5, step_loss=0.0479]
Steps: 0%| | 92/1000000 [14:35<1874:55:34, 6.75s/it, lr=1e-5, step_loss=0.0479][RANK-0]: Step: [92], local_loss=0.051478419452905655, train_loss=0.07734055817127228, time_cost=2.1358184814453125
+
Steps: 0%| | 92/1000000 [14:35<1874:55:34, 6.75s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 93/1000000 [14:41<1841:18:43, 6.63s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [93], local_loss=0.10731276124715805, train_loss=0.20009735226631165, time_cost=2.3525986671447754
+
Steps: 0%| | 93/1000000 [14:41<1841:18:43, 6.63s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 94/1000000 [14:46<1663:19:41, 5.99s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [94], local_loss=0.06169046834111214, train_loss=0.05064883083105087, time_cost=1.9133920669555664
+
Steps: 0%| | 94/1000000 [14:46<1663:19:41, 5.99s/it, lr=1e-5, step_loss=0.0617]
Steps: 0%| | 95/1000000 [14:54<1807:55:40, 6.51s/it, lr=1e-5, step_loss=0.0617][RANK-0]: Step: [95], local_loss=0.09135406464338303, train_loss=0.09472808986902237, time_cost=1.7352123260498047
+
Steps: 0%| | 95/1000000 [14:54<1807:55:40, 6.51s/it, lr=1e-5, step_loss=0.0914]
Steps: 0%| | 96/1000000 [15:08<2501:45:07, 9.01s/it, lr=1e-5, step_loss=0.0914][RANK-0]: Step: [96], local_loss=0.0874752402305603, train_loss=0.07904146611690521, time_cost=7.436732292175293
+
Steps: 0%| | 96/1000000 [15:08<2501:45:07, 9.01s/it, lr=1e-5, step_loss=0.0875]
Steps: 0%| | 97/1000000 [15:16<2407:19:24, 8.67s/it, lr=1e-5, step_loss=0.0875][RANK-0]: Step: [97], local_loss=0.05810268968343735, train_loss=0.08176260441541672, time_cost=3.0218610763549805
+
Steps: 0%| | 97/1000000 [15:16<2407:19:24, 8.67s/it, lr=1e-5, step_loss=0.0581]
Steps: 0%| | 98/1000000 [15:24<2324:21:56, 8.37s/it, lr=1e-5, step_loss=0.0581][RANK-0]: Step: [98], local_loss=0.06572820246219635, train_loss=0.06979584693908691, time_cost=2.5873327255249023
+
Steps: 0%| | 98/1000000 [15:24<2324:21:56, 8.37s/it, lr=1e-5, step_loss=0.0657]
Steps: 0%| | 99/1000000 [15:30<2107:13:55, 7.59s/it, lr=1e-5, step_loss=0.0657][RANK-0]: Step: [99], local_loss=0.03115866519510746, train_loss=0.08578327298164368, time_cost=1.4003331661224365
+
Steps: 0%| | 99/1000000 [15:30<2107:13:55, 7.59s/it, lr=1e-5, step_loss=0.0312]\
-
\
\
\
\
\
\
Steps: 0%| | 100/1000000 [15:35<1878:14:34, 6.76s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [100], local_loss=0.07717832922935486, train_loss=0.08418139815330505, time_cost=2.637701988220215
+
Steps: 0%| | 100/1000000 [15:35<1878:14:34, 6.76s/it, lr=1e-5, step_loss=0.0772]
Steps: 0%| | 101/1000000 [15:44<2082:21:13, 7.50s/it, lr=1e-5, step_loss=0.0772][RANK-0]: Step: [101], local_loss=0.07332826405763626, train_loss=0.05771113187074661, time_cost=2.927849531173706
+
Steps: 0%| | 101/1000000 [15:44<2082:21:13, 7.50s/it, lr=1e-5, step_loss=0.0733]
Steps: 0%| | 102/1000000 [15:51<2063:23:15, 7.43s/it, lr=1e-5, step_loss=0.0733][RANK-0]: Step: [102], local_loss=0.06284020841121674, train_loss=0.0897960513830185, time_cost=1.2414300441741943
+
Steps: 0%| | 102/1000000 [15:51<2063:23:15, 7.43s/it, lr=1e-5, step_loss=0.0628]
Steps: 0%| | 103/1000000 [16:00<2199:59:46, 7.92s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [103], local_loss=0.061199869960546494, train_loss=0.05285019800066948, time_cost=1.549764633178711
+
Steps: 0%| | 103/1000000 [16:00<2199:59:46, 7.92s/it, lr=1e-5, step_loss=0.0612]
Steps: 0%| | 104/1000000 [16:10<2339:28:32, 8.42s/it, lr=1e-5, step_loss=0.0612][RANK-0]: Step: [104], local_loss=0.047108229249715805, train_loss=0.09882444143295288, time_cost=1.5334267616271973
+
Steps: 0%| | 104/1000000 [16:10<2339:28:32, 8.42s/it, lr=1e-5, step_loss=0.0471]
Steps: 0%| | 105/1000000 [16:14<1987:01:21, 7.15s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [105], local_loss=0.0552029013633728, train_loss=0.1225072368979454, time_cost=1.305757999420166
+
Steps: 0%| | 105/1000000 [16:14<1987:01:21, 7.15s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 106/1000000 [16:25<2341:04:43, 8.43s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [106], local_loss=0.14509423077106476, train_loss=0.09584388136863708, time_cost=2.2683005332946777
+
Steps: 0%| | 106/1000000 [16:25<2341:04:43, 8.43s/it, lr=1e-5, step_loss=0.145]
Steps: 0%| | 107/1000000 [16:39<2780:19:43, 10.01s/it, lr=1e-5, step_loss=0.145][RANK-0]: Step: [107], local_loss=0.04383404552936554, train_loss=0.08425230532884598, time_cost=1.6949818134307861
+
Steps: 0%| | 107/1000000 [16:39<2780:19:43, 10.01s/it, lr=1e-5, step_loss=0.0438]
Steps: 0%| | 108/1000000 [16:55<3314:56:40, 11.94s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [108], local_loss=0.05093354359269142, train_loss=0.1315576136112213, time_cost=7.121157884597778
+
Steps: 0%| | 108/1000000 [16:55<3314:56:40, 11.94s/it, lr=1e-5, step_loss=0.0509]
Steps: 0%| | 109/1000000 [17:03<2960:20:15, 10.66s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [109], local_loss=0.030049219727516174, train_loss=0.06017636880278587, time_cost=1.3334903717041016
+
Steps: 0%| | 109/1000000 [17:03<2960:20:15, 10.66s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 110/1000000 [17:17<3262:10:35, 11.75s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [110], local_loss=0.38442304730415344, train_loss=0.11708323657512665, time_cost=5.024274587631226
+
Steps: 0%| | 110/1000000 [17:17<3262:10:35, 11.75s/it, lr=1e-5, step_loss=0.384]
Steps: 0%| | 111/1000000 [17:26<3011:30:48, 10.84s/it, lr=1e-5, step_loss=0.384][RANK-0]: Step: [111], local_loss=0.10847538709640503, train_loss=0.07044924795627594, time_cost=3.9646401405334473
+
Steps: 0%| | 111/1000000 [17:26<3011:30:48, 10.84s/it, lr=1e-5, step_loss=0.108]
Steps: 0%| | 112/1000000 [17:41<3350:35:46, 12.06s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [112], local_loss=0.056615956127643585, train_loss=0.053037792444229126, time_cost=6.2023398876190186
+
Steps: 0%| | 112/1000000 [17:41<3350:35:46, 12.06s/it, lr=1e-5, step_loss=0.0566]
Steps: 0%| | 113/1000000 [17:50<3126:31:29, 11.26s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [113], local_loss=0.056181274354457855, train_loss=0.0668802261352539, time_cost=1.9941558837890625
+
Steps: 0%| | 113/1000000 [17:50<3126:31:29, 11.26s/it, lr=1e-5, step_loss=0.0562]
Steps: 0%| | 114/1000000 [18:00<3005:46:38, 10.82s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [114], local_loss=0.026842882856726646, train_loss=0.10006473958492279, time_cost=1.6726491451263428
+
Steps: 0%| | 114/1000000 [18:00<3005:46:38, 10.82s/it, lr=1e-5, step_loss=0.0268]
Steps: 0%| | 115/1000000 [18:07<2681:56:29, 9.66s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [115], local_loss=542.6083374023438, train_loss=67.91925048828125, time_cost=4.994848966598511
+
Steps: 0%| | 115/1000000 [18:07<2681:56:29, 9.66s/it, lr=1e-5, step_loss=543]
Steps: 0%| | 116/1000000 [18:12<2278:57:56, 8.21s/it, lr=1e-5, step_loss=543][RANK-0]: Step: [116], local_loss=0.090975821018219, train_loss=0.07481858134269714, time_cost=1.332648754119873
+
Steps: 0%| | 116/1000000 [18:12<2278:57:56, 8.21s/it, lr=1e-5, step_loss=0.091]
Steps: 0%| | 117/1000000 [18:21<2382:01:10, 8.58s/it, lr=1e-5, step_loss=0.091][RANK-0]: Step: [117], local_loss=0.08394472301006317, train_loss=0.0767015591263771, time_cost=3.256784200668335
+
Steps: 0%| | 117/1000000 [18:21<2382:01:10, 8.58s/it, lr=1e-5, step_loss=0.0839]
Steps: 0%| | 118/1000000 [18:37<2933:14:49, 10.56s/it, lr=1e-5, step_loss=0.0839][RANK-0]: Step: [118], local_loss=0.08641251921653748, train_loss=0.1886485069990158, time_cost=6.085503101348877
+
Steps: 0%| | 118/1000000 [18:37<2933:14:49, 10.56s/it, lr=1e-5, step_loss=0.0864]
Steps: 0%| | 119/1000000 [18:48<2964:28:53, 10.67s/it, lr=1e-5, step_loss=0.0864][RANK-0]: Step: [119], local_loss=0.060910098254680634, train_loss=0.06470172107219696, time_cost=1.9312400817871094
+
Steps: 0%| | 119/1000000 [18:48<2964:28:53, 10.67s/it, lr=1e-5, step_loss=0.0609]
Steps: 0%| | 120/1000000 [18:59<3038:31:27, 10.94s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [120], local_loss=0.0495719239115715, train_loss=0.061134930700063705, time_cost=2.955362319946289
+
Steps: 0%| | 120/1000000 [18:59<3038:31:27, 10.94s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 121/1000000 [19:04<2578:15:35, 9.28s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [121], local_loss=0.0559130422770977, train_loss=0.07651147246360779, time_cost=2.7496225833892822
+
Steps: 0%| | 121/1000000 [19:04<2578:15:35, 9.28s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 122/1000000 [19:09<2161:42:49, 7.78s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [122], local_loss=0.04243046045303345, train_loss=0.06281960755586624, time_cost=1.2452373504638672
+
Steps: 0%| | 122/1000000 [19:09<2161:42:49, 7.78s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 123/1000000 [19:18<2296:33:01, 8.27s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [123], local_loss=0.0877036452293396, train_loss=0.1055346205830574, time_cost=1.2344725131988525
+
Steps: 0%| | 123/1000000 [19:18<2296:33:01, 8.27s/it, lr=1e-5, step_loss=0.0877]
Steps: 0%| | 124/1000000 [19:24<2109:47:37, 7.60s/it, lr=1e-5, step_loss=0.0877][RANK-0]: Step: [124], local_loss=0.08579913526773453, train_loss=0.10806581377983093, time_cost=2.730618953704834
+
Steps: 0%| | 124/1000000 [19:24<2109:47:37, 7.60s/it, lr=1e-5, step_loss=0.0858]
Steps: 0%| | 125/1000000 [19:36<2474:04:34, 8.91s/it, lr=1e-5, step_loss=0.0858][RANK-0]: Step: [125], local_loss=0.2254849076271057, train_loss=0.09380973875522614, time_cost=1.2083690166473389
+
Steps: 0%| | 125/1000000 [19:36<2474:04:34, 8.91s/it, lr=1e-5, step_loss=0.225]
Steps: 0%| | 126/1000000 [19:47<2616:22:05, 9.42s/it, lr=1e-5, step_loss=0.225][RANK-0]: Step: [126], local_loss=0.05611935630440712, train_loss=0.06239485740661621, time_cost=1.8648431301116943
+
Steps: 0%| | 126/1000000 [19:47<2616:22:05, 9.42s/it, lr=1e-5, step_loss=0.0561]
Steps: 0%| | 127/1000000 [19:53<2331:27:36, 8.39s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [127], local_loss=0.09289322793483734, train_loss=0.1288788616657257, time_cost=1.459679126739502
+
Steps: 0%| | 127/1000000 [19:53<2331:27:36, 8.39s/it, lr=1e-5, step_loss=0.0929]
Steps: 0%| | 128/1000000 [19:59<2136:44:28, 7.69s/it, lr=1e-5, step_loss=0.0929][RANK-0]: Step: [128], local_loss=0.0842365175485611, train_loss=0.092242531478405, time_cost=2.2477941513061523
+
Steps: 0%| | 128/1000000 [19:59<2136:44:28, 7.69s/it, lr=1e-5, step_loss=0.0842]
Steps: 0%| | 129/1000000 [20:04<1946:38:14, 7.01s/it, lr=1e-5, step_loss=0.0842][RANK-0]: Step: [129], local_loss=0.0389540009200573, train_loss=0.08099111169576645, time_cost=2.530893087387085
+
Steps: 0%| | 129/1000000 [20:04<1946:38:14, 7.01s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 130/1000000 [20:08<1714:39:36, 6.17s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [130], local_loss=0.05675321817398071, train_loss=0.05811350792646408, time_cost=1.5478346347808838
+
Steps: 0%| | 130/1000000 [20:08<1714:39:36, 6.17s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 131/1000000 [20:19<2069:53:16, 7.45s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [131], local_loss=0.03517449274659157, train_loss=0.0820915549993515, time_cost=3.7722291946411133
+
Steps: 0%| | 131/1000000 [20:19<2069:53:16, 7.45s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 132/1000000 [20:24<1857:47:31, 6.69s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [132], local_loss=208.04373168945312, train_loss=26.05610466003418, time_cost=1.236635446548462
+
Steps: 0%| | 132/1000000 [20:24<1857:47:31, 6.69s/it, lr=1e-5, step_loss=208]
Steps: 0%| | 133/1000000 [20:31<1911:55:44, 6.88s/it, lr=1e-5, step_loss=208][RANK-0]: Step: [133], local_loss=0.04300066456198692, train_loss=0.05841700732707977, time_cost=1.3116347789764404
+
Steps: 0%| | 133/1000000 [20:31<1911:55:44, 6.88s/it, lr=1e-5, step_loss=0.043]
Steps: 0%| | 134/1000000 [20:44<2443:21:21, 8.80s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [134], local_loss=0.0549243725836277, train_loss=0.07158002257347107, time_cost=3.4611613750457764
+
Steps: 0%| | 134/1000000 [20:44<2443:21:21, 8.80s/it, lr=1e-5, step_loss=0.0549]
Steps: 0%| | 135/1000000 [20:55<2616:53:21, 9.42s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [135], local_loss=0.06441626697778702, train_loss=0.07127466052770615, time_cost=5.496190786361694
+
Steps: 0%| | 135/1000000 [20:55<2616:53:21, 9.42s/it, lr=1e-5, step_loss=0.0644]
Steps: 0%| | 136/1000000 [21:01<2296:53:30, 8.27s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [136], local_loss=0.16104009747505188, train_loss=0.23154065012931824, time_cost=1.4412686824798584
+
Steps: 0%| | 136/1000000 [21:01<2296:53:30, 8.27s/it, lr=1e-5, step_loss=0.161]
Steps: 0%| | 137/1000000 [21:12<2529:15:48, 9.11s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [137], local_loss=0.04910260811448097, train_loss=0.11762984842061996, time_cost=4.041241884231567
+
Steps: 0%| | 137/1000000 [21:12<2529:15:48, 9.11s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 138/1000000 [21:24<2737:40:32, 9.86s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [138], local_loss=0.023860108107328415, train_loss=0.07042653858661652, time_cost=2.2875239849090576
+
Steps: 0%| | 138/1000000 [21:24<2737:40:32, 9.86s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 139/1000000 [21:34<2816:38:30, 10.14s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [139], local_loss=0.04765118658542633, train_loss=0.08556370437145233, time_cost=8.86039686203003
+
Steps: 0%| | 139/1000000 [21:34<2816:38:30, 10.14s/it, lr=1e-5, step_loss=0.0477]
Steps: 0%| | 140/1000000 [21:46<2921:25:19, 10.52s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [140], local_loss=0.06203953176736832, train_loss=0.07244538515806198, time_cost=3.488900899887085
+
Steps: 0%| | 140/1000000 [21:46<2921:25:19, 10.52s/it, lr=1e-5, step_loss=0.062]
Steps: 0%| | 141/1000000 [21:51<2469:23:20, 8.89s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [141], local_loss=0.03566855564713478, train_loss=0.08048824965953827, time_cost=2.690817356109619
+
Steps: 0%| | 141/1000000 [21:51<2469:23:20, 8.89s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 142/1000000 [21:58<2315:09:11, 8.34s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [142], local_loss=0.05239817500114441, train_loss=0.078925721347332, time_cost=1.8064634799957275
+
Steps: 0%| | 142/1000000 [21:58<2315:09:11, 8.34s/it, lr=1e-5, step_loss=0.0524]
Steps: 0%| | 143/1000000 [22:03<2051:52:03, 7.39s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [143], local_loss=0.06632880866527557, train_loss=0.0627853274345398, time_cost=4.087650775909424
+
Steps: 0%| | 143/1000000 [22:03<2051:52:03, 7.39s/it, lr=1e-5, step_loss=0.0663]
Steps: 0%| | 144/1000000 [22:09<1917:36:56, 6.90s/it, lr=1e-5, step_loss=0.0663][RANK-0]: Step: [144], local_loss=0.11027251929044724, train_loss=0.10522201657295227, time_cost=1.4500586986541748
+
Steps: 0%| | 144/1000000 [22:09<1917:36:56, 6.90s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 145/1000000 [22:14<1764:56:02, 6.35s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [145], local_loss=0.05585206672549248, train_loss=0.08625118434429169, time_cost=1.2141532897949219
+
Steps: 0%| | 145/1000000 [22:14<1764:56:02, 6.35s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 146/1000000 [22:26<2231:31:52, 8.03s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [146], local_loss=0.16890408098697662, train_loss=0.06671921163797379, time_cost=3.00844407081604
+
Steps: 0%| | 146/1000000 [22:26<2231:31:52, 8.03s/it, lr=1e-5, step_loss=0.169]
Steps: 0%| | 147/1000000 [22:31<1985:24:39, 7.15s/it, lr=1e-5, step_loss=0.169][RANK-0]: Step: [147], local_loss=0.05783844739198685, train_loss=0.11029821634292603, time_cost=3.0902628898620605
+
Steps: 0%| | 147/1000000 [22:31<1985:24:39, 7.15s/it, lr=1e-5, step_loss=0.0578]
Steps: 0%| | 148/1000000 [22:38<1976:14:12, 7.12s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [148], local_loss=0.03964550420641899, train_loss=0.08945003151893616, time_cost=2.5583910942077637
+
Steps: 0%| | 148/1000000 [22:38<1976:14:12, 7.12s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 149/1000000 [22:47<2157:03:21, 7.77s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [149], local_loss=0.055035755038261414, train_loss=0.09498193860054016, time_cost=7.0793297290802
+
Steps: 0%| | 149/1000000 [22:47<2157:03:21, 7.77s/it, lr=1e-5, step_loss=0.055]
Steps: 0%| | 150/1000000 [22:56<2245:18:48, 8.08s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [150], local_loss=0.04869943484663963, train_loss=0.054154492914676666, time_cost=1.2365903854370117
+
Steps: 0%| | 150/1000000 [22:56<2245:18:48, 8.08s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 151/1000000 [23:04<2190:15:53, 7.89s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [151], local_loss=0.11492685973644257, train_loss=0.08873069286346436, time_cost=2.598625898361206
+
Steps: 0%| | 151/1000000 [23:04<2190:15:53, 7.89s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 152/1000000 [23:08<1935:20:09, 6.97s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [152], local_loss=0.07067542523145676, train_loss=0.09200838208198547, time_cost=1.981199026107788
+
Steps: 0%| | 152/1000000 [23:08<1935:20:09, 6.97s/it, lr=1e-5, step_loss=0.0707]
Steps: 0%| | 153/1000000 [23:15<1922:22:49, 6.92s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [153], local_loss=0.05384361371397972, train_loss=0.06042461469769478, time_cost=2.2779605388641357
+
Steps: 0%| | 153/1000000 [23:15<1922:22:49, 6.92s/it, lr=1e-5, step_loss=0.0538]
Steps: 0%| | 154/1000000 [23:24<2064:59:15, 7.44s/it, lr=1e-5, step_loss=0.0538][RANK-0]: Step: [154], local_loss=0.0726991817355156, train_loss=0.07369500398635864, time_cost=1.3774158954620361
+
Steps: 0%| | 154/1000000 [23:24<2064:59:15, 7.44s/it, lr=1e-5, step_loss=0.0727]
Steps: 0%| | 155/1000000 [23:37<2505:07:36, 9.02s/it, lr=1e-5, step_loss=0.0727][RANK-0]: Step: [155], local_loss=0.03176642209291458, train_loss=0.0524563193321228, time_cost=5.234668016433716
+
Steps: 0%| | 155/1000000 [23:37<2505:07:36, 9.02s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 156/1000000 [23:50<2884:42:10, 10.39s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [156], local_loss=0.04482024535536766, train_loss=0.2298973649740219, time_cost=4.310097932815552
+
Steps: 0%| | 156/1000000 [23:50<2884:42:10, 10.39s/it, lr=1e-5, step_loss=0.0448]
Steps: 0%| | 157/1000000 [24:00<2813:27:35, 10.13s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [157], local_loss=0.0734390988945961, train_loss=0.17878548800945282, time_cost=1.2448253631591797
+
Steps: 0%| | 157/1000000 [24:00<2813:27:35, 10.13s/it, lr=1e-5, step_loss=0.0734]
Steps: 0%| | 158/1000000 [24:10<2856:19:55, 10.28s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [158], local_loss=0.037574075162410736, train_loss=0.04961920529603958, time_cost=1.2619342803955078
+
Steps: 0%| | 158/1000000 [24:10<2856:19:55, 10.28s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 159/1000000 [24:21<2910:58:08, 10.48s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [159], local_loss=0.05449439957737923, train_loss=0.08550719171762466, time_cost=3.3700530529022217
+
Steps: 0%| | 159/1000000 [24:21<2910:58:08, 10.48s/it, lr=1e-5, step_loss=0.0545]
Steps: 0%| | 160/1000000 [24:29<2674:10:23, 9.63s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [160], local_loss=0.04865283519029617, train_loss=0.14090102910995483, time_cost=2.9231114387512207
+
Steps: 0%| | 160/1000000 [24:29<2674:10:23, 9.63s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 161/1000000 [24:37<2513:26:14, 9.05s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [161], local_loss=0.041966117918491364, train_loss=0.16779275238513947, time_cost=6.199023485183716
+
Steps: 0%| | 161/1000000 [24:37<2513:26:14, 9.05s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 162/1000000 [24:51<3003:07:03, 10.81s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [162], local_loss=0.06828220933675766, train_loss=0.0550416000187397, time_cost=6.439754009246826
+
Steps: 0%| | 162/1000000 [24:51<3003:07:03, 10.81s/it, lr=1e-5, step_loss=0.0683]
Steps: 0%| | 163/1000000 [24:58<2673:10:03, 9.62s/it, lr=1e-5, step_loss=0.0683][RANK-0]: Step: [163], local_loss=0.06933746486902237, train_loss=0.08194182813167572, time_cost=3.6657183170318604
+
Steps: 0%| | 163/1000000 [24:58<2673:10:03, 9.62s/it, lr=1e-5, step_loss=0.0693]
Steps: 0%| | 164/1000000 [25:11<2945:50:40, 10.61s/it, lr=1e-5, step_loss=0.0693][RANK-0]: Step: [164], local_loss=0.035479746758937836, train_loss=0.04775089770555496, time_cost=2.9997377395629883
+
Steps: 0%| | 164/1000000 [25:11<2945:50:40, 10.61s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 165/1000000 [25:17<2578:49:48, 9.29s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [165], local_loss=0.07344295084476471, train_loss=0.0632360428571701, time_cost=2.5777814388275146
+
Steps: 0%| | 165/1000000 [25:17<2578:49:48, 9.29s/it, lr=1e-5, step_loss=0.0734]
Steps: 0%| | 166/1000000 [25:23<2259:24:16, 8.14s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [166], local_loss=0.07644443958997726, train_loss=0.09980356693267822, time_cost=2.9081287384033203
+
Steps: 0%| | 166/1000000 [25:23<2259:24:16, 8.14s/it, lr=1e-5, step_loss=0.0764]
Steps: 0%| | 167/1000000 [25:32<2363:18:43, 8.51s/it, lr=1e-5, step_loss=0.0764][RANK-0]: Step: [167], local_loss=0.05148898437619209, train_loss=0.07296457886695862, time_cost=4.102940559387207
+
Steps: 0%| | 167/1000000 [25:32<2363:18:43, 8.51s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 168/1000000 [25:37<2074:21:50, 7.47s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [168], local_loss=0.03364766761660576, train_loss=0.07105305790901184, time_cost=1.9045796394348145
+
Steps: 0%| | 168/1000000 [25:37<2074:21:50, 7.47s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 169/1000000 [25:45<2091:02:05, 7.53s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [169], local_loss=0.08642350137233734, train_loss=0.07707034051418304, time_cost=5.983527421951294
+
Steps: 0%| | 169/1000000 [25:45<2091:02:05, 7.53s/it, lr=1e-5, step_loss=0.0864]
Steps: 0%| | 170/1000000 [25:50<1858:51:04, 6.69s/it, lr=1e-5, step_loss=0.0864][RANK-0]: Step: [170], local_loss=0.060848578810691833, train_loss=0.12428039312362671, time_cost=1.3174562454223633
+
Steps: 0%| | 170/1000000 [25:50<1858:51:04, 6.69s/it, lr=1e-5, step_loss=0.0608]
Steps: 0%| | 171/1000000 [25:55<1711:18:54, 6.16s/it, lr=1e-5, step_loss=0.0608][RANK-0]: Step: [171], local_loss=0.05414707958698273, train_loss=0.0746423676609993, time_cost=1.8648402690887451
+
Steps: 0%| | 171/1000000 [25:55<1711:18:54, 6.16s/it, lr=1e-5, step_loss=0.0541]
Steps: 0%| | 172/1000000 [26:00<1623:14:36, 5.84s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [172], local_loss=0.09268493950366974, train_loss=0.08726435899734497, time_cost=2.15301775932312
+
Steps: 0%| | 172/1000000 [26:00<1623:14:36, 5.84s/it, lr=1e-5, step_loss=0.0927]
Steps: 0%| | 173/1000000 [26:04<1526:29:59, 5.50s/it, lr=1e-5, step_loss=0.0927][RANK-0]: Step: [173], local_loss=0.023751690983772278, train_loss=0.05886061117053032, time_cost=1.5660829544067383
+
Steps: 0%| | 173/1000000 [26:04<1526:29:59, 5.50s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 174/1000000 [26:11<1656:12:39, 5.96s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [174], local_loss=0.06516719609498978, train_loss=0.10443232953548431, time_cost=2.5924623012542725
+
Steps: 0%| | 174/1000000 [26:11<1656:12:39, 5.96s/it, lr=1e-5, step_loss=0.0652]
Steps: 0%| | 175/1000000 [26:17<1578:36:52, 5.68s/it, lr=1e-5, step_loss=0.0652][RANK-0]: Step: [175], local_loss=0.042838506400585175, train_loss=0.0639490932226181, time_cost=2.338787078857422
+
Steps: 0%| | 175/1000000 [26:17<1578:36:52, 5.68s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 176/1000000 [26:24<1692:50:54, 6.10s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [176], local_loss=0.08784428238868713, train_loss=0.07596999406814575, time_cost=2.7824816703796387
+
Steps: 0%| | 176/1000000 [26:24<1692:50:54, 6.10s/it, lr=1e-5, step_loss=0.0878]
Steps: 0%| | 177/1000000 [26:33<2000:39:43, 7.20s/it, lr=1e-5, step_loss=0.0878][RANK-0]: Step: [177], local_loss=0.060059819370508194, train_loss=0.06517937779426575, time_cost=3.147472620010376
+
Steps: 0%| | 177/1000000 [26:33<2000:39:43, 7.20s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 178/1000000 [26:47<2567:39:30, 9.25s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [178], local_loss=0.0723765566945076, train_loss=0.12276116758584976, time_cost=1.2509174346923828
+
Steps: 0%| | 178/1000000 [26:47<2567:39:30, 9.25s/it, lr=1e-5, step_loss=0.0724]
Steps: 0%| | 179/1000000 [27:03<3095:18:55, 11.15s/it, lr=1e-5, step_loss=0.0724][RANK-0]: Step: [179], local_loss=0.04711368307471275, train_loss=0.04807230085134506, time_cost=7.2617316246032715
+
Steps: 0%| | 179/1000000 [27:03<3095:18:55, 11.15s/it, lr=1e-5, step_loss=0.0471]
Steps: 0%| | 180/1000000 [27:15<3192:31:02, 11.50s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [180], local_loss=0.03505171090364456, train_loss=0.04646134003996849, time_cost=3.547891139984131
+
Steps: 0%| | 180/1000000 [27:15<3192:31:02, 11.50s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 181/1000000 [27:28<3270:04:23, 11.77s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [181], local_loss=0.16401678323745728, train_loss=0.078878253698349, time_cost=3.4782235622406006
+
Steps: 0%| | 181/1000000 [27:28<3270:04:23, 11.77s/it, lr=1e-5, step_loss=0.164]
Steps: 0%| | 182/1000000 [27:35<2914:19:36, 10.49s/it, lr=1e-5, step_loss=0.164][RANK-0]: Step: [182], local_loss=0.046817194670438766, train_loss=0.05396430939435959, time_cost=5.669573545455933
+
Steps: 0%| | 182/1000000 [27:35<2914:19:36, 10.49s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 183/1000000 [27:45<2873:55:25, 10.35s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [183], local_loss=0.04256759583950043, train_loss=0.06554628908634186, time_cost=2.494673490524292
+
Steps: 0%| | 183/1000000 [27:45<2873:55:25, 10.35s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 184/1000000 [27:52<2614:13:10, 9.41s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [184], local_loss=0.05513787269592285, train_loss=22.176776885986328, time_cost=1.2035257816314697
+
Steps: 0%| | 184/1000000 [27:52<2614:13:10, 9.41s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 185/1000000 [28:03<2745:37:17, 9.89s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [185], local_loss=0.04912891238927841, train_loss=0.046130161732435226, time_cost=1.2670395374298096
+
Steps: 0%| | 185/1000000 [28:03<2745:37:17, 9.89s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 186/1000000 [28:12<2649:02:51, 9.54s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [186], local_loss=0.5824974775314331, train_loss=0.15170390903949738, time_cost=4.476426839828491
+
Steps: 0%| | 186/1000000 [28:12<2649:02:51, 9.54s/it, lr=1e-5, step_loss=0.582]
Steps: 0%| | 187/1000000 [28:17<2277:30:32, 8.20s/it, lr=1e-5, step_loss=0.582][RANK-0]: Step: [187], local_loss=0.031153496354818344, train_loss=0.059689655900001526, time_cost=2.2725350856781006
+
Steps: 0%| | 187/1000000 [28:17<2277:30:32, 8.20s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 188/1000000 [28:23<2066:46:11, 7.44s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [188], local_loss=0.10165178030729294, train_loss=0.09299641102552414, time_cost=1.54634428024292
+
Steps: 0%| | 188/1000000 [28:23<2066:46:11, 7.44s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 189/1000000 [28:32<2197:19:41, 7.91s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [189], local_loss=0.03267504647374153, train_loss=0.09200974553823471, time_cost=1.432753562927246
+
Steps: 0%| | 189/1000000 [28:32<2197:19:41, 7.91s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 190/1000000 [28:38<2040:33:35, 7.35s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [190], local_loss=0.06311226636171341, train_loss=0.06974469870328903, time_cost=1.363339900970459
+
Steps: 0%| | 190/1000000 [28:38<2040:33:35, 7.35s/it, lr=1e-5, step_loss=0.0631]
Steps: 0%| | 191/1000000 [28:42<1787:53:20, 6.44s/it, lr=1e-5, step_loss=0.0631][RANK-0]: Step: [191], local_loss=0.27584850788116455, train_loss=0.11948616802692413, time_cost=1.3468544483184814
+
Steps: 0%| | 191/1000000 [28:42<1787:53:20, 6.44s/it, lr=1e-5, step_loss=0.276]
Steps: 0%| | 192/1000000 [28:49<1814:29:06, 6.53s/it, lr=1e-5, step_loss=0.276][RANK-0]: Step: [192], local_loss=0.08660914748907089, train_loss=0.06177166849374771, time_cost=2.287729024887085
+
Steps: 0%| | 192/1000000 [28:49<1814:29:06, 6.53s/it, lr=1e-5, step_loss=0.0866]
Steps: 0%| | 193/1000000 [28:58<1985:39:08, 7.15s/it, lr=1e-5, step_loss=0.0866][RANK-0]: Step: [193], local_loss=0.060019124299287796, train_loss=0.07651017606258392, time_cost=3.3579602241516113
+
Steps: 0%| | 193/1000000 [28:58<1985:39:08, 7.15s/it, lr=1e-5, step_loss=0.06]
Steps: 0%| | 194/1000000 [29:06<2081:39:31, 7.50s/it, lr=1e-5, step_loss=0.06][RANK-0]: Step: [194], local_loss=0.0807269811630249, train_loss=0.07269846647977829, time_cost=3.1870572566986084
+
Steps: 0%| | 194/1000000 [29:06<2081:39:31, 7.50s/it, lr=1e-5, step_loss=0.0807]
Steps: 0%| | 195/1000000 [29:15<2227:06:00, 8.02s/it, lr=1e-5, step_loss=0.0807][RANK-0]: Step: [195], local_loss=0.13857385516166687, train_loss=0.06403852999210358, time_cost=1.2206692695617676
+
Steps: 0%| | 195/1000000 [29:15<2227:06:00, 8.02s/it, lr=1e-5, step_loss=0.139]
Steps: 0%| | 196/1000000 [29:26<2490:08:15, 8.97s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [196], local_loss=0.05880264192819595, train_loss=0.07379476726055145, time_cost=2.870943307876587
+
Steps: 0%| | 196/1000000 [29:26<2490:08:15, 8.97s/it, lr=1e-5, step_loss=0.0588]
Steps: 0%| | 197/1000000 [29:36<2564:17:57, 9.23s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [197], local_loss=0.09981073439121246, train_loss=0.0638200044631958, time_cost=4.538117408752441
+
Steps: 0%| | 197/1000000 [29:36<2564:17:57, 9.23s/it, lr=1e-5, step_loss=0.0998]
Steps: 0%| | 198/1000000 [29:44<2482:08:10, 8.94s/it, lr=1e-5, step_loss=0.0998][RANK-0]: Step: [198], local_loss=0.05151480436325073, train_loss=0.05954204499721527, time_cost=6.758252859115601
+
Steps: 0%| | 198/1000000 [29:44<2482:08:10, 8.94s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 199/1000000 [29:50<2219:20:20, 7.99s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [199], local_loss=0.032756906002759933, train_loss=0.08692850172519684, time_cost=4.178992986679077
+
Steps: 0%| | 199/1000000 [29:50<2219:20:20, 7.99s/it, lr=1e-5, step_loss=0.0328]
Steps: 0%| | 200/1000000 [29:57<2084:59:22, 7.51s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [200], local_loss=0.14941951632499695, train_loss=0.07870519906282425, time_cost=2.734515428543091
+
Steps: 0%| | 200/1000000 [29:57<2084:59:22, 7.51s/it, lr=1e-5, step_loss=0.149]
Steps: 0%| | 201/1000000 [30:09<2474:33:04, 8.91s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [201], local_loss=0.058781132102012634, train_loss=0.05123203247785568, time_cost=2.8088884353637695
+
Steps: 0%| | 201/1000000 [30:09<2474:33:04, 8.91s/it, lr=1e-5, step_loss=0.0588]
Steps: 0%| | 202/1000000 [30:22<2799:48:34, 10.08s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [202], local_loss=0.04932808503508568, train_loss=0.07254345715045929, time_cost=6.367142200469971
+
Steps: 0%| | 202/1000000 [30:22<2799:48:34, 10.08s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 203/1000000 [30:36<3192:38:39, 11.50s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [203], local_loss=0.04565137252211571, train_loss=0.07032525539398193, time_cost=6.391272306442261
+
Steps: 0%| | 203/1000000 [30:36<3192:38:39, 11.50s/it, lr=1e-5, step_loss=0.0457]
Steps: 0%| | 204/1000000 [30:49<3283:15:20, 11.82s/it, lr=1e-5, step_loss=0.0457][RANK-0]: Step: [204], local_loss=0.05127674341201782, train_loss=0.05294952541589737, time_cost=5.890255928039551
+
Steps: 0%| | 204/1000000 [30:49<3283:15:20, 11.82s/it, lr=1e-5, step_loss=0.0513]
Steps: 0%| | 205/1000000 [30:58<3058:26:40, 11.01s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [205], local_loss=0.06641910970211029, train_loss=0.05642168968915939, time_cost=3.1720845699310303
+
Steps: 0%| | 205/1000000 [30:58<3058:26:40, 11.01s/it, lr=1e-5, step_loss=0.0664]
Steps: 0%| | 206/1000000 [31:04<2637:48:26, 9.50s/it, lr=1e-5, step_loss=0.0664][RANK-0]: Step: [206], local_loss=0.030674584209918976, train_loss=0.06481203436851501, time_cost=1.723698616027832
+
Steps: 0%| | 206/1000000 [31:04<2637:48:26, 9.50s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 207/1000000 [31:15<2732:45:21, 9.84s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [207], local_loss=1.007871150970459, train_loss=0.21864938735961914, time_cost=1.2969274520874023
+
Steps: 0%| | 207/1000000 [31:15<2732:45:21, 9.84s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 208/1000000 [31:27<2931:28:50, 10.56s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [208], local_loss=0.12045177072286606, train_loss=0.07434149831533432, time_cost=5.286602020263672
+
Steps: 0%| | 208/1000000 [31:27<2931:28:50, 10.56s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 209/1000000 [31:32<2477:58:05, 8.92s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [209], local_loss=0.046559736132621765, train_loss=0.056287940591573715, time_cost=1.373260259628296
+
Steps: 0%| | 209/1000000 [31:32<2477:58:05, 8.92s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 210/1000000 [31:48<3100:05:15, 11.16s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [210], local_loss=0.07986550778150558, train_loss=0.0655602514743805, time_cost=7.6846535205841064
+
Steps: 0%| | 210/1000000 [31:48<3100:05:15, 11.16s/it, lr=1e-5, step_loss=0.0799]
Steps: 0%| | 211/1000000 [32:00<3102:02:03, 11.17s/it, lr=1e-5, step_loss=0.0799][RANK-0]: Step: [211], local_loss=0.07298154383897781, train_loss=0.05736907199025154, time_cost=3.679905652999878
+
Steps: 0%| | 211/1000000 [32:00<3102:02:03, 11.17s/it, lr=1e-5, step_loss=0.073]
Steps: 0%| | 212/1000000 [32:11<3089:48:36, 11.13s/it, lr=1e-5, step_loss=0.073][RANK-0]: Step: [212], local_loss=0.0629253163933754, train_loss=0.06885078549385071, time_cost=4.101526975631714
+
Steps: 0%| | 212/1000000 [32:11<3089:48:36, 11.13s/it, lr=1e-5, step_loss=0.0629]
Steps: 0%| | 213/1000000 [32:21<3050:00:32, 10.98s/it, lr=1e-5, step_loss=0.0629][RANK-0]: Step: [213], local_loss=0.06007330119609833, train_loss=0.05462361499667168, time_cost=1.214040756225586
+
Steps: 0%| | 213/1000000 [32:21<3050:00:32, 10.98s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 214/1000000 [32:27<2640:30:38, 9.51s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [214], local_loss=0.42276501655578613, train_loss=0.09706016629934311, time_cost=1.690253734588623
+
Steps: 0%| | 214/1000000 [32:27<2640:30:38, 9.51s/it, lr=1e-5, step_loss=0.423]
Steps: 0%| | 215/1000000 [32:39<2780:54:26, 10.01s/it, lr=1e-5, step_loss=0.423][RANK-0]: Step: [215], local_loss=0.06984768807888031, train_loss=0.06754107773303986, time_cost=8.35134482383728
+
Steps: 0%| | 215/1000000 [32:39<2780:54:26, 10.01s/it, lr=1e-5, step_loss=0.0698]
Steps: 0%| | 216/1000000 [32:43<2320:33:44, 8.36s/it, lr=1e-5, step_loss=0.0698][RANK-0]: Step: [216], local_loss=0.06339840590953827, train_loss=0.07501450926065445, time_cost=2.154754877090454
+
Steps: 0%| | 216/1000000 [32:43<2320:33:44, 8.36s/it, lr=1e-5, step_loss=0.0634]
Steps: 0%| | 217/1000000 [32:51<2259:23:42, 8.14s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [217], local_loss=0.04322779178619385, train_loss=0.14671263098716736, time_cost=1.803480625152588
+
Steps: 0%| | 217/1000000 [32:51<2259:23:42, 8.14s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 218/1000000 [33:05<2748:38:17, 9.90s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [218], local_loss=0.03422773256897926, train_loss=0.04287116974592209, time_cost=1.2482409477233887
+
Steps: 0%| | 218/1000000 [33:05<2748:38:17, 9.90s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 219/1000000 [33:18<3056:03:08, 11.00s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [219], local_loss=0.08292597532272339, train_loss=0.0553281232714653, time_cost=5.185535192489624
+
Steps: 0%| | 219/1000000 [33:18<3056:03:08, 11.00s/it, lr=1e-5, step_loss=0.0829]
Steps: 0%| | 220/1000000 [33:30<3102:49:21, 11.17s/it, lr=1e-5, step_loss=0.0829][RANK-0]: Step: [220], local_loss=0.06572891771793365, train_loss=0.04940233379602432, time_cost=2.5022740364074707
+
Steps: 0%| | 220/1000000 [33:30<3102:49:21, 11.17s/it, lr=1e-5, step_loss=0.0657]
Steps: 0%| | 221/1000000 [33:38<2892:17:08, 10.41s/it, lr=1e-5, step_loss=0.0657][RANK-0]: Step: [221], local_loss=0.20336171984672546, train_loss=0.16457241773605347, time_cost=7.294737100601196
+
Steps: 0%| | 221/1000000 [33:38<2892:17:08, 10.41s/it, lr=1e-5, step_loss=0.203]
Steps: 0%| | 222/1000000 [33:49<2905:06:06, 10.46s/it, lr=1e-5, step_loss=0.203][RANK-0]: Step: [222], local_loss=0.06307446956634521, train_loss=0.0700148195028305, time_cost=5.804704666137695
+
Steps: 0%| | 222/1000000 [33:49<2905:06:06, 10.46s/it, lr=1e-5, step_loss=0.0631]
Steps: 0%| | 223/1000000 [33:55<2499:19:57, 9.00s/it, lr=1e-5, step_loss=0.0631][RANK-0]: Step: [223], local_loss=0.04465021938085556, train_loss=0.07072573155164719, time_cost=2.8736143112182617
+
Steps: 0%| | 223/1000000 [33:55<2499:19:57, 9.00s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 224/1000000 [34:10<2998:05:42, 10.80s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [224], local_loss=0.0633716955780983, train_loss=0.07287025451660156, time_cost=7.50832200050354
+
Steps: 0%| | 224/1000000 [34:10<2998:05:42, 10.80s/it, lr=1e-5, step_loss=0.0634]
Steps: 0%| | 225/1000000 [34:16<2645:56:01, 9.53s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [225], local_loss=0.07606783509254456, train_loss=0.06928145885467529, time_cost=2.891395092010498
+
Steps: 0%| | 225/1000000 [34:16<2645:56:01, 9.53s/it, lr=1e-5, step_loss=0.0761]
Steps: 0%| | 226/1000000 [34:21<2215:58:42, 7.98s/it, lr=1e-5, step_loss=0.0761][RANK-0]: Step: [226], local_loss=0.039588965475559235, train_loss=0.07065308094024658, time_cost=1.6832501888275146
+
Steps: 0%| | 226/1000000 [34:21<2215:58:42, 7.98s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 227/1000000 [34:30<2380:04:01, 8.57s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [227], local_loss=0.0728386789560318, train_loss=0.05312987044453621, time_cost=1.3814496994018555
+
Steps: 0%| | 227/1000000 [34:30<2380:04:01, 8.57s/it, lr=1e-5, step_loss=0.0728]
Steps: 0%| | 228/1000000 [34:45<2891:13:58, 10.41s/it, lr=1e-5, step_loss=0.0728][RANK-0]: Step: [228], local_loss=0.03160587698221207, train_loss=0.050981756299734116, time_cost=7.238270044326782
+
Steps: 0%| | 228/1000000 [34:45<2891:13:58, 10.41s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 229/1000000 [34:58<3112:50:29, 11.21s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [229], local_loss=0.316876620054245, train_loss=0.09992510080337524, time_cost=1.2412099838256836
+
Steps: 0%| | 229/1000000 [34:58<3112:50:29, 11.21s/it, lr=1e-5, step_loss=0.317]
Steps: 0%| | 230/1000000 [35:04<2680:26:39, 9.65s/it, lr=1e-5, step_loss=0.317][RANK-0]: Step: [230], local_loss=0.12006966024637222, train_loss=0.07457707822322845, time_cost=1.8146374225616455
+
Steps: 0%| | 230/1000000 [35:04<2680:26:39, 9.65s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 231/1000000 [35:12<2526:03:05, 9.10s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [231], local_loss=0.07899858802556992, train_loss=0.06319394707679749, time_cost=1.2108170986175537
+
Steps: 0%| | 231/1000000 [35:12<2526:03:05, 9.10s/it, lr=1e-5, step_loss=0.079]
Steps: 0%| | 232/1000000 [35:25<2843:29:48, 10.24s/it, lr=1e-5, step_loss=0.079][RANK-0]: Step: [232], local_loss=0.07013491541147232, train_loss=0.057478465139865875, time_cost=3.7239770889282227
+
Steps: 0%| | 232/1000000 [35:25<2843:29:48, 10.24s/it, lr=1e-5, step_loss=0.0701]
Steps: 0%| | 233/1000000 [35:38<3115:22:21, 11.22s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [233], local_loss=0.06783375144004822, train_loss=0.0556965135037899, time_cost=4.748890161514282
+
Steps: 0%| | 233/1000000 [35:38<3115:22:21, 11.22s/it, lr=1e-5, step_loss=0.0678]
Steps: 0%| | 234/1000000 [35:45<2729:58:01, 9.83s/it, lr=1e-5, step_loss=0.0678][RANK-0]: Step: [234], local_loss=0.04774130880832672, train_loss=0.05040246993303299, time_cost=3.2941932678222656
+
Steps: 0%| | 234/1000000 [35:45<2729:58:01, 9.83s/it, lr=1e-5, step_loss=0.0477]
Steps: 0%| | 235/1000000 [35:55<2698:59:39, 9.72s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [235], local_loss=0.0632844939827919, train_loss=0.08519162237644196, time_cost=1.292818546295166
+
Steps: 0%| | 235/1000000 [35:55<2698:59:39, 9.72s/it, lr=1e-5, step_loss=0.0633]
Steps: 0%| | 236/1000000 [36:01<2410:51:53, 8.68s/it, lr=1e-5, step_loss=0.0633][RANK-0]: Step: [236], local_loss=0.04099561646580696, train_loss=0.06522762030363083, time_cost=1.4398994445800781
+
Steps: 0%| | 236/1000000 [36:01<2410:51:53, 8.68s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 237/1000000 [36:08<2262:41:29, 8.15s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [237], local_loss=0.25568023324012756, train_loss=0.10396517813205719, time_cost=2.2242777347564697
+
Steps: 0%| | 237/1000000 [36:08<2262:41:29, 8.15s/it, lr=1e-5, step_loss=0.256]
Steps: 0%| | 238/1000000 [36:15<2176:53:18, 7.84s/it, lr=1e-5, step_loss=0.256][RANK-0]: Step: [238], local_loss=0.031759340316057205, train_loss=0.0999336764216423, time_cost=2.4223036766052246
+
Steps: 0%| | 238/1000000 [36:15<2176:53:18, 7.84s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 239/1000000 [36:26<2497:17:28, 8.99s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [239], local_loss=0.054699428379535675, train_loss=0.07403605431318283, time_cost=1.8007738590240479
+
Steps: 0%| | 239/1000000 [36:26<2497:17:28, 8.99s/it, lr=1e-5, step_loss=0.0547]
Steps: 0%| | 240/1000000 [36:32<2176:07:14, 7.84s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [240], local_loss=0.08012163639068604, train_loss=0.0648292526602745, time_cost=2.235978364944458
+
Steps: 0%| | 240/1000000 [36:32<2176:07:14, 7.84s/it, lr=1e-5, step_loss=0.0801]
Steps: 0%| | 241/1000000 [36:39<2102:36:22, 7.57s/it, lr=1e-5, step_loss=0.0801][RANK-0]: Step: [241], local_loss=0.030114665627479553, train_loss=0.07055599987506866, time_cost=2.078564167022705
+
Steps: 0%| | 241/1000000 [36:39<2102:36:22, 7.57s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 242/1000000 [36:44<1944:46:42, 7.00s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [242], local_loss=0.05653458833694458, train_loss=0.22025606036186218, time_cost=1.435382604598999
+
Steps: 0%| | 242/1000000 [36:44<1944:46:42, 7.00s/it, lr=1e-5, step_loss=0.0565]
Steps: 0%| | 243/1000000 [36:59<2552:35:07, 9.19s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [243], local_loss=0.04169631004333496, train_loss=0.07895281165838242, time_cost=7.910173177719116
+
Steps: 0%| | 243/1000000 [36:59<2552:35:07, 9.19s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 244/1000000 [37:10<2750:51:24, 9.91s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [244], local_loss=0.02725071646273136, train_loss=0.04413658007979393, time_cost=1.311359167098999
+
Steps: 0%| | 244/1000000 [37:10<2750:51:24, 9.91s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 245/1000000 [37:20<2778:49:14, 10.01s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [245], local_loss=0.049463555216789246, train_loss=0.05238725617527962, time_cost=3.803400754928589
+
Steps: 0%| | 245/1000000 [37:20<2778:49:14, 10.01s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 246/1000000 [37:31<2842:14:29, 10.23s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [246], local_loss=0.06006854772567749, train_loss=0.057856954634189606, time_cost=1.6577932834625244
+
Steps: 0%| | 246/1000000 [37:31<2842:14:29, 10.23s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 247/1000000 [37:40<2727:01:56, 9.82s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [247], local_loss=0.03949066251516342, train_loss=0.046619076281785965, time_cost=2.852313280105591
+
Steps: 0%| | 247/1000000 [37:40<2727:01:56, 9.82s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 248/1000000 [37:49<2652:29:39, 9.55s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [248], local_loss=0.12000967562198639, train_loss=0.05647030845284462, time_cost=1.3099181652069092
+
Steps: 0%| | 248/1000000 [37:49<2652:29:39, 9.55s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 249/1000000 [37:54<2253:26:18, 8.11s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [249], local_loss=0.05618629604578018, train_loss=0.2028075009584427, time_cost=1.9812493324279785
+
Steps: 0%| | 249/1000000 [37:54<2253:26:18, 8.11s/it, lr=1e-5, step_loss=0.0562]
Steps: 0%| | 250/1000000 [38:02<2283:55:26, 8.22s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [250], local_loss=0.05945773795247078, train_loss=0.060834817588329315, time_cost=6.993061542510986
+
Steps: 0%| | 250/1000000 [38:02<2283:55:26, 8.22s/it, lr=1e-5, step_loss=0.0595]
Steps: 0%| | 251/1000000 [38:07<2022:11:50, 7.28s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [251], local_loss=0.11050548404455185, train_loss=0.07851026952266693, time_cost=2.510610580444336
+
Steps: 0%| | 251/1000000 [38:07<2022:11:50, 7.28s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 252/1000000 [38:18<2319:32:34, 8.35s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [252], local_loss=0.05441460758447647, train_loss=0.0754689946770668, time_cost=7.730823040008545
+
Steps: 0%| | 252/1000000 [38:18<2319:32:34, 8.35s/it, lr=1e-5, step_loss=0.0544]
Steps: 0%| | 253/1000000 [38:27<2353:01:16, 8.47s/it, lr=1e-5, step_loss=0.0544][RANK-0]: Step: [253], local_loss=0.04603719711303711, train_loss=0.06195230782032013, time_cost=1.2596240043640137
+
Steps: 0%| | 253/1000000 [38:27<2353:01:16, 8.47s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 254/1000000 [38:31<1995:07:56, 7.18s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [254], local_loss=0.044675130397081375, train_loss=0.08865769952535629, time_cost=1.6128571033477783
+
Steps: 0%| | 254/1000000 [38:31<1995:07:56, 7.18s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 255/1000000 [38:44<2511:21:16, 9.04s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [255], local_loss=0.08575031906366348, train_loss=0.07651376724243164, time_cost=4.605248212814331
+
Steps: 0%| | 255/1000000 [38:44<2511:21:16, 9.04s/it, lr=1e-5, step_loss=0.0858]
Steps: 0%| | 256/1000000 [38:49<2108:50:24, 7.59s/it, lr=1e-5, step_loss=0.0858][RANK-0]: Step: [256], local_loss=0.05609027296304703, train_loss=0.06815080344676971, time_cost=2.965728998184204
+
Steps: 0%| | 256/1000000 [38:49<2108:50:24, 7.59s/it, lr=1e-5, step_loss=0.0561]
Steps: 0%| | 257/1000000 [38:53<1874:27:10, 6.75s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [257], local_loss=0.028275709599256516, train_loss=0.07469196617603302, time_cost=2.504868507385254
+
Steps: 0%| | 257/1000000 [38:53<1874:27:10, 6.75s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 258/1000000 [39:08<2489:22:45, 8.96s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [258], local_loss=0.04343527555465698, train_loss=0.06460307538509369, time_cost=1.2220678329467773
+
Steps: 0%| | 258/1000000 [39:08<2489:22:45, 8.96s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 259/1000000 [39:17<2539:26:43, 9.14s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [259], local_loss=0.04608467221260071, train_loss=0.047631390392780304, time_cost=3.340726613998413
+
Steps: 0%| | 259/1000000 [39:17<2539:26:43, 9.14s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 260/1000000 [39:24<2352:52:51, 8.47s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [260], local_loss=0.04798920080065727, train_loss=0.056728050112724304, time_cost=2.585940361022949
+
Steps: 0%| | 260/1000000 [39:24<2352:52:51, 8.47s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 261/1000000 [39:31<2237:34:13, 8.06s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [261], local_loss=0.059311576187610626, train_loss=0.053538672626018524, time_cost=2.8056833744049072
+
Steps: 0%| | 261/1000000 [39:31<2237:34:13, 8.06s/it, lr=1e-5, step_loss=0.0593]
Steps: 0%| | 262/1000000 [39:45<2757:23:46, 9.93s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [262], local_loss=0.03395009785890579, train_loss=0.059550702571868896, time_cost=6.946884870529175
+
Steps: 0%| | 262/1000000 [39:45<2757:23:46, 9.93s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 263/1000000 [40:01<3211:29:54, 11.56s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [263], local_loss=0.045871801674366, train_loss=0.05759671330451965, time_cost=6.581233024597168
+
Steps: 0%| | 263/1000000 [40:01<3211:29:54, 11.56s/it, lr=1e-5, step_loss=0.0459]
Steps: 0%| | 264/1000000 [40:10<3040:19:15, 10.95s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [264], local_loss=0.15450087189674377, train_loss=0.06832846254110336, time_cost=1.6041183471679688
+
Steps: 0%| | 264/1000000 [40:10<3040:19:15, 10.95s/it, lr=1e-5, step_loss=0.155]
Steps: 0%| | 265/1000000 [40:21<2998:08:55, 10.80s/it, lr=1e-5, step_loss=0.155][RANK-0]: Step: [265], local_loss=0.046265143901109695, train_loss=0.0519014447927475, time_cost=1.2509946823120117
+
Steps: 0%| | 265/1000000 [40:21<2998:08:55, 10.80s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 266/1000000 [40:26<2564:57:55, 9.24s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [266], local_loss=0.04496060311794281, train_loss=0.0799483209848404, time_cost=3.2517995834350586
+
Steps: 0%| | 266/1000000 [40:26<2564:57:55, 9.24s/it, lr=1e-5, step_loss=0.045]
Steps: 0%| | 267/1000000 [40:33<2379:30:52, 8.57s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [267], local_loss=0.07521092146635056, train_loss=0.05927791818976402, time_cost=1.205411434173584
+
Steps: 0%| | 267/1000000 [40:33<2379:30:52, 8.57s/it, lr=1e-5, step_loss=0.0752]
Steps: 0%| | 268/1000000 [40:47<2772:00:10, 9.98s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [268], local_loss=0.07925190776586533, train_loss=0.056995708495378494, time_cost=1.216231107711792
+
Steps: 0%| | 268/1000000 [40:47<2772:00:10, 9.98s/it, lr=1e-5, step_loss=0.0793]
Steps: 0%| | 269/1000000 [40:57<2779:11:01, 10.01s/it, lr=1e-5, step_loss=0.0793][RANK-0]: Step: [269], local_loss=0.13344010710716248, train_loss=0.06980125606060028, time_cost=6.762292385101318
+
Steps: 0%| | 269/1000000 [40:57<2779:11:01, 10.01s/it, lr=1e-5, step_loss=0.133]
Steps: 0%| | 270/1000000 [41:02<2381:20:59, 8.58s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [270], local_loss=0.043859273195266724, train_loss=0.09155169129371643, time_cost=1.8385586738586426
+
Steps: 0%| | 270/1000000 [41:02<2381:20:59, 8.58s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 271/1000000 [41:19<3104:22:32, 11.18s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [271], local_loss=0.03905655816197395, train_loss=0.05622229725122452, time_cost=8.350334882736206
+
Steps: 0%| | 271/1000000 [41:19<3104:22:32, 11.18s/it, lr=1e-5, step_loss=0.0391]
Steps: 0%| | 272/1000000 [41:24<2597:58:00, 9.36s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [272], local_loss=0.047198373824357986, train_loss=0.06764574348926544, time_cost=1.2173802852630615
+
Steps: 0%| | 272/1000000 [41:24<2597:58:00, 9.36s/it, lr=1e-5, step_loss=0.0472]
Steps: 0%| | 273/1000000 [41:29<2255:26:12, 8.12s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [273], local_loss=0.1603487730026245, train_loss=0.06591285765171051, time_cost=2.2645368576049805
+
Steps: 0%| | 273/1000000 [41:29<2255:26:12, 8.12s/it, lr=1e-5, step_loss=0.16]
Steps: 0%| | 274/1000000 [41:43<2732:43:57, 9.84s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [274], local_loss=0.05169348046183586, train_loss=0.06106935441493988, time_cost=1.2295305728912354
+
Steps: 0%| | 274/1000000 [41:43<2732:43:57, 9.84s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 275/1000000 [41:57<3061:33:15, 11.02s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [275], local_loss=0.036628738045692444, train_loss=0.07920222729444504, time_cost=6.151540040969849
+
Steps: 0%| | 275/1000000 [41:57<3061:33:15, 11.02s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 276/1000000 [42:09<3167:01:02, 11.40s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [276], local_loss=0.0322066955268383, train_loss=0.04336131364107132, time_cost=3.537925958633423
+
Steps: 0%| | 276/1000000 [42:09<3167:01:02, 11.40s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 277/1000000 [42:15<2714:06:20, 9.77s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [277], local_loss=0.06906713545322418, train_loss=0.05858529359102249, time_cost=1.540010690689087
+
Steps: 0%| | 277/1000000 [42:15<2714:06:20, 9.77s/it, lr=1e-5, step_loss=0.0691]
Steps: 0%| | 278/1000000 [42:26<2801:06:04, 10.09s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [278], local_loss=0.08838800340890884, train_loss=0.054353389889001846, time_cost=2.6137197017669678
+
Steps: 0%| | 278/1000000 [42:26<2801:06:04, 10.09s/it, lr=1e-5, step_loss=0.0884]
Steps: 0%| | 279/1000000 [42:31<2384:27:53, 8.59s/it, lr=1e-5, step_loss=0.0884][RANK-0]: Step: [279], local_loss=0.04677760228514671, train_loss=0.042233698070049286, time_cost=1.2884740829467773
+
Steps: 0%| | 279/1000000 [42:31<2384:27:53, 8.59s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 280/1000000 [42:40<2403:34:15, 8.66s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [280], local_loss=0.09896977245807648, train_loss=0.068179190158844, time_cost=1.2321407794952393
+
Steps: 0%| | 280/1000000 [42:40<2403:34:15, 8.66s/it, lr=1e-5, step_loss=0.099]
Steps: 0%| | 281/1000000 [42:45<2112:31:05, 7.61s/it, lr=1e-5, step_loss=0.099][RANK-0]: Step: [281], local_loss=0.05903205648064613, train_loss=0.10509121417999268, time_cost=1.5188896656036377
+
Steps: 0%| | 281/1000000 [42:45<2112:31:05, 7.61s/it, lr=1e-5, step_loss=0.059]
Steps: 0%| | 282/1000000 [42:50<1899:19:08, 6.84s/it, lr=1e-5, step_loss=0.059][RANK-0]: Step: [282], local_loss=0.042387522757053375, train_loss=0.10454181581735611, time_cost=2.106323719024658
+
Steps: 0%| | 282/1000000 [42:50<1899:19:08, 6.84s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 283/1000000 [42:58<1978:43:14, 7.13s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [283], local_loss=0.050957925617694855, train_loss=0.05938095599412918, time_cost=1.877812147140503
+
Steps: 0%| | 283/1000000 [42:58<1978:43:14, 7.13s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 284/1000000 [43:12<2532:13:02, 9.12s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [284], local_loss=0.9999032020568848, train_loss=0.25560927391052246, time_cost=4.38689661026001
+
Steps: 0%| | 284/1000000 [43:12<2532:13:02, 9.12s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 285/1000000 [43:20<2416:36:49, 8.70s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [285], local_loss=0.05539267510175705, train_loss=0.057504672557115555, time_cost=3.589139223098755
+
Steps: 0%| | 285/1000000 [43:20<2416:36:49, 8.70s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 286/1000000 [43:25<2153:26:27, 7.75s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [286], local_loss=0.05011283606290817, train_loss=0.0708148181438446, time_cost=2.7441112995147705
+
Steps: 0%| | 286/1000000 [43:25<2153:26:27, 7.75s/it, lr=1e-5, step_loss=0.0501]
Steps: 0%| | 287/1000000 [43:37<2458:30:39, 8.85s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [287], local_loss=0.11066677421331406, train_loss=0.10653215646743774, time_cost=2.200991153717041
+
Steps: 0%| | 287/1000000 [43:37<2458:30:39, 8.85s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 288/1000000 [43:42<2197:11:52, 7.91s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [288], local_loss=0.04553644731640816, train_loss=0.04235439747571945, time_cost=2.509929895401001
+
Steps: 0%| | 288/1000000 [43:42<2197:11:52, 7.91s/it, lr=1e-5, step_loss=0.0455]
Steps: 0%| | 289/1000000 [43:50<2157:53:58, 7.77s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [289], local_loss=0.04824754223227501, train_loss=0.10291129350662231, time_cost=1.228043794631958
+
Steps: 0%| | 289/1000000 [43:50<2157:53:58, 7.77s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 290/1000000 [44:04<2693:59:59, 9.70s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [290], local_loss=0.20579269528388977, train_loss=0.06601965427398682, time_cost=1.2662479877471924
+
Steps: 0%| | 290/1000000 [44:04<2693:59:59, 9.70s/it, lr=1e-5, step_loss=0.206]
Steps: 0%| | 291/1000000 [44:09<2310:43:22, 8.32s/it, lr=1e-5, step_loss=0.206][RANK-0]: Step: [291], local_loss=0.5034388303756714, train_loss=0.15345412492752075, time_cost=1.2257356643676758
+
Steps: 0%| | 291/1000000 [44:09<2310:43:22, 8.32s/it, lr=1e-5, step_loss=0.503]
Steps: 0%| | 292/1000000 [44:16<2209:58:22, 7.96s/it, lr=1e-5, step_loss=0.503][RANK-0]: Step: [292], local_loss=0.04019401967525482, train_loss=0.05374143272638321, time_cost=2.6677801609039307
+
Steps: 0%| | 292/1000000 [44:16<2209:58:22, 7.96s/it, lr=1e-5, step_loss=0.0402]
Steps: 0%| | 293/1000000 [44:27<2476:10:39, 8.92s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [293], local_loss=0.04279695078730583, train_loss=0.05931077152490616, time_cost=2.1493985652923584
+
Steps: 0%| | 293/1000000 [44:27<2476:10:39, 8.92s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 294/1000000 [44:41<2877:43:54, 10.36s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [294], local_loss=0.06390950083732605, train_loss=0.07012364268302917, time_cost=5.7775561809539795
+
Steps: 0%| | 294/1000000 [44:41<2877:43:54, 10.36s/it, lr=1e-5, step_loss=0.0639]
Steps: 0%| | 295/1000000 [44:45<2377:46:49, 8.56s/it, lr=1e-5, step_loss=0.0639][RANK-0]: Step: [295], local_loss=0.5322167277336121, train_loss=0.16830554604530334, time_cost=3.5228264331817627
+
Steps: 0%| | 295/1000000 [44:45<2377:46:49, 8.56s/it, lr=1e-5, step_loss=0.532]
Steps: 0%| | 296/1000000 [44:56<2572:42:45, 9.26s/it, lr=1e-5, step_loss=0.532][RANK-0]: Step: [296], local_loss=0.07639450579881668, train_loss=0.06870264559984207, time_cost=3.0223214626312256
+
Steps: 0%| | 296/1000000 [44:56<2572:42:45, 9.26s/it, lr=1e-5, step_loss=0.0764]
Steps: 0%| | 297/1000000 [45:06<2616:59:57, 9.42s/it, lr=1e-5, step_loss=0.0764][RANK-0]: Step: [297], local_loss=0.10165475308895111, train_loss=0.09703441709280014, time_cost=3.9387245178222656
+
Steps: 0%| | 297/1000000 [45:06<2616:59:57, 9.42s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 298/1000000 [45:11<2210:01:05, 7.96s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [298], local_loss=0.040564440190792084, train_loss=0.11291366815567017, time_cost=1.5448722839355469
+
Steps: 0%| | 298/1000000 [45:11<2210:01:05, 7.96s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 299/1000000 [45:18<2199:20:51, 7.92s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [299], local_loss=0.11996914446353912, train_loss=0.14117740094661713, time_cost=1.8015525341033936
+
Steps: 0%| | 299/1000000 [45:18<2199:20:51, 7.92s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 300/1000000 [45:29<2430:42:05, 8.75s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [300], local_loss=0.04275347292423248, train_loss=0.16952194273471832, time_cost=1.815535545349121
+
Steps: 0%| | 300/1000000 [45:29<2430:42:05, 8.75s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 301/1000000 [45:41<2701:26:51, 9.73s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [301], local_loss=0.10683637112379074, train_loss=0.0577428862452507, time_cost=4.29988956451416
+
Steps: 0%| | 301/1000000 [45:41<2701:26:51, 9.73s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 302/1000000 [45:47<2336:02:27, 8.41s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [302], local_loss=0.03233985975384712, train_loss=10.550590515136719, time_cost=2.3664562702178955
+
Steps: 0%| | 302/1000000 [45:47<2336:02:27, 8.41s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 303/1000000 [45:53<2202:30:36, 7.93s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [303], local_loss=0.12566879391670227, train_loss=13.501155853271484, time_cost=3.202707290649414
+
Steps: 0%| | 303/1000000 [45:53<2202:30:36, 7.93s/it, lr=1e-5, step_loss=0.126]
Steps: 0%| | 304/1000000 [45:58<1952:46:12, 7.03s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [304], local_loss=0.05410677194595337, train_loss=0.09075336158275604, time_cost=1.5067219734191895
+
Steps: 0%| | 304/1000000 [45:58<1952:46:12, 7.03s/it, lr=1e-5, step_loss=0.0541]
Steps: 0%| | 305/1000000 [46:09<2274:12:14, 8.19s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [305], local_loss=0.13607026636600494, train_loss=0.0863952785730362, time_cost=1.247920036315918
+
Steps: 0%| | 305/1000000 [46:09<2274:12:14, 8.19s/it, lr=1e-5, step_loss=0.136]
Steps: 0%| | 306/1000000 [46:14<2036:04:29, 7.33s/it, lr=1e-5, step_loss=0.136][RANK-0]: Step: [306], local_loss=0.0823255255818367, train_loss=0.057080432772636414, time_cost=2.4070217609405518
+
Steps: 0%| | 306/1000000 [46:14<2036:04:29, 7.33s/it, lr=1e-5, step_loss=0.0823]
Steps: 0%| | 307/1000000 [46:26<2356:30:31, 8.49s/it, lr=1e-5, step_loss=0.0823][RANK-0]: Step: [307], local_loss=0.06785204261541367, train_loss=0.10430559515953064, time_cost=1.6059422492980957
+
Steps: 0%| | 307/1000000 [46:26<2356:30:31, 8.49s/it, lr=1e-5, step_loss=0.0679]
Steps: 0%| | 308/1000000 [46:38<2644:26:35, 9.52s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [308], local_loss=0.06857684999704361, train_loss=0.10319049656391144, time_cost=1.2279152870178223
+
Steps: 0%| | 308/1000000 [46:38<2644:26:35, 9.52s/it, lr=1e-5, step_loss=0.0686]
Steps: 0%| | 309/1000000 [46:43<2331:35:09, 8.40s/it, lr=1e-5, step_loss=0.0686][RANK-0]: Step: [309], local_loss=0.07042886316776276, train_loss=0.05393211543560028, time_cost=1.5983304977416992
+
Steps: 0%| | 309/1000000 [46:43<2331:35:09, 8.40s/it, lr=1e-5, step_loss=0.0704]
Steps: 0%| | 310/1000000 [46:55<2602:24:59, 9.37s/it, lr=1e-5, step_loss=0.0704][RANK-0]: Step: [310], local_loss=0.10426248610019684, train_loss=0.0717654749751091, time_cost=2.3148181438446045
+
Steps: 0%| | 310/1000000 [46:55<2602:24:59, 9.37s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 311/1000000 [47:04<2588:30:59, 9.32s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [311], local_loss=0.051285725086927414, train_loss=0.09781433641910553, time_cost=1.9784049987792969
+
Steps: 0%| | 311/1000000 [47:04<2588:30:59, 9.32s/it, lr=1e-5, step_loss=0.0513]
Steps: 0%| | 312/1000000 [47:16<2790:53:23, 10.05s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [312], local_loss=0.028187844902276993, train_loss=0.08809491246938705, time_cost=2.995253801345825
+
Steps: 0%| | 312/1000000 [47:16<2790:53:23, 10.05s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 313/1000000 [47:21<2387:29:48, 8.60s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [313], local_loss=0.012369809672236443, train_loss=0.08665582537651062, time_cost=2.402432441711426
+
Steps: 0%| | 313/1000000 [47:21<2387:29:48, 8.60s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 314/1000000 [47:26<2038:24:32, 7.34s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [314], local_loss=0.03992684185504913, train_loss=0.17821794748306274, time_cost=1.6010234355926514
+
Steps: 0%| | 314/1000000 [47:26<2038:24:32, 7.34s/it, lr=1e-5, step_loss=0.0399]
Steps: 0%| | 315/1000000 [47:40<2622:29:24, 9.44s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [315], local_loss=0.034257274121046066, train_loss=0.07734861969947815, time_cost=11.76948881149292
+
Steps: 0%| | 315/1000000 [47:40<2622:29:24, 9.44s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 316/1000000 [47:49<2580:41:46, 9.29s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [316], local_loss=0.08581696450710297, train_loss=0.058690011501312256, time_cost=1.317859411239624
+
Steps: 0%| | 316/1000000 [47:49<2580:41:46, 9.29s/it, lr=1e-5, step_loss=0.0858]
Steps: 0%| | 317/1000000 [47:56<2390:38:41, 8.61s/it, lr=1e-5, step_loss=0.0858][RANK-0]: Step: [317], local_loss=0.054649993777275085, train_loss=0.040565915405750275, time_cost=1.278085708618164
+
Steps: 0%| | 317/1000000 [47:56<2390:38:41, 8.61s/it, lr=1e-5, step_loss=0.0546]
Steps: 0%| | 318/1000000 [48:05<2455:57:47, 8.84s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [318], local_loss=0.03545272350311279, train_loss=0.07211519777774811, time_cost=3.092377185821533
+
Steps: 0%| | 318/1000000 [48:05<2455:57:47, 8.84s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 319/1000000 [48:17<2665:25:18, 9.60s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [319], local_loss=0.06365489214658737, train_loss=0.0789107233285904, time_cost=3.459632635116577
+
Steps: 0%| | 319/1000000 [48:17<2665:25:18, 9.60s/it, lr=1e-5, step_loss=0.0637]
Steps: 0%| | 320/1000000 [48:25<2586:04:27, 9.31s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [320], local_loss=0.10381846874952316, train_loss=0.0876343697309494, time_cost=2.8902194499969482
+
Steps: 0%| | 320/1000000 [48:25<2586:04:27, 9.31s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 321/1000000 [48:36<2707:57:16, 9.75s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [321], local_loss=0.23467682301998138, train_loss=0.12920717895030975, time_cost=3.167208671569824
+
Steps: 0%| | 321/1000000 [48:36<2707:57:16, 9.75s/it, lr=1e-5, step_loss=0.235]
Steps: 0%| | 322/1000000 [48:47<2817:02:42, 10.14s/it, lr=1e-5, step_loss=0.235][RANK-0]: Step: [322], local_loss=0.043514225631952286, train_loss=0.0689704567193985, time_cost=3.4774107933044434
+
Steps: 0%| | 322/1000000 [48:47<2817:02:42, 10.14s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 323/1000000 [48:55<2620:08:51, 9.44s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [323], local_loss=0.04336751997470856, train_loss=0.06867267191410065, time_cost=1.381042718887329
+
Steps: 0%| | 323/1000000 [48:55<2620:08:51, 9.44s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 324/1000000 [49:00<2267:32:55, 8.17s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [324], local_loss=0.039229877293109894, train_loss=0.0962267518043518, time_cost=4.443645477294922
+
Steps: 0%| | 324/1000000 [49:00<2267:32:55, 8.17s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 325/1000000 [49:11<2453:52:24, 8.84s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [325], local_loss=0.05122426152229309, train_loss=0.0821276307106018, time_cost=7.84107518196106
+
Steps: 0%| | 325/1000000 [49:11<2453:52:24, 8.84s/it, lr=1e-5, step_loss=0.0512]
Steps: 0%| | 326/1000000 [49:16<2197:59:24, 7.92s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [326], local_loss=0.052325718104839325, train_loss=0.07168115675449371, time_cost=1.3162214756011963
+
Steps: 0%| | 326/1000000 [49:16<2197:59:24, 7.92s/it, lr=1e-5, step_loss=0.0523]
Steps: 0%| | 327/1000000 [49:21<1958:19:02, 7.05s/it, lr=1e-5, step_loss=0.0523][RANK-0]: Step: [327], local_loss=0.04525185376405716, train_loss=0.0631321519613266, time_cost=2.6196048259735107
+
Steps: 0%| | 327/1000000 [49:21<1958:19:02, 7.05s/it, lr=1e-5, step_loss=0.0453]
Steps: 0%| | 328/1000000 [49:29<1988:10:01, 7.16s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [328], local_loss=0.057068269699811935, train_loss=11.100871086120605, time_cost=3.256042003631592
+
Steps: 0%| | 328/1000000 [49:29<1988:10:01, 7.16s/it, lr=1e-5, step_loss=0.0571]
Steps: 0%| | 329/1000000 [49:36<1990:25:11, 7.17s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [329], local_loss=0.05587850883603096, train_loss=0.0729023888707161, time_cost=3.0742831230163574
+
Steps: 0%| | 329/1000000 [49:36<1990:25:11, 7.17s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 330/1000000 [49:52<2720:25:56, 9.80s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [330], local_loss=0.04677401855587959, train_loss=0.09505991637706757, time_cost=6.897519588470459
+
Steps: 0%| | 330/1000000 [49:52<2720:25:56, 9.80s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 331/1000000 [50:05<3016:45:36, 10.86s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [331], local_loss=0.08450214564800262, train_loss=0.07218150794506073, time_cost=3.9389076232910156
+
Steps: 0%| | 331/1000000 [50:05<3016:45:36, 10.86s/it, lr=1e-5, step_loss=0.0845]
Steps: 0%| | 332/1000000 [50:13<2739:44:50, 9.87s/it, lr=1e-5, step_loss=0.0845][RANK-0]: Step: [332], local_loss=0.4991810917854309, train_loss=0.1115015372633934, time_cost=3.179487466812134
+
Steps: 0%| | 332/1000000 [50:13<2739:44:50, 9.87s/it, lr=1e-5, step_loss=0.499]
Steps: 0%| | 333/1000000 [50:18<2342:37:32, 8.44s/it, lr=1e-5, step_loss=0.499][RANK-0]: Step: [333], local_loss=0.11247868835926056, train_loss=0.08280353248119354, time_cost=1.4347310066223145
+
Steps: 0%| | 333/1000000 [50:18<2342:37:32, 8.44s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 334/1000000 [50:23<2050:58:02, 7.39s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [334], local_loss=0.1892147958278656, train_loss=0.07383496314287186, time_cost=1.237659215927124
+
Steps: 0%| | 334/1000000 [50:23<2050:58:02, 7.39s/it, lr=1e-5, step_loss=0.189]
Steps: 0%| | 335/1000000 [50:32<2167:39:44, 7.81s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [335], local_loss=0.24225550889968872, train_loss=0.12943135201931, time_cost=2.8190083503723145
+
Steps: 0%| | 335/1000000 [50:32<2167:39:44, 7.81s/it, lr=1e-5, step_loss=0.242]
Steps: 0%| | 336/1000000 [50:46<2731:03:11, 9.84s/it, lr=1e-5, step_loss=0.242][RANK-0]: Step: [336], local_loss=0.14360086619853973, train_loss=0.05760246515274048, time_cost=3.9535281658172607
+
Steps: 0%| | 336/1000000 [50:46<2731:03:11, 9.84s/it, lr=1e-5, step_loss=0.144]
Steps: 0%| | 337/1000000 [50:53<2471:15:15, 8.90s/it, lr=1e-5, step_loss=0.144][RANK-0]: Step: [337], local_loss=0.7361539602279663, train_loss=0.1484905630350113, time_cost=2.249553680419922
+
Steps: 0%| | 337/1000000 [50:53<2471:15:15, 8.90s/it, lr=1e-5, step_loss=0.736]
Steps: 0%| | 338/1000000 [50:58<2177:49:48, 7.84s/it, lr=1e-5, step_loss=0.736][RANK-0]: Step: [338], local_loss=0.0689900740981102, train_loss=0.0611560121178627, time_cost=1.6487481594085693
+
Steps: 0%| | 338/1000000 [50:58<2177:49:48, 7.84s/it, lr=1e-5, step_loss=0.069]
Steps: 0%| | 339/1000000 [51:03<1927:19:40, 6.94s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [339], local_loss=0.03956682235002518, train_loss=0.22937758266925812, time_cost=2.1143620014190674
+
Steps: 0%| | 339/1000000 [51:03<1927:19:40, 6.94s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 340/1000000 [51:14<2230:54:14, 8.03s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [340], local_loss=0.04941879212856293, train_loss=0.05821216106414795, time_cost=3.62283992767334
+
Steps: 0%| | 340/1000000 [51:14<2230:54:14, 8.03s/it, lr=1e-5, step_loss=0.0494]
Steps: 0%| | 341/1000000 [51:20<2067:41:22, 7.45s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [341], local_loss=0.04049253091216087, train_loss=0.0448288731276989, time_cost=1.610236406326294
+
Steps: 0%| | 341/1000000 [51:20<2067:41:22, 7.45s/it, lr=1e-5, step_loss=0.0405]
Steps: 0%| | 342/1000000 [51:32<2491:22:28, 8.97s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [342], local_loss=0.07327502220869064, train_loss=0.1142970398068428, time_cost=5.669966220855713
+
Steps: 0%| | 342/1000000 [51:32<2491:22:28, 8.97s/it, lr=1e-5, step_loss=0.0733]
Steps: 0%| | 343/1000000 [51:41<2454:42:59, 8.84s/it, lr=1e-5, step_loss=0.0733][RANK-0]: Step: [343], local_loss=0.05277334526181221, train_loss=0.054107606410980225, time_cost=6.279112339019775
+
Steps: 0%| | 343/1000000 [51:41<2454:42:59, 8.84s/it, lr=1e-5, step_loss=0.0528]
Steps: 0%| | 344/1000000 [51:50<2514:14:41, 9.05s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [344], local_loss=0.036273930221796036, train_loss=0.059583552181720734, time_cost=2.5720765590667725
+
Steps: 0%| | 344/1000000 [51:50<2514:14:41, 9.05s/it, lr=1e-5, step_loss=0.0363]
Steps: 0%| | 345/1000000 [51:58<2374:52:30, 8.55s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [345], local_loss=0.08006136119365692, train_loss=0.059954337775707245, time_cost=3.0882322788238525
+
Steps: 0%| | 345/1000000 [51:58<2374:52:30, 8.55s/it, lr=1e-5, step_loss=0.0801]
Steps: 0%| | 346/1000000 [52:07<2469:04:26, 8.89s/it, lr=1e-5, step_loss=0.0801][RANK-0]: Step: [346], local_loss=0.02373042143881321, train_loss=0.087051160633564, time_cost=3.5255062580108643
+
Steps: 0%| | 346/1000000 [52:07<2469:04:26, 8.89s/it, lr=1e-5, step_loss=0.0237]
Steps: 0%| | 347/1000000 [52:17<2541:10:14, 9.15s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [347], local_loss=0.03622006997466087, train_loss=0.045841023325920105, time_cost=3.4188480377197266
+
Steps: 0%| | 347/1000000 [52:17<2541:10:14, 9.15s/it, lr=1e-5, step_loss=0.0362]
Steps: 0%| | 348/1000000 [52:31<2901:11:57, 10.45s/it, lr=1e-5, step_loss=0.0362][RANK-0]: Step: [348], local_loss=0.0258642565459013, train_loss=0.06041504442691803, time_cost=1.2360639572143555
+
Steps: 0%| | 348/1000000 [52:31<2901:11:57, 10.45s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 349/1000000 [52:45<3194:33:52, 11.50s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [349], local_loss=0.0541735477745533, train_loss=0.11669507622718811, time_cost=6.023799180984497
+
Steps: 0%| | 349/1000000 [52:45<3194:33:52, 11.50s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 350/1000000 [52:59<3424:15:43, 12.33s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [350], local_loss=0.04186438396573067, train_loss=0.059892065823078156, time_cost=6.72292947769165
+
Steps: 0%| | 350/1000000 [52:59<3424:15:43, 12.33s/it, lr=1e-5, step_loss=0.0419]
Steps: 0%| | 351/1000000 [53:05<2883:13:06, 10.38s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [351], local_loss=0.06797072291374207, train_loss=0.07053142040967941, time_cost=1.8554627895355225
+
Steps: 0%| | 351/1000000 [53:05<2883:13:06, 10.38s/it, lr=1e-5, step_loss=0.068]
Steps: 0%| | 352/1000000 [53:14<2767:10:17, 9.97s/it, lr=1e-5, step_loss=0.068][RANK-0]: Step: [352], local_loss=0.04966335743665695, train_loss=0.04073026776313782, time_cost=1.3850018978118896
+
Steps: 0%| | 352/1000000 [53:14<2767:10:17, 9.97s/it, lr=1e-5, step_loss=0.0497]
Steps: 0%| | 353/1000000 [53:20<2432:46:49, 8.76s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [353], local_loss=0.06912218779325485, train_loss=0.06208242475986481, time_cost=1.8426978588104248
+
Steps: 0%| | 353/1000000 [53:20<2432:46:49, 8.76s/it, lr=1e-5, step_loss=0.0691]
Steps: 0%| | 354/1000000 [53:35<2999:17:37, 10.80s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [354], local_loss=0.05318053066730499, train_loss=0.10214866697788239, time_cost=7.523094177246094
+
Steps: 0%| | 354/1000000 [53:35<2999:17:37, 10.80s/it, lr=1e-5, step_loss=0.0532]
Steps: 0%| | 355/1000000 [53:46<3032:28:41, 10.92s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [355], local_loss=0.042095284909009933, train_loss=0.10377398133277893, time_cost=4.5372278690338135
+
Steps: 0%| | 355/1000000 [53:46<3032:28:41, 10.92s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 356/1000000 [53:54<2756:25:50, 9.93s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [356], local_loss=0.04406280443072319, train_loss=0.06459259241819382, time_cost=3.477790594100952
+
Steps: 0%| | 356/1000000 [53:54<2756:25:50, 9.93s/it, lr=1e-5, step_loss=0.0441]
Steps: 0%| | 357/1000000 [54:09<3149:56:01, 11.34s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [357], local_loss=0.038588475435972214, train_loss=0.08683086931705475, time_cost=6.327197790145874
+
Steps: 0%| | 357/1000000 [54:09<3149:56:01, 11.34s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 358/1000000 [54:18<2996:04:46, 10.79s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [358], local_loss=0.11519455164670944, train_loss=0.10627280175685883, time_cost=1.6385447978973389
+
Steps: 0%| | 358/1000000 [54:18<2996:04:46, 10.79s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 359/1000000 [54:26<2781:21:52, 10.02s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [359], local_loss=0.0827166885137558, train_loss=0.13916544616222382, time_cost=2.9645369052886963
+
Steps: 0%| | 359/1000000 [54:26<2781:21:52, 10.02s/it, lr=1e-5, step_loss=0.0827]
Steps: 0%| | 360/1000000 [54:42<3219:34:01, 11.59s/it, lr=1e-5, step_loss=0.0827][RANK-0]: Step: [360], local_loss=0.06215471774339676, train_loss=0.10393831133842468, time_cost=3.6206557750701904
+
Steps: 0%| | 360/1000000 [54:42<3219:34:01, 11.59s/it, lr=1e-5, step_loss=0.0622]
Steps: 0%| | 361/1000000 [54:46<2605:52:48, 9.38s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [361], local_loss=0.3558129370212555, train_loss=0.14109490811824799, time_cost=1.2200300693511963
+
Steps: 0%| | 361/1000000 [54:46<2605:52:48, 9.38s/it, lr=1e-5, step_loss=0.356]
Steps: 0%| | 362/1000000 [54:53<2411:51:49, 8.69s/it, lr=1e-5, step_loss=0.356][RANK-0]: Step: [362], local_loss=0.3477957546710968, train_loss=0.09741099178791046, time_cost=2.907072067260742
+
Steps: 0%| | 362/1000000 [54:53<2411:51:49, 8.69s/it, lr=1e-5, step_loss=0.348]
Steps: 0%| | 363/1000000 [55:01<2320:56:26, 8.36s/it, lr=1e-5, step_loss=0.348][RANK-0]: Step: [363], local_loss=0.05950666964054108, train_loss=0.05392923206090927, time_cost=1.2127330303192139
+
Steps: 0%| | 363/1000000 [55:01<2320:56:26, 8.36s/it, lr=1e-5, step_loss=0.0595]
Steps: 0%| | 364/1000000 [55:12<2557:31:00, 9.21s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [364], local_loss=0.03534240648150444, train_loss=0.057453349232673645, time_cost=3.270803928375244
+
Steps: 0%| | 364/1000000 [55:12<2557:31:00, 9.21s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 365/1000000 [55:17<2222:37:38, 8.00s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [365], local_loss=0.0438804067671299, train_loss=0.04615306854248047, time_cost=1.360349178314209
+
Steps: 0%| | 365/1000000 [55:17<2222:37:38, 8.00s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 366/1000000 [55:21<1917:57:02, 6.91s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [366], local_loss=0.03475575149059296, train_loss=0.0456400141119957, time_cost=1.4302780628204346
+
Steps: 0%| | 366/1000000 [55:21<1917:57:02, 6.91s/it, lr=1e-5, step_loss=0.0348]
Steps: 0%| | 367/1000000 [55:28<1899:45:16, 6.84s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [367], local_loss=0.049630843102931976, train_loss=0.0700874999165535, time_cost=2.131134510040283
+
Steps: 0%| | 367/1000000 [55:28<1899:45:16, 6.84s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 368/1000000 [55:42<2511:32:41, 9.04s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [368], local_loss=0.07907802611589432, train_loss=0.04948059096932411, time_cost=5.356181621551514
+
Steps: 0%| | 368/1000000 [55:42<2511:32:41, 9.04s/it, lr=1e-5, step_loss=0.0791]
Steps: 0%| | 369/1000000 [55:49<2308:23:44, 8.31s/it, lr=1e-5, step_loss=0.0791][RANK-0]: Step: [369], local_loss=0.038176894187927246, train_loss=0.04717153310775757, time_cost=2.373534679412842
+
Steps: 0%| | 369/1000000 [55:49<2308:23:44, 8.31s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 370/1000000 [56:02<2761:30:06, 9.95s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [370], local_loss=0.04733341187238693, train_loss=0.1060480922460556, time_cost=6.5122292041778564
+
Steps: 0%| | 370/1000000 [56:02<2761:30:06, 9.95s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 371/1000000 [56:18<3250:04:28, 11.70s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [371], local_loss=0.00808434747159481, train_loss=0.07042940706014633, time_cost=4.882800102233887
+
Steps: 0%| | 371/1000000 [56:18<3250:04:28, 11.70s/it, lr=1e-5, step_loss=0.00808]
Steps: 0%| | 372/1000000 [56:32<3376:14:11, 12.16s/it, lr=1e-5, step_loss=0.00808][RANK-0]: Step: [372], local_loss=0.12152969092130661, train_loss=0.10068989545106888, time_cost=4.428552627563477
+
Steps: 0%| | 372/1000000 [56:32<3376:14:11, 12.16s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 373/1000000 [56:46<3538:30:50, 12.74s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [373], local_loss=0.04273294657468796, train_loss=0.06268293410539627, time_cost=4.92893123626709
+
Steps: 0%| | 373/1000000 [56:46<3538:30:50, 12.74s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 374/1000000 [56:53<3113:41:41, 11.21s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [374], local_loss=0.03522854670882225, train_loss=0.05213591456413269, time_cost=1.2779693603515625
+
Steps: 0%| | 374/1000000 [56:53<3113:41:41, 11.21s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 375/1000000 [57:05<3185:05:42, 11.47s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [375], local_loss=0.06489180773496628, train_loss=0.08047574758529663, time_cost=5.094097375869751
+
Steps: 0%| | 375/1000000 [57:05<3185:05:42, 11.47s/it, lr=1e-5, step_loss=0.0649]
Steps: 0%| | 376/1000000 [57:13<2861:39:23, 10.31s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [376], local_loss=0.06294127553701401, train_loss=0.11048555374145508, time_cost=1.2819299697875977
+
Steps: 0%| | 376/1000000 [57:13<2861:39:23, 10.31s/it, lr=1e-5, step_loss=0.0629]
Steps: 0%| | 377/1000000 [57:19<2519:33:29, 9.07s/it, lr=1e-5, step_loss=0.0629][RANK-0]: Step: [377], local_loss=0.0537806935608387, train_loss=0.06397005915641785, time_cost=2.0046942234039307
+
Steps: 0%| | 377/1000000 [57:19<2519:33:29, 9.07s/it, lr=1e-5, step_loss=0.0538]
Steps: 0%| | 378/1000000 [57:30<2698:14:12, 9.72s/it, lr=1e-5, step_loss=0.0538][RANK-0]: Step: [378], local_loss=0.04107647389173508, train_loss=0.06560191512107849, time_cost=3.4543440341949463
+
Steps: 0%| | 378/1000000 [57:30<2698:14:12, 9.72s/it, lr=1e-5, step_loss=0.0411]
Steps: 0%| | 379/1000000 [57:45<3078:30:12, 11.09s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [379], local_loss=0.029395684599876404, train_loss=0.05913274735212326, time_cost=3.069765090942383
+
Steps: 0%| | 379/1000000 [57:45<3078:30:12, 11.09s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 380/1000000 [57:52<2770:59:39, 9.98s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [380], local_loss=0.054373547434806824, train_loss=0.0695444867014885, time_cost=3.6391208171844482
+
Steps: 0%| | 380/1000000 [57:52<2770:59:39, 9.98s/it, lr=1e-5, step_loss=0.0544]
Steps: 0%| | 381/1000000 [58:00<2607:58:28, 9.39s/it, lr=1e-5, step_loss=0.0544][RANK-0]: Step: [381], local_loss=0.07007235288619995, train_loss=0.20874910056591034, time_cost=1.3238520622253418
+
Steps: 0%| | 381/1000000 [58:00<2607:58:28, 9.39s/it, lr=1e-5, step_loss=0.0701]
Steps: 0%| | 382/1000000 [58:05<2261:35:09, 8.14s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [382], local_loss=0.06095902621746063, train_loss=0.08221440017223358, time_cost=2.4209392070770264
+
Steps: 0%| | 382/1000000 [58:05<2261:35:09, 8.14s/it, lr=1e-5, step_loss=0.061]
Steps: 0%| | 383/1000000 [58:14<2300:56:24, 8.29s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [383], local_loss=0.01974998041987419, train_loss=0.07783588767051697, time_cost=2.616102695465088
+
Steps: 0%| | 383/1000000 [58:14<2300:56:24, 8.29s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 384/1000000 [58:27<2707:11:12, 9.75s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [384], local_loss=0.10226945579051971, train_loss=0.06632590293884277, time_cost=1.3150513172149658
+
Steps: 0%| | 384/1000000 [58:27<2707:11:12, 9.75s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 385/1000000 [58:35<2569:54:21, 9.26s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [385], local_loss=0.07176855951547623, train_loss=0.20896318554878235, time_cost=1.3343262672424316
+
Steps: 0%| | 385/1000000 [58:35<2569:54:21, 9.26s/it, lr=1e-5, step_loss=0.0718]
Steps: 0%| | 386/1000000 [58:49<2911:28:42, 10.49s/it, lr=1e-5, step_loss=0.0718][RANK-0]: Step: [386], local_loss=0.12377037853002548, train_loss=0.07101219892501831, time_cost=4.280271530151367
+
Steps: 0%| | 386/1000000 [58:49<2911:28:42, 10.49s/it, lr=1e-5, step_loss=0.124]
Steps: 0%| | 387/1000000 [58:54<2476:57:07, 8.92s/it, lr=1e-5, step_loss=0.124][RANK-0]: Step: [387], local_loss=0.05299627035856247, train_loss=0.061796851456165314, time_cost=2.890394687652588
+
Steps: 0%| | 387/1000000 [58:54<2476:57:07, 8.92s/it, lr=1e-5, step_loss=0.053]
Steps: 0%| | 388/1000000 [59:04<2546:29:34, 9.17s/it, lr=1e-5, step_loss=0.053][RANK-0]: Step: [388], local_loss=0.22533433139324188, train_loss=0.0895373746752739, time_cost=3.2538516521453857
+
Steps: 0%| | 388/1000000 [59:04<2546:29:34, 9.17s/it, lr=1e-5, step_loss=0.225]
Steps: 0%| | 389/1000000 [59:09<2248:24:25, 8.10s/it, lr=1e-5, step_loss=0.225][RANK-0]: Step: [389], local_loss=0.044483449310064316, train_loss=0.08410095423460007, time_cost=3.0945475101470947
+
Steps: 0%| | 389/1000000 [59:09<2248:24:25, 8.10s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 390/1000000 [59:14<2000:50:40, 7.21s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [390], local_loss=0.04734668880701065, train_loss=0.08373982459306717, time_cost=1.2151036262512207
+
Steps: 0%| | 390/1000000 [59:14<2000:50:40, 7.21s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 391/1000000 [59:23<2145:06:58, 7.73s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [391], local_loss=0.052549585700035095, train_loss=25.123199462890625, time_cost=2.7618954181671143
+
Steps: 0%| | 391/1000000 [59:23<2145:06:58, 7.73s/it, lr=1e-5, step_loss=0.0525]
Steps: 0%| | 392/1000000 [59:29<2012:52:50, 7.25s/it, lr=1e-5, step_loss=0.0525][RANK-0]: Step: [392], local_loss=0.04984619468450546, train_loss=0.05068099871277809, time_cost=1.8741490840911865
+
Steps: 0%| | 392/1000000 [59:29<2012:52:50, 7.25s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 393/1000000 [59:44<2611:53:15, 9.41s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [393], local_loss=0.07067897170782089, train_loss=0.06314562261104584, time_cost=2.9079291820526123
+
Steps: 0%| | 393/1000000 [59:44<2611:53:15, 9.41s/it, lr=1e-5, step_loss=0.0707]
Steps: 0%| | 394/1000000 [59:51<2399:19:55, 8.64s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [394], local_loss=0.03432843089103699, train_loss=0.04428836703300476, time_cost=2.626056432723999
+
Steps: 0%| | 394/1000000 [59:51<2399:19:55, 8.64s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 395/1000000 [59:56<2089:15:50, 7.52s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [395], local_loss=0.04352565482258797, train_loss=0.051040440797805786, time_cost=1.2310481071472168
+
Steps: 0%| | 395/1000000 [59:56<2089:15:50, 7.52s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 396/1000000 [1:00:01<1879:48:14, 6.77s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [396], local_loss=0.11118417233228683, train_loss=0.06737424433231354, time_cost=1.2266037464141846
+
Steps: 0%| | 396/1000000 [1:00:01<1879:48:14, 6.77s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 397/1000000 [1:00:10<2094:15:49, 7.54s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [397], local_loss=0.4469371438026428, train_loss=0.11236338317394257, time_cost=3.3835537433624268
+
Steps: 0%| | 397/1000000 [1:00:10<2094:15:49, 7.54s/it, lr=1e-5, step_loss=0.447]
Steps: 0%| | 398/1000000 [1:00:23<2547:14:28, 9.17s/it, lr=1e-5, step_loss=0.447][RANK-0]: Step: [398], local_loss=0.03931601718068123, train_loss=0.08607743680477142, time_cost=9.27360987663269
+
Steps: 0%| | 398/1000000 [1:00:23<2547:14:28, 9.17s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 399/1000000 [1:00:31<2428:58:38, 8.75s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [399], local_loss=0.03768047317862511, train_loss=0.07325218617916107, time_cost=1.8484547138214111
+
Steps: 0%| | 399/1000000 [1:00:31<2428:58:38, 8.75s/it, lr=1e-5, step_loss=0.0377]
Steps: 0%| | 400/1000000 [1:00:37<2218:41:02, 7.99s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [400], local_loss=0.043175168335437775, train_loss=0.0743766576051712, time_cost=1.36088228225708
+
Steps: 0%| | 400/1000000 [1:00:37<2218:41:02, 7.99s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 401/1000000 [1:00:48<2451:12:00, 8.83s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [401], local_loss=0.058321669697761536, train_loss=0.0625971183180809, time_cost=2.677865743637085
+
Steps: 0%| | 401/1000000 [1:00:48<2451:12:00, 8.83s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 402/1000000 [1:00:57<2518:21:55, 9.07s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [402], local_loss=0.09970855712890625, train_loss=0.05445973947644234, time_cost=1.211533546447754
+
Steps: 0%| | 402/1000000 [1:00:57<2518:21:55, 9.07s/it, lr=1e-5, step_loss=0.0997]
Steps: 0%| | 403/1000000 [1:01:04<2342:07:18, 8.44s/it, lr=1e-5, step_loss=0.0997][RANK-0]: Step: [403], local_loss=0.04424284026026726, train_loss=0.05468450486660004, time_cost=1.3934643268585205
+
Steps: 0%| | 403/1000000 [1:01:04<2342:07:18, 8.44s/it, lr=1e-5, step_loss=0.0442]
Steps: 0%| | 404/1000000 [1:01:13<2345:04:48, 8.45s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [404], local_loss=0.04352619871497154, train_loss=0.057317011058330536, time_cost=3.3853461742401123
+
Steps: 0%| | 404/1000000 [1:01:13<2345:04:48, 8.45s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 405/1000000 [1:01:20<2222:08:53, 8.00s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [405], local_loss=0.11068809032440186, train_loss=0.0997910127043724, time_cost=1.2075598239898682
+
Steps: 0%| | 405/1000000 [1:01:20<2222:08:53, 8.00s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 406/1000000 [1:01:31<2476:39:49, 8.92s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [406], local_loss=0.04869040474295616, train_loss=0.05884428694844246, time_cost=1.2295773029327393
+
Steps: 0%| | 406/1000000 [1:01:31<2476:39:49, 8.92s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 407/1000000 [1:01:35<2100:35:29, 7.57s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [407], local_loss=0.04627389460802078, train_loss=0.06823533028364182, time_cost=1.8385796546936035
+
Steps: 0%| | 407/1000000 [1:01:35<2100:35:29, 7.57s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 408/1000000 [1:01:46<2344:18:15, 8.44s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [408], local_loss=0.08992061764001846, train_loss=0.10527107864618301, time_cost=1.926957607269287
+
Steps: 0%| | 408/1000000 [1:01:46<2344:18:15, 8.44s/it, lr=1e-5, step_loss=0.0899]
Steps: 0%| | 409/1000000 [1:01:59<2749:38:10, 9.90s/it, lr=1e-5, step_loss=0.0899][RANK-0]: Step: [409], local_loss=0.03809940442442894, train_loss=0.06481252610683441, time_cost=4.285513162612915
+
Steps: 0%| | 409/1000000 [1:01:59<2749:38:10, 9.90s/it, lr=1e-5, step_loss=0.0381]
Steps: 0%| | 410/1000000 [1:02:07<2560:25:53, 9.22s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [410], local_loss=0.036205362528562546, train_loss=0.05394435673952103, time_cost=1.424619436264038
+
Steps: 0%| | 410/1000000 [1:02:07<2560:25:53, 9.22s/it, lr=1e-5, step_loss=0.0362]
Steps: 0%| | 411/1000000 [1:02:14<2386:17:57, 8.59s/it, lr=1e-5, step_loss=0.0362][RANK-0]: Step: [411], local_loss=0.04899221658706665, train_loss=0.047522127628326416, time_cost=5.650220632553101
+
Steps: 0%| | 411/1000000 [1:02:14<2386:17:57, 8.59s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 412/1000000 [1:02:28<2848:56:01, 10.26s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [412], local_loss=0.048257775604724884, train_loss=0.057549849152565, time_cost=4.194333076477051
+
Steps: 0%| | 412/1000000 [1:02:28<2848:56:01, 10.26s/it, lr=1e-5, step_loss=0.0483]
Steps: 0%| | 413/1000000 [1:02:38<2871:49:24, 10.34s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [413], local_loss=0.05328530818223953, train_loss=0.04963206499814987, time_cost=7.413425922393799
+
Steps: 0%| | 413/1000000 [1:02:38<2871:49:24, 10.34s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 414/1000000 [1:02:49<2865:46:38, 10.32s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [414], local_loss=0.06396730244159698, train_loss=0.14437896013259888, time_cost=1.4497642517089844
+
Steps: 0%| | 414/1000000 [1:02:49<2865:46:38, 10.32s/it, lr=1e-5, step_loss=0.064]
Steps: 0%| | 415/1000000 [1:02:59<2894:23:58, 10.42s/it, lr=1e-5, step_loss=0.064][RANK-0]: Step: [415], local_loss=0.03722970932722092, train_loss=0.05056136101484299, time_cost=1.288754940032959
+
Steps: 0%| | 415/1000000 [1:02:59<2894:23:58, 10.42s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 416/1000000 [1:03:10<2892:02:11, 10.42s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [416], local_loss=0.03791474550962448, train_loss=0.0780414342880249, time_cost=5.039896726608276
+
Steps: 0%| | 416/1000000 [1:03:10<2892:02:11, 10.42s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 417/1000000 [1:03:15<2450:01:30, 8.82s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [417], local_loss=0.11441117525100708, train_loss=0.16408011317253113, time_cost=2.1974215507507324
+
Steps: 0%| | 417/1000000 [1:03:15<2450:01:30, 8.82s/it, lr=1e-5, step_loss=0.114]
Steps: 0%| | 418/1000000 [1:03:24<2441:01:38, 8.79s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [418], local_loss=0.04417070001363754, train_loss=0.045941825956106186, time_cost=1.6742384433746338
+
Steps: 0%| | 418/1000000 [1:03:24<2441:01:38, 8.79s/it, lr=1e-5, step_loss=0.0442]
Steps: 0%| | 419/1000000 [1:03:38<2872:41:25, 10.35s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [419], local_loss=0.02622849866747856, train_loss=0.06191147118806839, time_cost=4.313310384750366
+
Steps: 0%| | 419/1000000 [1:03:38<2872:41:25, 10.35s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 420/1000000 [1:03:54<3406:26:03, 12.27s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [420], local_loss=0.03848428651690483, train_loss=0.06853561848402023, time_cost=8.198737859725952
+
Steps: 0%| | 420/1000000 [1:03:54<3406:26:03, 12.27s/it, lr=1e-5, step_loss=0.0385]
Steps: 0%| | 421/1000000 [1:04:00<2871:01:43, 10.34s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [421], local_loss=0.05514855310320854, train_loss=0.076469287276268, time_cost=2.4323089122772217
+
Steps: 0%| | 421/1000000 [1:04:00<2871:01:43, 10.34s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 422/1000000 [1:04:15<3284:29:26, 11.83s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [422], local_loss=0.03117360733449459, train_loss=0.05550304427742958, time_cost=7.10327410697937
+
Steps: 0%| | 422/1000000 [1:04:15<3284:29:26, 11.83s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 423/1000000 [1:04:28<3363:02:08, 12.11s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [423], local_loss=0.0689968466758728, train_loss=0.10070262849330902, time_cost=3.908797025680542
+
Steps: 0%| | 423/1000000 [1:04:28<3363:02:08, 12.11s/it, lr=1e-5, step_loss=0.069]
Steps: 0%| | 424/1000000 [1:04:34<2850:21:10, 10.27s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [424], local_loss=0.04276498034596443, train_loss=0.038861535489559174, time_cost=1.3385858535766602
+
Steps: 0%| | 424/1000000 [1:04:34<2850:21:10, 10.27s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 425/1000000 [1:04:40<2442:40:27, 8.80s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [425], local_loss=0.7014759182929993, train_loss=0.14267052710056305, time_cost=3.0474445819854736
+
Steps: 0%| | 425/1000000 [1:04:40<2442:40:27, 8.80s/it, lr=1e-5, step_loss=0.701]
Steps: 0%| | 426/1000000 [1:04:54<2902:14:19, 10.45s/it, lr=1e-5, step_loss=0.701][RANK-0]: Step: [426], local_loss=0.07669232785701752, train_loss=0.12361108511686325, time_cost=6.478599309921265
+
Steps: 0%| | 426/1000000 [1:04:54<2902:14:19, 10.45s/it, lr=1e-5, step_loss=0.0767]
Steps: 0%| | 427/1000000 [1:05:10<3391:18:02, 12.21s/it, lr=1e-5, step_loss=0.0767][RANK-0]: Step: [427], local_loss=0.05783190578222275, train_loss=0.07810993492603302, time_cost=7.522636651992798
+
Steps: 0%| | 427/1000000 [1:05:10<3391:18:02, 12.21s/it, lr=1e-5, step_loss=0.0578]
Steps: 0%| | 428/1000000 [1:05:17<2964:16:57, 10.68s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [428], local_loss=0.07834198325872421, train_loss=14.488536834716797, time_cost=1.4145774841308594
+
Steps: 0%| | 428/1000000 [1:05:17<2964:16:57, 10.68s/it, lr=1e-5, step_loss=0.0783]
Steps: 0%| | 429/1000000 [1:05:26<2787:20:37, 10.04s/it, lr=1e-5, step_loss=0.0783][RANK-0]: Step: [429], local_loss=0.06576111167669296, train_loss=0.06660084426403046, time_cost=3.9778263568878174
+
Steps: 0%| | 429/1000000 [1:05:26<2787:20:37, 10.04s/it, lr=1e-5, step_loss=0.0658]
Steps: 0%| | 430/1000000 [1:05:42<3296:57:54, 11.87s/it, lr=1e-5, step_loss=0.0658][RANK-0]: Step: [430], local_loss=0.07831420749425888, train_loss=0.045159608125686646, time_cost=7.642104864120483
+
Steps: 0%| | 430/1000000 [1:05:42<3296:57:54, 11.87s/it, lr=1e-5, step_loss=0.0783]
Steps: 0%| | 431/1000000 [1:05:47<2726:33:53, 9.82s/it, lr=1e-5, step_loss=0.0783][RANK-0]: Step: [431], local_loss=0.050441838800907135, train_loss=0.05397829785943031, time_cost=1.8951287269592285
+
Steps: 0%| | 431/1000000 [1:05:47<2726:33:53, 9.82s/it, lr=1e-5, step_loss=0.0504]
Steps: 0%| | 432/1000000 [1:05:54<2510:51:36, 9.04s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [432], local_loss=0.035049062222242355, train_loss=0.11095252633094788, time_cost=2.998809337615967
+
Steps: 0%| | 432/1000000 [1:05:54<2510:51:36, 9.04s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 433/1000000 [1:06:01<2335:08:15, 8.41s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [433], local_loss=0.2556709349155426, train_loss=0.1189364641904831, time_cost=2.2866263389587402
+
Steps: 0%| | 433/1000000 [1:06:01<2335:08:15, 8.41s/it, lr=1e-5, step_loss=0.256]
Steps: 0%| | 434/1000000 [1:06:09<2313:06:18, 8.33s/it, lr=1e-5, step_loss=0.256][RANK-0]: Step: [434], local_loss=0.04823710024356842, train_loss=30.36903953552246, time_cost=4.464274883270264
+
Steps: 0%| | 434/1000000 [1:06:09<2313:06:18, 8.33s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 435/1000000 [1:06:16<2195:06:18, 7.91s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [435], local_loss=0.046003445982933044, train_loss=0.1971040666103363, time_cost=1.3131084442138672
+
Steps: 0%| | 435/1000000 [1:06:16<2195:06:18, 7.91s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 436/1000000 [1:06:23<2123:48:53, 7.65s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [436], local_loss=0.05178975313901901, train_loss=0.05605105310678482, time_cost=2.4987332820892334
+
Steps: 0%| | 436/1000000 [1:06:23<2123:48:53, 7.65s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 437/1000000 [1:06:35<2447:22:39, 8.81s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [437], local_loss=0.03561123460531235, train_loss=0.07901333272457123, time_cost=2.4545962810516357
+
Steps: 0%| | 437/1000000 [1:06:35<2447:22:39, 8.81s/it, lr=1e-5, step_loss=0.0356]
Steps: 0%| | 438/1000000 [1:06:44<2482:45:23, 8.94s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [438], local_loss=0.025315070524811745, train_loss=0.050584372133016586, time_cost=1.6783435344696045
+
Steps: 0%| | 438/1000000 [1:06:44<2482:45:23, 8.94s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 439/1000000 [1:06:49<2156:03:56, 7.77s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [439], local_loss=0.052929699420928955, train_loss=0.06299035251140594, time_cost=2.4455556869506836
+
Steps: 0%| | 439/1000000 [1:06:49<2156:03:56, 7.77s/it, lr=1e-5, step_loss=0.0529]
Steps: 0%| | 440/1000000 [1:06:55<2000:48:54, 7.21s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [440], local_loss=0.04811402037739754, train_loss=0.04685215651988983, time_cost=1.4204552173614502
+
Steps: 0%| | 440/1000000 [1:06:55<2000:48:54, 7.21s/it, lr=1e-5, step_loss=0.0481]
Steps: 0%| | 441/1000000 [1:07:09<2574:41:26, 9.27s/it, lr=1e-5, step_loss=0.0481][RANK-0]: Step: [441], local_loss=0.061560094356536865, train_loss=0.07406818121671677, time_cost=2.8481154441833496
+
Steps: 0%| | 441/1000000 [1:07:09<2574:41:26, 9.27s/it, lr=1e-5, step_loss=0.0616]
Steps: 0%| | 442/1000000 [1:07:20<2715:29:13, 9.78s/it, lr=1e-5, step_loss=0.0616][RANK-0]: Step: [442], local_loss=0.05505805462598801, train_loss=0.10041716694831848, time_cost=2.6005699634552
+
Steps: 0%| | 442/1000000 [1:07:20<2715:29:13, 9.78s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 443/1000000 [1:07:25<2328:59:29, 8.39s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [443], local_loss=0.028512367978692055, train_loss=0.049780748784542084, time_cost=2.0550553798675537
+
Steps: 0%| | 443/1000000 [1:07:25<2328:59:29, 8.39s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 444/1000000 [1:07:36<2536:02:30, 9.13s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [444], local_loss=0.028007883578538895, train_loss=0.11287181824445724, time_cost=3.9805238246917725
+
Steps: 0%| | 444/1000000 [1:07:36<2536:02:30, 9.13s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 445/1000000 [1:07:46<2601:09:07, 9.37s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [445], local_loss=0.06494887173175812, train_loss=0.052640728652477264, time_cost=3.453371524810791
+
Steps: 0%| | 445/1000000 [1:07:46<2601:09:07, 9.37s/it, lr=1e-5, step_loss=0.0649]
Steps: 0%| | 446/1000000 [1:07:55<2599:05:58, 9.36s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [446], local_loss=0.07992575317621231, train_loss=0.0850692093372345, time_cost=1.7574481964111328
+
Steps: 0%| | 446/1000000 [1:07:55<2599:05:58, 9.36s/it, lr=1e-5, step_loss=0.0799]
Steps: 0%| | 447/1000000 [1:08:02<2405:25:49, 8.66s/it, lr=1e-5, step_loss=0.0799][RANK-0]: Step: [447], local_loss=0.11653892695903778, train_loss=0.09039008617401123, time_cost=2.4074580669403076
+
Steps: 0%| | 447/1000000 [1:08:02<2405:25:49, 8.66s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 448/1000000 [1:08:13<2555:58:17, 9.21s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [448], local_loss=0.050634462386369705, train_loss=0.08030544221401215, time_cost=1.2850244045257568
+
Steps: 0%| | 448/1000000 [1:08:13<2555:58:17, 9.21s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 449/1000000 [1:08:24<2722:40:39, 9.81s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [449], local_loss=0.03953330218791962, train_loss=0.0865095853805542, time_cost=2.1653313636779785
+
Steps: 0%| | 449/1000000 [1:08:24<2722:40:39, 9.81s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 450/1000000 [1:08:29<2331:10:43, 8.40s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [450], local_loss=0.039405032992362976, train_loss=0.05537940561771393, time_cost=2.01727557182312
+
Steps: 0%| | 450/1000000 [1:08:29<2331:10:43, 8.40s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 451/1000000 [1:08:36<2186:39:16, 7.88s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [451], local_loss=0.058359675109386444, train_loss=0.09150262176990509, time_cost=2.021503210067749
+
Steps: 0%| | 451/1000000 [1:08:36<2186:39:16, 7.88s/it, lr=1e-5, step_loss=0.0584]
Steps: 0%| | 452/1000000 [1:08:47<2488:11:28, 8.96s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [452], local_loss=0.03781306371092796, train_loss=10.38574504852295, time_cost=8.448552370071411
+
Steps: 0%| | 452/1000000 [1:08:47<2488:11:28, 8.96s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 453/1000000 [1:09:00<2773:29:18, 9.99s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [453], local_loss=0.10086934268474579, train_loss=11.274160385131836, time_cost=5.010663032531738
+
Steps: 0%| | 453/1000000 [1:09:00<2773:29:18, 9.99s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 454/1000000 [1:09:14<3158:37:47, 11.38s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [454], local_loss=0.030832307413220406, train_loss=0.04775106906890869, time_cost=5.24391508102417
+
Steps: 0%| | 454/1000000 [1:09:14<3158:37:47, 11.38s/it, lr=1e-5, step_loss=0.0308]
Steps: 0%| | 455/1000000 [1:09:26<3218:24:15, 11.59s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [455], local_loss=0.05521141365170479, train_loss=0.19462907314300537, time_cost=4.053735733032227
+
Steps: 0%| | 455/1000000 [1:09:26<3218:24:15, 11.59s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 456/1000000 [1:09:34<2852:04:46, 10.27s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [456], local_loss=0.034681256860494614, train_loss=0.04733923822641373, time_cost=4.239746332168579
+
Steps: 0%| | 456/1000000 [1:09:34<2852:04:46, 10.27s/it, lr=1e-5, step_loss=0.0347]
Steps: 0%| | 457/1000000 [1:09:38<2366:40:35, 8.52s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [457], local_loss=0.06663285940885544, train_loss=14.148981094360352, time_cost=1.267883062362671
+
Steps: 0%| | 457/1000000 [1:09:38<2366:40:35, 8.52s/it, lr=1e-5, step_loss=0.0666]
Steps: 0%| | 458/1000000 [1:09:52<2830:17:34, 10.19s/it, lr=1e-5, step_loss=0.0666][RANK-0]: Step: [458], local_loss=0.29213178157806396, train_loss=0.10624571144580841, time_cost=5.261695861816406
+
Steps: 0%| | 458/1000000 [1:09:52<2830:17:34, 10.19s/it, lr=1e-5, step_loss=0.292]
Steps: 0%| | 459/1000000 [1:10:06<3123:24:55, 11.25s/it, lr=1e-5, step_loss=0.292][RANK-0]: Step: [459], local_loss=0.04517247900366783, train_loss=0.050754159688949585, time_cost=5.058003664016724
+
Steps: 0%| | 459/1000000 [1:10:06<3123:24:55, 11.25s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 460/1000000 [1:10:15<2976:58:55, 10.72s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [460], local_loss=0.046916112303733826, train_loss=0.1470552384853363, time_cost=4.645414352416992
+
Steps: 0%| | 460/1000000 [1:10:15<2976:58:55, 10.72s/it, lr=1e-5, step_loss=0.0469]
Steps: 0%| | 461/1000000 [1:10:26<2989:46:58, 10.77s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [461], local_loss=0.07163692265748978, train_loss=0.07156555354595184, time_cost=1.236914873123169
+
Steps: 0%| | 461/1000000 [1:10:26<2989:46:58, 10.77s/it, lr=1e-5, step_loss=0.0716]
Steps: 0%| | 462/1000000 [1:10:33<2671:51:08, 9.62s/it, lr=1e-5, step_loss=0.0716][RANK-0]: Step: [462], local_loss=0.04360153526067734, train_loss=0.07395277917385101, time_cost=2.737203598022461
+
Steps: 0%| | 462/1000000 [1:10:33<2671:51:08, 9.62s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 463/1000000 [1:10:51<3327:30:48, 11.98s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [463], local_loss=0.02861914038658142, train_loss=0.0775788426399231, time_cost=1.9307444095611572
+
Steps: 0%| | 463/1000000 [1:10:51<3327:30:48, 11.98s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 464/1000000 [1:10:56<2770:51:19, 9.98s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [464], local_loss=0.030489709228277206, train_loss=0.05001980811357498, time_cost=2.59540057182312
+
Steps: 0%| | 464/1000000 [1:10:56<2770:51:19, 9.98s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 465/1000000 [1:11:09<3028:16:32, 10.91s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [465], local_loss=0.046997517347335815, train_loss=0.05924847722053528, time_cost=3.0530507564544678
+
Steps: 0%| | 465/1000000 [1:11:09<3028:16:32, 10.91s/it, lr=1e-5, step_loss=0.047]
Steps: 0%| | 466/1000000 [1:11:19<2934:12:45, 10.57s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [466], local_loss=0.10552362352609634, train_loss=0.09256171435117722, time_cost=2.5650296211242676
+
Steps: 0%| | 466/1000000 [1:11:19<2934:12:45, 10.57s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 467/1000000 [1:11:24<2486:32:58, 8.96s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [467], local_loss=0.03481442108750343, train_loss=78.09854125976562, time_cost=1.2453837394714355
+
Steps: 0%| | 467/1000000 [1:11:24<2486:32:58, 8.96s/it, lr=1e-5, step_loss=0.0348]
Steps: 0%| | 468/1000000 [1:11:31<2321:25:34, 8.36s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [468], local_loss=392.91839599609375, train_loss=49.23607635498047, time_cost=5.0822718143463135
+
Steps: 0%| | 468/1000000 [1:11:31<2321:25:34, 8.36s/it, lr=1e-5, step_loss=393]
Steps: 0%| | 469/1000000 [1:11:42<2578:30:32, 9.29s/it, lr=1e-5, step_loss=393][RANK-0]: Step: [469], local_loss=0.13548053801059723, train_loss=0.06052423641085625, time_cost=8.474187135696411
+
Steps: 0%| | 469/1000000 [1:11:42<2578:30:32, 9.29s/it, lr=1e-5, step_loss=0.135]
Steps: 0%| | 470/1000000 [1:11:54<2777:06:15, 10.00s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [470], local_loss=0.10213339328765869, train_loss=0.09299200028181076, time_cost=2.2739851474761963
+
Steps: 0%| | 470/1000000 [1:11:54<2777:06:15, 10.00s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 471/1000000 [1:12:05<2829:20:04, 10.19s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [471], local_loss=0.03868729621171951, train_loss=0.06273753941059113, time_cost=1.2340683937072754
+
Steps: 0%| | 471/1000000 [1:12:05<2829:20:04, 10.19s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 472/1000000 [1:12:10<2465:23:56, 8.88s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [472], local_loss=0.04953683540225029, train_loss=0.11018617451190948, time_cost=1.3278627395629883
+
Steps: 0%| | 472/1000000 [1:12:10<2465:23:56, 8.88s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 473/1000000 [1:12:16<2150:17:29, 7.74s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [473], local_loss=0.025913843885064125, train_loss=0.034925416111946106, time_cost=2.7052128314971924
+
Steps: 0%| | 473/1000000 [1:12:16<2150:17:29, 7.74s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 474/1000000 [1:12:20<1880:39:37, 6.77s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [474], local_loss=0.10502652823925018, train_loss=0.06933955103158951, time_cost=1.8956832885742188
+
Steps: 0%| | 474/1000000 [1:12:20<1880:39:37, 6.77s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 475/1000000 [1:12:25<1757:33:29, 6.33s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [475], local_loss=0.04080250859260559, train_loss=0.10853461176156998, time_cost=2.5366313457489014
+
Steps: 0%| | 475/1000000 [1:12:25<1757:33:29, 6.33s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 476/1000000 [1:12:38<2292:44:27, 8.26s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [476], local_loss=0.06007787957787514, train_loss=0.3307446837425232, time_cost=4.061704397201538
+
Steps: 0%| | 476/1000000 [1:12:38<2292:44:27, 8.26s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 477/1000000 [1:12:49<2472:50:20, 8.91s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [477], local_loss=0.02882755547761917, train_loss=0.04644151031970978, time_cost=1.2861628532409668
+
Steps: 0%| | 477/1000000 [1:12:49<2472:50:20, 8.91s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 478/1000000 [1:12:53<2096:51:08, 7.55s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [478], local_loss=0.3731212019920349, train_loss=0.10647208988666534, time_cost=1.488847017288208
+
Steps: 0%| | 478/1000000 [1:12:53<2096:51:08, 7.55s/it, lr=1e-5, step_loss=0.373]
Steps: 0%| | 479/1000000 [1:13:03<2313:38:07, 8.33s/it, lr=1e-5, step_loss=0.373][RANK-0]: Step: [479], local_loss=0.060179490596055984, train_loss=0.08733481913805008, time_cost=1.710540533065796
+
Steps: 0%| | 479/1000000 [1:13:03<2313:38:07, 8.33s/it, lr=1e-5, step_loss=0.0602]
Steps: 0%| | 480/1000000 [1:13:07<1967:16:36, 7.09s/it, lr=1e-5, step_loss=0.0602][RANK-0]: Step: [480], local_loss=0.037866003811359406, train_loss=0.04326561465859413, time_cost=2.3479838371276855
+
Steps: 0%| | 480/1000000 [1:13:07<1967:16:36, 7.09s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 481/1000000 [1:13:21<2538:34:27, 9.14s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [481], local_loss=0.1382831633090973, train_loss=0.13542594015598297, time_cost=4.089056968688965
+
Steps: 0%| | 481/1000000 [1:13:21<2538:34:27, 9.14s/it, lr=1e-5, step_loss=0.138]
Steps: 0%| | 482/1000000 [1:13:27<2251:22:23, 8.11s/it, lr=1e-5, step_loss=0.138][RANK-0]: Step: [482], local_loss=0.05197865515947342, train_loss=0.04869500920176506, time_cost=2.90863037109375
+
Steps: 0%| | 482/1000000 [1:13:27<2251:22:23, 8.11s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 483/1000000 [1:13:38<2485:09:30, 8.95s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [483], local_loss=0.022069325670599937, train_loss=0.041041579097509384, time_cost=4.697173833847046
+
Steps: 0%| | 483/1000000 [1:13:38<2485:09:30, 8.95s/it, lr=1e-5, step_loss=0.0221]|
|
|
|
|
\
|
|
Steps: 0%| | 484/1000000 [1:13:48<2568:00:21, 9.25s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [484], local_loss=0.09660439193248749, train_loss=0.060736753046512604, time_cost=1.2757627964019775
+
Steps: 0%| | 484/1000000 [1:13:48<2568:00:21, 9.25s/it, lr=1e-5, step_loss=0.0966]/
/
/
/
/
/
/
|
Steps: 0%| | 485/1000000 [1:14:01<2860:12:53, 10.30s/it, lr=1e-5, step_loss=0.0966][RANK-0]: Step: [485], local_loss=0.07256097346544266, train_loss=0.05506389960646629, time_cost=1.972261905670166
+
Steps: 0%| | 485/1000000 [1:14:01<2860:12:53, 10.30s/it, lr=1e-5, step_loss=0.0726]
Steps: 0%| | 486/1000000 [1:14:17<3381:45:04, 12.18s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [486], local_loss=0.034064799547195435, train_loss=0.04985203593969345, time_cost=8.802279472351074
+
Steps: 0%| | 486/1000000 [1:14:17<3381:45:04, 12.18s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 487/1000000 [1:14:23<2899:47:46, 10.44s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [487], local_loss=0.04954463988542557, train_loss=0.047021664679050446, time_cost=1.2164127826690674
+
Steps: 0%| | 487/1000000 [1:14:24<2899:47:46, 10.44s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 488/1000000 [1:14:30<2549:29:12, 9.18s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [488], local_loss=0.039368074387311935, train_loss=0.05266733467578888, time_cost=1.8719024658203125
+
Steps: 0%| | 488/1000000 [1:14:30<2549:29:12, 9.18s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 489/1000000 [1:14:41<2721:30:34, 9.80s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [489], local_loss=0.09178264439105988, train_loss=0.05936937406659126, time_cost=1.773618221282959
+
Steps: 0%| | 489/1000000 [1:14:41<2721:30:34, 9.80s/it, lr=1e-5, step_loss=0.0918]
Steps: 0%| | 490/1000000 [1:14:49<2532:10:10, 9.12s/it, lr=1e-5, step_loss=0.0918][RANK-0]: Step: [490], local_loss=0.03169453144073486, train_loss=0.06853589415550232, time_cost=1.4320168495178223
+
Steps: 0%| | 490/1000000 [1:14:49<2532:10:10, 9.12s/it, lr=1e-5, step_loss=0.0317]
Steps: 0%| | 491/1000000 [1:15:05<3181:03:29, 11.46s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [491], local_loss=0.044917479157447815, train_loss=0.0927615612745285, time_cost=9.062692642211914
+
Steps: 0%| | 491/1000000 [1:15:05<3181:03:29, 11.46s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 492/1000000 [1:15:11<2680:44:49, 9.66s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [492], local_loss=96.82242584228516, train_loss=12.149218559265137, time_cost=2.4528117179870605
+
Steps: 0%| | 492/1000000 [1:15:11<2680:44:49, 9.66s/it, lr=1e-5, step_loss=96.8]
Steps: 0%| | 493/1000000 [1:15:17<2347:56:15, 8.46s/it, lr=1e-5, step_loss=96.8][RANK-0]: Step: [493], local_loss=0.0830066055059433, train_loss=0.057841286063194275, time_cost=1.2426421642303467
+
Steps: 0%| | 493/1000000 [1:15:17<2347:56:15, 8.46s/it, lr=1e-5, step_loss=0.083]
Steps: 0%| | 494/1000000 [1:15:24<2253:44:23, 8.12s/it, lr=1e-5, step_loss=0.083][RANK-0]: Step: [494], local_loss=0.035336486995220184, train_loss=0.0715513527393341, time_cost=1.5076203346252441
+
Steps: 0%| | 494/1000000 [1:15:24<2253:44:23, 8.12s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 495/1000000 [1:15:35<2541:39:46, 9.15s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [495], local_loss=0.04298511520028114, train_loss=0.06602303683757782, time_cost=3.0920639038085938
+
Steps: 0%| | 495/1000000 [1:15:35<2541:39:46, 9.15s/it, lr=1e-5, step_loss=0.043]
Steps: 0%| | 496/1000000 [1:15:43<2387:47:09, 8.60s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [496], local_loss=0.03446913883090019, train_loss=0.05000803992152214, time_cost=2.604541301727295
+
Steps: 0%| | 496/1000000 [1:15:43<2387:47:09, 8.60s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 497/1000000 [1:15:55<2722:58:56, 9.81s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [497], local_loss=0.07174082100391388, train_loss=0.059875354170799255, time_cost=1.2519776821136475
+
Steps: 0%| | 497/1000000 [1:15:55<2722:58:56, 9.81s/it, lr=1e-5, step_loss=0.0717]
Steps: 0%| | 498/1000000 [1:16:04<2661:23:33, 9.59s/it, lr=1e-5, step_loss=0.0717][RANK-0]: Step: [498], local_loss=0.03684914484620094, train_loss=0.16838625073432922, time_cost=2.6310908794403076
+
Steps: 0%| | 498/1000000 [1:16:04<2661:23:33, 9.59s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 499/1000000 [1:16:18<2976:40:48, 10.72s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [499], local_loss=0.045740991830825806, train_loss=0.07015534490346909, time_cost=3.5363214015960693
+
Steps: 0%| | 499/1000000 [1:16:18<2976:40:48, 10.72s/it, lr=1e-5, step_loss=0.0457]
Steps: 0%| | 500/1000000 [1:16:29<2998:36:59, 10.80s/it, lr=1e-5, step_loss=0.0457][RANK-0]: Step: [500], local_loss=0.03470021113753319, train_loss=0.051608726382255554, time_cost=1.8048789501190186
+
Steps: 0%| | 500/1000000 [1:16:29<2998:36:59, 10.80s/it, lr=1e-5, step_loss=0.0347]
Steps: 0%| | 501/1000000 [1:16:40<3028:43:03, 10.91s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [501], local_loss=0.05327390506863594, train_loss=0.055236537009477615, time_cost=3.651790142059326
+
Steps: 0%| | 501/1000000 [1:16:40<3028:43:03, 10.91s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 502/1000000 [1:16:45<2548:03:26, 9.18s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [502], local_loss=0.04481722414493561, train_loss=0.05118333920836449, time_cost=2.0864474773406982
+
Steps: 0%| | 502/1000000 [1:16:45<2548:03:26, 9.18s/it, lr=1e-5, step_loss=0.0448]
Steps: 0%| | 503/1000000 [1:16:51<2274:03:07, 8.19s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [503], local_loss=0.03873957693576813, train_loss=0.04619164764881134, time_cost=1.2548720836639404
+
Steps: 0%| | 503/1000000 [1:16:51<2274:03:07, 8.19s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 504/1000000 [1:17:01<2404:04:19, 8.66s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [504], local_loss=0.03960588946938515, train_loss=0.047800227999687195, time_cost=3.3368849754333496
+
Steps: 0%| | 504/1000000 [1:17:01<2404:04:19, 8.66s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 505/1000000 [1:17:07<2205:31:50, 7.94s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [505], local_loss=0.06722521781921387, train_loss=0.10418723523616791, time_cost=1.7540991306304932
+
Steps: 0%| | 505/1000000 [1:17:07<2205:31:50, 7.94s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 506/1000000 [1:17:18<2468:49:16, 8.89s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [506], local_loss=0.05526479333639145, train_loss=0.0634552612900734, time_cost=2.228606939315796
+
Steps: 0%| | 506/1000000 [1:17:18<2468:49:16, 8.89s/it, lr=1e-5, step_loss=0.0553]
Steps: 0%| | 507/1000000 [1:17:23<2096:37:23, 7.55s/it, lr=1e-5, step_loss=0.0553][RANK-0]: Step: [507], local_loss=0.05263686180114746, train_loss=0.10241936147212982, time_cost=1.4925196170806885
+
Steps: 0%| | 507/1000000 [1:17:23<2096:37:23, 7.55s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 508/1000000 [1:17:30<2071:44:42, 7.46s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [508], local_loss=0.044529277831315994, train_loss=15.488786697387695, time_cost=2.6177573204040527
+
Steps: 0%| | 508/1000000 [1:17:30<2071:44:42, 7.46s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 509/1000000 [1:17:35<1869:24:14, 6.73s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [509], local_loss=0.03990224748849869, train_loss=0.059803783893585205, time_cost=2.225464105606079
+
Steps: 0%| | 509/1000000 [1:17:35<1869:24:14, 6.73s/it, lr=1e-5, step_loss=0.0399]
Steps: 0%| | 510/1000000 [1:17:40<1751:49:19, 6.31s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [510], local_loss=0.05267179012298584, train_loss=0.08396557718515396, time_cost=1.3152658939361572
+
Steps: 0%| | 510/1000000 [1:17:40<1751:49:19, 6.31s/it, lr=1e-5, step_loss=0.0527]
Steps: 0%| | 511/1000000 [1:17:48<1839:40:42, 6.63s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [511], local_loss=0.39823752641677856, train_loss=22.071191787719727, time_cost=1.9236321449279785
+
Steps: 0%| | 511/1000000 [1:17:48<1839:40:42, 6.63s/it, lr=1e-5, step_loss=0.398]
Steps: 0%| | 512/1000000 [1:17:54<1823:31:05, 6.57s/it, lr=1e-5, step_loss=0.398][RANK-0]: Step: [512], local_loss=0.03108217567205429, train_loss=0.05124820023775101, time_cost=2.6996328830718994
+
Steps: 0%| | 512/1000000 [1:17:54<1823:31:05, 6.57s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 513/1000000 [1:17:59<1706:58:31, 6.15s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [513], local_loss=0.06577461212873459, train_loss=0.060486748814582825, time_cost=1.271449327468872
+
Steps: 0%| | 513/1000000 [1:17:59<1706:58:31, 6.15s/it, lr=1e-5, step_loss=0.0658]
Steps: 0%| | 514/1000000 [1:18:09<2025:21:04, 7.30s/it, lr=1e-5, step_loss=0.0658][RANK-0]: Step: [514], local_loss=0.29219600558280945, train_loss=0.22161859273910522, time_cost=4.208409547805786
+
Steps: 0%| | 514/1000000 [1:18:09<2025:21:04, 7.30s/it, lr=1e-5, step_loss=0.292]
Steps: 0%| | 515/1000000 [1:18:14<1839:30:53, 6.63s/it, lr=1e-5, step_loss=0.292][RANK-0]: Step: [515], local_loss=0.06704089790582657, train_loss=0.08358362317085266, time_cost=2.1423556804656982
+
Steps: 0%| | 515/1000000 [1:18:14<1839:30:53, 6.63s/it, lr=1e-5, step_loss=0.067]
Steps: 0%| | 516/1000000 [1:18:18<1640:33:22, 5.91s/it, lr=1e-5, step_loss=0.067][RANK-0]: Step: [516], local_loss=0.03050476871430874, train_loss=0.04966479167342186, time_cost=1.3555238246917725
+
Steps: 0%| | 516/1000000 [1:18:18<1640:33:22, 5.91s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 517/1000000 [1:18:24<1643:10:35, 5.92s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [517], local_loss=0.037868302315473557, train_loss=0.04861939698457718, time_cost=1.760089635848999
+
Steps: 0%| | 517/1000000 [1:18:24<1643:10:35, 5.92s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 518/1000000 [1:18:34<1955:41:05, 7.04s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [518], local_loss=0.07004161179065704, train_loss=0.09302075952291489, time_cost=2.7169957160949707
+
Steps: 0%| | 518/1000000 [1:18:34<1955:41:05, 7.04s/it, lr=1e-5, step_loss=0.07]
Steps: 0%| | 519/1000000 [1:18:43<2151:21:59, 7.75s/it, lr=1e-5, step_loss=0.07][RANK-0]: Step: [519], local_loss=0.08147061616182327, train_loss=0.09358057379722595, time_cost=3.2625999450683594
+
Steps: 0%| | 519/1000000 [1:18:43<2151:21:59, 7.75s/it, lr=1e-5, step_loss=0.0815]
Steps: 0%| | 520/1000000 [1:18:50<2015:58:10, 7.26s/it, lr=1e-5, step_loss=0.0815][RANK-0]: Step: [520], local_loss=0.035654034465551376, train_loss=0.05445978417992592, time_cost=1.3424904346466064
+
Steps: 0%| | 520/1000000 [1:18:50<2015:58:10, 7.26s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 521/1000000 [1:18:57<2015:59:23, 7.26s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [521], local_loss=0.04316166788339615, train_loss=0.042229246348142624, time_cost=3.038323163986206
+
Steps: 0%| | 521/1000000 [1:18:57<2015:59:23, 7.26s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 522/1000000 [1:19:08<2312:44:14, 8.33s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [522], local_loss=0.058850813657045364, train_loss=0.06534803658723831, time_cost=5.981446027755737
+
Steps: 0%| | 522/1000000 [1:19:08<2312:44:14, 8.33s/it, lr=1e-5, step_loss=0.0589]
Steps: 0%| | 523/1000000 [1:19:15<2244:25:48, 8.08s/it, lr=1e-5, step_loss=0.0589][RANK-0]: Step: [523], local_loss=0.048218291252851486, train_loss=0.07560119032859802, time_cost=4.505565166473389
+
Steps: 0%| | 523/1000000 [1:19:15<2244:25:48, 8.08s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 524/1000000 [1:19:25<2355:56:35, 8.49s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [524], local_loss=0.05791350454092026, train_loss=0.05724814906716347, time_cost=3.2695136070251465
+
Steps: 0%| | 524/1000000 [1:19:25<2355:56:35, 8.49s/it, lr=1e-5, step_loss=0.0579]
Steps: 0%| | 525/1000000 [1:19:30<2084:08:55, 7.51s/it, lr=1e-5, step_loss=0.0579][RANK-0]: Step: [525], local_loss=0.04399963095784187, train_loss=0.10117001831531525, time_cost=3.887864589691162
+
Steps: 0%| | 525/1000000 [1:19:30<2084:08:55, 7.51s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 526/1000000 [1:19:40<2354:18:23, 8.48s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [526], local_loss=0.10820183902978897, train_loss=0.10480787605047226, time_cost=1.4914302825927734
+
Steps: 0%| | 526/1000000 [1:19:41<2354:18:23, 8.48s/it, lr=1e-5, step_loss=0.108]
Steps: 0%| | 527/1000000 [1:19:54<2757:00:47, 9.93s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [527], local_loss=0.2486417591571808, train_loss=0.08999762684106827, time_cost=3.18172550201416
+
Steps: 0%| | 527/1000000 [1:19:54<2757:00:47, 9.93s/it, lr=1e-5, step_loss=0.249]
Steps: 0%| | 528/1000000 [1:20:06<2936:34:17, 10.58s/it, lr=1e-5, step_loss=0.249][RANK-0]: Step: [528], local_loss=0.04075845703482628, train_loss=0.07108543813228607, time_cost=5.8641746044158936
+
Steps: 0%| | 528/1000000 [1:20:06<2936:34:17, 10.58s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 529/1000000 [1:20:20<3260:10:29, 11.74s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [529], local_loss=0.03509321063756943, train_loss=0.04637772962450981, time_cost=6.631657600402832
+
Steps: 0%| | 529/1000000 [1:20:20<3260:10:29, 11.74s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 530/1000000 [1:20:33<3356:00:40, 12.09s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [530], local_loss=0.042773712426424026, train_loss=0.09190003573894501, time_cost=2.212905168533325
+
Steps: 0%| | 530/1000000 [1:20:33<3356:00:40, 12.09s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 531/1000000 [1:20:38<2781:51:51, 10.02s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [531], local_loss=0.06759993731975555, train_loss=0.08160269260406494, time_cost=1.3097853660583496
+
Steps: 0%| | 531/1000000 [1:20:38<2781:51:51, 10.02s/it, lr=1e-5, step_loss=0.0676]
Steps: 0%| | 532/1000000 [1:20:50<2884:15:50, 10.39s/it, lr=1e-5, step_loss=0.0676][RANK-0]: Step: [532], local_loss=0.05064211040735245, train_loss=0.051497988402843475, time_cost=3.1224281787872314
+
Steps: 0%| | 532/1000000 [1:20:50<2884:15:50, 10.39s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 533/1000000 [1:20:54<2383:52:52, 8.59s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [533], local_loss=0.03162599354982376, train_loss=0.059943657368421555, time_cost=1.3552253246307373
+
Steps: 0%| | 533/1000000 [1:20:54<2383:52:52, 8.59s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 534/1000000 [1:21:07<2760:02:28, 9.94s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [534], local_loss=0.1353973150253296, train_loss=0.07074158638715744, time_cost=3.954113483428955
+
Steps: 0%| | 534/1000000 [1:21:07<2760:02:28, 9.94s/it, lr=1e-5, step_loss=0.135]
Steps: 0%| | 535/1000000 [1:21:19<2899:17:56, 10.44s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [535], local_loss=0.04847582429647446, train_loss=0.04098062962293625, time_cost=4.781000375747681
+
Steps: 0%| | 535/1000000 [1:21:19<2899:17:56, 10.44s/it, lr=1e-5, step_loss=0.0485]
Steps: 0%| | 536/1000000 [1:21:29<2873:37:30, 10.35s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [536], local_loss=0.20017294585704803, train_loss=0.06925000995397568, time_cost=4.672370195388794
+
Steps: 0%| | 536/1000000 [1:21:29<2873:37:30, 10.35s/it, lr=1e-5, step_loss=0.2]
Steps: 0%| | 537/1000000 [1:21:34<2445:50:41, 8.81s/it, lr=1e-5, step_loss=0.2][RANK-0]: Step: [537], local_loss=0.025323286652565002, train_loss=0.044601134955883026, time_cost=2.034911870956421
+
Steps: 0%| | 537/1000000 [1:21:34<2445:50:41, 8.81s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 538/1000000 [1:21:41<2281:43:55, 8.22s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [538], local_loss=0.028572913259267807, train_loss=0.09598210453987122, time_cost=2.4822428226470947
+
Steps: 0%| | 538/1000000 [1:21:41<2281:43:55, 8.22s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 539/1000000 [1:21:45<1965:45:07, 7.08s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [539], local_loss=0.034131020307540894, train_loss=0.08883309364318848, time_cost=1.586836576461792
+
Steps: 0%| | 539/1000000 [1:21:45<1965:45:07, 7.08s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 540/1000000 [1:21:57<2301:50:57, 8.29s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [540], local_loss=0.1542641818523407, train_loss=0.08964300900697708, time_cost=2.555194854736328
+
Steps: 0%| | 540/1000000 [1:21:57<2301:50:57, 8.29s/it, lr=1e-5, step_loss=0.154]
Steps: 0%| | 541/1000000 [1:22:05<2325:54:02, 8.38s/it, lr=1e-5, step_loss=0.154][RANK-0]: Step: [541], local_loss=0.06669456511735916, train_loss=0.07386898994445801, time_cost=2.3227202892303467
+
Steps: 0%| | 541/1000000 [1:22:05<2325:54:02, 8.38s/it, lr=1e-5, step_loss=0.0667]
Steps: 0%| | 542/1000000 [1:22:14<2363:34:21, 8.51s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [542], local_loss=0.03559081256389618, train_loss=0.05070990324020386, time_cost=2.5061564445495605
+
Steps: 0%| | 542/1000000 [1:22:14<2363:34:21, 8.51s/it, lr=1e-5, step_loss=0.0356]
Steps: 0%| | 543/1000000 [1:22:19<2094:29:54, 7.54s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [543], local_loss=0.25136545300483704, train_loss=0.08306720852851868, time_cost=1.4705994129180908
+
Steps: 0%| | 543/1000000 [1:22:19<2094:29:54, 7.54s/it, lr=1e-5, step_loss=0.251]
Steps: 0%| | 544/1000000 [1:22:29<2266:56:58, 8.17s/it, lr=1e-5, step_loss=0.251][RANK-0]: Step: [544], local_loss=0.021319221705198288, train_loss=0.04914166033267975, time_cost=3.2846460342407227
+
Steps: 0%| | 544/1000000 [1:22:29<2266:56:58, 8.17s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 545/1000000 [1:22:34<2056:32:16, 7.41s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [545], local_loss=0.05663765221834183, train_loss=0.07294407486915588, time_cost=2.978755474090576
+
Steps: 0%| | 545/1000000 [1:22:34<2056:32:16, 7.41s/it, lr=1e-5, step_loss=0.0566]
Steps: 0%| | 546/1000000 [1:22:41<1953:38:01, 7.04s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [546], local_loss=0.03568854182958603, train_loss=0.0637902244925499, time_cost=2.6964480876922607
+
Steps: 0%| | 546/1000000 [1:22:41<1953:38:01, 7.04s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 547/1000000 [1:22:49<2059:22:15, 7.42s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [547], local_loss=0.07112699747085571, train_loss=0.1006602793931961, time_cost=2.39489483833313
+
Steps: 0%| | 547/1000000 [1:22:49<2059:22:15, 7.42s/it, lr=1e-5, step_loss=0.0711]
Steps: 0%| | 548/1000000 [1:22:58<2175:59:01, 7.84s/it, lr=1e-5, step_loss=0.0711][RANK-0]: Step: [548], local_loss=0.052237946540117264, train_loss=0.07462688535451889, time_cost=2.652597665786743
+
Steps: 0%| | 548/1000000 [1:22:58<2175:59:01, 7.84s/it, lr=1e-5, step_loss=0.0522]
Steps: 0%| | 549/1000000 [1:23:09<2445:34:02, 8.81s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [549], local_loss=0.04924687743186951, train_loss=0.09165656566619873, time_cost=9.3746497631073
+
Steps: 0%| | 549/1000000 [1:23:09<2445:34:02, 8.81s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 550/1000000 [1:23:14<2143:08:35, 7.72s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [550], local_loss=0.11886189132928848, train_loss=0.09637767821550369, time_cost=1.2402372360229492
+
Steps: 0%| | 550/1000000 [1:23:14<2143:08:35, 7.72s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 551/1000000 [1:23:29<2709:36:43, 9.76s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [551], local_loss=0.04934016615152359, train_loss=0.06566552817821503, time_cost=5.149628162384033
+
Steps: 0%| | 551/1000000 [1:23:29<2709:36:43, 9.76s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 552/1000000 [1:23:40<2831:21:13, 10.20s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [552], local_loss=0.052304502576589584, train_loss=0.04751429706811905, time_cost=2.488442897796631
+
Steps: 0%| | 552/1000000 [1:23:40<2831:21:13, 10.20s/it, lr=1e-5, step_loss=0.0523]
Steps: 0%| | 553/1000000 [1:23:45<2420:59:11, 8.72s/it, lr=1e-5, step_loss=0.0523][RANK-0]: Step: [553], local_loss=0.022556480020284653, train_loss=0.05550596863031387, time_cost=2.446566581726074
+
Steps: 0%| | 553/1000000 [1:23:45<2420:59:11, 8.72s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 554/1000000 [1:23:52<2283:28:01, 8.23s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [554], local_loss=0.05149506777524948, train_loss=0.09837043285369873, time_cost=5.176401138305664
+
Steps: 0%| | 554/1000000 [1:23:52<2283:28:01, 8.23s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 555/1000000 [1:24:04<2555:33:03, 9.21s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [555], local_loss=0.08792111277580261, train_loss=0.17827250063419342, time_cost=1.5274062156677246
+
Steps: 0%| | 555/1000000 [1:24:04<2555:33:03, 9.21s/it, lr=1e-5, step_loss=0.0879]
Steps: 0%| | 556/1000000 [1:24:09<2222:25:25, 8.01s/it, lr=1e-5, step_loss=0.0879][RANK-0]: Step: [556], local_loss=0.04130193218588829, train_loss=0.049055248498916626, time_cost=2.3361289501190186
+
Steps: 0%| | 556/1000000 [1:24:09<2222:25:25, 8.01s/it, lr=1e-5, step_loss=0.0413]
Steps: 0%| | 557/1000000 [1:24:18<2295:17:15, 8.27s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [557], local_loss=0.04172787442803383, train_loss=5.892871856689453, time_cost=4.170129299163818
+
Steps: 0%| | 557/1000000 [1:24:18<2295:17:15, 8.27s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 558/1000000 [1:24:31<2691:42:48, 9.70s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [558], local_loss=0.01266446989029646, train_loss=0.06651850044727325, time_cost=5.027434587478638
+
Steps: 0%| | 558/1000000 [1:24:31<2691:42:48, 9.70s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 559/1000000 [1:24:37<2394:28:11, 8.62s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [559], local_loss=0.11124666035175323, train_loss=7.4133477210998535, time_cost=1.9030730724334717
+
Steps: 0%| | 559/1000000 [1:24:37<2394:28:11, 8.62s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 560/1000000 [1:24:52<2940:04:02, 10.59s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [560], local_loss=0.04649147391319275, train_loss=0.06038152053952217, time_cost=1.216033935546875
+
Steps: 0%| | 560/1000000 [1:24:52<2940:04:02, 10.59s/it, lr=1e-5, step_loss=0.0465]
Steps: 0%| | 561/1000000 [1:25:00<2750:28:15, 9.91s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [561], local_loss=0.04060393571853638, train_loss=0.09333455562591553, time_cost=3.072545289993286
+
Steps: 0%| | 561/1000000 [1:25:00<2750:28:15, 9.91s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 562/1000000 [1:25:08<2523:41:08, 9.09s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [562], local_loss=0.03565688803792, train_loss=0.061643682420253754, time_cost=2.773815155029297
+
Steps: 0%| | 562/1000000 [1:25:08<2523:41:08, 9.09s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 563/1000000 [1:25:18<2634:40:04, 9.49s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [563], local_loss=0.032180849462747574, train_loss=0.05264997482299805, time_cost=4.545869827270508
+
Steps: 0%| | 563/1000000 [1:25:18<2634:40:04, 9.49s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 564/1000000 [1:25:29<2751:44:25, 9.91s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [564], local_loss=0.46227043867111206, train_loss=0.1319294571876526, time_cost=3.4070568084716797
+
Steps: 0%| | 564/1000000 [1:25:29<2751:44:25, 9.91s/it, lr=1e-5, step_loss=0.462]
Steps: 0%| | 565/1000000 [1:25:36<2550:28:04, 9.19s/it, lr=1e-5, step_loss=0.462][RANK-0]: Step: [565], local_loss=0.03667028620839119, train_loss=0.050207093358039856, time_cost=2.9082300662994385
+
Steps: 0%| | 565/1000000 [1:25:36<2550:28:04, 9.19s/it, lr=1e-5, step_loss=0.0367]
Steps: 0%| | 566/1000000 [1:25:43<2357:06:46, 8.49s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [566], local_loss=0.03232044726610184, train_loss=0.07192864269018173, time_cost=2.3951609134674072
+
Steps: 0%| | 566/1000000 [1:25:43<2357:06:46, 8.49s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 567/1000000 [1:25:56<2730:41:13, 9.84s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [567], local_loss=0.04542586952447891, train_loss=0.06977890431880951, time_cost=5.178200721740723
+
Steps: 0%| | 567/1000000 [1:25:56<2730:41:13, 9.84s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 568/1000000 [1:26:09<2943:06:19, 10.60s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [568], local_loss=0.051300764083862305, train_loss=0.04836132004857063, time_cost=4.01718282699585
+
Steps: 0%| | 568/1000000 [1:26:09<2943:06:19, 10.60s/it, lr=1e-5, step_loss=0.0513]
Steps: 0%| | 569/1000000 [1:26:20<2976:59:59, 10.72s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [569], local_loss=0.02014920301735401, train_loss=0.077211394906044, time_cost=1.243100643157959
+
Steps: 0%| | 569/1000000 [1:26:20<2976:59:59, 10.72s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 570/1000000 [1:26:28<2819:20:46, 10.16s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [570], local_loss=0.046697743237018585, train_loss=0.06365346908569336, time_cost=2.8410165309906006
+
Steps: 0%| | 570/1000000 [1:26:28<2819:20:46, 10.16s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 571/1000000 [1:26:42<3075:18:28, 11.08s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [571], local_loss=0.038311149924993515, train_loss=0.04940839856863022, time_cost=1.233513593673706
+
Steps: 0%| | 571/1000000 [1:26:42<3075:18:28, 11.08s/it, lr=1e-5, step_loss=0.0383]
Steps: 0%| | 572/1000000 [1:26:54<3161:35:14, 11.39s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [572], local_loss=0.11739107966423035, train_loss=0.06287941336631775, time_cost=8.390621423721313
+
Steps: 0%| | 572/1000000 [1:26:54<3161:35:14, 11.39s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 573/1000000 [1:27:05<3127:49:15, 11.27s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [573], local_loss=0.061407968401908875, train_loss=0.06983983516693115, time_cost=1.2413721084594727
+
Steps: 0%| | 573/1000000 [1:27:05<3127:49:15, 11.27s/it, lr=1e-5, step_loss=0.0614]
Steps: 0%| | 574/1000000 [1:27:14<2951:50:45, 10.63s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [574], local_loss=0.1252172291278839, train_loss=0.07218024134635925, time_cost=1.9379444122314453
+
Steps: 0%| | 574/1000000 [1:27:14<2951:50:45, 10.63s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 575/1000000 [1:27:22<2741:44:45, 9.88s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [575], local_loss=0.03379867598414421, train_loss=0.051379039883613586, time_cost=2.9862332344055176
+
Steps: 0%| | 575/1000000 [1:27:22<2741:44:45, 9.88s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 576/1000000 [1:27:28<2405:16:43, 8.66s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [576], local_loss=0.06591330468654633, train_loss=0.07750909775495529, time_cost=1.2089202404022217
+
Steps: 0%| | 576/1000000 [1:27:28<2405:16:43, 8.66s/it, lr=1e-5, step_loss=0.0659]
Steps: 0%| | 577/1000000 [1:27:41<2818:22:06, 10.15s/it, lr=1e-5, step_loss=0.0659][RANK-0]: Step: [577], local_loss=0.037094224244356155, train_loss=0.12667882442474365, time_cost=5.94764518737793
+
Steps: 0%| | 577/1000000 [1:27:41<2818:22:06, 10.15s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 578/1000000 [1:27:55<3123:25:18, 11.25s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [578], local_loss=0.04916762933135033, train_loss=0.06851820647716522, time_cost=3.714395523071289
+
Steps: 0%| | 578/1000000 [1:27:55<3123:25:18, 11.25s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 579/1000000 [1:28:00<2548:53:57, 9.18s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [579], local_loss=0.11796212941408157, train_loss=0.056860245764255524, time_cost=1.3497190475463867
+
Steps: 0%| | 579/1000000 [1:28:00<2548:53:57, 9.18s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 580/1000000 [1:28:08<2491:13:33, 8.97s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [580], local_loss=0.03149043768644333, train_loss=0.048284925520420074, time_cost=3.4956955909729004
+
Steps: 0%| | 580/1000000 [1:28:08<2491:13:33, 8.97s/it, lr=1e-5, step_loss=0.0315]
Steps: 0%| | 581/1000000 [1:28:20<2703:34:28, 9.74s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [581], local_loss=0.0744340643286705, train_loss=0.06433647871017456, time_cost=3.6611945629119873
+
Steps: 0%| | 581/1000000 [1:28:20<2703:34:28, 9.74s/it, lr=1e-5, step_loss=0.0744]
Steps: 0%| | 582/1000000 [1:28:27<2522:39:33, 9.09s/it, lr=1e-5, step_loss=0.0744][RANK-0]: Step: [582], local_loss=0.05908610671758652, train_loss=0.04983037710189819, time_cost=1.351224422454834
+
Steps: 0%| | 582/1000000 [1:28:27<2522:39:33, 9.09s/it, lr=1e-5, step_loss=0.0591]
Steps: 0%| | 583/1000000 [1:28:40<2866:08:23, 10.32s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [583], local_loss=0.1153026595711708, train_loss=0.050413720309734344, time_cost=3.6667699813842773
+
Steps: 0%| | 583/1000000 [1:28:40<2866:08:23, 10.32s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 584/1000000 [1:28:45<2358:26:16, 8.50s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [584], local_loss=0.03617687523365021, train_loss=0.0424809455871582, time_cost=1.330435037612915
+
Steps: 0%| | 584/1000000 [1:28:45<2358:26:16, 8.50s/it, lr=1e-5, step_loss=0.0362]
Steps: 0%| | 585/1000000 [1:28:59<2837:14:38, 10.22s/it, lr=1e-5, step_loss=0.0362][RANK-0]: Step: [585], local_loss=0.06717891991138458, train_loss=0.166655495762825, time_cost=1.7310724258422852
+
Steps: 0%| | 585/1000000 [1:28:59<2837:14:38, 10.22s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 586/1000000 [1:29:08<2766:38:21, 9.97s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [586], local_loss=0.04375828430056572, train_loss=0.03729511797428131, time_cost=1.2441234588623047
+
Steps: 0%| | 586/1000000 [1:29:08<2766:38:21, 9.97s/it, lr=1e-5, step_loss=0.0438]
Steps: 0%| | 587/1000000 [1:29:15<2527:07:59, 9.10s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [587], local_loss=0.060315292328596115, train_loss=0.055111512541770935, time_cost=2.1273386478424072
+
Steps: 0%| | 587/1000000 [1:29:15<2527:07:59, 9.10s/it, lr=1e-5, step_loss=0.0603]
Steps: 0%| | 588/1000000 [1:29:27<2772:28:54, 9.99s/it, lr=1e-5, step_loss=0.0603][RANK-0]: Step: [588], local_loss=1.0092436075210571, train_loss=0.16323316097259521, time_cost=4.226607322692871
+
Steps: 0%| | 588/1000000 [1:29:27<2772:28:54, 9.99s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 589/1000000 [1:29:36<2639:53:42, 9.51s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [589], local_loss=0.0554155632853508, train_loss=0.057275354862213135, time_cost=2.957519292831421
+
Steps: 0%| | 589/1000000 [1:29:36<2639:53:42, 9.51s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 590/1000000 [1:29:52<3172:03:40, 11.43s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [590], local_loss=0.03753235936164856, train_loss=0.0490211546421051, time_cost=6.7012364864349365
+
Steps: 0%| | 590/1000000 [1:29:52<3172:03:40, 11.43s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 591/1000000 [1:30:02<3111:43:35, 11.21s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [591], local_loss=0.042103711515665054, train_loss=0.07801841199398041, time_cost=5.6672444343566895
+
Steps: 0%| | 591/1000000 [1:30:02<3111:43:35, 11.21s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 592/1000000 [1:30:15<3265:56:40, 11.76s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [592], local_loss=0.036135174334049225, train_loss=47.49790573120117, time_cost=6.284798622131348
+
Steps: 0%| | 592/1000000 [1:30:15<3265:56:40, 11.76s/it, lr=1e-5, step_loss=0.0361]
Steps: 0%| | 593/1000000 [1:30:21<2741:09:48, 9.87s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [593], local_loss=0.07933811098337173, train_loss=0.08044268935918808, time_cost=3.008265256881714
+
Steps: 0%| | 593/1000000 [1:30:21<2741:09:48, 9.87s/it, lr=1e-5, step_loss=0.0793]
Steps: 0%| | 594/1000000 [1:30:28<2497:48:57, 9.00s/it, lr=1e-5, step_loss=0.0793][RANK-0]: Step: [594], local_loss=0.04553622007369995, train_loss=0.04954134672880173, time_cost=2.145394802093506
+
Steps: 0%| | 594/1000000 [1:30:28<2497:48:57, 9.00s/it, lr=1e-5, step_loss=0.0455]
Steps: 0%| | 595/1000000 [1:30:39<2641:19:35, 9.51s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [595], local_loss=0.05715496838092804, train_loss=0.06290490180253983, time_cost=4.262896299362183
+
Steps: 0%| | 595/1000000 [1:30:39<2641:19:35, 9.51s/it, lr=1e-5, step_loss=0.0572]
Steps: 0%| | 596/1000000 [1:30:44<2338:34:01, 8.42s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [596], local_loss=0.5125380158424377, train_loss=27.337308883666992, time_cost=1.3421037197113037
+
Steps: 0%| | 596/1000000 [1:30:44<2338:34:01, 8.42s/it, lr=1e-5, step_loss=0.513]
Steps: 0%| | 597/1000000 [1:30:53<2364:47:15, 8.52s/it, lr=1e-5, step_loss=0.513][RANK-0]: Step: [597], local_loss=0.04274682328104973, train_loss=0.05041109398007393, time_cost=2.7761993408203125
+
Steps: 0%| | 597/1000000 [1:30:53<2364:47:15, 8.52s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 598/1000000 [1:31:01<2295:33:37, 8.27s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [598], local_loss=0.03346462920308113, train_loss=0.07523652911186218, time_cost=1.241290807723999
+
Steps: 0%| | 598/1000000 [1:31:01<2295:33:37, 8.27s/it, lr=1e-5, step_loss=0.0335]
Steps: 0%| | 599/1000000 [1:31:10<2324:58:56, 8.37s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [599], local_loss=0.04324107989668846, train_loss=0.06545715034008026, time_cost=1.9662823677062988
+
Steps: 0%| | 599/1000000 [1:31:10<2324:58:56, 8.37s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 600/1000000 [1:31:19<2393:41:26, 8.62s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [600], local_loss=0.04577770084142685, train_loss=0.05503690242767334, time_cost=1.4294447898864746
+
Steps: 0%| | 600/1000000 [1:31:19<2393:41:26, 8.62s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 601/1000000 [1:31:25<2178:34:06, 7.85s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [601], local_loss=0.13481713831424713, train_loss=0.09117432683706284, time_cost=1.561241865158081
+
Steps: 0%| | 601/1000000 [1:31:25<2178:34:06, 7.85s/it, lr=1e-5, step_loss=0.135]
Steps: 0%| | 602/1000000 [1:31:30<1988:06:57, 7.16s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [602], local_loss=0.09838347882032394, train_loss=0.06354660540819168, time_cost=1.3876771926879883
+
Steps: 0%| | 602/1000000 [1:31:30<1988:06:57, 7.16s/it, lr=1e-5, step_loss=0.0984]
Steps: 0%| | 603/1000000 [1:31:43<2460:54:19, 8.86s/it, lr=1e-5, step_loss=0.0984][RANK-0]: Step: [603], local_loss=0.060647740960121155, train_loss=0.2454787790775299, time_cost=3.8015027046203613
+
Steps: 0%| | 603/1000000 [1:31:43<2460:54:19, 8.86s/it, lr=1e-5, step_loss=0.0606]
Steps: 0%| | 604/1000000 [1:31:54<2613:22:21, 9.41s/it, lr=1e-5, step_loss=0.0606][RANK-0]: Step: [604], local_loss=0.020789390429854393, train_loss=0.09073051065206528, time_cost=3.0762369632720947
+
Steps: 0%| | 604/1000000 [1:31:54<2613:22:21, 9.41s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 605/1000000 [1:31:58<2193:32:29, 7.90s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [605], local_loss=0.03658347576856613, train_loss=23.861845016479492, time_cost=1.8220067024230957
+
Steps: 0%| | 605/1000000 [1:31:58<2193:32:29, 7.90s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 606/1000000 [1:32:03<1927:42:00, 6.94s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [606], local_loss=0.055775873363018036, train_loss=0.08995194733142853, time_cost=2.2342357635498047
+
Steps: 0%| | 606/1000000 [1:32:03<1927:42:00, 6.94s/it, lr=1e-5, step_loss=0.0558]
Steps: 0%| | 607/1000000 [1:32:10<1947:06:56, 7.01s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [607], local_loss=0.14709749817848206, train_loss=0.08263660222291946, time_cost=5.618217706680298
+
Steps: 0%| | 607/1000000 [1:32:10<1947:06:56, 7.01s/it, lr=1e-5, step_loss=0.147]
Steps: 0%| | 608/1000000 [1:32:17<1952:24:30, 7.03s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [608], local_loss=0.05153220519423485, train_loss=0.07994572073221207, time_cost=1.2590522766113281
+
Steps: 0%| | 608/1000000 [1:32:17<1952:24:30, 7.03s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 609/1000000 [1:32:22<1783:04:55, 6.42s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [609], local_loss=0.03065764717757702, train_loss=0.059888534247875214, time_cost=1.9995696544647217
+
Steps: 0%| | 609/1000000 [1:32:22<1783:04:55, 6.42s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 610/1000000 [1:32:28<1749:04:31, 6.30s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [610], local_loss=0.11145196855068207, train_loss=0.05307655781507492, time_cost=1.646902322769165
+
Steps: 0%| | 610/1000000 [1:32:28<1749:04:31, 6.30s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 611/1000000 [1:32:37<1999:24:43, 7.20s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [611], local_loss=0.023060666397213936, train_loss=0.06994831562042236, time_cost=3.137732744216919
+
Steps: 0%| | 611/1000000 [1:32:37<1999:24:43, 7.20s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 612/1000000 [1:32:42<1807:44:02, 6.51s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [612], local_loss=0.061196163296699524, train_loss=0.06562411040067673, time_cost=2.0213520526885986
+
Steps: 0%| | 612/1000000 [1:32:42<1807:44:02, 6.51s/it, lr=1e-5, step_loss=0.0612]
Steps: 0%| | 613/1000000 [1:32:52<2051:39:38, 7.39s/it, lr=1e-5, step_loss=0.0612][RANK-0]: Step: [613], local_loss=0.0325089730322361, train_loss=0.05028192698955536, time_cost=3.904438018798828
+
Steps: 0%| | 613/1000000 [1:32:52<2051:39:38, 7.39s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 614/1000000 [1:33:02<2249:11:43, 8.10s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [614], local_loss=0.04464409500360489, train_loss=0.1520290970802307, time_cost=1.2492244243621826
+
Steps: 0%| | 614/1000000 [1:33:02<2249:11:43, 8.10s/it, lr=1e-5, step_loss=0.0446]
Steps: 0%| | 615/1000000 [1:33:07<1986:09:16, 7.15s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [615], local_loss=0.03924626111984253, train_loss=0.19031402468681335, time_cost=1.9889867305755615
+
Steps: 0%| | 615/1000000 [1:33:07<1986:09:16, 7.15s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 616/1000000 [1:33:17<2286:04:00, 8.23s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [616], local_loss=0.0647648349404335, train_loss=0.05632665008306503, time_cost=1.2203259468078613
+
Steps: 0%| | 616/1000000 [1:33:17<2286:04:00, 8.23s/it, lr=1e-5, step_loss=0.0648]
Steps: 0%| | 617/1000000 [1:33:28<2514:51:34, 9.06s/it, lr=1e-5, step_loss=0.0648][RANK-0]: Step: [617], local_loss=0.09271053224802017, train_loss=0.06660838425159454, time_cost=8.606094598770142
+
Steps: 0%| | 617/1000000 [1:33:28<2514:51:34, 9.06s/it, lr=1e-5, step_loss=0.0927]
Steps: 0%| | 618/1000000 [1:33:39<2684:50:43, 9.67s/it, lr=1e-5, step_loss=0.0927][RANK-0]: Step: [618], local_loss=0.07467193901538849, train_loss=0.06900738924741745, time_cost=2.4596357345581055
+
Steps: 0%| | 618/1000000 [1:33:39<2684:50:43, 9.67s/it, lr=1e-5, step_loss=0.0747]
Steps: 0%| | 619/1000000 [1:33:47<2499:35:10, 9.00s/it, lr=1e-5, step_loss=0.0747][RANK-0]: Step: [619], local_loss=0.03982891887426376, train_loss=0.03745623677968979, time_cost=3.123276472091675
+
Steps: 0%| | 619/1000000 [1:33:47<2499:35:10, 9.00s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 620/1000000 [1:33:54<2370:06:21, 8.54s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [620], local_loss=0.022832654416561127, train_loss=0.04785531386733055, time_cost=3.0938076972961426
+
Steps: 0%| | 620/1000000 [1:33:54<2370:06:21, 8.54s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 621/1000000 [1:33:59<2083:51:51, 7.51s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [621], local_loss=0.0696793869137764, train_loss=0.04846528172492981, time_cost=1.2301702499389648
+
Steps: 0%| | 621/1000000 [1:33:59<2083:51:51, 7.51s/it, lr=1e-5, step_loss=0.0697]
Steps: 0%| | 622/1000000 [1:34:09<2243:59:34, 8.08s/it, lr=1e-5, step_loss=0.0697][RANK-0]: Step: [622], local_loss=0.043576646596193314, train_loss=0.16200479865074158, time_cost=3.5562057495117188
+
Steps: 0%| | 622/1000000 [1:34:09<2243:59:34, 8.08s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 623/1000000 [1:34:19<2456:16:57, 8.85s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [623], local_loss=0.1475333571434021, train_loss=0.12371428310871124, time_cost=6.8111090660095215
+
Steps: 0%| | 623/1000000 [1:34:19<2456:16:57, 8.85s/it, lr=1e-5, step_loss=0.148]
Steps: 0%| | 624/1000000 [1:34:28<2449:08:11, 8.82s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [624], local_loss=0.050397805869579315, train_loss=0.06768979132175446, time_cost=1.2080919742584229
+
Steps: 0%| | 624/1000000 [1:34:28<2449:08:11, 8.82s/it, lr=1e-5, step_loss=0.0504]
Steps: 0%| | 625/1000000 [1:34:34<2181:22:07, 7.86s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [625], local_loss=0.056750595569610596, train_loss=0.046222999691963196, time_cost=4.6835033893585205
+
Steps: 0%| | 625/1000000 [1:34:34<2181:22:07, 7.86s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 626/1000000 [1:34:41<2140:34:06, 7.71s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [626], local_loss=0.23331736028194427, train_loss=0.17942282557487488, time_cost=3.1432747840881348
+
Steps: 0%| | 626/1000000 [1:34:41<2140:34:06, 7.71s/it, lr=1e-5, step_loss=0.233]
Steps: 0%| | 627/1000000 [1:34:59<2952:28:09, 10.64s/it, lr=1e-5, step_loss=0.233][RANK-0]: Step: [627], local_loss=0.02247084118425846, train_loss=0.05918916314840317, time_cost=7.0384650230407715
+
Steps: 0%| | 627/1000000 [1:34:59<2952:28:09, 10.64s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 628/1000000 [1:35:12<3170:06:58, 11.42s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [628], local_loss=0.04739696905016899, train_loss=0.05420199781656265, time_cost=5.6595985889434814
+
Steps: 0%| | 628/1000000 [1:35:12<3170:06:58, 11.42s/it, lr=1e-5, step_loss=0.0474]
Steps: 0%| | 629/1000000 [1:35:26<3361:09:56, 12.11s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [629], local_loss=0.03683288395404816, train_loss=0.052920810878276825, time_cost=2.033789873123169
+
Steps: 0%| | 629/1000000 [1:35:26<3361:09:56, 12.11s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 630/1000000 [1:35:31<2784:39:17, 10.03s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [630], local_loss=0.05506731942296028, train_loss=0.08044029772281647, time_cost=2.2626049518585205
+
Steps: 0%| | 630/1000000 [1:35:31<2784:39:17, 10.03s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 631/1000000 [1:35:43<2980:32:31, 10.74s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [631], local_loss=0.03371209651231766, train_loss=0.08194112777709961, time_cost=3.048520088195801
+
Steps: 0%| | 631/1000000 [1:35:43<2980:32:31, 10.74s/it, lr=1e-5, step_loss=0.0337]
Steps: 0%| | 632/1000000 [1:35:54<3007:14:30, 10.83s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [632], local_loss=0.03432386368513107, train_loss=0.058867018669843674, time_cost=1.2261161804199219
+
Steps: 0%| | 632/1000000 [1:35:54<3007:14:30, 10.83s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 633/1000000 [1:36:01<2678:44:32, 9.65s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [633], local_loss=0.9826313853263855, train_loss=0.20207761228084564, time_cost=2.9293556213378906
+
Steps: 0%| | 633/1000000 [1:36:01<2678:44:32, 9.65s/it, lr=1e-5, step_loss=0.983]
Steps: 0%| | 634/1000000 [1:36:12<2808:05:13, 10.12s/it, lr=1e-5, step_loss=0.983][RANK-0]: Step: [634], local_loss=0.13886518776416779, train_loss=0.12631753087043762, time_cost=2.790628671646118
+
Steps: 0%| | 634/1000000 [1:36:12<2808:05:13, 10.12s/it, lr=1e-5, step_loss=0.139]
Steps: 0%| | 635/1000000 [1:36:19<2518:53:59, 9.07s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [635], local_loss=0.14411674439907074, train_loss=0.1026972234249115, time_cost=1.8172626495361328
+
Steps: 0%| | 635/1000000 [1:36:19<2518:53:59, 9.07s/it, lr=1e-5, step_loss=0.144]
Steps: 0%| | 636/1000000 [1:36:24<2166:22:57, 7.80s/it, lr=1e-5, step_loss=0.144][RANK-0]: Step: [636], local_loss=0.03741903975605965, train_loss=0.10019006580114365, time_cost=1.866020679473877
+
Steps: 0%| | 636/1000000 [1:36:24<2166:22:57, 7.80s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 637/1000000 [1:36:35<2412:05:21, 8.69s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [637], local_loss=0.036556508392095566, train_loss=0.04323761537671089, time_cost=7.875906229019165
+
Steps: 0%| | 637/1000000 [1:36:35<2412:05:21, 8.69s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 638/1000000 [1:36:46<2653:13:15, 9.56s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [638], local_loss=0.06210405379533768, train_loss=0.04507146030664444, time_cost=3.7912445068359375
+
Steps: 0%| | 638/1000000 [1:36:46<2653:13:15, 9.56s/it, lr=1e-5, step_loss=0.0621]
Steps: 0%| | 639/1000000 [1:36:58<2860:45:02, 10.31s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [639], local_loss=0.038470908999443054, train_loss=0.09092265367507935, time_cost=3.7330563068389893
+
Steps: 0%| | 639/1000000 [1:36:58<2860:45:02, 10.31s/it, lr=1e-5, step_loss=0.0385]
Steps: 0%| | 640/1000000 [1:37:10<2987:14:31, 10.76s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [640], local_loss=0.051419083029031754, train_loss=0.04923861101269722, time_cost=2.33412766456604
+
Steps: 0%| | 640/1000000 [1:37:10<2987:14:31, 10.76s/it, lr=1e-5, step_loss=0.0514]
Steps: 0%| | 641/1000000 [1:37:17<2707:08:07, 9.75s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [641], local_loss=0.04874030500650406, train_loss=0.04322587698698044, time_cost=1.2670295238494873
+
Steps: 0%| | 641/1000000 [1:37:17<2707:08:07, 9.75s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 642/1000000 [1:37:29<2845:29:03, 10.25s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [642], local_loss=0.0755467563867569, train_loss=0.07460891455411911, time_cost=2.3668718338012695
+
Steps: 0%| | 642/1000000 [1:37:29<2845:29:03, 10.25s/it, lr=1e-5, step_loss=0.0755]
Steps: 0%| | 643/1000000 [1:37:43<3187:58:16, 11.48s/it, lr=1e-5, step_loss=0.0755][RANK-0]: Step: [643], local_loss=0.03424826264381409, train_loss=0.04366330802440643, time_cost=5.411098957061768
+
Steps: 0%| | 643/1000000 [1:37:43<3187:58:16, 11.48s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 644/1000000 [1:37:49<2728:27:48, 9.83s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [644], local_loss=0.03485832363367081, train_loss=0.05539781600236893, time_cost=1.8345811367034912
+
Steps: 0%| | 644/1000000 [1:37:49<2728:27:48, 9.83s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 645/1000000 [1:38:00<2789:09:40, 10.05s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [645], local_loss=0.04657857120037079, train_loss=0.060053203254938126, time_cost=4.200287103652954
+
Steps: 0%| | 645/1000000 [1:38:00<2789:09:40, 10.05s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 646/1000000 [1:38:04<2333:11:17, 8.40s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [646], local_loss=0.06321895122528076, train_loss=0.06080526113510132, time_cost=1.7032525539398193
+
Steps: 0%| | 646/1000000 [1:38:04<2333:11:17, 8.40s/it, lr=1e-5, step_loss=0.0632]
Steps: 0%| | 647/1000000 [1:38:11<2230:30:23, 8.04s/it, lr=1e-5, step_loss=0.0632][RANK-0]: Step: [647], local_loss=0.04965543746948242, train_loss=0.05057717114686966, time_cost=1.4928066730499268
+
Steps: 0%| | 647/1000000 [1:38:11<2230:30:23, 8.04s/it, lr=1e-5, step_loss=0.0497]
Steps: 0%| | 648/1000000 [1:38:22<2479:08:55, 8.93s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [648], local_loss=0.04629756510257721, train_loss=0.0617709681391716, time_cost=2.717763900756836
+
Steps: 0%| | 648/1000000 [1:38:22<2479:08:55, 8.93s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 649/1000000 [1:38:30<2364:53:44, 8.52s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [649], local_loss=0.006274990737438202, train_loss=0.052999988198280334, time_cost=3.136035203933716
+
Steps: 0%| | 649/1000000 [1:38:30<2364:53:44, 8.52s/it, lr=1e-5, step_loss=0.00627]
Steps: 0%| | 650/1000000 [1:38:37<2227:45:43, 8.03s/it, lr=1e-5, step_loss=0.00627][RANK-0]: Step: [650], local_loss=0.026311829686164856, train_loss=0.09213726222515106, time_cost=1.3076488971710205
+
Steps: 0%| | 650/1000000 [1:38:37<2227:45:43, 8.03s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 651/1000000 [1:38:44<2181:46:54, 7.86s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [651], local_loss=0.067188560962677, train_loss=0.04848136752843857, time_cost=6.007964849472046
+
Steps: 0%| | 651/1000000 [1:38:44<2181:46:54, 7.86s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 652/1000000 [1:38:56<2483:39:22, 8.95s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [652], local_loss=0.07164648175239563, train_loss=0.07279028743505478, time_cost=3.5136821269989014
+
Steps: 0%| | 652/1000000 [1:38:56<2483:39:22, 8.95s/it, lr=1e-5, step_loss=0.0716]
Steps: 0%| | 653/1000000 [1:39:10<2913:44:21, 10.50s/it, lr=1e-5, step_loss=0.0716][RANK-0]: Step: [653], local_loss=0.04547414928674698, train_loss=0.055593423545360565, time_cost=4.965859651565552
+
Steps: 0%| | 653/1000000 [1:39:10<2913:44:21, 10.50s/it, lr=1e-5, step_loss=0.0455]
Steps: 0%| | 654/1000000 [1:39:16<2535:24:38, 9.13s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [654], local_loss=0.21138231456279755, train_loss=0.1303979754447937, time_cost=4.712203741073608
+
Steps: 0%| | 654/1000000 [1:39:16<2535:24:38, 9.13s/it, lr=1e-5, step_loss=0.211]
Steps: 0%| | 655/1000000 [1:39:23<2355:15:20, 8.48s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [655], local_loss=0.12661226093769073, train_loss=0.06500492244958878, time_cost=2.904966115951538
+
Steps: 0%| | 655/1000000 [1:39:23<2355:15:20, 8.48s/it, lr=1e-5, step_loss=0.127]
Steps: 0%| | 656/1000000 [1:39:29<2142:37:45, 7.72s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [656], local_loss=0.08554204553365707, train_loss=0.048963334411382675, time_cost=3.2099385261535645
+
Steps: 0%| | 656/1000000 [1:39:29<2142:37:45, 7.72s/it, lr=1e-5, step_loss=0.0855]
Steps: 0%| | 657/1000000 [1:39:34<1941:43:33, 6.99s/it, lr=1e-5, step_loss=0.0855][RANK-0]: Step: [657], local_loss=0.07269386947154999, train_loss=0.11314259469509125, time_cost=2.2524030208587646
+
Steps: 0%| | 657/1000000 [1:39:34<1941:43:33, 6.99s/it, lr=1e-5, step_loss=0.0727]
Steps: 0%| | 658/1000000 [1:39:39<1785:38:43, 6.43s/it, lr=1e-5, step_loss=0.0727][RANK-0]: Step: [658], local_loss=0.046199120581150055, train_loss=0.0469510592520237, time_cost=2.1435797214508057
+
Steps: 0%| | 658/1000000 [1:39:39<1785:38:43, 6.43s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 659/1000000 [1:39:51<2236:54:08, 8.06s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [659], local_loss=0.09350529313087463, train_loss=0.1397969126701355, time_cost=3.300133466720581
+
Steps: 0%| | 659/1000000 [1:39:51<2236:54:08, 8.06s/it, lr=1e-5, step_loss=0.0935]
Steps: 0%| | 660/1000000 [1:40:00<2288:21:05, 8.24s/it, lr=1e-5, step_loss=0.0935][RANK-0]: Step: [660], local_loss=0.1363573670387268, train_loss=0.11702819913625717, time_cost=2.6829421520233154
+
Steps: 0%| | 660/1000000 [1:40:00<2288:21:05, 8.24s/it, lr=1e-5, step_loss=0.136]
Steps: 0%| | 661/1000000 [1:40:06<2106:16:53, 7.59s/it, lr=1e-5, step_loss=0.136][RANK-0]: Step: [661], local_loss=0.08043897897005081, train_loss=0.06834715604782104, time_cost=1.4397857189178467
+
Steps: 0%| | 661/1000000 [1:40:06<2106:16:53, 7.59s/it, lr=1e-5, step_loss=0.0804]
Steps: 0%| | 662/1000000 [1:40:17<2367:57:54, 8.53s/it, lr=1e-5, step_loss=0.0804][RANK-0]: Step: [662], local_loss=0.18229462206363678, train_loss=0.07937118411064148, time_cost=1.280118465423584
+
Steps: 0%| | 662/1000000 [1:40:17<2367:57:54, 8.53s/it, lr=1e-5, step_loss=0.182]
Steps: 0%| | 663/1000000 [1:40:23<2157:36:56, 7.77s/it, lr=1e-5, step_loss=0.182][RANK-0]: Step: [663], local_loss=0.10551512986421585, train_loss=0.08324825763702393, time_cost=1.8955578804016113
+
Steps: 0%| | 663/1000000 [1:40:23<2157:36:56, 7.77s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 664/1000000 [1:40:27<1872:09:53, 6.74s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [664], local_loss=0.0696641355752945, train_loss=0.06490454077720642, time_cost=1.8589227199554443
+
Steps: 0%| | 664/1000000 [1:40:27<1872:09:53, 6.74s/it, lr=1e-5, step_loss=0.0697]
Steps: 0%| | 665/1000000 [1:40:40<2411:40:13, 8.69s/it, lr=1e-5, step_loss=0.0697][RANK-0]: Step: [665], local_loss=0.052695125341415405, train_loss=0.05664724484086037, time_cost=4.7901997566223145
+
Steps: 0%| | 665/1000000 [1:40:40<2411:40:13, 8.69s/it, lr=1e-5, step_loss=0.0527]
Steps: 0%| | 666/1000000 [1:40:45<2077:48:46, 7.49s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [666], local_loss=0.036413535475730896, train_loss=0.08463408797979355, time_cost=1.6844472885131836
+
Steps: 0%| | 666/1000000 [1:40:45<2077:48:46, 7.49s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 667/1000000 [1:40:56<2399:02:14, 8.64s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [667], local_loss=0.055115677416324615, train_loss=0.22497671842575073, time_cost=2.4507334232330322
+
Steps: 0%| | 667/1000000 [1:40:56<2399:02:14, 8.64s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 668/1000000 [1:41:07<2616:09:58, 9.42s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [668], local_loss=0.04468391463160515, train_loss=0.07445348054170609, time_cost=2.026779890060425
+
Steps: 0%| | 668/1000000 [1:41:07<2616:09:58, 9.42s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 669/1000000 [1:41:17<2610:55:14, 9.41s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [669], local_loss=0.03601347655057907, train_loss=0.04767640680074692, time_cost=1.380232810974121
+
Steps: 0%| | 669/1000000 [1:41:17<2610:55:14, 9.41s/it, lr=1e-5, step_loss=0.036]
Steps: 0%| | 670/1000000 [1:41:21<2187:00:27, 7.88s/it, lr=1e-5, step_loss=0.036][RANK-0]: Step: [670], local_loss=0.15846005082130432, train_loss=0.06595824658870697, time_cost=1.4912943840026855
+
Steps: 0%| | 670/1000000 [1:41:21<2187:00:27, 7.88s/it, lr=1e-5, step_loss=0.158]
Steps: 0%| | 671/1000000 [1:41:37<2837:19:12, 10.22s/it, lr=1e-5, step_loss=0.158][RANK-0]: Step: [671], local_loss=0.05764488875865936, train_loss=0.041373178362846375, time_cost=6.6599767208099365
+
Steps: 0%| | 671/1000000 [1:41:37<2837:19:12, 10.22s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 672/1000000 [1:41:42<2426:13:38, 8.74s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [672], local_loss=0.16971895098686218, train_loss=0.058042317628860474, time_cost=2.547755479812622
+
Steps: 0%| | 672/1000000 [1:41:42<2426:13:38, 8.74s/it, lr=1e-5, step_loss=0.17]
Steps: 0%| | 673/1000000 [1:41:48<2216:05:58, 7.98s/it, lr=1e-5, step_loss=0.17][RANK-0]: Step: [673], local_loss=0.06536540389060974, train_loss=0.05524980276823044, time_cost=1.4343061447143555
+
Steps: 0%| | 673/1000000 [1:41:48<2216:05:58, 7.98s/it, lr=1e-5, step_loss=0.0654]
Steps: 0%| | 674/1000000 [1:41:55<2098:31:05, 7.56s/it, lr=1e-5, step_loss=0.0654][RANK-0]: Step: [674], local_loss=0.2270725816488266, train_loss=0.08275312930345535, time_cost=2.8126041889190674
+
Steps: 0%| | 674/1000000 [1:41:55<2098:31:05, 7.56s/it, lr=1e-5, step_loss=0.227]
Steps: 0%| | 675/1000000 [1:42:02<2086:28:21, 7.52s/it, lr=1e-5, step_loss=0.227][RANK-0]: Step: [675], local_loss=0.033877089619636536, train_loss=0.05246888846158981, time_cost=3.0534069538116455
+
Steps: 0%| | 675/1000000 [1:42:02<2086:28:21, 7.52s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 676/1000000 [1:42:09<2059:58:30, 7.42s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [676], local_loss=0.12409175932407379, train_loss=0.14054711163043976, time_cost=5.576662540435791
+
Steps: 0%| | 676/1000000 [1:42:09<2059:58:30, 7.42s/it, lr=1e-5, step_loss=0.124]
Steps: 0%| | 677/1000000 [1:42:20<2353:44:02, 8.48s/it, lr=1e-5, step_loss=0.124][RANK-0]: Step: [677], local_loss=0.02568395808339119, train_loss=0.0801868885755539, time_cost=2.3453402519226074
+
Steps: 0%| | 677/1000000 [1:42:20<2353:44:02, 8.48s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 678/1000000 [1:42:34<2772:03:15, 9.99s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [678], local_loss=0.034855782985687256, train_loss=0.0582985058426857, time_cost=10.93330454826355
+
Steps: 0%| | 678/1000000 [1:42:34<2772:03:15, 9.99s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 679/1000000 [1:42:40<2456:51:39, 8.85s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [679], local_loss=0.13001753389835358, train_loss=0.10856711864471436, time_cost=2.2086267471313477
+
Steps: 0%| | 679/1000000 [1:42:40<2456:51:39, 8.85s/it, lr=1e-5, step_loss=0.13]
Steps: 0%| | 680/1000000 [1:42:55<2951:54:33, 10.63s/it, lr=1e-5, step_loss=0.13][RANK-0]: Step: [680], local_loss=0.02770235389471054, train_loss=0.0503012090921402, time_cost=11.48615574836731
+
Steps: 0%| | 680/1000000 [1:42:55<2951:54:33, 10.63s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 681/1000000 [1:43:00<2484:41:40, 8.95s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [681], local_loss=0.02456207387149334, train_loss=0.05225016176700592, time_cost=1.9104900360107422
+
Steps: 0%| | 681/1000000 [1:43:00<2484:41:40, 8.95s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 682/1000000 [1:43:06<2221:07:16, 8.00s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [682], local_loss=0.02593120001256466, train_loss=0.05249805748462677, time_cost=1.296964168548584
+
Steps: 0%| | 682/1000000 [1:43:06<2221:07:16, 8.00s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 683/1000000 [1:43:22<2895:34:38, 10.43s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [683], local_loss=0.040769316256046295, train_loss=0.07897712290287018, time_cost=1.2669062614440918
+
Steps: 0%| | 683/1000000 [1:43:22<2895:34:38, 10.43s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 684/1000000 [1:43:26<2389:12:26, 8.61s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [684], local_loss=0.4179026782512665, train_loss=0.15019947290420532, time_cost=1.4751572608947754
+
Steps: 0%| | 684/1000000 [1:43:26<2389:12:26, 8.61s/it, lr=1e-5, step_loss=0.418]
Steps: 0%| | 685/1000000 [1:43:31<2090:38:19, 7.53s/it, lr=1e-5, step_loss=0.418][RANK-0]: Step: [685], local_loss=0.0874612033367157, train_loss=0.0712965801358223, time_cost=2.356583833694458
+
Steps: 0%| | 685/1000000 [1:43:31<2090:38:19, 7.53s/it, lr=1e-5, step_loss=0.0875]
Steps: 0%| | 686/1000000 [1:43:43<2405:47:04, 8.67s/it, lr=1e-5, step_loss=0.0875][RANK-0]: Step: [686], local_loss=0.11679299920797348, train_loss=0.08176940679550171, time_cost=2.5131595134735107
+
Steps: 0%| | 686/1000000 [1:43:43<2405:47:04, 8.67s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 687/1000000 [1:43:49<2218:41:11, 7.99s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [687], local_loss=0.05138003081083298, train_loss=0.04336024820804596, time_cost=1.8177995681762695
+
Steps: 0%| | 687/1000000 [1:43:49<2218:41:11, 7.99s/it, lr=1e-5, step_loss=0.0514]
Steps: 0%| | 688/1000000 [1:44:01<2527:22:13, 9.10s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [688], local_loss=0.06727150082588196, train_loss=0.04692228138446808, time_cost=4.470135450363159
+
Steps: 0%| | 688/1000000 [1:44:01<2527:22:13, 9.10s/it, lr=1e-5, step_loss=0.0673]
Steps: 0%| | 689/1000000 [1:44:16<3028:50:14, 10.91s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [689], local_loss=0.04799302667379379, train_loss=0.1529960334300995, time_cost=6.996366024017334
+
Steps: 0%| | 689/1000000 [1:44:16<3028:50:14, 10.91s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 690/1000000 [1:44:23<2762:28:11, 9.95s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [690], local_loss=0.05758718028664589, train_loss=0.05969199538230896, time_cost=4.003711462020874
+
Steps: 0%| | 690/1000000 [1:44:23<2762:28:11, 9.95s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 691/1000000 [1:44:37<3048:37:57, 10.98s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [691], local_loss=0.061996057629585266, train_loss=0.04821979999542236, time_cost=4.31303334236145
+
Steps: 0%| | 691/1000000 [1:44:37<3048:37:57, 10.98s/it, lr=1e-5, step_loss=0.062]
Steps: 0%| | 692/1000000 [1:44:46<2888:23:53, 10.41s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [692], local_loss=0.04376688599586487, train_loss=0.08651688694953918, time_cost=2.7092113494873047
+
Steps: 0%| | 692/1000000 [1:44:46<2888:23:53, 10.41s/it, lr=1e-5, step_loss=0.0438]
Steps: 0%| | 693/1000000 [1:44:53<2605:48:58, 9.39s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [693], local_loss=0.38543063402175903, train_loss=0.1498667150735855, time_cost=2.1816389560699463
+
Steps: 0%| | 693/1000000 [1:44:53<2605:48:58, 9.39s/it, lr=1e-5, step_loss=0.385]
Steps: 0%| | 694/1000000 [1:45:05<2834:48:11, 10.21s/it, lr=1e-5, step_loss=0.385][RANK-0]: Step: [694], local_loss=0.8581772446632385, train_loss=0.17125432193279266, time_cost=1.229532241821289
+
Steps: 0%| | 694/1000000 [1:45:05<2834:48:11, 10.21s/it, lr=1e-5, step_loss=0.858]
Steps: 0%| | 695/1000000 [1:45:09<2349:30:27, 8.46s/it, lr=1e-5, step_loss=0.858][RANK-0]: Step: [695], local_loss=0.1248646154999733, train_loss=19.500587463378906, time_cost=1.2369236946105957
+
Steps: 0%| | 695/1000000 [1:45:09<2349:30:27, 8.46s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 696/1000000 [1:45:14<2014:51:56, 7.26s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [696], local_loss=0.047170646488666534, train_loss=0.1775570958852768, time_cost=1.4297738075256348
+
Steps: 0%| | 696/1000000 [1:45:14<2014:51:56, 7.26s/it, lr=1e-5, step_loss=0.0472]
Steps: 0%| | 697/1000000 [1:45:27<2466:41:48, 8.89s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [697], local_loss=0.049175143241882324, train_loss=0.08154243975877762, time_cost=3.239988088607788
+
Steps: 0%| | 697/1000000 [1:45:27<2466:41:48, 8.89s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 698/1000000 [1:45:38<2646:56:07, 9.54s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [698], local_loss=0.02723483368754387, train_loss=0.050690654665231705, time_cost=2.3520309925079346
+
Steps: 0%| | 698/1000000 [1:45:38<2646:56:07, 9.54s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 699/1000000 [1:45:48<2718:20:06, 9.79s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [699], local_loss=0.07088381052017212, train_loss=0.047495484352111816, time_cost=1.2168240547180176
+
Steps: 0%| | 699/1000000 [1:45:48<2718:20:06, 9.79s/it, lr=1e-5, step_loss=0.0709]
Steps: 0%| | 700/1000000 [1:45:57<2638:11:29, 9.50s/it, lr=1e-5, step_loss=0.0709][RANK-0]: Step: [700], local_loss=0.043982576578855515, train_loss=0.09189479053020477, time_cost=6.24759578704834
+
Steps: 0%| | 700/1000000 [1:45:57<2638:11:29, 9.50s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 701/1000000 [1:46:04<2473:31:13, 8.91s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [701], local_loss=0.05704449489712715, train_loss=0.3384806513786316, time_cost=1.3098745346069336
+
Steps: 0%| | 701/1000000 [1:46:04<2473:31:13, 8.91s/it, lr=1e-5, step_loss=0.057]
Steps: 0%| | 702/1000000 [1:46:14<2522:33:19, 9.09s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [702], local_loss=0.06259994208812714, train_loss=0.06419813632965088, time_cost=2.895949125289917
+
Steps: 0%| | 702/1000000 [1:46:14<2522:33:19, 9.09s/it, lr=1e-5, step_loss=0.0626]
Steps: 0%| | 703/1000000 [1:46:21<2399:19:48, 8.64s/it, lr=1e-5, step_loss=0.0626][RANK-0]: Step: [703], local_loss=0.35563501715660095, train_loss=0.35630127787590027, time_cost=3.33235239982605
+
Steps: 0%| | 703/1000000 [1:46:21<2399:19:48, 8.64s/it, lr=1e-5, step_loss=0.356]
Steps: 0%| | 704/1000000 [1:46:29<2314:50:48, 8.34s/it, lr=1e-5, step_loss=0.356][RANK-0]: Step: [704], local_loss=0.07216189801692963, train_loss=0.1749979853630066, time_cost=1.4885995388031006
+
Steps: 0%| | 704/1000000 [1:46:29<2314:50:48, 8.34s/it, lr=1e-5, step_loss=0.0722]
Steps: 0%| | 705/1000000 [1:46:45<2969:24:24, 10.70s/it, lr=1e-5, step_loss=0.0722][RANK-0]: Step: [705], local_loss=0.043200381100177765, train_loss=0.10568036139011383, time_cost=13.465869665145874
+
Steps: 0%| | 705/1000000 [1:46:45<2969:24:24, 10.70s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 706/1000000 [1:46:59<3179:48:37, 11.46s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [706], local_loss=0.04096900299191475, train_loss=0.05348677933216095, time_cost=5.324188947677612
+
Steps: 0%| | 706/1000000 [1:46:59<3179:48:37, 11.46s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 707/1000000 [1:47:06<2883:53:07, 10.39s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [707], local_loss=0.0266415998339653, train_loss=0.04303629696369171, time_cost=1.3643693923950195
+
Steps: 0%| | 707/1000000 [1:47:06<2883:53:07, 10.39s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 708/1000000 [1:47:20<3186:14:39, 11.48s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [708], local_loss=0.0572873093187809, train_loss=0.10905025899410248, time_cost=5.528654098510742
+
Steps: 0%| | 708/1000000 [1:47:20<3186:14:39, 11.48s/it, lr=1e-5, step_loss=0.0573]
Steps: 0%| | 709/1000000 [1:47:29<2939:43:32, 10.59s/it, lr=1e-5, step_loss=0.0573][RANK-0]: Step: [709], local_loss=0.11805065721273422, train_loss=0.06717099994421005, time_cost=1.3152847290039062
+
Steps: 0%| | 709/1000000 [1:47:29<2939:43:32, 10.59s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 710/1000000 [1:47:42<3137:52:32, 11.30s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [710], local_loss=0.2666040360927582, train_loss=0.09322342276573181, time_cost=6.932068586349487
+
Steps: 0%| | 710/1000000 [1:47:42<3137:52:32, 11.30s/it, lr=1e-5, step_loss=0.267]
Steps: 0%| | 711/1000000 [1:47:48<2678:45:01, 9.65s/it, lr=1e-5, step_loss=0.267][RANK-0]: Step: [711], local_loss=0.04262492060661316, train_loss=0.1855384111404419, time_cost=1.399153709411621
+
Steps: 0%| | 711/1000000 [1:47:48<2678:45:01, 9.65s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 712/1000000 [1:48:05<3291:18:40, 11.86s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [712], local_loss=0.08660313487052917, train_loss=0.07328393310308456, time_cost=7.840190887451172
+
Steps: 0%| | 712/1000000 [1:48:05<3291:18:40, 11.86s/it, lr=1e-5, step_loss=0.0866]
Steps: 0%| | 713/1000000 [1:48:19<3518:26:06, 12.68s/it, lr=1e-5, step_loss=0.0866][RANK-0]: Step: [713], local_loss=0.41121378540992737, train_loss=0.08427280932664871, time_cost=2.493638277053833
+
Steps: 0%| | 713/1000000 [1:48:19<3518:26:06, 12.68s/it, lr=1e-5, step_loss=0.411]
Steps: 0%| | 714/1000000 [1:48:36<3823:14:09, 13.77s/it, lr=1e-5, step_loss=0.411][RANK-0]: Step: [714], local_loss=0.03638017922639847, train_loss=0.043358732014894485, time_cost=7.651865005493164
+
Steps: 0%| | 714/1000000 [1:48:36<3823:14:09, 13.77s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 715/1000000 [1:48:43<3296:30:53, 11.88s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [715], local_loss=0.04196697846055031, train_loss=0.07595546543598175, time_cost=1.5411200523376465
+
Steps: 0%| | 715/1000000 [1:48:43<3296:30:53, 11.88s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 716/1000000 [1:48:57<3438:45:11, 12.39s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [716], local_loss=0.1473509818315506, train_loss=0.10704473406076431, time_cost=3.7231197357177734
+
Steps: 0%| | 716/1000000 [1:48:57<3438:45:11, 12.39s/it, lr=1e-5, step_loss=0.147]
Steps: 0%| | 717/1000000 [1:49:01<2785:50:05, 10.04s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [717], local_loss=0.04978509619832039, train_loss=0.07193520665168762, time_cost=1.3093843460083008
+
Steps: 0%| | 717/1000000 [1:49:01<2785:50:05, 10.04s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 718/1000000 [1:49:07<2403:29:00, 8.66s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [718], local_loss=0.9136484265327454, train_loss=0.18772923946380615, time_cost=2.8334293365478516
+
Steps: 0%| | 718/1000000 [1:49:07<2403:29:00, 8.66s/it, lr=1e-5, step_loss=0.914]
Steps: 0%| | 719/1000000 [1:49:20<2780:59:22, 10.02s/it, lr=1e-5, step_loss=0.914][RANK-0]: Step: [719], local_loss=0.0483059361577034, train_loss=0.03741640970110893, time_cost=1.3138394355773926
+
Steps: 0%| | 719/1000000 [1:49:20<2780:59:22, 10.02s/it, lr=1e-5, step_loss=0.0483]
Steps: 0%| | 720/1000000 [1:49:26<2426:40:12, 8.74s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [720], local_loss=0.02982412651181221, train_loss=0.16796109080314636, time_cost=1.6482479572296143
+
Steps: 0%| | 720/1000000 [1:49:26<2426:40:12, 8.74s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 721/1000000 [1:49:31<2147:22:24, 7.74s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [721], local_loss=0.04207379370927811, train_loss=0.10838437080383301, time_cost=2.5294415950775146
+
Steps: 0%| | 721/1000000 [1:49:31<2147:22:24, 7.74s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 722/1000000 [1:49:36<1899:05:35, 6.84s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [722], local_loss=0.09153901785612106, train_loss=0.05367094278335571, time_cost=3.4953699111938477
+
Steps: 0%| | 722/1000000 [1:49:36<1899:05:35, 6.84s/it, lr=1e-5, step_loss=0.0915]
Steps: 0%| | 723/1000000 [1:49:46<2140:58:25, 7.71s/it, lr=1e-5, step_loss=0.0915][RANK-0]: Step: [723], local_loss=0.1014864519238472, train_loss=0.06600706279277802, time_cost=1.3125946521759033
+
Steps: 0%| | 723/1000000 [1:49:46<2140:58:25, 7.71s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 724/1000000 [1:50:00<2667:12:55, 9.61s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [724], local_loss=0.016255658119916916, train_loss=0.10292879492044449, time_cost=5.3641839027404785
+
Steps: 0%| | 724/1000000 [1:50:00<2667:12:55, 9.61s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 725/1000000 [1:50:08<2583:23:06, 9.31s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [725], local_loss=0.06742796301841736, train_loss=0.18250730633735657, time_cost=1.3157389163970947
+
Steps: 0%| | 725/1000000 [1:50:08<2583:23:06, 9.31s/it, lr=1e-5, step_loss=0.0674]
Steps: 0%| | 726/1000000 [1:50:24<3085:06:14, 11.11s/it, lr=1e-5, step_loss=0.0674][RANK-0]: Step: [726], local_loss=0.08765164762735367, train_loss=0.19850504398345947, time_cost=7.848394155502319
+
Steps: 0%| | 726/1000000 [1:50:24<3085:06:14, 11.11s/it, lr=1e-5, step_loss=0.0877]
Steps: 0%| | 727/1000000 [1:50:29<2589:48:31, 9.33s/it, lr=1e-5, step_loss=0.0877][RANK-0]: Step: [727], local_loss=0.09450963884592056, train_loss=0.13973280787467957, time_cost=1.2752816677093506
+
Steps: 0%| | 727/1000000 [1:50:29<2589:48:31, 9.33s/it, lr=1e-5, step_loss=0.0945]
Steps: 0%| | 728/1000000 [1:50:34<2281:46:02, 8.22s/it, lr=1e-5, step_loss=0.0945][RANK-0]: Step: [728], local_loss=0.1032479852437973, train_loss=0.05827835202217102, time_cost=1.4257941246032715
+
Steps: 0%| | 728/1000000 [1:50:34<2281:46:02, 8.22s/it, lr=1e-5, step_loss=0.103]
Steps: 0%| | 729/1000000 [1:50:51<2974:03:12, 10.71s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [729], local_loss=0.0895472839474678, train_loss=0.1038859561085701, time_cost=8.02278447151184
+
Steps: 0%| | 729/1000000 [1:50:51<2974:03:12, 10.71s/it, lr=1e-5, step_loss=0.0895]
Steps: 0%| | 730/1000000 [1:50:57<2598:30:28, 9.36s/it, lr=1e-5, step_loss=0.0895][RANK-0]: Step: [730], local_loss=0.03723771870136261, train_loss=0.1902737021446228, time_cost=4.716270923614502
+
Steps: 0%| | 730/1000000 [1:50:57<2598:30:28, 9.36s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 731/1000000 [1:51:01<2179:08:35, 7.85s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [731], local_loss=0.04324490204453468, train_loss=0.09543897211551666, time_cost=1.945507287979126
+
Steps: 0%| | 731/1000000 [1:51:01<2179:08:35, 7.85s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 732/1000000 [1:51:12<2414:59:53, 8.70s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [732], local_loss=0.06693042814731598, train_loss=15.073986053466797, time_cost=4.319713354110718
+
Steps: 0%| | 732/1000000 [1:51:12<2414:59:53, 8.70s/it, lr=1e-5, step_loss=0.0669]
Steps: 0%| | 733/1000000 [1:51:21<2447:07:01, 8.82s/it, lr=1e-5, step_loss=0.0669][RANK-0]: Step: [733], local_loss=0.07847115397453308, train_loss=0.10219576954841614, time_cost=2.963339328765869
+
Steps: 0%| | 733/1000000 [1:51:21<2447:07:01, 8.82s/it, lr=1e-5, step_loss=0.0785]
Steps: 0%| | 734/1000000 [1:51:32<2628:08:29, 9.47s/it, lr=1e-5, step_loss=0.0785][RANK-0]: Step: [734], local_loss=0.03781620413064957, train_loss=0.0506351962685585, time_cost=6.237375974655151
+
Steps: 0%| | 734/1000000 [1:51:32<2628:08:29, 9.47s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 735/1000000 [1:51:38<2337:58:36, 8.42s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [735], local_loss=0.05460881441831589, train_loss=0.08985135704278946, time_cost=2.665013551712036
+
Steps: 0%| | 735/1000000 [1:51:38<2337:58:36, 8.42s/it, lr=1e-5, step_loss=0.0546]
Steps: 0%| | 736/1000000 [1:51:44<2121:01:10, 7.64s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [736], local_loss=0.059152133762836456, train_loss=0.08406903594732285, time_cost=3.1031653881073
+
Steps: 0%| | 736/1000000 [1:51:44<2121:01:10, 7.64s/it, lr=1e-5, step_loss=0.0592]
Steps: 0%| | 737/1000000 [1:51:48<1826:56:19, 6.58s/it, lr=1e-5, step_loss=0.0592][RANK-0]: Step: [737], local_loss=0.041240058839321136, train_loss=0.05669739842414856, time_cost=1.3825209140777588
+
Steps: 0%| | 737/1000000 [1:51:48<1826:56:19, 6.58s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 738/1000000 [1:51:54<1787:02:43, 6.44s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [738], local_loss=0.1422290951013565, train_loss=0.07667757570743561, time_cost=3.847878932952881
+
Steps: 0%| | 738/1000000 [1:51:54<1787:02:43, 6.44s/it, lr=1e-5, step_loss=0.142]
Steps: 0%| | 739/1000000 [1:52:03<2026:45:16, 7.30s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [739], local_loss=1.0078152418136597, train_loss=0.1880696415901184, time_cost=2.7648022174835205
+
Steps: 0%| | 739/1000000 [1:52:03<2026:45:16, 7.30s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 740/1000000 [1:52:12<2166:45:23, 7.81s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [740], local_loss=0.08603683114051819, train_loss=0.058874279260635376, time_cost=1.4469876289367676
+
Steps: 0%| | 740/1000000 [1:52:12<2166:45:23, 7.81s/it, lr=1e-5, step_loss=0.086]
Steps: 0%| | 741/1000000 [1:52:18<1967:12:35, 7.09s/it, lr=1e-5, step_loss=0.086][RANK-0]: Step: [741], local_loss=0.09534172713756561, train_loss=0.1656956523656845, time_cost=2.4220778942108154
+
Steps: 0%| | 741/1000000 [1:52:18<1967:12:35, 7.09s/it, lr=1e-5, step_loss=0.0953]
Steps: 0%| | 742/1000000 [1:52:26<2043:21:44, 7.36s/it, lr=1e-5, step_loss=0.0953][RANK-0]: Step: [742], local_loss=0.033605605363845825, train_loss=0.11244122684001923, time_cost=3.4369025230407715
+
Steps: 0%| | 742/1000000 [1:52:26<2043:21:44, 7.36s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 743/1000000 [1:52:36<2247:36:13, 8.10s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [743], local_loss=0.03472552448511124, train_loss=0.18790875375270844, time_cost=4.473682880401611
+
Steps: 0%| | 743/1000000 [1:52:36<2247:36:13, 8.10s/it, lr=1e-5, step_loss=0.0347]
Steps: 0%| | 744/1000000 [1:52:43<2162:05:54, 7.79s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [744], local_loss=0.030735421925783157, train_loss=0.10468129813671112, time_cost=1.3317272663116455
+
Steps: 0%| | 744/1000000 [1:52:43<2162:05:54, 7.79s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 745/1000000 [1:52:53<2400:50:14, 8.65s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [745], local_loss=0.05317472293972969, train_loss=0.05140399932861328, time_cost=3.0123236179351807
+
Steps: 0%| | 745/1000000 [1:52:53<2400:50:14, 8.65s/it, lr=1e-5, step_loss=0.0532]
Steps: 0%| | 746/1000000 [1:53:03<2505:20:27, 9.03s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [746], local_loss=0.040881119668483734, train_loss=0.06885632127523422, time_cost=3.6674458980560303
+
Steps: 0%| | 746/1000000 [1:53:03<2505:20:27, 9.03s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 747/1000000 [1:53:14<2605:45:58, 9.39s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [747], local_loss=0.04806175455451012, train_loss=0.09662027657032013, time_cost=1.8406898975372314
+
Steps: 0%| | 747/1000000 [1:53:14<2605:45:58, 9.39s/it, lr=1e-5, step_loss=0.0481]
Steps: 0%| | 748/1000000 [1:53:27<2950:05:14, 10.63s/it, lr=1e-5, step_loss=0.0481][RANK-0]: Step: [748], local_loss=0.3585980236530304, train_loss=0.09651874005794525, time_cost=5.164671897888184
+
Steps: 0%| | 748/1000000 [1:53:27<2950:05:14, 10.63s/it, lr=1e-5, step_loss=0.359]
Steps: 0%| | 749/1000000 [1:53:33<2561:18:10, 9.23s/it, lr=1e-5, step_loss=0.359][RANK-0]: Step: [749], local_loss=0.03218807652592659, train_loss=0.03908689692616463, time_cost=1.4438660144805908
+
Steps: 0%| | 749/1000000 [1:53:33<2561:18:10, 9.23s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 750/1000000 [1:53:40<2372:06:46, 8.55s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [750], local_loss=0.05032738670706749, train_loss=0.08588245511054993, time_cost=2.4135940074920654
+
Steps: 0%| | 750/1000000 [1:53:40<2372:06:46, 8.55s/it, lr=1e-5, step_loss=0.0503]
Steps: 0%| | 751/1000000 [1:53:51<2577:04:02, 9.28s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [751], local_loss=0.043929751962423325, train_loss=0.1647070199251175, time_cost=2.2598485946655273
+
Steps: 0%| | 751/1000000 [1:53:51<2577:04:02, 9.28s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 752/1000000 [1:54:01<2615:34:39, 9.42s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [752], local_loss=0.045024529099464417, train_loss=0.04588152468204498, time_cost=1.2342398166656494
+
Steps: 0%| | 752/1000000 [1:54:01<2615:34:39, 9.42s/it, lr=1e-5, step_loss=0.045]
Steps: 0%| | 753/1000000 [1:54:05<2187:25:25, 7.88s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [753], local_loss=0.028010696172714233, train_loss=0.1085108071565628, time_cost=3.296755313873291
+
Steps: 0%| | 753/1000000 [1:54:05<2187:25:25, 7.88s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 754/1000000 [1:54:19<2718:46:09, 9.79s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [754], local_loss=0.044361475855112076, train_loss=0.0543830431997776, time_cost=6.5560691356658936
+
Steps: 0%| | 754/1000000 [1:54:19<2718:46:09, 9.79s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 755/1000000 [1:54:33<3024:46:50, 10.90s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [755], local_loss=0.032178640365600586, train_loss=0.06445039063692093, time_cost=1.244328260421753
+
Steps: 0%| | 755/1000000 [1:54:33<3024:46:50, 10.90s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 756/1000000 [1:54:40<2732:09:40, 9.84s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [756], local_loss=0.05096539482474327, train_loss=0.04753538966178894, time_cost=2.4022674560546875
+
Steps: 0%| | 756/1000000 [1:54:40<2732:09:40, 9.84s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 757/1000000 [1:54:47<2492:57:22, 8.98s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [757], local_loss=0.04022778198122978, train_loss=0.04847346991300583, time_cost=2.3383688926696777
+
Steps: 0%| | 757/1000000 [1:54:47<2492:57:22, 8.98s/it, lr=1e-5, step_loss=0.0402]
Steps: 0%| | 758/1000000 [1:55:00<2797:20:21, 10.08s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [758], local_loss=0.046256937086582184, train_loss=0.07457540929317474, time_cost=5.946069955825806
+
Steps: 0%| | 758/1000000 [1:55:00<2797:20:21, 10.08s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 759/1000000 [1:55:06<2477:05:59, 8.92s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [759], local_loss=0.062082163989543915, train_loss=0.05752072110772133, time_cost=2.117316246032715
+
Steps: 0%| | 759/1000000 [1:55:06<2477:05:59, 8.92s/it, lr=1e-5, step_loss=0.0621]
Steps: 0%| | 760/1000000 [1:55:13<2293:25:43, 8.26s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [760], local_loss=0.0677214190363884, train_loss=0.05351283401250839, time_cost=3.0691769123077393
+
Steps: 0%| | 760/1000000 [1:55:13<2293:25:43, 8.26s/it, lr=1e-5, step_loss=0.0677]
Steps: 0%| | 761/1000000 [1:55:18<2030:39:18, 7.32s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [761], local_loss=0.16447605192661285, train_loss=0.08977076411247253, time_cost=2.4518275260925293
+
Steps: 0%| | 761/1000000 [1:55:18<2030:39:18, 7.32s/it, lr=1e-5, step_loss=0.164]
Steps: 0%| | 762/1000000 [1:55:27<2190:47:24, 7.89s/it, lr=1e-5, step_loss=0.164][RANK-0]: Step: [762], local_loss=0.035946328192949295, train_loss=0.11898679286241531, time_cost=3.3033359050750732
+
Steps: 0%| | 762/1000000 [1:55:27<2190:47:24, 7.89s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 763/1000000 [1:55:33<2020:02:01, 7.28s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [763], local_loss=0.040038734674453735, train_loss=0.16777724027633667, time_cost=1.6139707565307617
+
Steps: 0%| | 763/1000000 [1:55:33<2020:02:01, 7.28s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 764/1000000 [1:55:42<2193:56:02, 7.90s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [764], local_loss=0.04463182017207146, train_loss=0.06078457832336426, time_cost=2.593003034591675
+
Steps: 0%| | 764/1000000 [1:55:42<2193:56:02, 7.90s/it, lr=1e-5, step_loss=0.0446]
Steps: 0%| | 765/1000000 [1:55:52<2360:22:23, 8.50s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [765], local_loss=0.0900726169347763, train_loss=0.06592176854610443, time_cost=4.031023740768433
+
Steps: 0%| | 765/1000000 [1:55:52<2360:22:23, 8.50s/it, lr=1e-5, step_loss=0.0901]
Steps: 0%| | 766/1000000 [1:56:04<2625:55:06, 9.46s/it, lr=1e-5, step_loss=0.0901][RANK-0]: Step: [766], local_loss=0.9923559427261353, train_loss=0.18643403053283691, time_cost=2.560673952102661
+
Steps: 0%| | 766/1000000 [1:56:04<2625:55:06, 9.46s/it, lr=1e-5, step_loss=0.992]
Steps: 0%| | 767/1000000 [1:56:15<2752:37:30, 9.92s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [767], local_loss=0.07326238602399826, train_loss=0.045226406306028366, time_cost=4.719976186752319
+
Steps: 0%| | 767/1000000 [1:56:15<2752:37:30, 9.92s/it, lr=1e-5, step_loss=0.0733]
Steps: 0%| | 768/1000000 [1:56:22<2519:02:19, 9.08s/it, lr=1e-5, step_loss=0.0733][RANK-0]: Step: [768], local_loss=0.05310448259115219, train_loss=0.10007862746715546, time_cost=4.128613233566284
+
Steps: 0%| | 768/1000000 [1:56:22<2519:02:19, 9.08s/it, lr=1e-5, step_loss=0.0531]
Steps: 0%| | 769/1000000 [1:56:38<3100:16:01, 11.17s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [769], local_loss=0.08046326041221619, train_loss=0.051178939640522, time_cost=1.3025076389312744
+
Steps: 0%| | 769/1000000 [1:56:38<3100:16:01, 11.17s/it, lr=1e-5, step_loss=0.0805]
Steps: 0%| | 770/1000000 [1:56:43<2604:30:05, 9.38s/it, lr=1e-5, step_loss=0.0805][RANK-0]: Step: [770], local_loss=0.07311062514781952, train_loss=0.08500984311103821, time_cost=1.394791603088379
+
Steps: 0%| | 770/1000000 [1:56:43<2604:30:05, 9.38s/it, lr=1e-5, step_loss=0.0731]
Steps: 0%| | 771/1000000 [1:56:52<2597:33:55, 9.36s/it, lr=1e-5, step_loss=0.0731][RANK-0]: Step: [771], local_loss=0.03268652409315109, train_loss=0.04162599891424179, time_cost=1.2796871662139893
+
Steps: 0%| | 771/1000000 [1:56:52<2597:33:55, 9.36s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 772/1000000 [1:57:03<2685:01:26, 9.67s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [772], local_loss=0.056759074330329895, train_loss=0.08651275932788849, time_cost=2.936457872390747
+
Steps: 0%| | 772/1000000 [1:57:03<2685:01:26, 9.67s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 773/1000000 [1:57:08<2343:15:54, 8.44s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [773], local_loss=0.10985716432332993, train_loss=0.1725756675004959, time_cost=2.6490015983581543
+
Steps: 0%| | 773/1000000 [1:57:08<2343:15:54, 8.44s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 774/1000000 [1:57:20<2605:23:35, 9.39s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [774], local_loss=0.04092063754796982, train_loss=0.056920841336250305, time_cost=1.2832701206207275
+
Steps: 0%| | 774/1000000 [1:57:20<2605:23:35, 9.39s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 775/1000000 [1:57:26<2285:16:03, 8.23s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [775], local_loss=0.04346712306141853, train_loss=0.058195602148771286, time_cost=2.4070727825164795
+
Steps: 0%| | 775/1000000 [1:57:26<2285:16:03, 8.23s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 776/1000000 [1:57:39<2715:56:21, 9.78s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [776], local_loss=0.1530190110206604, train_loss=0.054282963275909424, time_cost=1.2671401500701904
+
Steps: 0%| | 776/1000000 [1:57:39<2715:56:21, 9.78s/it, lr=1e-5, step_loss=0.153]
Steps: 0%| | 777/1000000 [1:57:47<2527:41:38, 9.11s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [777], local_loss=0.020531946793198586, train_loss=0.06537453830242157, time_cost=2.8994007110595703
+
Steps: 0%| | 777/1000000 [1:57:47<2527:41:38, 9.11s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 778/1000000 [1:57:54<2374:15:11, 8.55s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [778], local_loss=0.040383480489254, train_loss=6.355679988861084, time_cost=2.7143943309783936
+
Steps: 0%| | 778/1000000 [1:57:54<2374:15:11, 8.55s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 779/1000000 [1:57:59<2104:56:09, 7.58s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [779], local_loss=0.044643741101026535, train_loss=0.10442765802145004, time_cost=2.273967742919922
+
Steps: 0%| | 779/1000000 [1:57:59<2104:56:09, 7.58s/it, lr=1e-5, step_loss=0.0446]
Steps: 0%| | 780/1000000 [1:58:10<2418:21:43, 8.71s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [780], local_loss=0.03901098668575287, train_loss=0.06128435581922531, time_cost=2.538008213043213
+
Steps: 0%| | 780/1000000 [1:58:10<2418:21:43, 8.71s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 781/1000000 [1:58:21<2604:31:23, 9.38s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [781], local_loss=0.05813178792595863, train_loss=0.09005672484636307, time_cost=8.107981443405151
+
Steps: 0%| | 781/1000000 [1:58:21<2604:31:23, 9.38s/it, lr=1e-5, step_loss=0.0581]
Steps: 0%| | 782/1000000 [1:58:26<2232:45:27, 8.04s/it, lr=1e-5, step_loss=0.0581][RANK-0]: Step: [782], local_loss=0.05080454796552658, train_loss=0.062488798052072525, time_cost=3.5689663887023926
+
Steps: 0%| | 782/1000000 [1:58:26<2232:45:27, 8.04s/it, lr=1e-5, step_loss=0.0508]
Steps: 0%| | 783/1000000 [1:58:41<2781:39:49, 10.02s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [783], local_loss=0.07777097076177597, train_loss=0.08405429869890213, time_cost=10.693701982498169
+
Steps: 0%| | 783/1000000 [1:58:41<2781:39:49, 10.02s/it, lr=1e-5, step_loss=0.0778]
Steps: 0%| | 784/1000000 [1:58:54<3051:42:46, 10.99s/it, lr=1e-5, step_loss=0.0778][RANK-0]: Step: [784], local_loss=0.0651170015335083, train_loss=0.04162722826004028, time_cost=3.608022451400757
+
Steps: 0%| | 784/1000000 [1:58:54<3051:42:46, 10.99s/it, lr=1e-5, step_loss=0.0651]
Steps: 0%| | 785/1000000 [1:59:10<3438:34:14, 12.39s/it, lr=1e-5, step_loss=0.0651][RANK-0]: Step: [785], local_loss=0.03499455377459526, train_loss=0.10015879571437836, time_cost=1.2796976566314697
+
Steps: 0%| | 785/1000000 [1:59:10<3438:34:14, 12.39s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 786/1000000 [1:59:17<3013:39:56, 10.86s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [786], local_loss=0.03677462786436081, train_loss=0.04428071528673172, time_cost=1.2874746322631836
+
Steps: 0%| | 786/1000000 [1:59:17<3013:39:56, 10.86s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 787/1000000 [1:59:28<3029:16:31, 10.91s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [787], local_loss=0.05831775814294815, train_loss=0.04955387860536575, time_cost=2.5664737224578857
+
Steps: 0%| | 787/1000000 [1:59:28<3029:16:31, 10.91s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 788/1000000 [1:59:38<2956:04:23, 10.65s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [788], local_loss=0.03637507185339928, train_loss=0.04666408896446228, time_cost=3.3096203804016113
+
Steps: 0%| | 788/1000000 [1:59:38<2956:04:23, 10.65s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 789/1000000 [1:59:44<2571:21:44, 9.26s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [789], local_loss=0.07444015890359879, train_loss=0.038030702620744705, time_cost=1.9190492630004883
+
Steps: 0%| | 789/1000000 [1:59:44<2571:21:44, 9.26s/it, lr=1e-5, step_loss=0.0744]
Steps: 0%| | 790/1000000 [1:59:51<2380:32:36, 8.58s/it, lr=1e-5, step_loss=0.0744][RANK-0]: Step: [790], local_loss=0.041344549506902695, train_loss=0.07087869942188263, time_cost=2.9590020179748535
+
Steps: 0%| | 790/1000000 [1:59:51<2380:32:36, 8.58s/it, lr=1e-5, step_loss=0.0413]
Steps: 0%| | 791/1000000 [2:00:05<2820:34:53, 10.16s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [791], local_loss=0.04653486981987953, train_loss=0.05202099680900574, time_cost=5.680396795272827
+
Steps: 0%| | 791/1000000 [2:00:05<2820:34:53, 10.16s/it, lr=1e-5, step_loss=0.0465]
Steps: 0%| | 792/1000000 [2:00:18<3056:33:22, 11.01s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [792], local_loss=0.02577531524002552, train_loss=0.07042908668518066, time_cost=1.3826496601104736
+
Steps: 0%| | 792/1000000 [2:00:18<3056:33:22, 11.01s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 793/1000000 [2:00:31<3221:46:33, 11.61s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [793], local_loss=0.027674002572894096, train_loss=0.05152575671672821, time_cost=4.023808240890503
+
Steps: 0%| | 793/1000000 [2:00:31<3221:46:33, 11.61s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 794/1000000 [2:00:36<2630:21:44, 9.48s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [794], local_loss=0.9928015470504761, train_loss=0.17462952435016632, time_cost=2.1739206314086914
+
Steps: 0%| | 794/1000000 [2:00:36<2630:21:44, 9.48s/it, lr=1e-5, step_loss=0.993]
Steps: 0%| | 795/1000000 [2:00:43<2420:08:06, 8.72s/it, lr=1e-5, step_loss=0.993][RANK-0]: Step: [795], local_loss=0.036794520914554596, train_loss=0.05058995634317398, time_cost=2.6385326385498047
+
Steps: 0%| | 795/1000000 [2:00:43<2420:08:06, 8.72s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 796/1000000 [2:00:56<2815:00:07, 10.14s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [796], local_loss=0.02882533147931099, train_loss=0.0800148993730545, time_cost=3.985448122024536
+
Steps: 0%| | 796/1000000 [2:00:56<2815:00:07, 10.14s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 797/1000000 [2:01:07<2892:46:33, 10.42s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [797], local_loss=0.03006620891392231, train_loss=0.04012726992368698, time_cost=1.9904508590698242
+
Steps: 0%| | 797/1000000 [2:01:07<2892:46:33, 10.42s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 798/1000000 [2:01:16<2803:51:56, 10.10s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [798], local_loss=0.036500271409749985, train_loss=0.04019992798566818, time_cost=1.2226004600524902
+
Steps: 0%| | 798/1000000 [2:01:16<2803:51:56, 10.10s/it, lr=1e-5, step_loss=0.0365]
Steps: 0%| | 799/1000000 [2:01:23<2478:03:43, 8.93s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [799], local_loss=0.23495374619960785, train_loss=0.20853272080421448, time_cost=1.9132037162780762
+
Steps: 0%| | 799/1000000 [2:01:23<2478:03:43, 8.93s/it, lr=1e-5, step_loss=0.235]
Steps: 0%| | 800/1000000 [2:01:28<2151:09:44, 7.75s/it, lr=1e-5, step_loss=0.235][RANK-0]: Step: [800], local_loss=0.10432593524456024, train_loss=0.24605713784694672, time_cost=1.753662109375
+
Steps: 0%| | 800/1000000 [2:01:28<2151:09:44, 7.75s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 801/1000000 [2:01:41<2595:56:26, 9.35s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [801], local_loss=0.10101310908794403, train_loss=0.05437769368290901, time_cost=4.082196235656738
+
Steps: 0%| | 801/1000000 [2:01:41<2595:56:26, 9.35s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 802/1000000 [2:01:48<2445:42:26, 8.81s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [802], local_loss=0.03962455689907074, train_loss=0.06596846878528595, time_cost=1.5812666416168213
+
Steps: 0%| | 802/1000000 [2:01:48<2445:42:26, 8.81s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 803/1000000 [2:01:56<2365:01:18, 8.52s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [803], local_loss=0.07484354823827744, train_loss=0.06045457720756531, time_cost=3.722496509552002
+
Steps: 0%| | 803/1000000 [2:01:56<2365:01:18, 8.52s/it, lr=1e-5, step_loss=0.0748]
Steps: 0%| | 804/1000000 [2:02:02<2114:54:56, 7.62s/it, lr=1e-5, step_loss=0.0748][RANK-0]: Step: [804], local_loss=0.10368894040584564, train_loss=0.18401391804218292, time_cost=1.332538366317749
+
Steps: 0%| | 804/1000000 [2:02:02<2114:54:56, 7.62s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 805/1000000 [2:02:11<2232:34:14, 8.04s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [805], local_loss=0.05520723760128021, train_loss=0.060304801911115646, time_cost=1.2191145420074463
+
Steps: 0%| | 805/1000000 [2:02:11<2232:34:14, 8.04s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 806/1000000 [2:02:15<1939:27:32, 6.99s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [806], local_loss=0.040390562266111374, train_loss=0.044365864247083664, time_cost=1.485539436340332
+
Steps: 0%| | 806/1000000 [2:02:15<1939:27:32, 6.99s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 807/1000000 [2:02:25<2181:12:00, 7.86s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [807], local_loss=0.0358477458357811, train_loss=0.1863309144973755, time_cost=3.9193897247314453
+
Steps: 0%| | 807/1000000 [2:02:25<2181:12:00, 7.86s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 808/1000000 [2:02:30<1900:58:19, 6.85s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [808], local_loss=0.3089447319507599, train_loss=0.11532030254602432, time_cost=1.356745958328247
+
Steps: 0%| | 808/1000000 [2:02:30<1900:58:19, 6.85s/it, lr=1e-5, step_loss=0.309]
Steps: 0%| | 809/1000000 [2:02:37<1968:24:49, 7.09s/it, lr=1e-5, step_loss=0.309][RANK-0]: Step: [809], local_loss=0.027843985706567764, train_loss=0.059919897466897964, time_cost=5.885148048400879
+
Steps: 0%| | 809/1000000 [2:02:37<1968:24:49, 7.09s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 810/1000000 [2:02:42<1762:08:39, 6.35s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [810], local_loss=0.04914504289627075, train_loss=0.09848420321941376, time_cost=2.215951442718506
+
Steps: 0%| | 810/1000000 [2:02:42<1762:08:39, 6.35s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 811/1000000 [2:02:48<1764:07:00, 6.36s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [811], local_loss=0.03678563982248306, train_loss=9.681192398071289, time_cost=1.865654468536377
+
Steps: 0%| | 811/1000000 [2:02:48<1764:07:00, 6.36s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 812/1000000 [2:03:00<2235:17:43, 8.05s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [812], local_loss=0.14062818884849548, train_loss=0.10798632353544235, time_cost=10.3701651096344
+
Steps: 0%| | 812/1000000 [2:03:00<2235:17:43, 8.05s/it, lr=1e-5, step_loss=0.141]
Steps: 0%| | 813/1000000 [2:03:05<1931:13:32, 6.96s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [813], local_loss=0.1052844300866127, train_loss=0.06817953288555145, time_cost=1.697892189025879
+
Steps: 0%| | 813/1000000 [2:03:05<1931:13:32, 6.96s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 814/1000000 [2:03:10<1784:41:58, 6.43s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [814], local_loss=0.026219623163342476, train_loss=0.054776325821876526, time_cost=2.5242502689361572
+
Steps: 0%| | 814/1000000 [2:03:10<1784:41:58, 6.43s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 815/1000000 [2:03:23<2361:44:50, 8.51s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [815], local_loss=0.035215526819229126, train_loss=0.03954671323299408, time_cost=5.7600319385528564
+
Steps: 0%| | 815/1000000 [2:03:23<2361:44:50, 8.51s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 816/1000000 [2:03:31<2289:57:28, 8.25s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [816], local_loss=0.05016062408685684, train_loss=0.06729727238416672, time_cost=2.9602410793304443
+
Steps: 0%| | 816/1000000 [2:03:31<2289:57:28, 8.25s/it, lr=1e-5, step_loss=0.0502]
Steps: 0%| | 817/1000000 [2:03:37<2114:24:01, 7.62s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [817], local_loss=0.028958473354578018, train_loss=0.04979503154754639, time_cost=2.0503664016723633
+
Steps: 0%| | 817/1000000 [2:03:37<2114:24:01, 7.62s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 818/1000000 [2:03:45<2115:04:51, 7.62s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [818], local_loss=0.05399960279464722, train_loss=0.08915682882070541, time_cost=6.307188510894775
+
Steps: 0%| | 818/1000000 [2:03:45<2115:04:51, 7.62s/it, lr=1e-5, step_loss=0.054]
Steps: 0%| | 819/1000000 [2:03:55<2384:07:34, 8.59s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [819], local_loss=0.07067567110061646, train_loss=0.047205328941345215, time_cost=7.728653907775879
+
Steps: 0%| | 819/1000000 [2:03:55<2384:07:34, 8.59s/it, lr=1e-5, step_loss=0.0707]
Steps: 0%| | 820/1000000 [2:04:02<2192:57:11, 7.90s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [820], local_loss=0.05927269160747528, train_loss=0.042446672916412354, time_cost=1.5074570178985596
+
Steps: 0%| | 820/1000000 [2:04:02<2192:57:11, 7.90s/it, lr=1e-5, step_loss=0.0593]
Steps: 0%| | 821/1000000 [2:04:07<1987:38:50, 7.16s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [821], local_loss=0.0835224911570549, train_loss=18.428966522216797, time_cost=3.1967008113861084
+
Steps: 0%| | 821/1000000 [2:04:07<1987:38:50, 7.16s/it, lr=1e-5, step_loss=0.0835]
Steps: 0%| | 822/1000000 [2:04:12<1755:07:21, 6.32s/it, lr=1e-5, step_loss=0.0835][RANK-0]: Step: [822], local_loss=0.070247583091259, train_loss=0.0534801259636879, time_cost=1.4818317890167236
+
Steps: 0%| | 822/1000000 [2:04:12<1755:07:21, 6.32s/it, lr=1e-5, step_loss=0.0702]
Steps: 0%| | 823/1000000 [2:04:25<2334:48:32, 8.41s/it, lr=1e-5, step_loss=0.0702][RANK-0]: Step: [823], local_loss=0.04405152425169945, train_loss=0.04312940686941147, time_cost=4.36509108543396
+
Steps: 0%| | 823/1000000 [2:04:25<2334:48:32, 8.41s/it, lr=1e-5, step_loss=0.0441]
Steps: 0%| | 824/1000000 [2:04:31<2124:38:04, 7.65s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [824], local_loss=0.057859767228364944, train_loss=0.26304030418395996, time_cost=1.402414321899414
+
Steps: 0%| | 824/1000000 [2:04:31<2124:38:04, 7.65s/it, lr=1e-5, step_loss=0.0579]
Steps: 0%| | 825/1000000 [2:04:36<1941:46:25, 7.00s/it, lr=1e-5, step_loss=0.0579][RANK-0]: Step: [825], local_loss=0.1290377825498581, train_loss=0.13660110533237457, time_cost=1.6299495697021484
+
Steps: 0%| | 825/1000000 [2:04:36<1941:46:25, 7.00s/it, lr=1e-5, step_loss=0.129]
Steps: 0%| | 826/1000000 [2:04:49<2422:25:48, 8.73s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [826], local_loss=0.03853217884898186, train_loss=0.056098420172929764, time_cost=1.5243165493011475
+
Steps: 0%| | 826/1000000 [2:04:49<2422:25:48, 8.73s/it, lr=1e-5, step_loss=0.0385]
Steps: 0%| | 827/1000000 [2:04:55<2238:13:40, 8.06s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [827], local_loss=0.1609739065170288, train_loss=0.05543343722820282, time_cost=2.0049660205841064
+
Steps: 0%| | 827/1000000 [2:04:55<2238:13:40, 8.06s/it, lr=1e-5, step_loss=0.161]
Steps: 0%| | 828/1000000 [2:05:02<2071:01:38, 7.46s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [828], local_loss=0.03656696900725365, train_loss=0.16718579828739166, time_cost=1.9499363899230957
+
Steps: 0%| | 828/1000000 [2:05:02<2071:01:38, 7.46s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 829/1000000 [2:05:17<2766:48:49, 9.97s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [829], local_loss=0.04597229138016701, train_loss=0.04960796982049942, time_cost=9.280145645141602
+
Steps: 0%| | 829/1000000 [2:05:17<2766:48:49, 9.97s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 830/1000000 [2:05:28<2860:24:25, 10.31s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [830], local_loss=0.037979606539011, train_loss=0.09793121367692947, time_cost=3.4587607383728027
+
Steps: 0%| | 830/1000000 [2:05:28<2860:24:25, 10.31s/it, lr=1e-5, step_loss=0.038]
Steps: 0%| | 831/1000000 [2:05:35<2508:09:18, 9.04s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [831], local_loss=0.04492112994194031, train_loss=0.045825712382793427, time_cost=2.1651644706726074
+
Steps: 0%| | 831/1000000 [2:05:35<2508:09:18, 9.04s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 832/1000000 [2:05:40<2233:33:55, 8.05s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [832], local_loss=0.030413158237934113, train_loss=0.0746345967054367, time_cost=2.41374135017395
+
Steps: 0%| | 832/1000000 [2:05:40<2233:33:55, 8.05s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 833/1000000 [2:05:48<2172:55:45, 7.83s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [833], local_loss=0.05175096541643143, train_loss=0.08796945214271545, time_cost=1.5929248332977295
+
Steps: 0%| | 833/1000000 [2:05:48<2172:55:45, 7.83s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 834/1000000 [2:05:52<1920:55:40, 6.92s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [834], local_loss=0.04398795962333679, train_loss=0.06401292979717255, time_cost=1.5219430923461914
+
Steps: 0%| | 834/1000000 [2:05:52<1920:55:40, 6.92s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 835/1000000 [2:05:57<1697:47:54, 6.12s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [835], local_loss=0.04767238348722458, train_loss=0.047798678278923035, time_cost=1.5232899188995361
+
Steps: 0%| | 835/1000000 [2:05:57<1697:47:54, 6.12s/it, lr=1e-5, step_loss=0.0477]
Steps: 0%| | 836/1000000 [2:06:04<1841:25:22, 6.63s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [836], local_loss=0.02988457679748535, train_loss=10.929742813110352, time_cost=5.5133583545684814
+
Steps: 0%| | 836/1000000 [2:06:04<1841:25:22, 6.63s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 837/1000000 [2:06:13<2031:12:14, 7.32s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [837], local_loss=0.03931373357772827, train_loss=0.06378859281539917, time_cost=5.996174573898315
+
Steps: 0%| | 837/1000000 [2:06:13<2031:12:14, 7.32s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 838/1000000 [2:06:22<2143:58:39, 7.72s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [838], local_loss=0.04551928490400314, train_loss=0.08214108645915985, time_cost=2.7094228267669678
+
Steps: 0%| | 838/1000000 [2:06:22<2143:58:39, 7.72s/it, lr=1e-5, step_loss=0.0455]
Steps: 0%| | 839/1000000 [2:06:27<1926:31:38, 6.94s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [839], local_loss=0.03722131997346878, train_loss=0.11311414837837219, time_cost=3.9324653148651123
+
Steps: 0%| | 839/1000000 [2:06:27<1926:31:38, 6.94s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 840/1000000 [2:06:35<1989:34:16, 7.17s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [840], local_loss=0.07168733328580856, train_loss=0.07332514226436615, time_cost=6.784825086593628
+
Steps: 0%| | 840/1000000 [2:06:35<1989:34:16, 7.17s/it, lr=1e-5, step_loss=0.0717]
Steps: 0%| | 841/1000000 [2:06:48<2447:35:06, 8.82s/it, lr=1e-5, step_loss=0.0717][RANK-0]: Step: [841], local_loss=0.04870455339550972, train_loss=0.08663394302129745, time_cost=8.565831899642944
+
Steps: 0%| | 841/1000000 [2:06:48<2447:35:06, 8.82s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 842/1000000 [2:06:58<2562:59:20, 9.23s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [842], local_loss=0.05174567550420761, train_loss=0.06335289776325226, time_cost=8.48521375656128
+
Steps: 0%| | 842/1000000 [2:06:58<2562:59:20, 9.23s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 843/1000000 [2:07:02<2179:27:24, 7.85s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [843], local_loss=0.039982300251722336, train_loss=0.0638350248336792, time_cost=2.1415584087371826
+
Steps: 0%| | 843/1000000 [2:07:02<2179:27:24, 7.85s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 844/1000000 [2:07:07<1934:05:49, 6.97s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [844], local_loss=0.03513001278042793, train_loss=0.061061881482601166, time_cost=1.2479438781738281
+
Steps: 0%| | 844/1000000 [2:07:07<1934:05:49, 6.97s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 845/1000000 [2:07:13<1830:11:13, 6.59s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [845], local_loss=0.040324434638023376, train_loss=0.0503426194190979, time_cost=1.9251248836517334
+
Steps: 0%| | 845/1000000 [2:07:13<1830:11:13, 6.59s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 846/1000000 [2:07:21<1956:45:18, 7.05s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [846], local_loss=0.1063302531838417, train_loss=0.06433311104774475, time_cost=1.835289716720581
+
Steps: 0%| | 846/1000000 [2:07:21<1956:45:18, 7.05s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 847/1000000 [2:07:26<1784:10:24, 6.43s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [847], local_loss=0.0369621217250824, train_loss=0.05775641277432442, time_cost=2.2257025241851807
+
Steps: 0%| | 847/1000000 [2:07:26<1784:10:24, 6.43s/it, lr=1e-5, step_loss=0.037]
Steps: 0%| | 848/1000000 [2:07:35<1985:20:48, 7.15s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [848], local_loss=0.02449140138924122, train_loss=0.1749592423439026, time_cost=3.882856845855713
+
Steps: 0%| | 848/1000000 [2:07:35<1985:20:48, 7.15s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 849/1000000 [2:07:43<2090:10:07, 7.53s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [849], local_loss=0.04432286322116852, train_loss=0.05335714668035507, time_cost=1.6960422992706299
+
Steps: 0%| | 849/1000000 [2:07:43<2090:10:07, 7.53s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 850/1000000 [2:07:59<2741:11:45, 9.88s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [850], local_loss=0.038594800978899, train_loss=0.06363797187805176, time_cost=6.449444055557251
+
Steps: 0%| | 850/1000000 [2:07:59<2741:11:45, 9.88s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 851/1000000 [2:08:06<2529:16:33, 9.11s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [851], local_loss=0.03751012682914734, train_loss=0.045409198850393295, time_cost=5.470849990844727
+
Steps: 0%| | 851/1000000 [2:08:06<2529:16:33, 9.11s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 852/1000000 [2:08:11<2201:30:50, 7.93s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [852], local_loss=0.06773003190755844, train_loss=0.1138484925031662, time_cost=4.270642518997192
+
Steps: 0%| | 852/1000000 [2:08:11<2201:30:50, 7.93s/it, lr=1e-5, step_loss=0.0677]
Steps: 0%| | 853/1000000 [2:08:18<2118:04:24, 7.63s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [853], local_loss=0.12592928111553192, train_loss=0.05228634178638458, time_cost=1.248218297958374
+
Steps: 0%| | 853/1000000 [2:08:18<2118:04:24, 7.63s/it, lr=1e-5, step_loss=0.126]
Steps: 0%| | 854/1000000 [2:08:23<1870:14:43, 6.74s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [854], local_loss=0.02278238534927368, train_loss=0.07180024683475494, time_cost=1.7124824523925781
+
Steps: 0%| | 854/1000000 [2:08:23<1870:14:43, 6.74s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 855/1000000 [2:08:31<1966:47:25, 7.09s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [855], local_loss=0.034646760672330856, train_loss=0.05588264763355255, time_cost=4.746356725692749
+
Steps: 0%| | 855/1000000 [2:08:31<1966:47:25, 7.09s/it, lr=1e-5, step_loss=0.0346]
Steps: 0%| | 856/1000000 [2:08:35<1748:34:41, 6.30s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [856], local_loss=0.17088527977466583, train_loss=0.11413212865591049, time_cost=1.325756311416626
+
Steps: 0%| | 856/1000000 [2:08:35<1748:34:41, 6.30s/it, lr=1e-5, step_loss=0.171]
Steps: 0%| | 857/1000000 [2:08:47<2204:42:45, 7.94s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [857], local_loss=0.05577398091554642, train_loss=0.16500380635261536, time_cost=2.9723784923553467
+
Steps: 0%| | 857/1000000 [2:08:47<2204:42:45, 7.94s/it, lr=1e-5, step_loss=0.0558]
Steps: 0%| | 858/1000000 [2:08:52<1956:22:52, 7.05s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [858], local_loss=0.03833535313606262, train_loss=0.0541633740067482, time_cost=1.245284080505371
+
Steps: 0%| | 858/1000000 [2:08:52<1956:22:52, 7.05s/it, lr=1e-5, step_loss=0.0383]
Steps: 0%| | 859/1000000 [2:09:01<2140:43:00, 7.71s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [859], local_loss=0.02702951431274414, train_loss=0.058463070541620255, time_cost=2.1612586975097656
+
Steps: 0%| | 859/1000000 [2:09:01<2140:43:00, 7.71s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 860/1000000 [2:09:10<2274:13:05, 8.19s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [860], local_loss=0.034887153655290604, train_loss=0.06513969600200653, time_cost=1.2444610595703125
+
Steps: 0%| | 860/1000000 [2:09:10<2274:13:05, 8.19s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 861/1000000 [2:09:26<2868:03:10, 10.33s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [861], local_loss=0.03714137151837349, train_loss=0.0375407338142395, time_cost=6.944053888320923
+
Steps: 0%| | 861/1000000 [2:09:26<2868:03:10, 10.33s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 862/1000000 [2:09:30<2347:11:54, 8.46s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [862], local_loss=0.03777514770627022, train_loss=0.16099193692207336, time_cost=1.3120858669281006
+
Steps: 0%| | 862/1000000 [2:09:30<2347:11:54, 8.46s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 863/1000000 [2:09:38<2281:16:25, 8.22s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [863], local_loss=0.08578677475452423, train_loss=0.07723298668861389, time_cost=1.9599883556365967
+
Steps: 0%| | 863/1000000 [2:09:38<2281:16:25, 8.22s/it, lr=1e-5, step_loss=0.0858]
Steps: 0%| | 864/1000000 [2:09:43<2022:35:29, 7.29s/it, lr=1e-5, step_loss=0.0858][RANK-0]: Step: [864], local_loss=0.038207631558179855, train_loss=0.03954070061445236, time_cost=1.22055983543396
+
Steps: 0%| | 864/1000000 [2:09:43<2022:35:29, 7.29s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 865/1000000 [2:09:53<2299:51:04, 8.29s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [865], local_loss=0.11383204162120819, train_loss=0.12078148126602173, time_cost=1.7657554149627686
+
Steps: 0%| | 865/1000000 [2:09:53<2299:51:04, 8.29s/it, lr=1e-5, step_loss=0.114]
Steps: 0%| | 866/1000000 [2:09:59<2055:06:02, 7.40s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [866], local_loss=0.051970429718494415, train_loss=0.08367182314395905, time_cost=1.2670581340789795
+
Steps: 0%| | 866/1000000 [2:09:59<2055:06:02, 7.40s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 867/1000000 [2:10:08<2209:19:13, 7.96s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [867], local_loss=0.03764905780553818, train_loss=0.0712510198354721, time_cost=6.852496862411499
+
Steps: 0%| | 867/1000000 [2:10:08<2209:19:13, 7.96s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 868/1000000 [2:10:15<2163:23:08, 7.79s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [868], local_loss=0.14327289164066315, train_loss=0.10456426441669464, time_cost=1.2384557723999023
+
Steps: 0%| | 868/1000000 [2:10:15<2163:23:08, 7.79s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 869/1000000 [2:10:22<2095:29:38, 7.55s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [869], local_loss=0.031052365899086, train_loss=0.05237191915512085, time_cost=2.9782633781433105
+
Steps: 0%| | 869/1000000 [2:10:22<2095:29:38, 7.55s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 870/1000000 [2:10:29<2023:02:43, 7.29s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [870], local_loss=216.7582244873047, train_loss=27.144943237304688, time_cost=2.1834888458251953
+
Steps: 0%| | 870/1000000 [2:10:29<2023:02:43, 7.29s/it, lr=1e-5, step_loss=217]
Steps: 0%| | 871/1000000 [2:10:43<2595:14:41, 9.35s/it, lr=1e-5, step_loss=217][RANK-0]: Step: [871], local_loss=0.061456382274627686, train_loss=0.06719671189785004, time_cost=4.301819324493408
+
Steps: 0%| | 871/1000000 [2:10:43<2595:14:41, 9.35s/it, lr=1e-5, step_loss=0.0615]
Steps: 0%| | 872/1000000 [2:10:57<2950:12:02, 10.63s/it, lr=1e-5, step_loss=0.0615][RANK-0]: Step: [872], local_loss=0.03492950648069382, train_loss=0.06800903379917145, time_cost=4.250360727310181
+
Steps: 0%| | 872/1000000 [2:10:57<2950:12:02, 10.63s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 873/1000000 [2:11:10<3184:08:03, 11.47s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [873], local_loss=0.040342338383197784, train_loss=50.80307388305664, time_cost=4.10332465171814
+
Steps: 0%| | 873/1000000 [2:11:10<3184:08:03, 11.47s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 874/1000000 [2:11:15<2594:29:20, 9.35s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [874], local_loss=0.02773667313158512, train_loss=0.03723974898457527, time_cost=1.4078640937805176
+
Steps: 0%| | 874/1000000 [2:11:15<2594:29:20, 9.35s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 875/1000000 [2:11:30<3102:19:17, 11.18s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [875], local_loss=0.1429368257522583, train_loss=0.057014696300029755, time_cost=7.469146966934204
+
Steps: 0%| | 875/1000000 [2:11:30<3102:19:17, 11.18s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 876/1000000 [2:11:35<2592:23:57, 9.34s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [876], local_loss=0.03711378574371338, train_loss=0.05610768869519234, time_cost=2.328427791595459
+
Steps: 0%| | 876/1000000 [2:11:35<2592:23:57, 9.34s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 877/1000000 [2:11:48<2918:52:10, 10.52s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [877], local_loss=0.0731513723731041, train_loss=0.050281696021556854, time_cost=1.3202438354492188
+
Steps: 0%| | 877/1000000 [2:11:48<2918:52:10, 10.52s/it, lr=1e-5, step_loss=0.0732]
Steps: 0%| | 878/1000000 [2:11:58<2807:41:49, 10.12s/it, lr=1e-5, step_loss=0.0732][RANK-0]: Step: [878], local_loss=0.024708513170480728, train_loss=0.04168708249926567, time_cost=3.3925795555114746
+
Steps: 0%| | 878/1000000 [2:11:58<2807:41:49, 10.12s/it, lr=1e-5, step_loss=0.0247]
Steps: 0%| | 879/1000000 [2:12:03<2442:44:16, 8.80s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [879], local_loss=0.1187947541475296, train_loss=0.053438421338796616, time_cost=3.1084420680999756
+
Steps: 0%| | 879/1000000 [2:12:03<2442:44:16, 8.80s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 880/1000000 [2:12:08<2143:36:57, 7.72s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [880], local_loss=0.02568783611059189, train_loss=0.04801885783672333, time_cost=2.0532755851745605
+
Steps: 0%| | 880/1000000 [2:12:08<2143:36:57, 7.72s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 881/1000000 [2:12:13<1844:04:55, 6.64s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [881], local_loss=0.053362783044576645, train_loss=0.07329052686691284, time_cost=3.137136936187744
+
Steps: 0%| | 881/1000000 [2:12:13<1844:04:55, 6.64s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 882/1000000 [2:12:32<2901:53:23, 10.46s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [882], local_loss=0.04577125236392021, train_loss=0.04736865311861038, time_cost=11.917846202850342
+
Steps: 0%| | 882/1000000 [2:12:32<2901:53:23, 10.46s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 883/1000000 [2:12:38<2520:53:14, 9.08s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [883], local_loss=0.05395644158124924, train_loss=0.0644635483622551, time_cost=3.5436720848083496
+
Steps: 0%| | 883/1000000 [2:12:38<2520:53:14, 9.08s/it, lr=1e-5, step_loss=0.054]
Steps: 0%| | 884/1000000 [2:12:54<3108:53:29, 11.20s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [884], local_loss=0.024817733094096184, train_loss=0.17817839980125427, time_cost=8.506617546081543
+
Steps: 0%| | 884/1000000 [2:12:54<3108:53:29, 11.20s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 885/1000000 [2:13:04<3012:05:25, 10.85s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [885], local_loss=0.036422453820705414, train_loss=0.04783572256565094, time_cost=3.169910192489624
+
Steps: 0%| | 885/1000000 [2:13:04<3012:05:25, 10.85s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 886/1000000 [2:13:09<2553:23:32, 9.20s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [886], local_loss=0.09729556739330292, train_loss=0.18857598304748535, time_cost=2.798327922821045
+
Steps: 0%| | 886/1000000 [2:13:09<2553:23:32, 9.20s/it, lr=1e-5, step_loss=0.0973]
Steps: 0%| | 887/1000000 [2:13:21<2733:44:06, 9.85s/it, lr=1e-5, step_loss=0.0973][RANK-0]: Step: [887], local_loss=0.03693629428744316, train_loss=0.06638215482234955, time_cost=8.994820833206177
+
Steps: 0%| | 887/1000000 [2:13:21<2733:44:06, 9.85s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 888/1000000 [2:13:37<3231:05:55, 11.64s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [888], local_loss=0.028209179639816284, train_loss=0.07961946725845337, time_cost=7.164019584655762
+
Steps: 0%| | 888/1000000 [2:13:37<3231:05:55, 11.64s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 889/1000000 [2:13:44<2889:43:24, 10.41s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [889], local_loss=0.02751164138317108, train_loss=0.059844329953193665, time_cost=1.6831202507019043
+
Steps: 0%| | 889/1000000 [2:13:44<2889:43:24, 10.41s/it, lr=1e-5, step_loss=0.0275]
Steps: 0%| | 890/1000000 [2:13:49<2431:09:23, 8.76s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [890], local_loss=0.14903606474399567, train_loss=0.07304836809635162, time_cost=2.2303378582000732
+
Steps: 0%| | 890/1000000 [2:13:49<2431:09:23, 8.76s/it, lr=1e-5, step_loss=0.149]
Steps: 0%| | 891/1000000 [2:13:54<2142:17:15, 7.72s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [891], local_loss=0.055669158697128296, train_loss=0.08047925680875778, time_cost=2.084150552749634
+
Steps: 0%| | 891/1000000 [2:13:54<2142:17:15, 7.72s/it, lr=1e-5, step_loss=0.0557]
Steps: 0%| | 892/1000000 [2:14:03<2233:01:15, 8.05s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [892], local_loss=0.033128391951322556, train_loss=0.04587964713573456, time_cost=2.3768370151519775
+
Steps: 0%| | 892/1000000 [2:14:03<2233:01:15, 8.05s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 893/1000000 [2:14:19<2849:25:57, 10.27s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [893], local_loss=0.05998216196894646, train_loss=0.04906625300645828, time_cost=4.925959825515747
+
Steps: 0%| | 893/1000000 [2:14:19<2849:25:57, 10.27s/it, lr=1e-5, step_loss=0.06]
Steps: 0%| | 894/1000000 [2:14:30<2916:26:39, 10.51s/it, lr=1e-5, step_loss=0.06][RANK-0]: Step: [894], local_loss=0.06088893488049507, train_loss=0.046138279139995575, time_cost=1.237278938293457
+
Steps: 0%| | 894/1000000 [2:14:30<2916:26:39, 10.51s/it, lr=1e-5, step_loss=0.0609]
Steps: 0%| | 895/1000000 [2:14:39<2791:15:05, 10.06s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [895], local_loss=0.04868592321872711, train_loss=0.061492446810007095, time_cost=3.809326410293579
+
Steps: 0%| | 895/1000000 [2:14:39<2791:15:05, 10.06s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 896/1000000 [2:14:50<2941:44:03, 10.60s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [896], local_loss=0.05309687554836273, train_loss=0.04152724891901016, time_cost=3.874530553817749
+
Steps: 0%| | 896/1000000 [2:14:50<2941:44:03, 10.60s/it, lr=1e-5, step_loss=0.0531]
Steps: 0%| | 897/1000000 [2:15:04<3155:58:43, 11.37s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [897], local_loss=0.04262533783912659, train_loss=33.053104400634766, time_cost=5.329009532928467
+
Steps: 0%| | 897/1000000 [2:15:04<3155:58:43, 11.37s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 898/1000000 [2:15:13<3024:11:35, 10.90s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [898], local_loss=0.12831664085388184, train_loss=0.06340748071670532, time_cost=2.835644483566284
+
Steps: 0%| | 898/1000000 [2:15:13<3024:11:35, 10.90s/it, lr=1e-5, step_loss=0.128]
Steps: 0%| | 899/1000000 [2:15:18<2482:24:34, 8.94s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [899], local_loss=0.02541832998394966, train_loss=0.08071047812700272, time_cost=3.3877346515655518
+
Steps: 0%| | 899/1000000 [2:15:18<2482:24:34, 8.94s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 900/1000000 [2:15:27<2500:52:40, 9.01s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [900], local_loss=0.10057178139686584, train_loss=0.05166831612586975, time_cost=3.324594736099243
+
Steps: 0%| | 900/1000000 [2:15:27<2500:52:40, 9.01s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 901/1000000 [2:15:39<2742:43:20, 9.88s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [901], local_loss=0.22142060101032257, train_loss=13.197389602661133, time_cost=9.058565855026245
+
Steps: 0%| | 901/1000000 [2:15:39<2742:43:20, 9.88s/it, lr=1e-5, step_loss=0.221]
Steps: 0%| | 902/1000000 [2:15:44<2349:38:19, 8.47s/it, lr=1e-5, step_loss=0.221][RANK-0]: Step: [902], local_loss=0.05503436550498009, train_loss=0.06694156676530838, time_cost=2.260357141494751
+
Steps: 0%| | 902/1000000 [2:15:44<2349:38:19, 8.47s/it, lr=1e-5, step_loss=0.055]
Steps: 0%| | 903/1000000 [2:15:57<2763:05:56, 9.96s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [903], local_loss=0.028420204296708107, train_loss=0.23193159699440002, time_cost=5.0169837474823
+
Steps: 0%| | 903/1000000 [2:15:57<2763:05:56, 9.96s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 904/1000000 [2:16:10<3013:24:56, 10.86s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [904], local_loss=0.048853855580091476, train_loss=0.051469992846250534, time_cost=4.036396741867065
+
Steps: 0%| | 904/1000000 [2:16:10<3013:24:56, 10.86s/it, lr=1e-5, step_loss=0.0489]
Steps: 0%| | 905/1000000 [2:16:18<2718:51:46, 9.80s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [905], local_loss=0.03593508154153824, train_loss=0.03913859650492668, time_cost=2.486461639404297
+
Steps: 0%| | 905/1000000 [2:16:18<2718:51:46, 9.80s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 906/1000000 [2:16:34<3263:08:02, 11.76s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [906], local_loss=0.0951593890786171, train_loss=0.14093969762325287, time_cost=7.944579839706421
+
Steps: 0%| | 906/1000000 [2:16:34<3263:08:02, 11.76s/it, lr=1e-5, step_loss=0.0952]
Steps: 0%| | 907/1000000 [2:16:45<3204:05:48, 11.55s/it, lr=1e-5, step_loss=0.0952][RANK-0]: Step: [907], local_loss=0.0264122411608696, train_loss=0.11275887489318848, time_cost=6.175766944885254
+
Steps: 0%| | 907/1000000 [2:16:45<3204:05:48, 11.55s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 908/1000000 [2:16:59<3431:05:06, 12.36s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [908], local_loss=0.036062926054000854, train_loss=0.10174998641014099, time_cost=6.5030341148376465
+
Steps: 0%| | 908/1000000 [2:16:59<3431:05:06, 12.36s/it, lr=1e-5, step_loss=0.0361]
Steps: 0%| | 909/1000000 [2:17:10<3311:31:59, 11.93s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [909], local_loss=0.14265869557857513, train_loss=0.06552322208881378, time_cost=4.332590579986572
+
Steps: 0%| | 909/1000000 [2:17:10<3311:31:59, 11.93s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 910/1000000 [2:17:17<2869:08:52, 10.34s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [910], local_loss=0.033219113945961, train_loss=0.08327411860227585, time_cost=1.8660600185394287
+
Steps: 0%| | 910/1000000 [2:17:17<2869:08:52, 10.34s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 911/1000000 [2:17:32<3227:02:57, 11.63s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [911], local_loss=0.04077153652906418, train_loss=0.05259648710489273, time_cost=6.749818325042725
+
Steps: 0%| | 911/1000000 [2:17:32<3227:02:57, 11.63s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 912/1000000 [2:17:44<3325:13:16, 11.98s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [912], local_loss=0.03363832086324692, train_loss=81.41495513916016, time_cost=9.709238529205322
+
Steps: 0%| | 912/1000000 [2:17:44<3325:13:16, 11.98s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 913/1000000 [2:17:58<3471:06:55, 12.51s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [913], local_loss=0.05214468389749527, train_loss=0.06470029056072235, time_cost=4.3410890102386475
+
Steps: 0%| | 913/1000000 [2:17:58<3471:06:55, 12.51s/it, lr=1e-5, step_loss=0.0521]
Steps: 0%| | 914/1000000 [2:18:03<2872:03:35, 10.35s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [914], local_loss=0.10248853266239166, train_loss=0.05259248614311218, time_cost=2.6480166912078857
+
Steps: 0%| | 914/1000000 [2:18:03<2872:03:35, 10.35s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 915/1000000 [2:18:17<3144:04:32, 11.33s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [915], local_loss=0.03008382022380829, train_loss=0.08744034171104431, time_cost=6.065440893173218
+
Steps: 0%| | 915/1000000 [2:18:17<3144:04:32, 11.33s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 916/1000000 [2:18:25<2893:01:12, 10.42s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [916], local_loss=0.09050461649894714, train_loss=0.05607893317937851, time_cost=2.8246452808380127
+
Steps: 0%| | 916/1000000 [2:18:25<2893:01:12, 10.42s/it, lr=1e-5, step_loss=0.0905]
Steps: 0%| | 917/1000000 [2:18:32<2611:14:17, 9.41s/it, lr=1e-5, step_loss=0.0905][RANK-0]: Step: [917], local_loss=0.0650230273604393, train_loss=0.04816363751888275, time_cost=2.5633270740509033
+
Steps: 0%| | 917/1000000 [2:18:32<2611:14:17, 9.41s/it, lr=1e-5, step_loss=0.065]
Steps: 0%| | 918/1000000 [2:18:39<2339:48:00, 8.43s/it, lr=1e-5, step_loss=0.065][RANK-0]: Step: [918], local_loss=0.062049590051174164, train_loss=0.054513946175575256, time_cost=1.5873339176177979
+
Steps: 0%| | 918/1000000 [2:18:39<2339:48:00, 8.43s/it, lr=1e-5, step_loss=0.062]
Steps: 0%| | 919/1000000 [2:18:46<2261:32:27, 8.15s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [919], local_loss=0.08881985396146774, train_loss=0.06095687299966812, time_cost=1.2879116535186768
+
Steps: 0%| | 919/1000000 [2:18:46<2261:32:27, 8.15s/it, lr=1e-5, step_loss=0.0888]
Steps: 0%| | 920/1000000 [2:18:51<1974:26:38, 7.11s/it, lr=1e-5, step_loss=0.0888][RANK-0]: Step: [920], local_loss=0.07763218879699707, train_loss=0.05151166021823883, time_cost=1.2882869243621826
+
Steps: 0%| | 920/1000000 [2:18:51<1974:26:38, 7.11s/it, lr=1e-5, step_loss=0.0776]
Steps: 0%| | 921/1000000 [2:19:01<2264:45:47, 8.16s/it, lr=1e-5, step_loss=0.0776][RANK-0]: Step: [921], local_loss=0.06990981101989746, train_loss=0.047809239476919174, time_cost=5.221888780593872
+
Steps: 0%| | 921/1000000 [2:19:01<2264:45:47, 8.16s/it, lr=1e-5, step_loss=0.0699]
Steps: 0%| | 922/1000000 [2:19:16<2807:51:31, 10.12s/it, lr=1e-5, step_loss=0.0699][RANK-0]: Step: [922], local_loss=0.03785506635904312, train_loss=0.08601691573858261, time_cost=6.939727783203125
+
Steps: 0%| | 922/1000000 [2:19:16<2807:51:31, 10.12s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 923/1000000 [2:19:27<2902:18:41, 10.46s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [923], local_loss=0.029943563044071198, train_loss=0.17143717408180237, time_cost=1.7064814567565918
+
Steps: 0%| | 923/1000000 [2:19:27<2902:18:41, 10.46s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 924/1000000 [2:19:40<3081:07:06, 11.10s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [924], local_loss=0.06111666187644005, train_loss=0.06884720921516418, time_cost=3.941013813018799
+
Steps: 0%| | 924/1000000 [2:19:40<3081:07:06, 11.10s/it, lr=1e-5, step_loss=0.0611]
Steps: 0%| | 925/1000000 [2:19:50<3011:22:51, 10.85s/it, lr=1e-5, step_loss=0.0611][RANK-0]: Step: [925], local_loss=0.03365025296807289, train_loss=0.049805134534835815, time_cost=2.032256603240967
+
Steps: 0%| | 925/1000000 [2:19:50<3011:22:51, 10.85s/it, lr=1e-5, step_loss=0.0337]
Steps: 0%| | 926/1000000 [2:19:57<2669:10:04, 9.62s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [926], local_loss=0.06602580100297928, train_loss=0.11985261738300323, time_cost=3.2148187160491943
+
Steps: 0%| | 926/1000000 [2:19:57<2669:10:04, 9.62s/it, lr=1e-5, step_loss=0.066]
Steps: 0%| | 927/1000000 [2:20:02<2301:23:01, 8.29s/it, lr=1e-5, step_loss=0.066][RANK-0]: Step: [927], local_loss=0.07380986958742142, train_loss=0.05324156582355499, time_cost=2.4762074947357178
+
Steps: 0%| | 927/1000000 [2:20:02<2301:23:01, 8.29s/it, lr=1e-5, step_loss=0.0738]
Steps: 0%| | 928/1000000 [2:20:08<2133:46:43, 7.69s/it, lr=1e-5, step_loss=0.0738][RANK-0]: Step: [928], local_loss=0.6252683997154236, train_loss=0.15803612768650055, time_cost=1.657728910446167
+
Steps: 0%| | 928/1000000 [2:20:08<2133:46:43, 7.69s/it, lr=1e-5, step_loss=0.625]
Steps: 0%| | 929/1000000 [2:20:14<1952:58:10, 7.04s/it, lr=1e-5, step_loss=0.625][RANK-0]: Step: [929], local_loss=0.07104970514774323, train_loss=0.049748264253139496, time_cost=1.293971300125122
+
Steps: 0%| | 929/1000000 [2:20:14<1952:58:10, 7.04s/it, lr=1e-5, step_loss=0.071]
Steps: 0%| | 930/1000000 [2:20:29<2630:58:01, 9.48s/it, lr=1e-5, step_loss=0.071][RANK-0]: Step: [930], local_loss=0.058279238641262054, train_loss=0.035181015729904175, time_cost=2.0577409267425537
+
Steps: 0%| | 930/1000000 [2:20:29<2630:58:01, 9.48s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 931/1000000 [2:20:35<2322:43:54, 8.37s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [931], local_loss=0.04358188435435295, train_loss=0.06354937702417374, time_cost=1.6653697490692139
+
Steps: 0%| | 931/1000000 [2:20:35<2322:43:54, 8.37s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 932/1000000 [2:20:49<2773:12:16, 9.99s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [932], local_loss=0.02209833264350891, train_loss=0.06684708595275879, time_cost=5.814310550689697
+
Steps: 0%| | 932/1000000 [2:20:49<2773:12:16, 9.99s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 933/1000000 [2:20:56<2526:21:12, 9.10s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [933], local_loss=0.034635335206985474, train_loss=0.05187629908323288, time_cost=2.8962128162384033
+
Steps: 0%| | 933/1000000 [2:20:56<2526:21:12, 9.10s/it, lr=1e-5, step_loss=0.0346]
Steps: 0%| | 934/1000000 [2:21:02<2336:01:29, 8.42s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [934], local_loss=0.02430799789726734, train_loss=24.037076950073242, time_cost=1.748732089996338
+
Steps: 0%| | 934/1000000 [2:21:03<2336:01:29, 8.42s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 935/1000000 [2:21:10<2274:45:28, 8.20s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [935], local_loss=0.04677687585353851, train_loss=0.04283594340085983, time_cost=3.436777353286743
+
Steps: 0%| | 935/1000000 [2:21:10<2274:45:28, 8.20s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 936/1000000 [2:21:17<2201:17:15, 7.93s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [936], local_loss=0.03920619562268257, train_loss=0.045137301087379456, time_cost=2.8325700759887695
+
Steps: 0%| | 936/1000000 [2:21:17<2201:17:15, 7.93s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 937/1000000 [2:21:31<2659:54:30, 9.58s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [937], local_loss=0.049624621868133545, train_loss=0.04310771822929382, time_cost=4.466623783111572
+
Steps: 0%| | 937/1000000 [2:21:31<2659:54:30, 9.58s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 938/1000000 [2:21:42<2763:55:17, 9.96s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [938], local_loss=0.038452357053756714, train_loss=0.03297024592757225, time_cost=7.126146078109741
+
Steps: 0%| | 938/1000000 [2:21:42<2763:55:17, 9.96s/it, lr=1e-5, step_loss=0.0385]
Steps: 0%| | 939/1000000 [2:21:53<2874:21:33, 10.36s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [939], local_loss=0.03513650223612785, train_loss=0.0822213888168335, time_cost=4.604186534881592
+
Steps: 0%| | 939/1000000 [2:21:53<2874:21:33, 10.36s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 940/1000000 [2:21:59<2506:17:35, 9.03s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [940], local_loss=0.024010125547647476, train_loss=0.07397957891225815, time_cost=1.860490083694458
+
Steps: 0%| | 940/1000000 [2:21:59<2506:17:35, 9.03s/it, lr=1e-5, step_loss=0.024]
Steps: 0%| | 941/1000000 [2:22:14<3015:15:55, 10.87s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [941], local_loss=0.026347940787672997, train_loss=21.742828369140625, time_cost=5.666162490844727
+
Steps: 0%| | 941/1000000 [2:22:14<3015:15:55, 10.87s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 942/1000000 [2:22:20<2616:50:45, 9.43s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [942], local_loss=0.036019694060087204, train_loss=0.03946619853377342, time_cost=1.229024887084961
+
Steps: 0%| | 942/1000000 [2:22:20<2616:50:45, 9.43s/it, lr=1e-5, step_loss=0.036]
Steps: 0%| | 943/1000000 [2:22:25<2207:24:22, 7.95s/it, lr=1e-5, step_loss=0.036][RANK-0]: Step: [943], local_loss=0.032853223383426666, train_loss=0.08652637898921967, time_cost=1.6356251239776611
+
Steps: 0%| | 943/1000000 [2:22:25<2207:24:22, 7.95s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 944/1000000 [2:22:32<2141:23:58, 7.72s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [944], local_loss=0.024793343618512154, train_loss=0.08483821898698807, time_cost=2.6981492042541504
+
Steps: 0%| | 944/1000000 [2:22:32<2141:23:58, 7.72s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 945/1000000 [2:22:36<1873:23:47, 6.75s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [945], local_loss=0.027429183945059776, train_loss=0.07412794232368469, time_cost=1.7695791721343994
+
Steps: 0%| | 945/1000000 [2:22:36<1873:23:47, 6.75s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 946/1000000 [2:22:41<1729:17:37, 6.23s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [946], local_loss=0.029867542907595634, train_loss=0.07108218967914581, time_cost=2.624237537384033
+
Steps: 0%| | 946/1000000 [2:22:41<1729:17:37, 6.23s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 947/1000000 [2:22:47<1714:25:11, 6.18s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [947], local_loss=0.06422864645719528, train_loss=0.04655773565173149, time_cost=1.6933565139770508
+
Steps: 0%| | 947/1000000 [2:22:47<1714:25:11, 6.18s/it, lr=1e-5, step_loss=0.0642]
Steps: 0%| | 948/1000000 [2:22:57<2033:21:57, 7.33s/it, lr=1e-5, step_loss=0.0642][RANK-0]: Step: [948], local_loss=0.03749711811542511, train_loss=0.15423142910003662, time_cost=1.7659790515899658
+
Steps: 0%| | 948/1000000 [2:22:57<2033:21:57, 7.33s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 949/1000000 [2:23:05<2091:36:09, 7.54s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [949], local_loss=0.03631662204861641, train_loss=0.05896361544728279, time_cost=3.766838788986206
+
Steps: 0%| | 949/1000000 [2:23:05<2091:36:09, 7.54s/it, lr=1e-5, step_loss=0.0363]
Steps: 0%| | 950/1000000 [2:23:15<2231:13:00, 8.04s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [950], local_loss=0.25121036171913147, train_loss=0.1653888076543808, time_cost=3.4785666465759277
+
Steps: 0%| | 950/1000000 [2:23:15<2231:13:00, 8.04s/it, lr=1e-5, step_loss=0.251]
Steps: 0%| | 951/1000000 [2:23:20<2035:38:52, 7.34s/it, lr=1e-5, step_loss=0.251][RANK-0]: Step: [951], local_loss=0.9928756952285767, train_loss=0.19945812225341797, time_cost=1.4345917701721191
+
Steps: 0%| | 951/1000000 [2:23:20<2035:38:52, 7.34s/it, lr=1e-5, step_loss=0.993]
Steps: 0%| | 952/1000000 [2:23:30<2198:35:55, 7.92s/it, lr=1e-5, step_loss=0.993][RANK-0]: Step: [952], local_loss=0.0813424289226532, train_loss=0.0676526129245758, time_cost=2.192380428314209
+
Steps: 0%| | 952/1000000 [2:23:30<2198:35:55, 7.92s/it, lr=1e-5, step_loss=0.0813]
Steps: 0%| | 953/1000000 [2:23:36<2031:36:01, 7.32s/it, lr=1e-5, step_loss=0.0813][RANK-0]: Step: [953], local_loss=0.0291412603110075, train_loss=0.1632612943649292, time_cost=1.7709722518920898
+
Steps: 0%| | 953/1000000 [2:23:36<2031:36:01, 7.32s/it, lr=1e-5, step_loss=0.0291]
Steps: 0%| | 954/1000000 [2:23:43<2038:35:23, 7.35s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [954], local_loss=0.03985843062400818, train_loss=0.05956272780895233, time_cost=1.8731043338775635
+
Steps: 0%| | 954/1000000 [2:23:43<2038:35:23, 7.35s/it, lr=1e-5, step_loss=0.0399]
Steps: 0%| | 955/1000000 [2:23:47<1768:59:23, 6.37s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [955], local_loss=0.988555908203125, train_loss=0.18440160155296326, time_cost=1.342468500137329
+
Steps: 0%| | 955/1000000 [2:23:47<1768:59:23, 6.37s/it, lr=1e-5, step_loss=0.989]
Steps: 0%| | 956/1000000 [2:23:55<1857:19:39, 6.69s/it, lr=1e-5, step_loss=0.989][RANK-0]: Step: [956], local_loss=0.04344877228140831, train_loss=0.05049806088209152, time_cost=1.7307233810424805
+
Steps: 0%| | 956/1000000 [2:23:55<1857:19:39, 6.69s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 957/1000000 [2:24:05<2158:07:36, 7.78s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [957], local_loss=0.030815159901976585, train_loss=0.03366202861070633, time_cost=1.941124439239502
+
Steps: 0%| | 957/1000000 [2:24:05<2158:07:36, 7.78s/it, lr=1e-5, step_loss=0.0308]
Steps: 0%| | 958/1000000 [2:24:17<2556:40:36, 9.21s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [958], local_loss=0.04110032320022583, train_loss=0.05770973116159439, time_cost=6.394864320755005
+
Steps: 0%| | 958/1000000 [2:24:17<2556:40:36, 9.21s/it, lr=1e-5, step_loss=0.0411]
Steps: 0%| | 959/1000000 [2:24:24<2312:50:51, 8.33s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [959], local_loss=0.039699941873550415, train_loss=0.07498840987682343, time_cost=1.63810133934021
+
Steps: 0%| | 959/1000000 [2:24:24<2312:50:51, 8.33s/it, lr=1e-5, step_loss=0.0397]
Steps: 0%| | 960/1000000 [2:24:31<2222:58:40, 8.01s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [960], local_loss=0.04049345850944519, train_loss=0.0399673730134964, time_cost=2.422016143798828
+
Steps: 0%| | 960/1000000 [2:24:31<2222:58:40, 8.01s/it, lr=1e-5, step_loss=0.0405]
Steps: 0%| | 961/1000000 [2:24:35<1914:21:59, 6.90s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [961], local_loss=0.03739215061068535, train_loss=0.03954014182090759, time_cost=1.39156174659729
+
Steps: 0%| | 961/1000000 [2:24:35<1914:21:59, 6.90s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 962/1000000 [2:24:47<2287:01:20, 8.24s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [962], local_loss=0.02405979111790657, train_loss=0.06515620648860931, time_cost=2.509026288986206
+
Steps: 0%| | 962/1000000 [2:24:47<2287:01:20, 8.24s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 963/1000000 [2:24:54<2251:44:53, 8.11s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [963], local_loss=0.11585168540477753, train_loss=0.08332623541355133, time_cost=2.419954299926758
+
Steps: 0%| | 963/1000000 [2:24:54<2251:44:53, 8.11s/it, lr=1e-5, step_loss=0.116]
Steps: 0%| | 964/1000000 [2:25:01<2156:51:23, 7.77s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [964], local_loss=0.05310773849487305, train_loss=0.05888855829834938, time_cost=2.025954008102417
+
Steps: 0%| | 964/1000000 [2:25:01<2156:51:23, 7.77s/it, lr=1e-5, step_loss=0.0531]
Steps: 0%| | 965/1000000 [2:25:09<2130:51:01, 7.68s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [965], local_loss=0.07850676774978638, train_loss=0.07645098865032196, time_cost=1.3534719944000244
+
Steps: 0%| | 965/1000000 [2:25:09<2130:51:01, 7.68s/it, lr=1e-5, step_loss=0.0785]
Steps: 0%| | 966/1000000 [2:25:15<1976:29:40, 7.12s/it, lr=1e-5, step_loss=0.0785][RANK-0]: Step: [966], local_loss=0.02804717794060707, train_loss=0.06347838044166565, time_cost=3.074618339538574
+
Steps: 0%| | 966/1000000 [2:25:15<1976:29:40, 7.12s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 967/1000000 [2:25:20<1795:45:17, 6.47s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [967], local_loss=0.029360396787524223, train_loss=0.1760403960943222, time_cost=2.2587475776672363
+
Steps: 0%| | 967/1000000 [2:25:20<1795:45:17, 6.47s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 968/1000000 [2:25:27<1866:13:19, 6.72s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [968], local_loss=0.04956701025366783, train_loss=0.060517892241477966, time_cost=3.1727828979492188
+
Steps: 0%| | 968/1000000 [2:25:27<1866:13:19, 6.72s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 969/1000000 [2:25:38<2224:10:05, 8.01s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [969], local_loss=0.031225088983774185, train_loss=0.10848669707775116, time_cost=1.2482891082763672
+
Steps: 0%| | 969/1000000 [2:25:38<2224:10:05, 8.01s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 970/1000000 [2:25:50<2585:12:00, 9.32s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [970], local_loss=0.051552414894104004, train_loss=0.05817726254463196, time_cost=3.1700375080108643
+
Steps: 0%| | 970/1000000 [2:25:50<2585:12:00, 9.32s/it, lr=1e-5, step_loss=0.0516]
Steps: 0%| | 971/1000000 [2:25:56<2301:47:43, 8.29s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [971], local_loss=0.3598365783691406, train_loss=0.139259934425354, time_cost=1.4735980033874512
+
Steps: 0%| | 971/1000000 [2:25:56<2301:47:43, 8.29s/it, lr=1e-5, step_loss=0.36]
Steps: 0%| | 972/1000000 [2:26:04<2247:50:37, 8.10s/it, lr=1e-5, step_loss=0.36][RANK-0]: Step: [972], local_loss=0.03300777077674866, train_loss=0.08604905754327774, time_cost=1.5636098384857178
+
Steps: 0%| | 972/1000000 [2:26:04<2247:50:37, 8.10s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 973/1000000 [2:26:12<2220:39:00, 8.00s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [973], local_loss=0.028644781559705734, train_loss=0.04121030867099762, time_cost=3.5325465202331543
+
Steps: 0%| | 973/1000000 [2:26:12<2220:39:00, 8.00s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 974/1000000 [2:26:21<2307:15:48, 8.31s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [974], local_loss=0.08187447488307953, train_loss=0.06483925879001617, time_cost=1.8367500305175781
+
Steps: 0%| | 974/1000000 [2:26:21<2307:15:48, 8.31s/it, lr=1e-5, step_loss=0.0819]
Steps: 0%| | 975/1000000 [2:26:28<2228:14:54, 8.03s/it, lr=1e-5, step_loss=0.0819][RANK-0]: Step: [975], local_loss=0.0395086295902729, train_loss=0.04912673309445381, time_cost=6.307555913925171
+
Steps: 0%| | 975/1000000 [2:26:28<2228:14:54, 8.03s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 976/1000000 [2:26:35<2123:58:30, 7.65s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [976], local_loss=0.041446372866630554, train_loss=0.1021086722612381, time_cost=2.04691219329834
+
Steps: 0%| | 976/1000000 [2:26:35<2123:58:30, 7.65s/it, lr=1e-5, step_loss=0.0414]
Steps: 0%| | 977/1000000 [2:26:41<1985:13:46, 7.15s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [977], local_loss=0.024197950959205627, train_loss=0.07033930718898773, time_cost=1.7401573657989502
+
Steps: 0%| | 977/1000000 [2:26:41<1985:13:46, 7.15s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 978/1000000 [2:26:54<2458:53:44, 8.86s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [978], local_loss=0.0260606836527586, train_loss=0.07236389815807343, time_cost=9.937843799591064
+
Steps: 0%| | 978/1000000 [2:26:54<2458:53:44, 8.86s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 979/1000000 [2:27:07<2850:08:18, 10.27s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [979], local_loss=0.1315014660358429, train_loss=0.08754446357488632, time_cost=5.893698453903198
+
Steps: 0%| | 979/1000000 [2:27:07<2850:08:18, 10.27s/it, lr=1e-5, step_loss=0.132]
Steps: 0%| | 980/1000000 [2:27:13<2508:13:28, 9.04s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [980], local_loss=0.04143429175019264, train_loss=0.16776898503303528, time_cost=1.9424893856048584
+
Steps: 0%| | 980/1000000 [2:27:13<2508:13:28, 9.04s/it, lr=1e-5, step_loss=0.0414]
Steps: 0%| | 981/1000000 [2:27:19<2180:28:57, 7.86s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [981], local_loss=0.05369453877210617, train_loss=0.0568561926484108, time_cost=2.115323781967163
+
Steps: 0%| | 981/1000000 [2:27:19<2180:28:57, 7.86s/it, lr=1e-5, step_loss=0.0537]
Steps: 0%| | 982/1000000 [2:27:24<1988:43:53, 7.17s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [982], local_loss=0.030103830620646477, train_loss=0.04288925230503082, time_cost=1.4973175525665283
+
Steps: 0%| | 982/1000000 [2:27:24<1988:43:53, 7.17s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 983/1000000 [2:27:32<2042:16:50, 7.36s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [983], local_loss=0.06426110863685608, train_loss=0.09296516329050064, time_cost=1.7427141666412354
+
Steps: 0%| | 983/1000000 [2:27:32<2042:16:50, 7.36s/it, lr=1e-5, step_loss=0.0643]
Steps: 0%| | 984/1000000 [2:27:39<2036:41:13, 7.34s/it, lr=1e-5, step_loss=0.0643][RANK-0]: Step: [984], local_loss=0.04547236114740372, train_loss=0.07692830264568329, time_cost=3.1313676834106445
+
Steps: 0%| | 984/1000000 [2:27:39<2036:41:13, 7.34s/it, lr=1e-5, step_loss=0.0455]
Steps: 0%| | 985/1000000 [2:27:50<2341:55:08, 8.44s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [985], local_loss=0.032068174332380295, train_loss=10.681852340698242, time_cost=3.7526016235351562
+
Steps: 0%| | 985/1000000 [2:27:50<2341:55:08, 8.44s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 986/1000000 [2:27:55<2054:54:16, 7.40s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [986], local_loss=0.029823999851942062, train_loss=0.047062646597623825, time_cost=1.2967815399169922
+
Steps: 0%| | 986/1000000 [2:27:55<2054:54:16, 7.40s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 987/1000000 [2:28:06<2378:42:34, 8.57s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [987], local_loss=0.09183128923177719, train_loss=0.2369479238986969, time_cost=2.307896375656128
+
Steps: 0%| | 987/1000000 [2:28:06<2378:42:34, 8.57s/it, lr=1e-5, step_loss=0.0918]
Steps: 0%| | 988/1000000 [2:28:13<2229:24:07, 8.03s/it, lr=1e-5, step_loss=0.0918][RANK-0]: Step: [988], local_loss=0.04706159234046936, train_loss=0.12166398018598557, time_cost=5.306105375289917
+
Steps: 0%| | 988/1000000 [2:28:13<2229:24:07, 8.03s/it, lr=1e-5, step_loss=0.0471]
Steps: 0%| | 989/1000000 [2:28:27<2712:49:42, 9.78s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [989], local_loss=0.022736046463251114, train_loss=0.051002662628889084, time_cost=1.2957303524017334
+
Steps: 0%| | 989/1000000 [2:28:27<2712:49:42, 9.78s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 990/1000000 [2:28:43<3187:13:21, 11.49s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [990], local_loss=0.030873460695147514, train_loss=0.05311230942606926, time_cost=1.3094828128814697
+
Steps: 0%| | 990/1000000 [2:28:43<3187:13:21, 11.49s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 991/1000000 [2:28:52<3045:44:32, 10.98s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [991], local_loss=0.037615515291690826, train_loss=0.06344249844551086, time_cost=2.4163408279418945
+
Steps: 0%| | 991/1000000 [2:28:52<3045:44:32, 10.98s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 992/1000000 [2:29:08<3407:27:26, 12.28s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [992], local_loss=0.05062178149819374, train_loss=0.063860684633255, time_cost=6.8682122230529785
+
Steps: 0%| | 992/1000000 [2:29:08<3407:27:26, 12.28s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 993/1000000 [2:29:13<2813:57:18, 10.14s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [993], local_loss=0.026984412223100662, train_loss=0.056169938296079636, time_cost=2.202364206314087
+
Steps: 0%| | 993/1000000 [2:29:13<2813:57:18, 10.14s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 994/1000000 [2:29:20<2563:06:37, 9.24s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [994], local_loss=0.1254383772611618, train_loss=0.07146851718425751, time_cost=3.2125704288482666
+
Steps: 0%| | 994/1000000 [2:29:20<2563:06:37, 9.24s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 995/1000000 [2:29:25<2199:22:29, 7.93s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [995], local_loss=0.046446047723293304, train_loss=0.04563789814710617, time_cost=1.8976058959960938
+
Steps: 0%| | 995/1000000 [2:29:25<2199:22:29, 7.93s/it, lr=1e-5, step_loss=0.0464]
Steps: 0%| | 996/1000000 [2:29:30<1974:41:02, 7.12s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [996], local_loss=0.0919281542301178, train_loss=0.06933397799730301, time_cost=2.2213828563690186
+
Steps: 0%| | 996/1000000 [2:29:30<1974:41:02, 7.12s/it, lr=1e-5, step_loss=0.0919]
Steps: 0%| | 997/1000000 [2:29:39<2134:54:46, 7.69s/it, lr=1e-5, step_loss=0.0919][RANK-0]: Step: [997], local_loss=0.034138280898332596, train_loss=0.048201702535152435, time_cost=3.5133121013641357
+
Steps: 0%| | 997/1000000 [2:29:39<2134:54:46, 7.69s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 998/1000000 [2:29:43<1840:20:41, 6.63s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [998], local_loss=0.05058850720524788, train_loss=0.09408698230981827, time_cost=1.3915109634399414
+
Steps: 0%| | 998/1000000 [2:29:43<1840:20:41, 6.63s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 999/1000000 [2:29:59<2573:46:53, 9.27s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [999], local_loss=0.04236692935228348, train_loss=0.09908788651227951, time_cost=7.220247268676758
+
Steps: 0%| | 999/1000000 [2:29:59<2573:46:53, 9.27s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 1000/1000000 [2:30:13<2956:47:10, 10.66s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [1000], local_loss=0.02799859829246998, train_loss=0.041002076119184494, time_cost=1.2678732872009277
+09/19/2024 01:40:05 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000
+09/19/2024 01:40:05 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 01:40:05,705] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+ warnings.warn(
+[2024-09-19 01:40:05,753] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 01:40:05,753] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 01:40:23,262] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:23,298] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 01:40:56,643] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:56,643] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:56,643] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:58,220] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:58,220] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:58,220] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:58,516] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:58,517] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:58,517] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:58,592] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:58,592] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:58,592] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:58,964] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:58,969] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:58,969] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:58,969] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:59,013] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:59,014] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:59,280] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:59,280] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:59,280] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 01:40:59,398] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 01:40:59,398] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 01:40:59,398] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 01:40:59 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/model/diffusion_pytorch_model.safetensors
+09/19/2024 01:42:06 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/scheduler.bin
+09/19/2024 01:42:06 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/sampler.bin
+09/19/2024 01:42:06 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000/random_states_0.pkl
+09/19/2024 01:42:06 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-1000
+
Steps: 0%| | 1000/1000000 [2:32:13<2956:47:10, 10.66s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 1001/1000000 [2:32:17<12452:11:26, 44.87s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [1001], local_loss=0.0638979896903038, train_loss=0.046142809092998505, time_cost=1.224015474319458
+
Steps: 0%| | 1001/1000000 [2:32:17<12452:11:26, 44.87s/it, lr=1e-5, step_loss=0.0639]
Steps: 0%| | 1002/1000000 [2:32:29<9652:21:56, 34.78s/it, lr=1e-5, step_loss=0.0639] [RANK-0]: Step: [1002], local_loss=0.041722819209098816, train_loss=0.05618671700358391, time_cost=2.0492911338806152
+
Steps: 0%| | 1002/1000000 [2:32:29<9652:21:56, 34.78s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1003/1000000 [2:32:40<7708:46:10, 27.78s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1003], local_loss=0.31664299964904785, train_loss=0.08087679743766785, time_cost=3.9331040382385254
+
Steps: 0%| | 1003/1000000 [2:32:40<7708:46:10, 27.78s/it, lr=1e-5, step_loss=0.317]
Steps: 0%| | 1004/1000000 [2:32:49<6163:34:22, 22.21s/it, lr=1e-5, step_loss=0.317][RANK-0]: Step: [1004], local_loss=0.03294413909316063, train_loss=0.05488521605730057, time_cost=2.882755756378174
+
Steps: 0%| | 1004/1000000 [2:32:49<6163:34:22, 22.21s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 1005/1000000 [2:32:55<4807:49:52, 17.33s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [1005], local_loss=0.13140757381916046, train_loss=0.056596800684928894, time_cost=3.3302104473114014
+
Steps: 0%| | 1005/1000000 [2:32:55<4807:49:52, 17.33s/it, lr=1e-5, step_loss=0.131]
Steps: 0%| | 1006/1000000 [2:33:07<4379:08:43, 15.78s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [1006], local_loss=0.027009474113583565, train_loss=0.043117765337228775, time_cost=2.2045066356658936
+
Steps: 0%| | 1006/1000000 [2:33:07<4379:08:43, 15.78s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 1007/1000000 [2:33:12<3482:36:24, 12.55s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [1007], local_loss=0.20680196583271027, train_loss=0.10694456845521927, time_cost=2.015057325363159
+
Steps: 0%| | 1007/1000000 [2:33:12<3482:36:24, 12.55s/it, lr=1e-5, step_loss=0.207]
Steps: 0%| | 1008/1000000 [2:33:25<3517:16:11, 12.67s/it, lr=1e-5, step_loss=0.207][RANK-0]: Step: [1008], local_loss=0.1145486980676651, train_loss=0.04873017966747284, time_cost=3.5469207763671875
+
Steps: 0%| | 1008/1000000 [2:33:25<3517:16:11, 12.67s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 1009/1000000 [2:33:32<3000:35:49, 10.81s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [1009], local_loss=0.06944793462753296, train_loss=0.1279076784849167, time_cost=1.656710147857666
+
Steps: 0%| | 1009/1000000 [2:33:32<3000:35:49, 10.81s/it, lr=1e-5, step_loss=0.0694]
Steps: 0%| | 1010/1000000 [2:33:40<2793:06:01, 10.07s/it, lr=1e-5, step_loss=0.0694][RANK-0]: Step: [1010], local_loss=0.3107782006263733, train_loss=0.10254475474357605, time_cost=4.579651117324829
+
Steps: 0%| | 1010/1000000 [2:33:40<2793:06:01, 10.07s/it, lr=1e-5, step_loss=0.311]
Steps: 0%| | 1011/1000000 [2:33:51<2871:52:36, 10.35s/it, lr=1e-5, step_loss=0.311][RANK-0]: Step: [1011], local_loss=0.03249732777476311, train_loss=0.0586685873568058, time_cost=3.2341036796569824
+
Steps: 0%| | 1011/1000000 [2:33:51<2871:52:36, 10.35s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 1012/1000000 [2:34:03<2992:54:27, 10.79s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [1012], local_loss=0.05202231928706169, train_loss=0.09536450356245041, time_cost=4.544330835342407
+
Steps: 0%| | 1012/1000000 [2:34:03<2992:54:27, 10.79s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 1013/1000000 [2:34:16<3178:09:06, 11.45s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [1013], local_loss=0.02347298339009285, train_loss=0.22181351482868195, time_cost=3.4010207653045654
+
Steps: 0%| | 1013/1000000 [2:34:16<3178:09:06, 11.45s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 1014/1000000 [2:34:27<3192:02:16, 11.50s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [1014], local_loss=0.04330502450466156, train_loss=0.05169399082660675, time_cost=2.8636388778686523
+
Steps: 0%| | 1014/1000000 [2:34:27<3192:02:16, 11.50s/it, lr=1e-5, step_loss=0.0433]
Steps: 0%| | 1015/1000000 [2:34:33<2651:11:13, 9.55s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [1015], local_loss=0.34498730301856995, train_loss=0.08506536483764648, time_cost=1.2770130634307861
+
Steps: 0%| | 1015/1000000 [2:34:33<2651:11:13, 9.55s/it, lr=1e-5, step_loss=0.345]
Steps: 0%| | 1016/1000000 [2:34:38<2293:32:28, 8.27s/it, lr=1e-5, step_loss=0.345][RANK-0]: Step: [1016], local_loss=0.02212473563849926, train_loss=0.04038432240486145, time_cost=2.130154609680176
+
Steps: 0%| | 1016/1000000 [2:34:38<2293:32:28, 8.27s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 1017/1000000 [2:34:51<2744:46:41, 9.89s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [1017], local_loss=0.02961140125989914, train_loss=0.054322659969329834, time_cost=4.54215669631958
+
Steps: 0%| | 1017/1000000 [2:34:51<2744:46:41, 9.89s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 1018/1000000 [2:34:59<2534:54:19, 9.13s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [1018], local_loss=0.04689550772309303, train_loss=0.104291170835495, time_cost=2.6504557132720947
+
Steps: 0%| | 1018/1000000 [2:34:59<2534:54:19, 9.13s/it, lr=1e-5, step_loss=0.0469]
Steps: 0%| | 1019/1000000 [2:35:06<2357:23:45, 8.50s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [1019], local_loss=0.15306511521339417, train_loss=0.05532185733318329, time_cost=2.9677019119262695
+
Steps: 0%| | 1019/1000000 [2:35:06<2357:23:45, 8.50s/it, lr=1e-5, step_loss=0.153]
Steps: 0%| | 1020/1000000 [2:35:13<2233:49:57, 8.05s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [1020], local_loss=0.031488087028265, train_loss=0.07193337380886078, time_cost=1.2134547233581543
+
Steps: 0%| | 1020/1000000 [2:35:13<2233:49:57, 8.05s/it, lr=1e-5, step_loss=0.0315]
Steps: 0%| | 1021/1000000 [2:35:27<2719:25:16, 9.80s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [1021], local_loss=0.09683487564325333, train_loss=3.8272130489349365, time_cost=9.901394844055176
+
Steps: 0%| | 1021/1000000 [2:35:27<2719:25:16, 9.80s/it, lr=1e-5, step_loss=0.0968]
Steps: 0%| | 1022/1000000 [2:35:36<2651:05:23, 9.55s/it, lr=1e-5, step_loss=0.0968][RANK-0]: Step: [1022], local_loss=0.0849909782409668, train_loss=0.04692624509334564, time_cost=1.2350800037384033
+
Steps: 0%| | 1022/1000000 [2:35:36<2651:05:23, 9.55s/it, lr=1e-5, step_loss=0.085]
Steps: 0%| | 1023/1000000 [2:35:48<2903:25:38, 10.46s/it, lr=1e-5, step_loss=0.085][RANK-0]: Step: [1023], local_loss=0.040172599256038666, train_loss=0.03955007717013359, time_cost=6.537621259689331
+
Steps: 0%| | 1023/1000000 [2:35:48<2903:25:38, 10.46s/it, lr=1e-5, step_loss=0.0402]
Steps: 0%| | 1024/1000000 [2:36:00<2982:15:17, 10.75s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [1024], local_loss=0.06372060626745224, train_loss=0.050982508808374405, time_cost=2.1382455825805664
+
Steps: 0%| | 1024/1000000 [2:36:00<2982:15:17, 10.75s/it, lr=1e-5, step_loss=0.0637]
Steps: 0%| | 1025/1000000 [2:36:07<2711:10:20, 9.77s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [1025], local_loss=0.026933515444397926, train_loss=0.07277458906173706, time_cost=1.7050635814666748
+
Steps: 0%| | 1025/1000000 [2:36:07<2711:10:20, 9.77s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 1026/1000000 [2:36:18<2769:35:14, 9.98s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [1026], local_loss=0.07636123150587082, train_loss=0.07095463573932648, time_cost=1.2427830696105957
+
Steps: 0%| | 1026/1000000 [2:36:18<2769:35:14, 9.98s/it, lr=1e-5, step_loss=0.0764]
Steps: 0%| | 1027/1000000 [2:36:23<2346:10:30, 8.45s/it, lr=1e-5, step_loss=0.0764][RANK-0]: Step: [1027], local_loss=0.03741071745753288, train_loss=0.07228271663188934, time_cost=2.090085744857788
+
Steps: 0%| | 1027/1000000 [2:36:23<2346:10:30, 8.45s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 1028/1000000 [2:36:34<2607:28:40, 9.40s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [1028], local_loss=0.05221458524465561, train_loss=0.03643876686692238, time_cost=8.707480430603027
+
Steps: 0%| | 1028/1000000 [2:36:34<2607:28:40, 9.40s/it, lr=1e-5, step_loss=0.0522]
Steps: 0%| | 1029/1000000 [2:36:39<2256:00:16, 8.13s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [1029], local_loss=0.058427706360816956, train_loss=0.0479496605694294, time_cost=4.149338960647583
+
Steps: 0%| | 1029/1000000 [2:36:39<2256:00:16, 8.13s/it, lr=1e-5, step_loss=0.0584]
Steps: 0%| | 1030/1000000 [2:36:50<2506:22:51, 9.03s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [1030], local_loss=0.041707202792167664, train_loss=0.10063314437866211, time_cost=1.2517619132995605
+
Steps: 0%| | 1030/1000000 [2:36:50<2506:22:51, 9.03s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1031/1000000 [2:36:56<2253:35:14, 8.12s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1031], local_loss=0.03543956205248833, train_loss=0.06036577746272087, time_cost=1.6224939823150635
+
Steps: 0%| | 1031/1000000 [2:36:56<2253:35:14, 8.12s/it, lr=1e-5, step_loss=0.0354]
Steps: 0%| | 1032/1000000 [2:37:05<2289:37:50, 8.25s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [1032], local_loss=0.22702305018901825, train_loss=0.09180240333080292, time_cost=6.302223443984985
+
Steps: 0%| | 1032/1000000 [2:37:05<2289:37:50, 8.25s/it, lr=1e-5, step_loss=0.227]
Steps: 0%| | 1033/1000000 [2:37:15<2430:03:00, 8.76s/it, lr=1e-5, step_loss=0.227][RANK-0]: Step: [1033], local_loss=0.052163783460855484, train_loss=0.05601733922958374, time_cost=1.8167376518249512
+
Steps: 0%| | 1033/1000000 [2:37:15<2430:03:00, 8.76s/it, lr=1e-5, step_loss=0.0522]
Steps: 0%| | 1034/1000000 [2:37:27<2694:02:52, 9.71s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [1034], local_loss=0.042652811855077744, train_loss=0.07248084992170334, time_cost=4.29190993309021
+
Steps: 0%| | 1034/1000000 [2:37:27<2694:02:52, 9.71s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 1035/1000000 [2:37:35<2533:13:00, 9.13s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [1035], local_loss=0.9996683597564697, train_loss=0.2160707265138626, time_cost=6.421734094619751
+
Steps: 0%| | 1035/1000000 [2:37:35<2533:13:00, 9.13s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 1036/1000000 [2:37:48<2874:06:27, 10.36s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [1036], local_loss=0.09534861147403717, train_loss=0.047771669924259186, time_cost=4.300315856933594
+
Steps: 0%| | 1036/1000000 [2:37:48<2874:06:27, 10.36s/it, lr=1e-5, step_loss=0.0953]
Steps: 0%| | 1037/1000000 [2:37:54<2497:06:41, 9.00s/it, lr=1e-5, step_loss=0.0953][RANK-0]: Step: [1037], local_loss=0.051733653992414474, train_loss=0.042249634861946106, time_cost=1.6821086406707764
+
Steps: 0%| | 1037/1000000 [2:37:54<2497:06:41, 9.00s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 1038/1000000 [2:38:05<2696:25:17, 9.72s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [1038], local_loss=0.10802540183067322, train_loss=0.09205740690231323, time_cost=3.4068009853363037
+
Steps: 0%| | 1038/1000000 [2:38:05<2696:25:17, 9.72s/it, lr=1e-5, step_loss=0.108]
Steps: 0%| | 1039/1000000 [2:38:16<2797:23:15, 10.08s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [1039], local_loss=0.1412017047405243, train_loss=0.04700235277414322, time_cost=7.606950044631958
+
Steps: 0%| | 1039/1000000 [2:38:16<2797:23:15, 10.08s/it, lr=1e-5, step_loss=0.141]
Steps: 0%| | 1040/1000000 [2:38:32<3272:20:36, 11.79s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [1040], local_loss=0.13109584152698517, train_loss=0.053878288716077805, time_cost=6.642171144485474
+
Steps: 0%| | 1040/1000000 [2:38:32<3272:20:36, 11.79s/it, lr=1e-5, step_loss=0.131]
Steps: 0%| | 1041/1000000 [2:38:43<3188:02:15, 11.49s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [1041], local_loss=0.02842523530125618, train_loss=0.03686600923538208, time_cost=3.2327239513397217
+
Steps: 0%| | 1041/1000000 [2:38:43<3188:02:15, 11.49s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 1042/1000000 [2:38:52<3019:22:10, 10.88s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [1042], local_loss=0.029968060553073883, train_loss=0.04962591081857681, time_cost=1.744539737701416
+
Steps: 0%| | 1042/1000000 [2:38:52<3019:22:10, 10.88s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 1043/1000000 [2:39:03<3064:05:14, 11.04s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [1043], local_loss=0.033949874341487885, train_loss=0.035293374210596085, time_cost=3.3913989067077637
+
Steps: 0%| | 1043/1000000 [2:39:03<3064:05:14, 11.04s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 1044/1000000 [2:39:09<2645:14:50, 9.53s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [1044], local_loss=64.25382995605469, train_loss=8.204341888427734, time_cost=1.6705338954925537
+
Steps: 0%| | 1044/1000000 [2:39:09<2645:14:50, 9.53s/it, lr=1e-5, step_loss=64.3]
Steps: 0%| | 1045/1000000 [2:39:18<2600:54:08, 9.37s/it, lr=1e-5, step_loss=64.3][RANK-0]: Step: [1045], local_loss=0.10003791749477386, train_loss=0.07501557469367981, time_cost=2.595470666885376
+
Steps: 0%| | 1045/1000000 [2:39:18<2600:54:08, 9.37s/it, lr=1e-5, step_loss=0.1]
Steps: 0%| | 1046/1000000 [2:39:25<2382:01:53, 8.58s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [1046], local_loss=386.41290283203125, train_loss=48.345985412597656, time_cost=2.1706714630126953
+
Steps: 0%| | 1046/1000000 [2:39:25<2382:01:53, 8.58s/it, lr=1e-5, step_loss=386]
Steps: 0%| | 1047/1000000 [2:39:30<2085:16:18, 7.51s/it, lr=1e-5, step_loss=386][RANK-0]: Step: [1047], local_loss=0.03872344642877579, train_loss=0.03850362449884415, time_cost=2.0613083839416504
+
Steps: 0%| | 1047/1000000 [2:39:30<2085:16:18, 7.51s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 1048/1000000 [2:39:37<2040:45:49, 7.35s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [1048], local_loss=0.027687495574355125, train_loss=0.046173419803380966, time_cost=2.4258601665496826
+
Steps: 0%| | 1048/1000000 [2:39:37<2040:45:49, 7.35s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 1049/1000000 [2:39:48<2362:34:00, 8.51s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [1049], local_loss=0.0453779399394989, train_loss=0.0569981187582016, time_cost=4.701237440109253
+
Steps: 0%| | 1049/1000000 [2:39:48<2362:34:00, 8.51s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 1050/1000000 [2:39:54<2081:38:48, 7.50s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [1050], local_loss=0.038238875567913055, train_loss=0.07923774421215057, time_cost=1.276033878326416
+
Steps: 0%| | 1050/1000000 [2:39:54<2081:38:48, 7.50s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 1051/1000000 [2:40:05<2384:01:22, 8.59s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [1051], local_loss=0.041684702038764954, train_loss=0.050156496465206146, time_cost=3.9082274436950684
+
Steps: 0%| | 1051/1000000 [2:40:05<2384:01:22, 8.59s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1052/1000000 [2:40:10<2080:14:16, 7.50s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1052], local_loss=0.022465748712420464, train_loss=0.05159206688404083, time_cost=1.980376958847046
+
Steps: 0%| | 1052/1000000 [2:40:10<2080:14:16, 7.50s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 1053/1000000 [2:40:15<1911:46:24, 6.89s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [1053], local_loss=0.14897318184375763, train_loss=0.11540062725543976, time_cost=2.586108922958374
+
Steps: 0%| | 1053/1000000 [2:40:15<1911:46:24, 6.89s/it, lr=1e-5, step_loss=0.149]
Steps: 0%| | 1054/1000000 [2:40:20<1747:01:25, 6.30s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [1054], local_loss=0.10692666471004486, train_loss=0.12815773487091064, time_cost=2.0180022716522217
+
Steps: 0%| | 1054/1000000 [2:40:20<1747:01:25, 6.30s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 1055/1000000 [2:40:25<1640:10:36, 5.91s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [1055], local_loss=0.04535834118723869, train_loss=0.05804920196533203, time_cost=2.0873448848724365
+
Steps: 0%| | 1055/1000000 [2:40:25<1640:10:36, 5.91s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 1056/1000000 [2:40:35<1991:48:32, 7.18s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [1056], local_loss=0.11092513799667358, train_loss=0.05010927468538284, time_cost=4.906656503677368
+
Steps: 0%| | 1056/1000000 [2:40:35<1991:48:32, 7.18s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 1057/1000000 [2:40:47<2352:32:17, 8.48s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [1057], local_loss=0.1216215044260025, train_loss=0.14063891768455505, time_cost=2.4135046005249023
+
Steps: 0%| | 1057/1000000 [2:40:47<2352:32:17, 8.48s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 1058/1000000 [2:40:57<2487:02:22, 8.96s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [1058], local_loss=0.03182395175099373, train_loss=0.04507138580083847, time_cost=5.061346530914307
+
Steps: 0%| | 1058/1000000 [2:40:57<2487:02:22, 8.96s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 1059/1000000 [2:41:07<2593:50:08, 9.35s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [1059], local_loss=0.06930387765169144, train_loss=0.09593107551336288, time_cost=1.2270889282226562
+
Steps: 0%| | 1059/1000000 [2:41:07<2593:50:08, 9.35s/it, lr=1e-5, step_loss=0.0693]
Steps: 0%| | 1060/1000000 [2:41:12<2222:41:56, 8.01s/it, lr=1e-5, step_loss=0.0693][RANK-0]: Step: [1060], local_loss=0.034215446561574936, train_loss=0.0552157461643219, time_cost=2.0654470920562744
+
Steps: 0%| | 1060/1000000 [2:41:12<2222:41:56, 8.01s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 1061/1000000 [2:41:22<2355:13:23, 8.49s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [1061], local_loss=0.13053297996520996, train_loss=0.09426316618919373, time_cost=1.7537510395050049
+
Steps: 0%| | 1061/1000000 [2:41:22<2355:13:23, 8.49s/it, lr=1e-5, step_loss=0.131]
Steps: 0%| | 1062/1000000 [2:41:34<2726:38:05, 9.83s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [1062], local_loss=19.47916030883789, train_loss=2.5278873443603516, time_cost=4.387477397918701
+
Steps: 0%| | 1062/1000000 [2:41:34<2726:38:05, 9.83s/it, lr=1e-5, step_loss=19.5]
Steps: 0%| | 1063/1000000 [2:41:46<2841:12:30, 10.24s/it, lr=1e-5, step_loss=19.5][RANK-0]: Step: [1063], local_loss=0.03651432693004608, train_loss=0.06601618230342865, time_cost=2.367155075073242
+
Steps: 0%| | 1063/1000000 [2:41:46<2841:12:30, 10.24s/it, lr=1e-5, step_loss=0.0365]
Steps: 0%| | 1064/1000000 [2:41:50<2371:30:17, 8.55s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [1064], local_loss=0.03244864195585251, train_loss=0.04987432062625885, time_cost=2.195988416671753
+
Steps: 0%| | 1064/1000000 [2:41:50<2371:30:17, 8.55s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 1065/1000000 [2:41:56<2114:40:02, 7.62s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [1065], local_loss=0.99825519323349, train_loss=0.18126454949378967, time_cost=2.1339290142059326
+
Steps: 0%| | 1065/1000000 [2:41:56<2114:40:02, 7.62s/it, lr=1e-5, step_loss=0.998]
Steps: 0%| | 1066/1000000 [2:42:05<2227:43:28, 8.03s/it, lr=1e-5, step_loss=0.998][RANK-0]: Step: [1066], local_loss=0.054008204489946365, train_loss=0.17964819073677063, time_cost=2.652059316635132
+
Steps: 0%| | 1066/1000000 [2:42:05<2227:43:28, 8.03s/it, lr=1e-5, step_loss=0.054]
Steps: 0%| | 1067/1000000 [2:42:14<2349:40:10, 8.47s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [1067], local_loss=0.0802970752120018, train_loss=0.04261314868927002, time_cost=1.927396297454834
+
Steps: 0%| | 1067/1000000 [2:42:14<2349:40:10, 8.47s/it, lr=1e-5, step_loss=0.0803]
Steps: 0%| | 1068/1000000 [2:42:20<2092:35:42, 7.54s/it, lr=1e-5, step_loss=0.0803][RANK-0]: Step: [1068], local_loss=0.04095974564552307, train_loss=0.047709256410598755, time_cost=2.309001922607422
+
Steps: 0%| | 1068/1000000 [2:42:20<2092:35:42, 7.54s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 1069/1000000 [2:42:25<1884:54:00, 6.79s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [1069], local_loss=0.05180079862475395, train_loss=0.05346539989113808, time_cost=1.3626971244812012
+
Steps: 0%| | 1069/1000000 [2:42:25<1884:54:00, 6.79s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 1070/1000000 [2:42:37<2339:56:41, 8.43s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [1070], local_loss=0.026887308806180954, train_loss=0.04733261093497276, time_cost=9.691197633743286
+
Steps: 0%| | 1070/1000000 [2:42:37<2339:56:41, 8.43s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 1071/1000000 [2:42:49<2617:18:49, 9.43s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [1071], local_loss=0.038639944046735764, train_loss=0.07543639838695526, time_cost=3.2498714923858643
+
Steps: 0%| | 1071/1000000 [2:42:49<2617:18:49, 9.43s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 1072/1000000 [2:42:54<2252:59:12, 8.12s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [1072], local_loss=0.04668574780225754, train_loss=0.11237774789333344, time_cost=2.0683186054229736
+
Steps: 0%| | 1072/1000000 [2:42:54<2252:59:12, 8.12s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 1073/1000000 [2:42:58<1932:41:38, 6.97s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [1073], local_loss=0.9757575988769531, train_loss=0.1697956770658493, time_cost=1.4962306022644043
+
Steps: 0%| | 1073/1000000 [2:42:58<1932:41:38, 6.97s/it, lr=1e-5, step_loss=0.976]
Steps: 0%| | 1074/1000000 [2:43:11<2465:21:38, 8.88s/it, lr=1e-5, step_loss=0.976][RANK-0]: Step: [1074], local_loss=0.036725517362356186, train_loss=0.04050682112574577, time_cost=4.376222372055054
+
Steps: 0%| | 1074/1000000 [2:43:11<2465:21:38, 8.88s/it, lr=1e-5, step_loss=0.0367]
Steps: 0%| | 1075/1000000 [2:43:16<2150:37:21, 7.75s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [1075], local_loss=0.050698719918727875, train_loss=0.04513787105679512, time_cost=2.3377859592437744
+
Steps: 0%| | 1075/1000000 [2:43:16<2150:37:21, 7.75s/it, lr=1e-5, step_loss=0.0507]
Steps: 0%| | 1076/1000000 [2:43:22<1929:26:24, 6.95s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [1076], local_loss=0.03304839879274368, train_loss=0.04636272042989731, time_cost=1.2233691215515137
+
Steps: 0%| | 1076/1000000 [2:43:22<1929:26:24, 6.95s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 1077/1000000 [2:43:32<2222:23:53, 8.01s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [1077], local_loss=0.052550822496414185, train_loss=0.14102908968925476, time_cost=1.8168246746063232
+
Steps: 0%| | 1077/1000000 [2:43:32<2222:23:53, 8.01s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 1078/1000000 [2:43:42<2407:20:49, 8.68s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [1078], local_loss=0.13692639768123627, train_loss=0.05724629387259483, time_cost=1.4004948139190674
+
Steps: 0%| | 1078/1000000 [2:43:42<2407:20:49, 8.68s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 1079/1000000 [2:43:56<2824:22:02, 10.18s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [1079], local_loss=0.051138266921043396, train_loss=0.06812995672225952, time_cost=4.17961573600769
+
Steps: 0%| | 1079/1000000 [2:43:56<2824:22:02, 10.18s/it, lr=1e-5, step_loss=0.0511]
Steps: 0%| | 1080/1000000 [2:44:03<2593:17:21, 9.35s/it, lr=1e-5, step_loss=0.0511][RANK-0]: Step: [1080], local_loss=0.049776364117860794, train_loss=0.059062421321868896, time_cost=1.924267292022705
+
Steps: 0%| | 1080/1000000 [2:44:03<2593:17:21, 9.35s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 1081/1000000 [2:44:14<2734:17:34, 9.85s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [1081], local_loss=0.04154498875141144, train_loss=0.04987545311450958, time_cost=3.5622808933258057
+
Steps: 0%| | 1081/1000000 [2:44:14<2734:17:34, 9.85s/it, lr=1e-5, step_loss=0.0415]
Steps: 0%| | 1082/1000000 [2:44:25<2785:44:03, 10.04s/it, lr=1e-5, step_loss=0.0415][RANK-0]: Step: [1082], local_loss=0.02592875063419342, train_loss=0.0741114467382431, time_cost=1.213632345199585
+
Steps: 0%| | 1082/1000000 [2:44:25<2785:44:03, 10.04s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 1083/1000000 [2:44:39<3097:38:29, 11.16s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [1083], local_loss=0.029310593381524086, train_loss=0.1060788482427597, time_cost=2.2564728260040283
+
Steps: 0%| | 1083/1000000 [2:44:39<3097:38:29, 11.16s/it, lr=1e-5, step_loss=0.0293]
Steps: 0%| | 1084/1000000 [2:44:53<3378:00:00, 12.17s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [1084], local_loss=0.1566556990146637, train_loss=0.13669852912425995, time_cost=4.793998956680298
+
Steps: 0%| | 1084/1000000 [2:44:53<3378:00:00, 12.17s/it, lr=1e-5, step_loss=0.157]
Steps: 0%| | 1085/1000000 [2:45:05<3316:54:01, 11.95s/it, lr=1e-5, step_loss=0.157][RANK-0]: Step: [1085], local_loss=0.04167041927576065, train_loss=0.04301849752664566, time_cost=6.437204360961914
+
Steps: 0%| | 1085/1000000 [2:45:05<3316:54:01, 11.95s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1086/1000000 [2:45:14<3104:52:13, 11.19s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1086], local_loss=0.02581234835088253, train_loss=0.050208061933517456, time_cost=2.8702809810638428
+
Steps: 0%| | 1086/1000000 [2:45:14<3104:52:13, 11.19s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 1087/1000000 [2:45:24<3034:59:05, 10.94s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [1087], local_loss=0.03848202899098396, train_loss=0.044211581349372864, time_cost=4.899207592010498
+
Steps: 0%| | 1087/1000000 [2:45:24<3034:59:05, 10.94s/it, lr=1e-5, step_loss=0.0385]
Steps: 0%| | 1088/1000000 [2:45:29<2495:54:05, 9.00s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [1088], local_loss=0.07273411750793457, train_loss=0.048457302153110504, time_cost=2.0739259719848633
+
Steps: 0%| | 1088/1000000 [2:45:29<2495:54:05, 9.00s/it, lr=1e-5, step_loss=0.0727]
Steps: 0%| | 1089/1000000 [2:45:40<2639:41:40, 9.51s/it, lr=1e-5, step_loss=0.0727][RANK-0]: Step: [1089], local_loss=0.038656000047922134, train_loss=0.07152226567268372, time_cost=1.7485644817352295
+
Steps: 0%| | 1089/1000000 [2:45:40<2639:41:40, 9.51s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 1090/1000000 [2:45:45<2328:01:45, 8.39s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [1090], local_loss=0.040325939655303955, train_loss=0.0454409085214138, time_cost=2.9564788341522217
+
Steps: 0%| | 1090/1000000 [2:45:45<2328:01:45, 8.39s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 1091/1000000 [2:45:51<2107:56:21, 7.60s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [1091], local_loss=0.06702239066362381, train_loss=0.10310304164886475, time_cost=1.8613440990447998
+
Steps: 0%| | 1091/1000000 [2:45:51<2107:56:21, 7.60s/it, lr=1e-5, step_loss=0.067]
Steps: 0%| | 1092/1000000 [2:46:02<2381:11:37, 8.58s/it, lr=1e-5, step_loss=0.067][RANK-0]: Step: [1092], local_loss=0.04134049266576767, train_loss=0.03989800065755844, time_cost=6.622002363204956
+
Steps: 0%| | 1092/1000000 [2:46:02<2381:11:37, 8.58s/it, lr=1e-5, step_loss=0.0413]
Steps: 0%| | 1093/1000000 [2:46:16<2814:46:29, 10.14s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [1093], local_loss=0.05562041699886322, train_loss=0.0528322197496891, time_cost=1.2323899269104004
+
Steps: 0%| | 1093/1000000 [2:46:16<2814:46:29, 10.14s/it, lr=1e-5, step_loss=0.0556]
Steps: 0%| | 1094/1000000 [2:46:29<3085:21:10, 11.12s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [1094], local_loss=0.038735195994377136, train_loss=0.06356069445610046, time_cost=5.18442440032959
+
Steps: 0%| | 1094/1000000 [2:46:29<3085:21:10, 11.12s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 1095/1000000 [2:46:43<3302:18:11, 11.90s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [1095], local_loss=0.03928295150399208, train_loss=0.05162280797958374, time_cost=4.03468656539917
+
Steps: 0%| | 1095/1000000 [2:46:43<3302:18:11, 11.90s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 1096/1000000 [2:46:47<2664:32:42, 9.60s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [1096], local_loss=0.12312574684619904, train_loss=0.06383244693279266, time_cost=3.147535562515259
+
Steps: 0%| | 1096/1000000 [2:46:47<2664:32:42, 9.60s/it, lr=1e-5, step_loss=0.123]
Steps: 0%| | 1097/1000000 [2:46:59<2862:46:14, 10.32s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [1097], local_loss=0.10206327587366104, train_loss=0.06358520686626434, time_cost=5.7363996505737305
+
Steps: 0%| | 1097/1000000 [2:46:59<2862:46:14, 10.32s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 1098/1000000 [2:47:04<2379:34:52, 8.58s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [1098], local_loss=0.23497755825519562, train_loss=0.06646811217069626, time_cost=1.513373613357544
+
Steps: 0%| | 1098/1000000 [2:47:04<2379:34:52, 8.58s/it, lr=1e-5, step_loss=0.235]
Steps: 0%| | 1099/1000000 [2:47:08<2061:54:15, 7.43s/it, lr=1e-5, step_loss=0.235][RANK-0]: Step: [1099], local_loss=0.06950481981039047, train_loss=0.17069090902805328, time_cost=1.760570764541626
+
Steps: 0%| | 1099/1000000 [2:47:08<2061:54:15, 7.43s/it, lr=1e-5, step_loss=0.0695]
Steps: 0%| | 1100/1000000 [2:47:13<1808:06:57, 6.52s/it, lr=1e-5, step_loss=0.0695][RANK-0]: Step: [1100], local_loss=0.9892072677612305, train_loss=0.1898740977048874, time_cost=1.2946748733520508
+
Steps: 0%| | 1100/1000000 [2:47:13<1808:06:57, 6.52s/it, lr=1e-5, step_loss=0.989]
Steps: 0%| | 1101/1000000 [2:47:23<2118:11:27, 7.63s/it, lr=1e-5, step_loss=0.989][RANK-0]: Step: [1101], local_loss=0.8965712785720825, train_loss=12.903219223022461, time_cost=1.7318472862243652
+
Steps: 0%| | 1101/1000000 [2:47:23<2118:11:27, 7.63s/it, lr=1e-5, step_loss=0.897]
Steps: 0%| | 1102/1000000 [2:47:33<2302:22:21, 8.30s/it, lr=1e-5, step_loss=0.897][RANK-0]: Step: [1102], local_loss=0.04012089595198631, train_loss=0.06875795125961304, time_cost=3.543902635574341
+
Steps: 0%| | 1102/1000000 [2:47:33<2302:22:21, 8.30s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 1103/1000000 [2:47:42<2374:31:02, 8.56s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [1103], local_loss=0.022451167926192284, train_loss=0.15784312784671783, time_cost=3.4653818607330322
+
Steps: 0%| | 1103/1000000 [2:47:42<2374:31:02, 8.56s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 1104/1000000 [2:47:49<2247:18:08, 8.10s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [1104], local_loss=0.04614514485001564, train_loss=0.07313509285449982, time_cost=3.36559796333313
+
Steps: 0%| | 1104/1000000 [2:47:49<2247:18:08, 8.10s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 1105/1000000 [2:48:03<2710:28:13, 9.77s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [1105], local_loss=0.02380489744246006, train_loss=0.04462464526295662, time_cost=9.95956039428711
+
Steps: 0%| | 1105/1000000 [2:48:03<2710:28:13, 9.77s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 1106/1000000 [2:48:08<2306:09:31, 8.31s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [1106], local_loss=0.08626700192689896, train_loss=0.04004337266087532, time_cost=2.0715880393981934
+
Steps: 0%| | 1106/1000000 [2:48:08<2306:09:31, 8.31s/it, lr=1e-5, step_loss=0.0863]
Steps: 0%| | 1107/1000000 [2:48:13<2074:49:30, 7.48s/it, lr=1e-5, step_loss=0.0863][RANK-0]: Step: [1107], local_loss=0.2506576180458069, train_loss=0.08011633902788162, time_cost=2.7920494079589844
+
Steps: 0%| | 1107/1000000 [2:48:13<2074:49:30, 7.48s/it, lr=1e-5, step_loss=0.251]
Steps: 0%| | 1108/1000000 [2:48:28<2656:39:25, 9.57s/it, lr=1e-5, step_loss=0.251][RANK-0]: Step: [1108], local_loss=0.24079465866088867, train_loss=0.08077869564294815, time_cost=5.584702014923096
+
Steps: 0%| | 1108/1000000 [2:48:28<2656:39:25, 9.57s/it, lr=1e-5, step_loss=0.241]
Steps: 0%| | 1109/1000000 [2:48:39<2770:55:43, 9.99s/it, lr=1e-5, step_loss=0.241][RANK-0]: Step: [1109], local_loss=0.03235815465450287, train_loss=0.0681842640042305, time_cost=2.5376181602478027
+
Steps: 0%| | 1109/1000000 [2:48:39<2770:55:43, 9.99s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 1110/1000000 [2:48:52<3086:29:00, 11.12s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [1110], local_loss=0.06311113387346268, train_loss=0.04878072813153267, time_cost=4.433205842971802
+
Steps: 0%| | 1110/1000000 [2:48:52<3086:29:00, 11.12s/it, lr=1e-5, step_loss=0.0631]
Steps: 0%| | 1111/1000000 [2:48:58<2602:57:55, 9.38s/it, lr=1e-5, step_loss=0.0631][RANK-0]: Step: [1111], local_loss=0.03899051994085312, train_loss=0.04381845146417618, time_cost=2.173839807510376
+
Steps: 0%| | 1111/1000000 [2:48:58<2602:57:55, 9.38s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 1112/1000000 [2:49:05<2448:34:41, 8.82s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [1112], local_loss=0.05142902955412865, train_loss=0.045765653252601624, time_cost=1.2503812313079834
+
Steps: 0%| | 1112/1000000 [2:49:05<2448:34:41, 8.82s/it, lr=1e-5, step_loss=0.0514]
Steps: 0%| | 1113/1000000 [2:49:16<2644:59:52, 9.53s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [1113], local_loss=0.05833663046360016, train_loss=0.04697897285223007, time_cost=4.681075096130371
+
Steps: 0%| | 1113/1000000 [2:49:16<2644:59:52, 9.53s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 1114/1000000 [2:49:26<2629:35:52, 9.48s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [1114], local_loss=0.03657089173793793, train_loss=0.07163333892822266, time_cost=3.3386974334716797
+
Steps: 0%| | 1114/1000000 [2:49:26<2629:35:52, 9.48s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 1115/1000000 [2:49:39<2911:52:54, 10.49s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [1115], local_loss=0.05114319920539856, train_loss=0.05892660841345787, time_cost=4.897503614425659
+
Steps: 0%| | 1115/1000000 [2:49:39<2911:52:54, 10.49s/it, lr=1e-5, step_loss=0.0511]
Steps: 0%| | 1116/1000000 [2:49:44<2466:39:12, 8.89s/it, lr=1e-5, step_loss=0.0511][RANK-0]: Step: [1116], local_loss=0.020170705392956734, train_loss=0.040474146604537964, time_cost=2.147733688354492
+
Steps: 0%| | 1116/1000000 [2:49:44<2466:39:12, 8.89s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 1117/1000000 [2:49:57<2870:26:06, 10.35s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [1117], local_loss=0.05161147192120552, train_loss=0.05365792289376259, time_cost=2.3480093479156494
+
Steps: 0%| | 1117/1000000 [2:49:57<2870:26:06, 10.35s/it, lr=1e-5, step_loss=0.0516]
Steps: 0%| | 1118/1000000 [2:50:09<2941:01:16, 10.60s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [1118], local_loss=0.036478348076343536, train_loss=0.052185170352458954, time_cost=4.201987981796265
+
Steps: 0%| | 1118/1000000 [2:50:09<2941:01:16, 10.60s/it, lr=1e-5, step_loss=0.0365]
Steps: 0%| | 1119/1000000 [2:50:15<2586:44:19, 9.32s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [1119], local_loss=0.043808698654174805, train_loss=0.057968683540821075, time_cost=2.8064091205596924
+
Steps: 0%| | 1119/1000000 [2:50:15<2586:44:19, 9.32s/it, lr=1e-5, step_loss=0.0438]
Steps: 0%| | 1120/1000000 [2:50:25<2627:20:16, 9.47s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [1120], local_loss=0.03534620627760887, train_loss=0.034307148307561874, time_cost=3.813218355178833
+
Steps: 0%| | 1120/1000000 [2:50:25<2627:20:16, 9.47s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 1121/1000000 [2:50:39<3061:56:15, 11.04s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [1121], local_loss=0.10064869374036789, train_loss=0.05538029968738556, time_cost=2.9250760078430176
+
Steps: 0%| | 1121/1000000 [2:50:39<3061:56:15, 11.04s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 1122/1000000 [2:50:47<2750:53:18, 9.91s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [1122], local_loss=0.1372753530740738, train_loss=0.061119500547647476, time_cost=1.2222511768341064
+
Steps: 0%| | 1122/1000000 [2:50:47<2750:53:18, 9.91s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 1123/1000000 [2:50:59<2947:38:28, 10.62s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [1123], local_loss=0.12180369347333908, train_loss=0.0580444261431694, time_cost=8.137125968933105
+
Steps: 0%| | 1123/1000000 [2:50:59<2947:38:28, 10.62s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 1124/1000000 [2:51:08<2784:32:15, 10.04s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [1124], local_loss=0.4165710508823395, train_loss=0.1522403061389923, time_cost=2.509533166885376
+
Steps: 0%| | 1124/1000000 [2:51:08<2784:32:15, 10.04s/it, lr=1e-5, step_loss=0.417]
Steps: 0%| | 1125/1000000 [2:51:20<2992:58:17, 10.79s/it, lr=1e-5, step_loss=0.417][RANK-0]: Step: [1125], local_loss=0.057586658746004105, train_loss=0.05072756111621857, time_cost=5.042309522628784
+
Steps: 0%| | 1125/1000000 [2:51:20<2992:58:17, 10.79s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 1126/1000000 [2:51:31<3006:32:12, 10.84s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [1126], local_loss=0.030641702935099602, train_loss=0.07700007408857346, time_cost=6.387960433959961
+
Steps: 0%| | 1126/1000000 [2:51:31<3006:32:12, 10.84s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 1127/1000000 [2:51:43<3055:43:27, 11.01s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [1127], local_loss=0.03714333102107048, train_loss=0.04167434573173523, time_cost=4.19382381439209
+
Steps: 0%| | 1127/1000000 [2:51:43<3055:43:27, 11.01s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 1128/1000000 [2:51:47<2490:23:28, 8.98s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [1128], local_loss=0.22263196110725403, train_loss=0.100272037088871, time_cost=2.9317588806152344
+
Steps: 0%| | 1128/1000000 [2:51:47<2490:23:28, 8.98s/it, lr=1e-5, step_loss=0.223]
Steps: 0%| | 1129/1000000 [2:51:54<2348:13:19, 8.46s/it, lr=1e-5, step_loss=0.223][RANK-0]: Step: [1129], local_loss=0.07605419307947159, train_loss=0.0412740483880043, time_cost=1.6002836227416992
+
Steps: 0%| | 1129/1000000 [2:51:54<2348:13:19, 8.46s/it, lr=1e-5, step_loss=0.0761]
Steps: 0%| | 1130/1000000 [2:52:00<2111:05:52, 7.61s/it, lr=1e-5, step_loss=0.0761][RANK-0]: Step: [1130], local_loss=0.05885845050215721, train_loss=0.05584544688463211, time_cost=4.3943727016448975
+
Steps: 0%| | 1130/1000000 [2:52:00<2111:05:52, 7.61s/it, lr=1e-5, step_loss=0.0589]
Steps: 0%| | 1131/1000000 [2:52:11<2387:04:10, 8.60s/it, lr=1e-5, step_loss=0.0589][RANK-0]: Step: [1131], local_loss=0.07179436087608337, train_loss=0.08383150398731232, time_cost=7.316596746444702
+
Steps: 0%| | 1131/1000000 [2:52:11<2387:04:10, 8.60s/it, lr=1e-5, step_loss=0.0718]
Steps: 0%| | 1132/1000000 [2:52:16<2093:23:16, 7.54s/it, lr=1e-5, step_loss=0.0718][RANK-0]: Step: [1132], local_loss=0.056822068989276886, train_loss=0.05394074320793152, time_cost=1.5271234512329102
+
Steps: 0%| | 1132/1000000 [2:52:16<2093:23:16, 7.54s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 1133/1000000 [2:52:26<2342:28:25, 8.44s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [1133], local_loss=0.04212268441915512, train_loss=0.058293506503105164, time_cost=1.5150954723358154
+
Steps: 0%| | 1133/1000000 [2:52:26<2342:28:25, 8.44s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 1134/1000000 [2:52:31<1992:26:15, 7.18s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [1134], local_loss=0.032020121812820435, train_loss=0.3462024927139282, time_cost=1.9347984790802002
+
Steps: 0%| | 1134/1000000 [2:52:31<1992:26:15, 7.18s/it, lr=1e-5, step_loss=0.032]
Steps: 0%| | 1135/1000000 [2:52:44<2540:29:24, 9.16s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [1135], local_loss=0.03332824259996414, train_loss=0.04489686340093613, time_cost=2.6021246910095215
+
Steps: 0%| | 1135/1000000 [2:52:44<2540:29:24, 9.16s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 1136/1000000 [2:52:52<2411:02:18, 8.69s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [1136], local_loss=0.04707200825214386, train_loss=0.05179252102971077, time_cost=3.605651378631592
+
Steps: 0%| | 1136/1000000 [2:52:52<2411:02:18, 8.69s/it, lr=1e-5, step_loss=0.0471]
Steps: 0%| | 1137/1000000 [2:53:00<2329:39:14, 8.40s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [1137], local_loss=0.029600420966744423, train_loss=0.08628539741039276, time_cost=4.038716554641724
+
Steps: 0%| | 1137/1000000 [2:53:00<2329:39:14, 8.40s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 1138/1000000 [2:53:07<2260:56:59, 8.15s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [1138], local_loss=0.059206776320934296, train_loss=0.10039080679416656, time_cost=2.184776544570923
+
Steps: 0%| | 1138/1000000 [2:53:07<2260:56:59, 8.15s/it, lr=1e-5, step_loss=0.0592]
Steps: 0%| | 1139/1000000 [2:53:22<2819:04:17, 10.16s/it, lr=1e-5, step_loss=0.0592][RANK-0]: Step: [1139], local_loss=0.026409786194562912, train_loss=0.04148497432470322, time_cost=6.320467233657837
+
Steps: 0%| | 1139/1000000 [2:53:22<2819:04:17, 10.16s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1140/1000000 [2:53:33<2851:06:56, 10.28s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1140], local_loss=0.006384860724210739, train_loss=0.04226849228143692, time_cost=7.836756944656372
+
Steps: 0%| | 1140/1000000 [2:53:33<2851:06:56, 10.28s/it, lr=1e-5, step_loss=0.00638]
Steps: 0%| | 1141/1000000 [2:53:46<3117:44:26, 11.24s/it, lr=1e-5, step_loss=0.00638][RANK-0]: Step: [1141], local_loss=0.05770997330546379, train_loss=0.1700742244720459, time_cost=4.653913259506226
+
Steps: 0%| | 1141/1000000 [2:53:46<3117:44:26, 11.24s/it, lr=1e-5, step_loss=0.0577]
Steps: 0%| | 1142/1000000 [2:53:54<2846:57:01, 10.26s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [1142], local_loss=0.04727325960993767, train_loss=0.046923886984586716, time_cost=1.3728289604187012
+
Steps: 0%| | 1142/1000000 [2:53:54<2846:57:01, 10.26s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 1143/1000000 [2:54:04<2852:28:32, 10.28s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [1143], local_loss=0.14561495184898376, train_loss=0.06397389620542526, time_cost=1.4464139938354492
+
Steps: 0%| | 1143/1000000 [2:54:04<2852:28:32, 10.28s/it, lr=1e-5, step_loss=0.146]
Steps: 0%| | 1144/1000000 [2:54:13<2751:06:25, 9.92s/it, lr=1e-5, step_loss=0.146][RANK-0]: Step: [1144], local_loss=0.0979490652680397, train_loss=0.0572538860142231, time_cost=1.5526931285858154
+
Steps: 0%| | 1144/1000000 [2:54:13<2751:06:25, 9.92s/it, lr=1e-5, step_loss=0.0979]
Steps: 0%| | 1145/1000000 [2:54:25<2923:37:25, 10.54s/it, lr=1e-5, step_loss=0.0979][RANK-0]: Step: [1145], local_loss=0.16310018301010132, train_loss=0.10383133590221405, time_cost=1.1964170932769775
+
Steps: 0%| | 1145/1000000 [2:54:25<2923:37:25, 10.54s/it, lr=1e-5, step_loss=0.163]
Steps: 0%| | 1146/1000000 [2:54:30<2449:34:22, 8.83s/it, lr=1e-5, step_loss=0.163][RANK-0]: Step: [1146], local_loss=0.3125917613506317, train_loss=0.08260840177536011, time_cost=2.3057901859283447
+
Steps: 0%| | 1146/1000000 [2:54:30<2449:34:22, 8.83s/it, lr=1e-5, step_loss=0.313]
Steps: 0%| | 1147/1000000 [2:54:39<2413:56:49, 8.70s/it, lr=1e-5, step_loss=0.313][RANK-0]: Step: [1147], local_loss=0.10179369151592255, train_loss=0.06359667330980301, time_cost=2.303798198699951
+
Steps: 0%| | 1147/1000000 [2:54:39<2413:56:49, 8.70s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 1148/1000000 [2:54:49<2558:59:41, 9.22s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [1148], local_loss=0.03205189108848572, train_loss=0.03820549324154854, time_cost=2.2450051307678223
+
Steps: 0%| | 1148/1000000 [2:54:49<2558:59:41, 9.22s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 1149/1000000 [2:54:54<2236:13:39, 8.06s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [1149], local_loss=0.056754112243652344, train_loss=0.0726420059800148, time_cost=1.4223129749298096
+
Steps: 0%| | 1149/1000000 [2:54:54<2236:13:39, 8.06s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 1150/1000000 [2:55:09<2766:52:07, 9.97s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [1150], local_loss=0.051963530480861664, train_loss=0.06046069413423538, time_cost=6.758858919143677
+
Steps: 0%| | 1150/1000000 [2:55:09<2766:52:07, 9.97s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 1151/1000000 [2:55:14<2350:49:13, 8.47s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [1151], local_loss=0.014784565195441246, train_loss=0.0311698317527771, time_cost=2.6232056617736816
+
Steps: 0%| | 1151/1000000 [2:55:14<2350:49:13, 8.47s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 1152/1000000 [2:55:22<2288:06:42, 8.25s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [1152], local_loss=0.024513715878129005, train_loss=0.13991732895374298, time_cost=1.7914276123046875
+
Steps: 0%| | 1152/1000000 [2:55:22<2288:06:42, 8.25s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 1153/1000000 [2:55:33<2533:04:58, 9.13s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [1153], local_loss=0.0736052393913269, train_loss=0.12069977819919586, time_cost=3.9897003173828125
+
Steps: 0%| | 1153/1000000 [2:55:33<2533:04:58, 9.13s/it, lr=1e-5, step_loss=0.0736]
Steps: 0%| | 1154/1000000 [2:55:37<2143:29:20, 7.73s/it, lr=1e-5, step_loss=0.0736][RANK-0]: Step: [1154], local_loss=0.015905320644378662, train_loss=0.2802026569843292, time_cost=1.2401542663574219
+
Steps: 0%| | 1154/1000000 [2:55:37<2143:29:20, 7.73s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 1155/1000000 [2:55:47<2348:59:33, 8.47s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [1155], local_loss=0.0409075953066349, train_loss=0.044657185673713684, time_cost=1.2201378345489502
+
Steps: 0%| | 1155/1000000 [2:55:47<2348:59:33, 8.47s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 1156/1000000 [2:55:57<2422:37:10, 8.73s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [1156], local_loss=0.8117284178733826, train_loss=0.14298275113105774, time_cost=6.649739980697632
+
Steps: 0%| | 1156/1000000 [2:55:57<2422:37:10, 8.73s/it, lr=1e-5, step_loss=0.812]
Steps: 0%| | 1157/1000000 [2:56:04<2284:03:20, 8.23s/it, lr=1e-5, step_loss=0.812][RANK-0]: Step: [1157], local_loss=0.07262678444385529, train_loss=0.0658506453037262, time_cost=2.7878003120422363
+
Steps: 0%| | 1157/1000000 [2:56:04<2284:03:20, 8.23s/it, lr=1e-5, step_loss=0.0726]
Steps: 0%| | 1158/1000000 [2:56:09<2041:38:29, 7.36s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [1158], local_loss=0.018531935289502144, train_loss=0.053694166243076324, time_cost=2.3427460193634033
+
Steps: 0%| | 1158/1000000 [2:56:09<2041:38:29, 7.36s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 1159/1000000 [2:56:17<2042:28:09, 7.36s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [1159], local_loss=0.06005900353193283, train_loss=0.05185806006193161, time_cost=3.1639721393585205
+
Steps: 0%| | 1159/1000000 [2:56:17<2042:28:09, 7.36s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 1160/1000000 [2:56:26<2211:44:53, 7.97s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [1160], local_loss=0.04898059368133545, train_loss=0.055542074143886566, time_cost=1.2160606384277344
+
Steps: 0%| | 1160/1000000 [2:56:26<2211:44:53, 7.97s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 1161/1000000 [2:56:33<2131:42:06, 7.68s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [1161], local_loss=0.03485816717147827, train_loss=0.05870373547077179, time_cost=3.0388588905334473
+
Steps: 0%| | 1161/1000000 [2:56:33<2131:42:06, 7.68s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 1162/1000000 [2:56:40<2059:38:40, 7.42s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [1162], local_loss=0.04884353280067444, train_loss=0.04376433044672012, time_cost=2.2860429286956787
+
Steps: 0%| | 1162/1000000 [2:56:40<2059:38:40, 7.42s/it, lr=1e-5, step_loss=0.0488]
Steps: 0%| | 1163/1000000 [2:56:55<2722:49:34, 9.81s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [1163], local_loss=0.051487457007169724, train_loss=0.10516086965799332, time_cost=12.201753854751587
+
Steps: 0%| | 1163/1000000 [2:56:55<2722:49:34, 9.81s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 1164/1000000 [2:57:06<2795:19:48, 10.07s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [1164], local_loss=0.055376529693603516, train_loss=0.04950787127017975, time_cost=2.263730525970459
+
Steps: 0%| | 1164/1000000 [2:57:06<2795:19:48, 10.07s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 1165/1000000 [2:57:10<2338:07:28, 8.43s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [1165], local_loss=0.026458339765667915, train_loss=0.056046582758426666, time_cost=1.7961061000823975
+
Steps: 0%| | 1165/1000000 [2:57:10<2338:07:28, 8.43s/it, lr=1e-5, step_loss=0.0265]
Steps: 0%| | 1166/1000000 [2:57:15<1990:55:20, 7.18s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [1166], local_loss=0.051370710134506226, train_loss=0.06956732273101807, time_cost=1.542614459991455
+
Steps: 0%| | 1166/1000000 [2:57:15<1990:55:20, 7.18s/it, lr=1e-5, step_loss=0.0514]
Steps: 0%| | 1167/1000000 [2:57:20<1841:41:08, 6.64s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [1167], local_loss=0.020906148478388786, train_loss=0.0312943235039711, time_cost=1.959214448928833
+
Steps: 0%| | 1167/1000000 [2:57:20<1841:41:08, 6.64s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 1168/1000000 [2:57:29<2042:07:36, 7.36s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [1168], local_loss=0.03038344345986843, train_loss=0.175482839345932, time_cost=3.543440103530884
+
Steps: 0%| | 1168/1000000 [2:57:29<2042:07:36, 7.36s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 1169/1000000 [2:57:44<2641:11:33, 9.52s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [1169], local_loss=0.035709042102098465, train_loss=0.07416713982820511, time_cost=11.886954307556152
+
Steps: 0%| | 1169/1000000 [2:57:44<2641:11:33, 9.52s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 1170/1000000 [2:57:53<2663:02:46, 9.60s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [1170], local_loss=0.042972322553396225, train_loss=0.04321544989943504, time_cost=3.554943084716797
+
Steps: 0%| | 1170/1000000 [2:57:53<2663:02:46, 9.60s/it, lr=1e-5, step_loss=0.043]
Steps: 0%| | 1171/1000000 [2:58:01<2464:47:03, 8.88s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [1171], local_loss=0.02425757236778736, train_loss=0.04087408632040024, time_cost=1.7616806030273438
+
Steps: 0%| | 1171/1000000 [2:58:01<2464:47:03, 8.88s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 1172/1000000 [2:58:06<2152:10:11, 7.76s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [1172], local_loss=0.046006325632333755, train_loss=0.0527227520942688, time_cost=2.3873353004455566
+
Steps: 0%| | 1172/1000000 [2:58:06<2152:10:11, 7.76s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 1173/1000000 [2:58:11<1924:52:41, 6.94s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [1173], local_loss=0.031023982912302017, train_loss=0.038633935153484344, time_cost=2.380077362060547
+
Steps: 0%| | 1173/1000000 [2:58:11<1924:52:41, 6.94s/it, lr=1e-5, step_loss=0.031]
Steps: 0%| | 1174/1000000 [2:58:18<1971:29:12, 7.11s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [1174], local_loss=0.027381109073758125, train_loss=0.04895938187837601, time_cost=1.5322582721710205
+
Steps: 0%| | 1174/1000000 [2:58:18<1971:29:12, 7.11s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 1175/1000000 [2:58:30<2391:33:14, 8.62s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [1175], local_loss=0.05424036085605621, train_loss=0.14762626588344574, time_cost=3.497901678085327
+
Steps: 0%| | 1175/1000000 [2:58:30<2391:33:14, 8.62s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 1176/1000000 [2:58:38<2280:16:02, 8.22s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [1176], local_loss=0.02773994393646717, train_loss=0.04475785046815872, time_cost=1.3034555912017822
+
Steps: 0%| | 1176/1000000 [2:58:38<2280:16:02, 8.22s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 1177/1000000 [2:58:51<2680:16:21, 9.66s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [1177], local_loss=0.04741702228784561, train_loss=0.10984165221452713, time_cost=3.3503262996673584
+
Steps: 0%| | 1177/1000000 [2:58:51<2680:16:21, 9.66s/it, lr=1e-5, step_loss=0.0474]
Steps: 0%| | 1178/1000000 [2:58:58<2479:31:40, 8.94s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [1178], local_loss=0.04032452404499054, train_loss=0.03457704558968544, time_cost=1.3111891746520996
+
Steps: 0%| | 1178/1000000 [2:58:58<2479:31:40, 8.94s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 1179/1000000 [2:59:05<2337:17:52, 8.42s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [1179], local_loss=0.037423789501190186, train_loss=0.160727858543396, time_cost=2.4608824253082275
+
Steps: 0%| | 1179/1000000 [2:59:05<2337:17:52, 8.42s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 1180/1000000 [2:59:20<2847:11:02, 10.26s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [1180], local_loss=0.041178520768880844, train_loss=0.035392552614212036, time_cost=5.661045551300049
+
Steps: 0%| | 1180/1000000 [2:59:20<2847:11:02, 10.26s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 1181/1000000 [2:59:33<3095:38:40, 11.16s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [1181], local_loss=0.028608497232198715, train_loss=0.0669151097536087, time_cost=9.586494207382202
+
Steps: 0%| | 1181/1000000 [2:59:33<3095:38:40, 11.16s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 1182/1000000 [2:59:42<2883:19:13, 10.39s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [1182], local_loss=0.1335945576429367, train_loss=0.059727542102336884, time_cost=1.288295030593872
+
Steps: 0%| | 1182/1000000 [2:59:42<2883:19:13, 10.39s/it, lr=1e-5, step_loss=0.134]
Steps: 0%| | 1183/1000000 [2:59:49<2601:12:18, 9.38s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [1183], local_loss=0.07335293292999268, train_loss=0.20284931361675262, time_cost=3.8192498683929443
+
Steps: 0%| | 1183/1000000 [2:59:49<2601:12:18, 9.38s/it, lr=1e-5, step_loss=0.0734]
Steps: 0%| | 1184/1000000 [3:00:00<2771:27:08, 9.99s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [1184], local_loss=0.07470155507326126, train_loss=24.304698944091797, time_cost=1.2225472927093506
+
Steps: 0%| | 1184/1000000 [3:00:00<2771:27:08, 9.99s/it, lr=1e-5, step_loss=0.0747]
Steps: 0%| | 1185/1000000 [3:00:09<2719:32:13, 9.80s/it, lr=1e-5, step_loss=0.0747][RANK-0]: Step: [1185], local_loss=0.06620974838733673, train_loss=0.0574851930141449, time_cost=1.2605555057525635
+
Steps: 0%| | 1185/1000000 [3:00:09<2719:32:13, 9.80s/it, lr=1e-5, step_loss=0.0662]
Steps: 0%| | 1186/1000000 [3:00:15<2328:10:44, 8.39s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [1186], local_loss=0.05411815270781517, train_loss=0.06492825597524643, time_cost=2.4113576412200928
+
Steps: 0%| | 1186/1000000 [3:00:15<2328:10:44, 8.39s/it, lr=1e-5, step_loss=0.0541]
Steps: 0%| | 1187/1000000 [3:00:25<2507:30:24, 9.04s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [1187], local_loss=0.018535656854510307, train_loss=0.05490449443459511, time_cost=2.426906108856201
+
Steps: 0%| | 1187/1000000 [3:00:25<2507:30:24, 9.04s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 1188/1000000 [3:00:30<2133:49:01, 7.69s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [1188], local_loss=0.04909280315041542, train_loss=0.17056497931480408, time_cost=1.8088102340698242
+
Steps: 0%| | 1188/1000000 [3:00:30<2133:49:01, 7.69s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 1189/1000000 [3:00:42<2532:10:24, 9.13s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [1189], local_loss=0.0496026836335659, train_loss=0.10408371686935425, time_cost=3.605177640914917
+
Steps: 0%| | 1189/1000000 [3:00:42<2532:10:24, 9.13s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 1190/1000000 [3:00:48<2237:31:03, 8.06s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [1190], local_loss=0.035527050495147705, train_loss=0.04958800971508026, time_cost=4.704083442687988
+
Steps: 0%| | 1190/1000000 [3:00:48<2237:31:03, 8.06s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 1191/1000000 [3:00:56<2285:03:12, 8.24s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [1191], local_loss=0.08622803539037704, train_loss=0.07402482628822327, time_cost=2.72450852394104
+
Steps: 0%| | 1191/1000000 [3:00:56<2285:03:12, 8.24s/it, lr=1e-5, step_loss=0.0862]
Steps: 0%| | 1192/1000000 [3:01:08<2613:11:36, 9.42s/it, lr=1e-5, step_loss=0.0862][RANK-0]: Step: [1192], local_loss=0.055666036903858185, train_loss=0.046282313764095306, time_cost=8.464775323867798
+
Steps: 0%| | 1192/1000000 [3:01:08<2613:11:36, 9.42s/it, lr=1e-5, step_loss=0.0557]
Steps: 0%| | 1193/1000000 [3:01:14<2263:34:58, 8.16s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [1193], local_loss=0.053181350231170654, train_loss=0.045633189380168915, time_cost=1.2020835876464844
+
Steps: 0%| | 1193/1000000 [3:01:14<2263:34:58, 8.16s/it, lr=1e-5, step_loss=0.0532]
Steps: 0%| | 1194/1000000 [3:01:18<1963:15:45, 7.08s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [1194], local_loss=0.15592917799949646, train_loss=0.06803283095359802, time_cost=1.5919551849365234
+
Steps: 0%| | 1194/1000000 [3:01:18<1963:15:45, 7.08s/it, lr=1e-5, step_loss=0.156]
Steps: 0%| | 1195/1000000 [3:01:31<2423:47:19, 8.74s/it, lr=1e-5, step_loss=0.156][RANK-0]: Step: [1195], local_loss=0.10071588307619095, train_loss=0.08253085613250732, time_cost=5.529223203659058
+
Steps: 0%| | 1195/1000000 [3:01:31<2423:47:19, 8.74s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 1196/1000000 [3:01:36<2116:53:03, 7.63s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [1196], local_loss=0.032956160604953766, train_loss=0.09190506488084793, time_cost=2.091757297515869
+
Steps: 0%| | 1196/1000000 [3:01:36<2116:53:03, 7.63s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 1197/1000000 [3:01:42<1975:34:54, 7.12s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [1197], local_loss=0.04387989640235901, train_loss=0.10221985727548599, time_cost=2.1567015647888184
+
Steps: 0%| | 1197/1000000 [3:01:42<1975:34:54, 7.12s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 1198/1000000 [3:01:46<1718:58:55, 6.20s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [1198], local_loss=0.055391278117895126, train_loss=0.08601593226194382, time_cost=1.3553781509399414
+
Steps: 0%| | 1198/1000000 [3:01:46<1718:58:55, 6.20s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 1199/1000000 [3:02:02<2504:43:20, 9.03s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [1199], local_loss=0.05993528664112091, train_loss=0.17243660986423492, time_cost=1.2184457778930664
+
Steps: 0%| | 1199/1000000 [3:02:02<2504:43:20, 9.03s/it, lr=1e-5, step_loss=0.0599]
Steps: 0%| | 1200/1000000 [3:02:13<2675:30:27, 9.64s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [1200], local_loss=0.04185028374195099, train_loss=0.0614427849650383, time_cost=3.536745309829712
+
Steps: 0%| | 1200/1000000 [3:02:13<2675:30:27, 9.64s/it, lr=1e-5, step_loss=0.0419]
Steps: 0%| | 1201/1000000 [3:02:17<2234:00:51, 8.05s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [1201], local_loss=0.037704579532146454, train_loss=0.044179804623126984, time_cost=2.8232240676879883
+
Steps: 0%| | 1201/1000000 [3:02:17<2234:00:51, 8.05s/it, lr=1e-5, step_loss=0.0377]
Steps: 0%| | 1202/1000000 [3:02:30<2636:39:48, 9.50s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [1202], local_loss=0.03309168294072151, train_loss=0.08042788505554199, time_cost=4.661508083343506
+
Steps: 0%| | 1202/1000000 [3:02:30<2636:39:48, 9.50s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 1203/1000000 [3:02:43<2901:49:11, 10.46s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [1203], local_loss=0.03499352186918259, train_loss=0.03745594248175621, time_cost=6.509677886962891
+
Steps: 0%| | 1203/1000000 [3:02:43<2901:49:11, 10.46s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 1204/1000000 [3:02:56<3155:22:12, 11.37s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [1204], local_loss=0.047586262226104736, train_loss=0.049714040011167526, time_cost=1.2212955951690674
+
Steps: 0%| | 1204/1000000 [3:02:56<3155:22:12, 11.37s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 1205/1000000 [3:03:02<2743:41:29, 9.89s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [1205], local_loss=0.03396732360124588, train_loss=0.04733945429325104, time_cost=2.198326587677002
+
Steps: 0%| | 1205/1000000 [3:03:02<2743:41:29, 9.89s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 1206/1000000 [3:03:11<2628:02:53, 9.47s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [1206], local_loss=0.09193164855241776, train_loss=0.05826035141944885, time_cost=4.88648247718811
+
Steps: 0%| | 1206/1000000 [3:03:11<2628:02:53, 9.47s/it, lr=1e-5, step_loss=0.0919]
Steps: 0%| | 1207/1000000 [3:03:20<2610:46:51, 9.41s/it, lr=1e-5, step_loss=0.0919][RANK-0]: Step: [1207], local_loss=0.05010591074824333, train_loss=0.04333425313234329, time_cost=1.201409101486206
+
Steps: 0%| | 1207/1000000 [3:03:20<2610:46:51, 9.41s/it, lr=1e-5, step_loss=0.0501]
Steps: 0%| | 1208/1000000 [3:03:27<2374:10:49, 8.56s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [1208], local_loss=134.69577026367188, train_loss=16.874841690063477, time_cost=2.163356304168701
+
Steps: 0%| | 1208/1000000 [3:03:27<2374:10:49, 8.56s/it, lr=1e-5, step_loss=135]
Steps: 0%| | 1209/1000000 [3:03:41<2880:15:34, 10.38s/it, lr=1e-5, step_loss=135][RANK-0]: Step: [1209], local_loss=0.03513804078102112, train_loss=0.04308238625526428, time_cost=6.147336959838867
+
Steps: 0%| | 1209/1000000 [3:03:41<2880:15:34, 10.38s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 1210/1000000 [3:03:47<2480:11:40, 8.94s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [1210], local_loss=0.030439436435699463, train_loss=0.0363268218934536, time_cost=4.348071336746216
+
Steps: 0%| | 1210/1000000 [3:03:47<2480:11:40, 8.94s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 1211/1000000 [3:03:53<2226:46:28, 8.03s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [1211], local_loss=0.06458431482315063, train_loss=0.04745021462440491, time_cost=4.877954006195068
+
Steps: 0%| | 1211/1000000 [3:03:53<2226:46:28, 8.03s/it, lr=1e-5, step_loss=0.0646]
Steps: 0%| | 1212/1000000 [3:04:04<2482:38:08, 8.95s/it, lr=1e-5, step_loss=0.0646][RANK-0]: Step: [1212], local_loss=0.05850118771195412, train_loss=0.05821330100297928, time_cost=3.769134044647217
+
Steps: 0%| | 1212/1000000 [3:04:04<2482:38:08, 8.95s/it, lr=1e-5, step_loss=0.0585]
Steps: 0%| | 1213/1000000 [3:04:10<2238:02:21, 8.07s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [1213], local_loss=0.02106318436563015, train_loss=0.042166732251644135, time_cost=1.2135145664215088
+
Steps: 0%| | 1213/1000000 [3:04:10<2238:02:21, 8.07s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 1214/1000000 [3:04:15<1983:21:50, 7.15s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [1214], local_loss=0.04659535363316536, train_loss=0.04746931418776512, time_cost=2.6266067028045654
+
Steps: 0%| | 1214/1000000 [3:04:15<1983:21:50, 7.15s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 1215/1000000 [3:04:21<1878:32:38, 6.77s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [1215], local_loss=0.06640846282243729, train_loss=0.06927638500928879, time_cost=4.493384599685669
+
Steps: 0%| | 1215/1000000 [3:04:21<1878:32:38, 6.77s/it, lr=1e-5, step_loss=0.0664]
Steps: 0%| | 1216/1000000 [3:04:28<1921:37:00, 6.93s/it, lr=1e-5, step_loss=0.0664][RANK-0]: Step: [1216], local_loss=0.025312868878245354, train_loss=0.06952527165412903, time_cost=2.1921627521514893
+
Steps: 0%| | 1216/1000000 [3:04:28<1921:37:00, 6.93s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 1217/1000000 [3:04:34<1837:16:38, 6.62s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [1217], local_loss=0.015327866189181805, train_loss=0.031891871243715286, time_cost=3.9791197776794434
+
Steps: 0%| | 1217/1000000 [3:04:34<1837:16:38, 6.62s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 1218/1000000 [3:04:39<1700:10:11, 6.13s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [1218], local_loss=0.05389225482940674, train_loss=0.05381399393081665, time_cost=1.9171578884124756
+
Steps: 0%| | 1218/1000000 [3:04:39<1700:10:11, 6.13s/it, lr=1e-5, step_loss=0.0539]
Steps: 0%| | 1219/1000000 [3:04:50<2110:09:19, 7.61s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [1219], local_loss=0.026598161086440086, train_loss=0.037724584341049194, time_cost=1.3289520740509033
+
Steps: 0%| | 1219/1000000 [3:04:50<2110:09:19, 7.61s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 1220/1000000 [3:04:55<1881:41:22, 6.78s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [1220], local_loss=0.05633816868066788, train_loss=0.21513697504997253, time_cost=2.0551869869232178
+
Steps: 0%| | 1220/1000000 [3:04:55<1881:41:22, 6.78s/it, lr=1e-5, step_loss=0.0563]
Steps: 0%| | 1221/1000000 [3:05:01<1781:00:59, 6.42s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [1221], local_loss=0.057093191891908646, train_loss=8.83074951171875, time_cost=1.2351627349853516
+
Steps: 0%| | 1221/1000000 [3:05:01<1781:00:59, 6.42s/it, lr=1e-5, step_loss=0.0571]
Steps: 0%| | 1222/1000000 [3:05:07<1751:26:49, 6.31s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [1222], local_loss=0.03384392336010933, train_loss=0.05443796515464783, time_cost=1.3494560718536377
+
Steps: 0%| | 1222/1000000 [3:05:07<1751:26:49, 6.31s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 1223/1000000 [3:05:16<1968:31:53, 7.10s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [1223], local_loss=0.33845141530036926, train_loss=0.14407625794410706, time_cost=2.5370194911956787
+
Steps: 0%| | 1223/1000000 [3:05:16<1968:31:53, 7.10s/it, lr=1e-5, step_loss=0.338]
Steps: 0%| | 1224/1000000 [3:05:25<2160:44:12, 7.79s/it, lr=1e-5, step_loss=0.338][RANK-0]: Step: [1224], local_loss=0.9998711943626404, train_loss=0.24972069263458252, time_cost=1.94107985496521
+
Steps: 0%| | 1224/1000000 [3:05:25<2160:44:12, 7.79s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 1225/1000000 [3:05:38<2569:35:48, 9.26s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [1225], local_loss=0.03241385146975517, train_loss=0.15645626187324524, time_cost=4.469071865081787
+
Steps: 0%| | 1225/1000000 [3:05:38<2569:35:48, 9.26s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 1226/1000000 [3:05:47<2568:21:54, 9.26s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [1226], local_loss=0.02621179260313511, train_loss=0.07604964822530746, time_cost=1.2902941703796387
+
Steps: 0%| | 1226/1000000 [3:05:47<2568:21:54, 9.26s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 1227/1000000 [3:05:52<2229:34:42, 8.04s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [1227], local_loss=0.07657904922962189, train_loss=0.09756740927696228, time_cost=1.2605071067810059
+
Steps: 0%| | 1227/1000000 [3:05:52<2229:34:42, 8.04s/it, lr=1e-5, step_loss=0.0766]
Steps: 0%| | 1228/1000000 [3:06:03<2498:56:31, 9.01s/it, lr=1e-5, step_loss=0.0766][RANK-0]: Step: [1228], local_loss=0.13249096274375916, train_loss=0.06224847584962845, time_cost=1.2150111198425293
+
Steps: 0%| | 1228/1000000 [3:06:03<2498:56:31, 9.01s/it, lr=1e-5, step_loss=0.132]
Steps: 0%| | 1229/1000000 [3:06:18<2949:00:54, 10.63s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [1229], local_loss=0.03863636031746864, train_loss=0.11121965199708939, time_cost=6.985599040985107
+
Steps: 0%| | 1229/1000000 [3:06:18<2949:00:54, 10.63s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 1230/1000000 [3:06:33<3306:08:36, 11.92s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [1230], local_loss=0.03690669685602188, train_loss=0.052720144391059875, time_cost=4.085031032562256
+
Steps: 0%| | 1230/1000000 [3:06:33<3306:08:36, 11.92s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 1231/1000000 [3:06:44<3268:37:02, 11.78s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [1231], local_loss=0.023978974670171738, train_loss=0.031362541019916534, time_cost=3.7601749897003174
+
Steps: 0%| | 1231/1000000 [3:06:44<3268:37:02, 11.78s/it, lr=1e-5, step_loss=0.024]
Steps: 0%| | 1232/1000000 [3:07:00<3591:58:48, 12.95s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [1232], local_loss=1.0019779205322266, train_loss=0.21869473159313202, time_cost=6.065664529800415
+
Steps: 0%| | 1232/1000000 [3:07:00<3591:58:48, 12.95s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 1233/1000000 [3:07:05<2944:01:27, 10.61s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [1233], local_loss=0.09225878864526749, train_loss=0.09059590846300125, time_cost=2.1248273849487305
+
Steps: 0%| | 1233/1000000 [3:07:05<2944:01:27, 10.61s/it, lr=1e-5, step_loss=0.0923]
Steps: 0%| | 1234/1000000 [3:07:16<2956:27:29, 10.66s/it, lr=1e-5, step_loss=0.0923][RANK-0]: Step: [1234], local_loss=0.03035687282681465, train_loss=0.0852142721414566, time_cost=1.230764389038086
+
Steps: 0%| | 1234/1000000 [3:07:16<2956:27:29, 10.66s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 1235/1000000 [3:07:23<2692:37:06, 9.71s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [1235], local_loss=0.03506363183259964, train_loss=0.06398651003837585, time_cost=3.7475714683532715
+
Steps: 0%| | 1235/1000000 [3:07:23<2692:37:06, 9.71s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 1236/1000000 [3:07:38<3145:23:44, 11.34s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [1236], local_loss=0.0496349111199379, train_loss=0.06871171295642853, time_cost=7.130253553390503
+
Steps: 0%| | 1236/1000000 [3:07:38<3145:23:44, 11.34s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 1237/1000000 [3:07:49<3103:20:25, 11.19s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [1237], local_loss=0.032745711505413055, train_loss=0.06797277927398682, time_cost=3.417551040649414
+
Steps: 0%| | 1237/1000000 [3:07:49<3103:20:25, 11.19s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 1238/1000000 [3:08:09<3805:14:25, 13.72s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [1238], local_loss=0.03582441806793213, train_loss=7.047825336456299, time_cost=11.312100648880005
+
Steps: 0%| | 1238/1000000 [3:08:09<3805:14:25, 13.72s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 1239/1000000 [3:08:13<2996:34:56, 10.80s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [1239], local_loss=0.025028295814990997, train_loss=0.03822009637951851, time_cost=3.3208038806915283
+
Steps: 0%| | 1239/1000000 [3:08:13<2996:34:56, 10.80s/it, lr=1e-5, step_loss=0.025]
Steps: 0%| | 1240/1000000 [3:08:23<2907:24:04, 10.48s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [1240], local_loss=0.2813422679901123, train_loss=0.32067590951919556, time_cost=1.869307518005371
+
Steps: 0%| | 1240/1000000 [3:08:23<2907:24:04, 10.48s/it, lr=1e-5, step_loss=0.281]
Steps: 0%| | 1241/1000000 [3:08:32<2789:49:59, 10.06s/it, lr=1e-5, step_loss=0.281][RANK-0]: Step: [1241], local_loss=107.38751983642578, train_loss=13.471232414245605, time_cost=3.5577731132507324
+
Steps: 0%| | 1241/1000000 [3:08:32<2789:49:59, 10.06s/it, lr=1e-5, step_loss=107]
Steps: 0%| | 1242/1000000 [3:08:40<2619:00:57, 9.44s/it, lr=1e-5, step_loss=107][RANK-0]: Step: [1242], local_loss=0.035074662417173386, train_loss=0.06823650002479553, time_cost=1.2260034084320068
+
Steps: 0%| | 1242/1000000 [3:08:40<2619:00:57, 9.44s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 1243/1000000 [3:08:49<2595:57:48, 9.36s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [1243], local_loss=0.036900945007801056, train_loss=0.07518783211708069, time_cost=1.5385785102844238
+
Steps: 0%| | 1243/1000000 [3:08:49<2595:57:48, 9.36s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 1244/1000000 [3:08:55<2306:22:32, 8.31s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [1244], local_loss=0.03861197084188461, train_loss=0.09230607748031616, time_cost=3.0834598541259766
+
Steps: 0%| | 1244/1000000 [3:08:55<2306:22:32, 8.31s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 1245/1000000 [3:09:01<2118:25:54, 7.64s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [1245], local_loss=0.037480562925338745, train_loss=0.07365643978118896, time_cost=4.467918872833252
+
Steps: 0%| | 1245/1000000 [3:09:01<2118:25:54, 7.64s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 1246/1000000 [3:09:10<2227:55:37, 8.03s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [1246], local_loss=0.04848761484026909, train_loss=0.0700816661119461, time_cost=4.1831214427948
+
Steps: 0%| | 1246/1000000 [3:09:10<2227:55:37, 8.03s/it, lr=1e-5, step_loss=0.0485]
Steps: 0%| | 1247/1000000 [3:09:23<2686:27:30, 9.68s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [1247], local_loss=0.019408531486988068, train_loss=0.1717623919248581, time_cost=4.860469341278076
+
Steps: 0%| | 1247/1000000 [3:09:23<2686:27:30, 9.68s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 1248/1000000 [3:09:28<2319:32:12, 8.36s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [1248], local_loss=0.026442715898156166, train_loss=0.034540288150310516, time_cost=1.223949909210205
+
Steps: 0%| | 1248/1000000 [3:09:28<2319:32:12, 8.36s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1249/1000000 [3:09:33<1993:26:10, 7.19s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1249], local_loss=0.034077148884534836, train_loss=14.347652435302734, time_cost=1.7447609901428223
+
Steps: 0%| | 1249/1000000 [3:09:33<1993:26:10, 7.19s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 1250/1000000 [3:09:41<2048:40:05, 7.38s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [1250], local_loss=0.0462365560233593, train_loss=0.11694011837244034, time_cost=2.4926669597625732
+
Steps: 0%| | 1250/1000000 [3:09:41<2048:40:05, 7.38s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 1251/1000000 [3:09:46<1872:14:14, 6.75s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [1251], local_loss=0.04627252370119095, train_loss=0.06803254038095474, time_cost=1.2268805503845215
+
Steps: 0%| | 1251/1000000 [3:09:46<1872:14:14, 6.75s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 1252/1000000 [3:09:57<2223:32:23, 8.01s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [1252], local_loss=0.03653043508529663, train_loss=0.055332764983177185, time_cost=1.6264915466308594
+
Steps: 0%| | 1252/1000000 [3:09:57<2223:32:23, 8.01s/it, lr=1e-5, step_loss=0.0365]
Steps: 0%| | 1253/1000000 [3:10:09<2570:59:44, 9.27s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [1253], local_loss=0.06635575741529465, train_loss=0.045592665672302246, time_cost=3.208529233932495
+
Steps: 0%| | 1253/1000000 [3:10:09<2570:59:44, 9.27s/it, lr=1e-5, step_loss=0.0664]
Steps: 0%| | 1254/1000000 [3:10:20<2717:11:28, 9.79s/it, lr=1e-5, step_loss=0.0664][RANK-0]: Step: [1254], local_loss=0.16208718717098236, train_loss=0.05309286713600159, time_cost=3.464818000793457
+
Steps: 0%| | 1254/1000000 [3:10:20<2717:11:28, 9.79s/it, lr=1e-5, step_loss=0.162]
Steps: 0%| | 1255/1000000 [3:10:27<2506:16:30, 9.03s/it, lr=1e-5, step_loss=0.162][RANK-0]: Step: [1255], local_loss=0.028897477313876152, train_loss=0.034091316163539886, time_cost=2.260620355606079
+
Steps: 0%| | 1255/1000000 [3:10:27<2506:16:30, 9.03s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 1256/1000000 [3:10:34<2311:28:02, 8.33s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [1256], local_loss=0.056984614580869675, train_loss=0.0535452663898468, time_cost=3.9764468669891357
+
Steps: 0%| | 1256/1000000 [3:10:34<2311:28:02, 8.33s/it, lr=1e-5, step_loss=0.057]
Steps: 0%| | 1257/1000000 [3:10:42<2284:36:48, 8.23s/it, lr=1e-5, step_loss=0.057][RANK-0]: Step: [1257], local_loss=0.038963574916124344, train_loss=0.06661420315504074, time_cost=6.709035873413086
+
Steps: 0%| | 1257/1000000 [3:10:42<2284:36:48, 8.23s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 1258/1000000 [3:10:56<2720:48:59, 9.81s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [1258], local_loss=0.05554688721895218, train_loss=0.04466456174850464, time_cost=5.347014904022217
+
Steps: 0%| | 1258/1000000 [3:10:56<2720:48:59, 9.81s/it, lr=1e-5, step_loss=0.0555]
Steps: 0%| | 1259/1000000 [3:11:09<3039:00:41, 10.95s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [1259], local_loss=0.06292016804218292, train_loss=0.039751648902893066, time_cost=5.147202014923096
+
Steps: 0%| | 1259/1000000 [3:11:09<3039:00:41, 10.95s/it, lr=1e-5, step_loss=0.0629]
Steps: 0%| | 1260/1000000 [3:11:16<2714:51:37, 9.79s/it, lr=1e-5, step_loss=0.0629][RANK-0]: Step: [1260], local_loss=0.11163076758384705, train_loss=0.10037443041801453, time_cost=1.2758104801177979
+
Steps: 0%| | 1260/1000000 [3:11:16<2714:51:37, 9.79s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 1261/1000000 [3:11:25<2608:51:15, 9.40s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [1261], local_loss=0.08354181051254272, train_loss=0.06789157539606094, time_cost=2.784282922744751
+
Steps: 0%| | 1261/1000000 [3:11:25<2608:51:15, 9.40s/it, lr=1e-5, step_loss=0.0835]
Steps: 0%| | 1262/1000000 [3:11:33<2543:44:54, 9.17s/it, lr=1e-5, step_loss=0.0835][RANK-0]: Step: [1262], local_loss=0.03494413569569588, train_loss=0.05525846406817436, time_cost=3.7138187885284424
+
Steps: 0%| | 1262/1000000 [3:11:33<2543:44:54, 9.17s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 1263/1000000 [3:11:41<2375:12:32, 8.56s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [1263], local_loss=0.04436736926436424, train_loss=0.05537059158086777, time_cost=2.5478875637054443
+
Steps: 0%| | 1263/1000000 [3:11:41<2375:12:32, 8.56s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 1264/1000000 [3:11:52<2622:35:56, 9.45s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [1264], local_loss=0.043707575649023056, train_loss=0.07468870282173157, time_cost=2.612571954727173
+
Steps: 0%| | 1264/1000000 [3:11:52<2622:35:56, 9.45s/it, lr=1e-5, step_loss=0.0437]
Steps: 0%| | 1265/1000000 [3:12:05<2864:47:13, 10.33s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [1265], local_loss=0.04820948466658592, train_loss=0.12206518650054932, time_cost=5.264673233032227
+
Steps: 0%| | 1265/1000000 [3:12:05<2864:47:13, 10.33s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 1266/1000000 [3:12:17<3070:54:51, 11.07s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [1266], local_loss=0.04866660386323929, train_loss=0.08311755955219269, time_cost=1.3019185066223145
+
Steps: 0%| | 1266/1000000 [3:12:17<3070:54:51, 11.07s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 1267/1000000 [3:12:24<2737:37:09, 9.87s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [1267], local_loss=0.34682729840278625, train_loss=0.08914089947938919, time_cost=2.4736270904541016
+
Steps: 0%| | 1267/1000000 [3:12:24<2737:37:09, 9.87s/it, lr=1e-5, step_loss=0.347]
Steps: 0%| | 1268/1000000 [3:12:37<2980:57:58, 10.75s/it, lr=1e-5, step_loss=0.347][RANK-0]: Step: [1268], local_loss=0.028784381225705147, train_loss=0.036040954291820526, time_cost=3.015906810760498
+
Steps: 0%| | 1268/1000000 [3:12:37<2980:57:58, 10.75s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1269/1000000 [3:12:50<3170:41:28, 11.43s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1269], local_loss=0.022947708144783974, train_loss=0.07995247095823288, time_cost=1.6186168193817139
+
Steps: 0%| | 1269/1000000 [3:12:50<3170:41:28, 11.43s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 1270/1000000 [3:12:55<2649:52:35, 9.55s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [1270], local_loss=0.03496900945901871, train_loss=0.0406651571393013, time_cost=2.135378122329712
+
Steps: 0%| | 1270/1000000 [3:12:55<2649:52:35, 9.55s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 1271/1000000 [3:13:05<2617:36:57, 9.44s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [1271], local_loss=0.07180079072713852, train_loss=0.04456428438425064, time_cost=3.0061209201812744
+
Steps: 0%| | 1271/1000000 [3:13:05<2617:36:57, 9.44s/it, lr=1e-5, step_loss=0.0718]
Steps: 0%| | 1272/1000000 [3:13:12<2445:00:11, 8.81s/it, lr=1e-5, step_loss=0.0718][RANK-0]: Step: [1272], local_loss=0.026490023359656334, train_loss=0.05675749108195305, time_cost=3.835472822189331
+
Steps: 0%| | 1272/1000000 [3:13:12<2445:00:11, 8.81s/it, lr=1e-5, step_loss=0.0265]
Steps: 0%| | 1273/1000000 [3:13:20<2367:29:31, 8.53s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [1273], local_loss=0.04732624068856239, train_loss=0.05482887476682663, time_cost=2.2603704929351807
+
Steps: 0%| | 1273/1000000 [3:13:20<2367:29:31, 8.53s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 1274/1000000 [3:13:31<2615:31:52, 9.43s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [1274], local_loss=0.020657068118453026, train_loss=0.09400749206542969, time_cost=1.6217992305755615
+
Steps: 0%| | 1274/1000000 [3:13:31<2615:31:52, 9.43s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 1275/1000000 [3:13:42<2688:26:11, 9.69s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [1275], local_loss=0.03946146368980408, train_loss=0.04204271733760834, time_cost=3.212759494781494
+
Steps: 0%| | 1275/1000000 [3:13:42<2688:26:11, 9.69s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 1276/1000000 [3:13:47<2368:27:16, 8.54s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [1276], local_loss=0.02980264276266098, train_loss=0.0505371019244194, time_cost=1.6737596988677979
+
Steps: 0%| | 1276/1000000 [3:13:47<2368:27:16, 8.54s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 1277/1000000 [3:14:01<2799:26:51, 10.09s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [1277], local_loss=0.16080254316329956, train_loss=0.05805118381977081, time_cost=6.240271329879761
+
Steps: 0%| | 1277/1000000 [3:14:01<2799:26:51, 10.09s/it, lr=1e-5, step_loss=0.161]
Steps: 0%| | 1278/1000000 [3:14:06<2402:38:34, 8.66s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [1278], local_loss=0.04873507097363472, train_loss=0.03230607509613037, time_cost=2.184870719909668
+
Steps: 0%| | 1278/1000000 [3:14:07<2402:38:34, 8.66s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 1279/1000000 [3:14:14<2326:34:02, 8.39s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [1279], local_loss=0.06758583337068558, train_loss=0.1837417185306549, time_cost=1.5021679401397705
+
Steps: 0%| | 1279/1000000 [3:14:14<2326:34:02, 8.39s/it, lr=1e-5, step_loss=0.0676]
Steps: 0%| | 1280/1000000 [3:14:25<2535:43:47, 9.14s/it, lr=1e-5, step_loss=0.0676][RANK-0]: Step: [1280], local_loss=0.057069677859544754, train_loss=0.08275933563709259, time_cost=3.3494455814361572
+
Steps: 0%| | 1280/1000000 [3:14:25<2535:43:47, 9.14s/it, lr=1e-5, step_loss=0.0571]
Steps: 0%| | 1281/1000000 [3:14:33<2414:00:38, 8.70s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [1281], local_loss=0.06036527827382088, train_loss=0.036169491708278656, time_cost=2.4250059127807617
+
Steps: 0%| | 1281/1000000 [3:14:33<2414:00:38, 8.70s/it, lr=1e-5, step_loss=0.0604]
Steps: 0%| | 1282/1000000 [3:14:38<2132:38:13, 7.69s/it, lr=1e-5, step_loss=0.0604][RANK-0]: Step: [1282], local_loss=0.030652150511741638, train_loss=0.03518679365515709, time_cost=2.233106851577759
+
Steps: 0%| | 1282/1000000 [3:14:38<2132:38:13, 7.69s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 1283/1000000 [3:14:44<1990:30:25, 7.18s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [1283], local_loss=0.03515087068080902, train_loss=0.03990911692380905, time_cost=3.25589656829834
+
Steps: 0%| | 1283/1000000 [3:14:44<1990:30:25, 7.18s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 1284/1000000 [3:14:54<2212:19:27, 7.97s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [1284], local_loss=0.033901698887348175, train_loss=0.058905623853206635, time_cost=3.7753400802612305
+
Steps: 0%| | 1284/1000000 [3:14:54<2212:19:27, 7.97s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 1285/1000000 [3:15:01<2112:58:04, 7.62s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [1285], local_loss=0.021398501470685005, train_loss=0.02958539128303528, time_cost=1.2238609790802002
+
Steps: 0%| | 1285/1000000 [3:15:01<2112:58:04, 7.62s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 1286/1000000 [3:15:12<2399:24:22, 8.65s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [1286], local_loss=0.040877923369407654, train_loss=0.05296173691749573, time_cost=2.3348662853240967
+
Steps: 0%| | 1286/1000000 [3:15:12<2399:24:22, 8.65s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 1287/1000000 [3:15:25<2740:16:03, 9.88s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [1287], local_loss=0.06136956065893173, train_loss=0.05058801919221878, time_cost=3.9561243057250977
+
Steps: 0%| | 1287/1000000 [3:15:25<2740:16:03, 9.88s/it, lr=1e-5, step_loss=0.0614]
Steps: 0%| | 1288/1000000 [3:15:30<2398:35:34, 8.65s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [1288], local_loss=0.028823371976614, train_loss=0.06208736076951027, time_cost=2.900101900100708
+
Steps: 0%| | 1288/1000000 [3:15:30<2398:35:34, 8.65s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1289/1000000 [3:15:38<2290:33:17, 8.26s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1289], local_loss=0.04840045049786568, train_loss=0.06830774992704391, time_cost=1.6529908180236816
+
Steps: 0%| | 1289/1000000 [3:15:38<2290:33:17, 8.26s/it, lr=1e-5, step_loss=0.0484]
Steps: 0%| | 1290/1000000 [3:15:53<2846:36:16, 10.26s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [1290], local_loss=0.04433503746986389, train_loss=0.08939055353403091, time_cost=5.195363998413086
+
Steps: 0%| | 1290/1000000 [3:15:53<2846:36:16, 10.26s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 1291/1000000 [3:16:06<3144:45:19, 11.34s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [1291], local_loss=0.05293043330311775, train_loss=0.03676506131887436, time_cost=2.401089906692505
+
Steps: 0%| | 1291/1000000 [3:16:06<3144:45:19, 11.34s/it, lr=1e-5, step_loss=0.0529]
Steps: 0%| | 1292/1000000 [3:16:12<2662:38:03, 9.60s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [1292], local_loss=0.029222765937447548, train_loss=0.07509218901395798, time_cost=4.526741981506348
+
Steps: 0%| | 1292/1000000 [3:16:12<2662:38:03, 9.60s/it, lr=1e-5, step_loss=0.0292]
Steps: 0%| | 1293/1000000 [3:16:17<2281:07:18, 8.22s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [1293], local_loss=0.05672982335090637, train_loss=0.05939183384180069, time_cost=2.0662670135498047
+
Steps: 0%| | 1293/1000000 [3:16:17<2281:07:18, 8.22s/it, lr=1e-5, step_loss=0.0567]
Steps: 0%| | 1294/1000000 [3:16:31<2725:15:44, 9.82s/it, lr=1e-5, step_loss=0.0567][RANK-0]: Step: [1294], local_loss=0.0497962161898613, train_loss=0.061242617666721344, time_cost=1.7896692752838135
+
Steps: 0%| | 1294/1000000 [3:16:31<2725:15:44, 9.82s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 1295/1000000 [3:16:36<2381:39:25, 8.59s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [1295], local_loss=0.032425954937934875, train_loss=0.03303151577711105, time_cost=1.704601764678955
+
Steps: 0%| | 1295/1000000 [3:16:36<2381:39:25, 8.59s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 1296/1000000 [3:16:51<2896:51:33, 10.44s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [1296], local_loss=0.041719358414411545, train_loss=0.07631807029247284, time_cost=5.2904157638549805
+
Steps: 0%| | 1296/1000000 [3:16:51<2896:51:33, 10.44s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1297/1000000 [3:17:03<3018:43:41, 10.88s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1297], local_loss=0.0416945219039917, train_loss=0.03975912928581238, time_cost=1.2283577919006348
+
Steps: 0%| | 1297/1000000 [3:17:03<3018:43:41, 10.88s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1298/1000000 [3:17:07<2466:21:00, 8.89s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1298], local_loss=0.04535157233476639, train_loss=0.07291432470083237, time_cost=1.5085651874542236
+
Steps: 0%| | 1298/1000000 [3:17:07<2466:21:00, 8.89s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 1299/1000000 [3:17:14<2315:53:25, 8.35s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [1299], local_loss=0.027919655665755272, train_loss=0.053285833448171616, time_cost=2.7811453342437744
+
Steps: 0%| | 1299/1000000 [3:17:14<2315:53:25, 8.35s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 1300/1000000 [3:17:21<2202:28:54, 7.94s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [1300], local_loss=0.020185571163892746, train_loss=0.08315382897853851, time_cost=1.2389025688171387
+
Steps: 0%| | 1300/1000000 [3:17:21<2202:28:54, 7.94s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 1301/1000000 [3:17:30<2234:34:41, 8.05s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [1301], local_loss=0.10061434656381607, train_loss=0.1060088649392128, time_cost=1.2291233539581299
+
Steps: 0%| | 1301/1000000 [3:17:30<2234:34:41, 8.05s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 1302/1000000 [3:17:34<1928:36:45, 6.95s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [1302], local_loss=0.09011044353246689, train_loss=0.08165378868579865, time_cost=1.9802122116088867
+
Steps: 0%| | 1302/1000000 [3:17:34<1928:36:45, 6.95s/it, lr=1e-5, step_loss=0.0901]
Steps: 0%| | 1303/1000000 [3:17:44<2195:01:01, 7.91s/it, lr=1e-5, step_loss=0.0901][RANK-0]: Step: [1303], local_loss=0.030140787363052368, train_loss=0.056100212037563324, time_cost=1.5568068027496338
+
Steps: 0%| | 1303/1000000 [3:17:44<2195:01:01, 7.91s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 1304/1000000 [3:17:50<2024:45:09, 7.30s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [1304], local_loss=0.17108558118343353, train_loss=0.1676741987466812, time_cost=1.2234694957733154
+
Steps: 0%| | 1304/1000000 [3:17:50<2024:45:09, 7.30s/it, lr=1e-5, step_loss=0.171]
Steps: 0%| | 1305/1000000 [3:17:54<1773:07:32, 6.39s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [1305], local_loss=0.02290939725935459, train_loss=0.04204470291733742, time_cost=1.6171565055847168
+
Steps: 0%| | 1305/1000000 [3:17:54<1773:07:32, 6.39s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 1306/1000000 [3:18:10<2522:44:31, 9.09s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [1306], local_loss=0.025404954329133034, train_loss=0.12223095446825027, time_cost=1.9465036392211914
+
Steps: 0%| | 1306/1000000 [3:18:10<2522:44:31, 9.09s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 1307/1000000 [3:18:14<2156:51:42, 7.77s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [1307], local_loss=0.05338041111826897, train_loss=0.0851801335811615, time_cost=2.2979085445404053
+
Steps: 0%| | 1307/1000000 [3:18:14<2156:51:42, 7.77s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 1308/1000000 [3:18:20<1942:08:35, 7.00s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [1308], local_loss=0.03380472585558891, train_loss=0.08001719415187836, time_cost=4.141435623168945
+
Steps: 0%| | 1308/1000000 [3:18:20<1942:08:35, 7.00s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 1309/1000000 [3:18:28<2080:35:51, 7.50s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [1309], local_loss=0.2468131184577942, train_loss=0.0934295505285263, time_cost=1.3485734462738037
+
Steps: 0%| | 1309/1000000 [3:18:28<2080:35:51, 7.50s/it, lr=1e-5, step_loss=0.247]
Steps: 0%| | 1310/1000000 [3:18:35<2060:46:41, 7.43s/it, lr=1e-5, step_loss=0.247][RANK-0]: Step: [1310], local_loss=0.10581912845373154, train_loss=0.06157664954662323, time_cost=1.8068783283233643
+
Steps: 0%| | 1310/1000000 [3:18:35<2060:46:41, 7.43s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 1311/1000000 [3:18:50<2624:32:27, 9.46s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [1311], local_loss=0.03048977628350258, train_loss=0.11956551671028137, time_cost=5.145217657089233
+
Steps: 0%| | 1311/1000000 [3:18:50<2624:32:27, 9.46s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 1312/1000000 [3:19:00<2713:19:48, 9.78s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [1312], local_loss=0.028437288478016853, train_loss=0.036927465349435806, time_cost=1.2372429370880127
+
Steps: 0%| | 1312/1000000 [3:19:00<2713:19:48, 9.78s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 1313/1000000 [3:19:07<2486:18:13, 8.96s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [1313], local_loss=0.021259140223264694, train_loss=0.05542891100049019, time_cost=1.22062087059021
+
Steps: 0%| | 1313/1000000 [3:19:07<2486:18:13, 8.96s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 1314/1000000 [3:19:14<2325:59:44, 8.38s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [1314], local_loss=0.020493661984801292, train_loss=0.03668772429227829, time_cost=1.7182600498199463
+
Steps: 0%| | 1314/1000000 [3:19:14<2325:59:44, 8.38s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 1315/1000000 [3:19:23<2374:23:47, 8.56s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [1315], local_loss=0.03508048877120018, train_loss=0.09795928001403809, time_cost=1.258263111114502
+
Steps: 0%| | 1315/1000000 [3:19:23<2374:23:47, 8.56s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 1316/1000000 [3:19:32<2405:32:33, 8.67s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [1316], local_loss=0.14195814728736877, train_loss=0.06157218664884567, time_cost=2.7472751140594482
+
Steps: 0%| | 1316/1000000 [3:19:32<2405:32:33, 8.67s/it, lr=1e-5, step_loss=0.142]
Steps: 0%| | 1317/1000000 [3:19:44<2670:53:45, 9.63s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [1317], local_loss=0.02805374376475811, train_loss=0.08662670850753784, time_cost=8.26556944847107
+
Steps: 0%| | 1317/1000000 [3:19:44<2670:53:45, 9.63s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 1318/1000000 [3:19:52<2526:31:51, 9.11s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [1318], local_loss=0.023919081315398216, train_loss=0.0792679712176323, time_cost=3.9004743099212646
+
Steps: 0%| | 1318/1000000 [3:19:52<2526:31:51, 9.11s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 1319/1000000 [3:20:02<2564:45:09, 9.25s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [1319], local_loss=0.032809674739837646, train_loss=0.040135227143764496, time_cost=1.9519627094268799
+
Steps: 0%| | 1319/1000000 [3:20:02<2564:45:09, 9.25s/it, lr=1e-5, step_loss=0.0328]
Steps: 0%| | 1320/1000000 [3:20:08<2363:01:24, 8.52s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [1320], local_loss=0.1665489226579666, train_loss=0.0963321253657341, time_cost=2.353717565536499
+
Steps: 0%| | 1320/1000000 [3:20:08<2363:01:24, 8.52s/it, lr=1e-5, step_loss=0.167]
Steps: 0%| | 1321/1000000 [3:20:13<2070:19:44, 7.46s/it, lr=1e-5, step_loss=0.167][RANK-0]: Step: [1321], local_loss=0.022100213915109634, train_loss=0.11327414214611053, time_cost=2.4289841651916504
+
Steps: 0%| | 1321/1000000 [3:20:13<2070:19:44, 7.46s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 1322/1000000 [3:20:18<1813:52:00, 6.54s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [1322], local_loss=0.027372874319553375, train_loss=0.04512037709355354, time_cost=1.5331413745880127
+
Steps: 0%| | 1322/1000000 [3:20:18<1813:52:00, 6.54s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 1323/1000000 [3:20:22<1640:00:14, 5.91s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [1323], local_loss=0.06097141280770302, train_loss=0.06758128106594086, time_cost=1.9567945003509521
+
Steps: 0%| | 1323/1000000 [3:20:22<1640:00:14, 5.91s/it, lr=1e-5, step_loss=0.061]
Steps: 0%| | 1324/1000000 [3:20:30<1793:17:58, 6.46s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [1324], local_loss=0.31740519404411316, train_loss=0.11491042375564575, time_cost=2.2568161487579346
+
Steps: 0%| | 1324/1000000 [3:20:30<1793:17:58, 6.46s/it, lr=1e-5, step_loss=0.317]
Steps: 0%| | 1325/1000000 [3:20:43<2358:34:09, 8.50s/it, lr=1e-5, step_loss=0.317][RANK-0]: Step: [1325], local_loss=0.038732171058654785, train_loss=0.0375056117773056, time_cost=4.770926237106323
+
Steps: 0%| | 1325/1000000 [3:20:43<2358:34:09, 8.50s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 1326/1000000 [3:20:48<2043:51:50, 7.37s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [1326], local_loss=0.019660629332065582, train_loss=0.03406126797199249, time_cost=3.7060956954956055
+
Steps: 0%| | 1326/1000000 [3:20:48<2043:51:50, 7.37s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 1327/1000000 [3:20:58<2259:05:55, 8.14s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [1327], local_loss=0.10040610283613205, train_loss=3.939314842224121, time_cost=1.5573780536651611
+
Steps: 0%| | 1327/1000000 [3:20:58<2259:05:55, 8.14s/it, lr=1e-5, step_loss=0.1]
Steps: 0%| | 1328/1000000 [3:21:07<2371:33:34, 8.55s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [1328], local_loss=0.03712224215269089, train_loss=0.16425731778144836, time_cost=7.1363043785095215
+
Steps: 0%| | 1328/1000000 [3:21:07<2371:33:34, 8.55s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 1329/1000000 [3:21:15<2266:51:01, 8.17s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [1329], local_loss=0.062048137187957764, train_loss=21.12590980529785, time_cost=1.7950422763824463
+
Steps: 0%| | 1329/1000000 [3:21:15<2266:51:01, 8.17s/it, lr=1e-5, step_loss=0.062]
Steps: 0%| | 1330/1000000 [3:21:24<2386:20:11, 8.60s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [1330], local_loss=0.030407920479774475, train_loss=0.05419182777404785, time_cost=1.7984201908111572
+
Steps: 0%| | 1330/1000000 [3:21:24<2386:20:11, 8.60s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 1331/1000000 [3:21:34<2515:03:15, 9.07s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [1331], local_loss=0.2649177014827728, train_loss=0.12139080464839935, time_cost=4.5245349407196045
+
Steps: 0%| | 1331/1000000 [3:21:34<2515:03:15, 9.07s/it, lr=1e-5, step_loss=0.265]
Steps: 0%| | 1332/1000000 [3:21:41<2288:37:05, 8.25s/it, lr=1e-5, step_loss=0.265][RANK-0]: Step: [1332], local_loss=0.0335087776184082, train_loss=0.05003714561462402, time_cost=1.6164960861206055
+
Steps: 0%| | 1332/1000000 [3:21:41<2288:37:05, 8.25s/it, lr=1e-5, step_loss=0.0335]
Steps: 0%| | 1333/1000000 [3:21:45<1953:02:57, 7.04s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [1333], local_loss=0.07191900163888931, train_loss=0.046465422958135605, time_cost=1.3241856098175049
+
Steps: 0%| | 1333/1000000 [3:21:45<1953:02:57, 7.04s/it, lr=1e-5, step_loss=0.0719]
Steps: 0%| | 1334/1000000 [3:21:53<2022:36:46, 7.29s/it, lr=1e-5, step_loss=0.0719][RANK-0]: Step: [1334], local_loss=0.04640914499759674, train_loss=0.18690380454063416, time_cost=3.7311153411865234
+
Steps: 0%| | 1334/1000000 [3:21:53<2022:36:46, 7.29s/it, lr=1e-5, step_loss=0.0464]
Steps: 0%| | 1335/1000000 [3:22:02<2197:19:49, 7.92s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [1335], local_loss=0.02963174320757389, train_loss=0.06343840062618256, time_cost=4.352301836013794
+
Steps: 0%| | 1335/1000000 [3:22:02<2197:19:49, 7.92s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 1336/1000000 [3:22:10<2163:33:29, 7.80s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [1336], local_loss=0.04272249713540077, train_loss=0.04566061496734619, time_cost=1.8114197254180908
+
Steps: 0%| | 1336/1000000 [3:22:10<2163:33:29, 7.80s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 1337/1000000 [3:22:21<2418:59:10, 8.72s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [1337], local_loss=0.07515926659107208, train_loss=40.935691833496094, time_cost=2.7742433547973633
+
Steps: 0%| | 1337/1000000 [3:22:21<2418:59:10, 8.72s/it, lr=1e-5, step_loss=0.0752]
Steps: 0%| | 1338/1000000 [3:22:26<2165:12:56, 7.81s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [1338], local_loss=0.026706431061029434, train_loss=0.090015709400177, time_cost=2.939533233642578
+
Steps: 0%| | 1338/1000000 [3:22:26<2165:12:56, 7.81s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 1339/1000000 [3:22:38<2469:14:05, 8.90s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [1339], local_loss=0.03974088281393051, train_loss=0.11485860496759415, time_cost=1.8433318138122559
+
Steps: 0%| | 1339/1000000 [3:22:38<2469:14:05, 8.90s/it, lr=1e-5, step_loss=0.0397]
Steps: 0%| | 1340/1000000 [3:22:47<2485:06:21, 8.96s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [1340], local_loss=0.02770765870809555, train_loss=0.0905662253499031, time_cost=3.838090658187866
+
Steps: 0%| | 1340/1000000 [3:22:47<2485:06:21, 8.96s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 1341/1000000 [3:23:00<2822:36:16, 10.18s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [1341], local_loss=0.02775132842361927, train_loss=0.045315369963645935, time_cost=2.1419363021850586
+
Steps: 0%| | 1341/1000000 [3:23:00<2822:36:16, 10.18s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 1342/1000000 [3:23:05<2393:57:52, 8.63s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [1342], local_loss=0.03440060839056969, train_loss=0.16073301434516907, time_cost=2.0780587196350098
+
Steps: 0%| | 1342/1000000 [3:23:05<2393:57:52, 8.63s/it, lr=1e-5, step_loss=0.0344]
Steps: 0%| | 1343/1000000 [3:23:10<2120:13:50, 7.64s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [1343], local_loss=0.048812903463840485, train_loss=0.06587670743465424, time_cost=2.422008752822876
+
Steps: 0%| | 1343/1000000 [3:23:10<2120:13:50, 7.64s/it, lr=1e-5, step_loss=0.0488]
Steps: 0%| | 1344/1000000 [3:23:19<2220:28:32, 8.00s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [1344], local_loss=0.031904201954603195, train_loss=0.04713286831974983, time_cost=1.7278575897216797
+
Steps: 0%| | 1344/1000000 [3:23:19<2220:28:32, 8.00s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 1345/1000000 [3:23:23<1891:39:57, 6.82s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [1345], local_loss=0.04486905783414841, train_loss=0.05503861606121063, time_cost=1.6254935264587402
+
Steps: 0%| | 1345/1000000 [3:23:23<1891:39:57, 6.82s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 1346/1000000 [3:23:27<1685:27:30, 6.08s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [1346], local_loss=0.050257958471775055, train_loss=0.08812791854143143, time_cost=1.551635980606079
+
Steps: 0%| | 1346/1000000 [3:23:27<1685:27:30, 6.08s/it, lr=1e-5, step_loss=0.0503]
Steps: 0%| | 1347/1000000 [3:23:42<2388:08:47, 8.61s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [1347], local_loss=0.07512930035591125, train_loss=0.07445837557315826, time_cost=5.307668447494507
+
Steps: 0%| | 1347/1000000 [3:23:42<2388:08:47, 8.61s/it, lr=1e-5, step_loss=0.0751]
Steps: 0%| | 1348/1000000 [3:23:48<2167:24:21, 7.81s/it, lr=1e-5, step_loss=0.0751][RANK-0]: Step: [1348], local_loss=0.02704351395368576, train_loss=0.06202249228954315, time_cost=3.581434965133667
+
Steps: 0%| | 1348/1000000 [3:23:48<2167:24:21, 7.81s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 1349/1000000 [3:24:03<2742:34:18, 9.89s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [1349], local_loss=0.06878472119569778, train_loss=0.0543423667550087, time_cost=4.647749423980713
+
Steps: 0%| | 1349/1000000 [3:24:03<2742:34:18, 9.89s/it, lr=1e-5, step_loss=0.0688]
Steps: 0%| | 1350/1000000 [3:24:08<2369:13:22, 8.54s/it, lr=1e-5, step_loss=0.0688][RANK-0]: Step: [1350], local_loss=0.031215205788612366, train_loss=0.049019068479537964, time_cost=2.936904191970825
+
Steps: 0%| | 1350/1000000 [3:24:08<2369:13:22, 8.54s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 1351/1000000 [3:24:15<2206:11:19, 7.95s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [1351], local_loss=0.0417364239692688, train_loss=0.04273124784231186, time_cost=4.779738426208496
+
Steps: 0%| | 1351/1000000 [3:24:15<2206:11:19, 7.95s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1352/1000000 [3:24:28<2680:13:32, 9.66s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1352], local_loss=0.14063943922519684, train_loss=0.06091530621051788, time_cost=5.686453104019165
+
Steps: 0%| | 1352/1000000 [3:24:28<2680:13:32, 9.66s/it, lr=1e-5, step_loss=0.141]
Steps: 0%| | 1353/1000000 [3:24:34<2356:48:32, 8.50s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [1353], local_loss=0.029386956244707108, train_loss=0.06451751291751862, time_cost=1.2342801094055176
+
Steps: 0%| | 1353/1000000 [3:24:34<2356:48:32, 8.50s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 1354/1000000 [3:24:38<2005:05:02, 7.23s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [1354], local_loss=0.060268860310316086, train_loss=0.08087392151355743, time_cost=1.4097731113433838
+
Steps: 0%| | 1354/1000000 [3:24:38<2005:05:02, 7.23s/it, lr=1e-5, step_loss=0.0603]
Steps: 0%| | 1355/1000000 [3:24:51<2472:08:31, 8.91s/it, lr=1e-5, step_loss=0.0603][RANK-0]: Step: [1355], local_loss=0.0337529182434082, train_loss=0.061007045209407806, time_cost=4.146320819854736
+
Steps: 0%| | 1355/1000000 [3:24:51<2472:08:31, 8.91s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 1356/1000000 [3:25:00<2471:11:16, 8.91s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [1356], local_loss=0.03303011506795883, train_loss=0.03561193868517876, time_cost=3.467104911804199
+
Steps: 0%| | 1356/1000000 [3:25:00<2471:11:16, 8.91s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 1357/1000000 [3:25:13<2814:13:41, 10.14s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [1357], local_loss=0.9877443909645081, train_loss=0.16007794439792633, time_cost=3.5564236640930176
+
Steps: 0%| | 1357/1000000 [3:25:13<2814:13:41, 10.14s/it, lr=1e-5, step_loss=0.988]
Steps: 0%| | 1358/1000000 [3:25:17<2328:19:32, 8.39s/it, lr=1e-5, step_loss=0.988][RANK-0]: Step: [1358], local_loss=0.030875323340296745, train_loss=0.06225006282329559, time_cost=1.4171438217163086
+
Steps: 0%| | 1358/1000000 [3:25:17<2328:19:32, 8.39s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 1359/1000000 [3:25:30<2656:39:08, 9.58s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [1359], local_loss=0.03282511979341507, train_loss=0.056122321635484695, time_cost=3.5217442512512207
+
Steps: 0%| | 1359/1000000 [3:25:30<2656:39:08, 9.58s/it, lr=1e-5, step_loss=0.0328]
Steps: 0%| | 1360/1000000 [3:25:37<2462:21:13, 8.88s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [1360], local_loss=0.02520415000617504, train_loss=0.07678381353616714, time_cost=2.7344911098480225
+
Steps: 0%| | 1360/1000000 [3:25:37<2462:21:13, 8.88s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 1361/1000000 [3:25:46<2492:31:42, 8.99s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [1361], local_loss=0.032672010362148285, train_loss=0.04064658656716347, time_cost=4.044827222824097
+
Steps: 0%| | 1361/1000000 [3:25:46<2492:31:42, 8.99s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 1362/1000000 [3:25:50<2079:25:46, 7.50s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [1362], local_loss=0.09522010385990143, train_loss=0.06869484484195709, time_cost=1.4726076126098633
+
Steps: 0%| | 1362/1000000 [3:25:50<2079:25:46, 7.50s/it, lr=1e-5, step_loss=0.0952]
Steps: 0%| | 1363/1000000 [3:26:03<2540:18:25, 9.16s/it, lr=1e-5, step_loss=0.0952][RANK-0]: Step: [1363], local_loss=0.04242326319217682, train_loss=0.05278804153203964, time_cost=4.018806219100952
+
Steps: 0%| | 1363/1000000 [3:26:03<2540:18:25, 9.16s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 1364/1000000 [3:26:11<2434:53:31, 8.78s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [1364], local_loss=0.01783864013850689, train_loss=0.18910865485668182, time_cost=3.8628780841827393
+
Steps: 0%| | 1364/1000000 [3:26:11<2434:53:31, 8.78s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 1365/1000000 [3:26:15<2045:08:38, 7.37s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [1365], local_loss=0.03517472371459007, train_loss=0.05135703086853027, time_cost=3.1379222869873047
+
Steps: 0%| | 1365/1000000 [3:26:15<2045:08:38, 7.37s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 1366/1000000 [3:26:20<1843:38:16, 6.65s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [1366], local_loss=0.13665661215782166, train_loss=0.19285178184509277, time_cost=1.245741605758667
+
Steps: 0%| | 1366/1000000 [3:26:20<1843:38:16, 6.65s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 1367/1000000 [3:26:34<2396:30:36, 8.64s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [1367], local_loss=0.0344715490937233, train_loss=0.04831353574991226, time_cost=9.764150142669678
+
Steps: 0%| | 1367/1000000 [3:26:34<2396:30:36, 8.64s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 1368/1000000 [3:26:38<2081:59:49, 7.51s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [1368], local_loss=0.04911937192082405, train_loss=0.08110195398330688, time_cost=2.068432569503784
+
Steps: 0%| | 1368/1000000 [3:26:38<2081:59:49, 7.51s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 1369/1000000 [3:26:44<1935:53:39, 6.98s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [1369], local_loss=0.03293384611606598, train_loss=0.05764421075582504, time_cost=1.3306574821472168
+
Steps: 0%| | 1369/1000000 [3:26:44<1935:53:39, 6.98s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 1370/1000000 [3:26:54<2192:46:25, 7.90s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [1370], local_loss=0.02884480357170105, train_loss=0.036972738802433014, time_cost=2.9632675647735596
+
Steps: 0%| | 1370/1000000 [3:26:54<2192:46:25, 7.90s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1371/1000000 [3:27:02<2210:32:30, 7.97s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1371], local_loss=0.07448280602693558, train_loss=0.09570856392383575, time_cost=4.027656555175781
+
Steps: 0%| | 1371/1000000 [3:27:02<2210:32:30, 7.97s/it, lr=1e-5, step_loss=0.0745]
Steps: 0%| | 1372/1000000 [3:27:09<2129:55:57, 7.68s/it, lr=1e-5, step_loss=0.0745][RANK-0]: Step: [1372], local_loss=0.07017543911933899, train_loss=0.0541427880525589, time_cost=2.737445116043091
+
Steps: 0%| | 1372/1000000 [3:27:09<2129:55:57, 7.68s/it, lr=1e-5, step_loss=0.0702]
Steps: 0%| | 1373/1000000 [3:27:15<1978:50:45, 7.13s/it, lr=1e-5, step_loss=0.0702][RANK-0]: Step: [1373], local_loss=0.034878503531217575, train_loss=0.06437571346759796, time_cost=1.2472076416015625
+
Steps: 0%| | 1373/1000000 [3:27:15<1978:50:45, 7.13s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 1374/1000000 [3:27:25<2203:11:06, 7.94s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [1374], local_loss=0.02532465010881424, train_loss=0.034204643219709396, time_cost=1.8890578746795654
+
Steps: 0%| | 1374/1000000 [3:27:25<2203:11:06, 7.94s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 1375/1000000 [3:27:29<1899:06:48, 6.85s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [1375], local_loss=0.05854462832212448, train_loss=0.053327079862356186, time_cost=1.4791312217712402
+
Steps: 0%| | 1375/1000000 [3:27:29<1899:06:48, 6.85s/it, lr=1e-5, step_loss=0.0585]
Steps: 0%| | 1376/1000000 [3:27:39<2174:55:10, 7.84s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [1376], local_loss=0.05714573338627815, train_loss=0.05351368710398674, time_cost=2.041461944580078
+
Steps: 0%| | 1376/1000000 [3:27:39<2174:55:10, 7.84s/it, lr=1e-5, step_loss=0.0571]
Steps: 0%| | 1377/1000000 [3:27:44<1878:14:45, 6.77s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [1377], local_loss=0.08298181742429733, train_loss=0.055960893630981445, time_cost=1.4498157501220703
+
Steps: 0%| | 1377/1000000 [3:27:44<1878:14:45, 6.77s/it, lr=1e-5, step_loss=0.083]
Steps: 0%| | 1378/1000000 [3:27:54<2149:47:52, 7.75s/it, lr=1e-5, step_loss=0.083][RANK-0]: Step: [1378], local_loss=0.02729073166847229, train_loss=0.04151491820812225, time_cost=2.5693514347076416
+
Steps: 0%| | 1378/1000000 [3:27:54<2149:47:52, 7.75s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 1379/1000000 [3:28:02<2228:44:30, 8.03s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [1379], local_loss=0.08725850284099579, train_loss=0.054109636694192886, time_cost=3.212468385696411
+
Steps: 0%| | 1379/1000000 [3:28:02<2228:44:30, 8.03s/it, lr=1e-5, step_loss=0.0873]
Steps: 0%| | 1380/1000000 [3:28:10<2160:51:03, 7.79s/it, lr=1e-5, step_loss=0.0873][RANK-0]: Step: [1380], local_loss=0.021968942135572433, train_loss=0.04818694293498993, time_cost=2.312720537185669
+
Steps: 0%| | 1380/1000000 [3:28:10<2160:51:03, 7.79s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 1381/1000000 [3:28:21<2481:57:39, 8.95s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [1381], local_loss=0.08956079185009003, train_loss=0.11168922483921051, time_cost=4.585643768310547
+
Steps: 0%| | 1381/1000000 [3:28:21<2481:57:39, 8.95s/it, lr=1e-5, step_loss=0.0896]
Steps: 0%| | 1382/1000000 [3:28:31<2507:31:18, 9.04s/it, lr=1e-5, step_loss=0.0896][RANK-0]: Step: [1382], local_loss=0.025352362543344498, train_loss=20.201217651367188, time_cost=3.748290538787842
+
Steps: 0%| | 1382/1000000 [3:28:31<2507:31:18, 9.04s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 1383/1000000 [3:28:44<2861:14:10, 10.31s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [1383], local_loss=0.0423942469060421, train_loss=0.03679700568318367, time_cost=3.299917697906494
+
Steps: 0%| | 1383/1000000 [3:28:44<2861:14:10, 10.31s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 1384/1000000 [3:28:55<2937:43:46, 10.59s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [1384], local_loss=0.14696013927459717, train_loss=0.057417210191488266, time_cost=3.6434550285339355
+
Steps: 0%| | 1384/1000000 [3:28:55<2937:43:46, 10.59s/it, lr=1e-5, step_loss=0.147]
Steps: 0%| | 1385/1000000 [3:29:01<2543:48:38, 9.17s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [1385], local_loss=0.14549662172794342, train_loss=0.05162028968334198, time_cost=1.5746934413909912
+
Steps: 0%| | 1385/1000000 [3:29:01<2543:48:38, 9.17s/it, lr=1e-5, step_loss=0.145]
Steps: 0%| | 1386/1000000 [3:29:05<2140:06:19, 7.72s/it, lr=1e-5, step_loss=0.145][RANK-0]: Step: [1386], local_loss=0.030992329120635986, train_loss=0.14532269537448883, time_cost=1.5148766040802002
+
Steps: 0%| | 1386/1000000 [3:29:05<2140:06:19, 7.72s/it, lr=1e-5, step_loss=0.031]
Steps: 0%| | 1387/1000000 [3:29:11<2002:29:48, 7.22s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [1387], local_loss=0.06396166235208511, train_loss=0.05386071279644966, time_cost=1.4443442821502686
+
Steps: 0%| | 1387/1000000 [3:29:11<2002:29:48, 7.22s/it, lr=1e-5, step_loss=0.064]
Steps: 0%| | 1388/1000000 [3:29:22<2319:20:22, 8.36s/it, lr=1e-5, step_loss=0.064][RANK-0]: Step: [1388], local_loss=0.17277595400810242, train_loss=0.06445103138685226, time_cost=3.9533286094665527
+
Steps: 0%| | 1388/1000000 [3:29:22<2319:20:22, 8.36s/it, lr=1e-5, step_loss=0.173]
Steps: 0%| | 1389/1000000 [3:29:36<2717:59:09, 9.80s/it, lr=1e-5, step_loss=0.173][RANK-0]: Step: [1389], local_loss=0.02987010031938553, train_loss=0.05501120537519455, time_cost=5.385785102844238
+
Steps: 0%| | 1389/1000000 [3:29:36<2717:59:09, 9.80s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 1390/1000000 [3:29:43<2505:45:20, 9.03s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [1390], local_loss=0.024486158043146133, train_loss=0.03256455063819885, time_cost=5.425802707672119
+
Steps: 0%| | 1390/1000000 [3:29:43<2505:45:20, 9.03s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 1391/1000000 [3:29:49<2270:30:01, 8.19s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [1391], local_loss=0.2869904339313507, train_loss=0.09589020162820816, time_cost=2.164550304412842
+
Steps: 0%| | 1391/1000000 [3:29:49<2270:30:01, 8.19s/it, lr=1e-5, step_loss=0.287]
Steps: 0%| | 1392/1000000 [3:30:03<2747:02:50, 9.90s/it, lr=1e-5, step_loss=0.287][RANK-0]: Step: [1392], local_loss=0.030077150091528893, train_loss=0.09735805541276932, time_cost=4.683629512786865
+
Steps: 0%| | 1392/1000000 [3:30:03<2747:02:50, 9.90s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 1393/1000000 [3:30:09<2421:05:18, 8.73s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [1393], local_loss=0.04167179390788078, train_loss=0.05660434812307358, time_cost=4.892219543457031
+
Steps: 0%| | 1393/1000000 [3:30:09<2421:05:18, 8.73s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 1394/1000000 [3:30:20<2615:05:45, 9.43s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [1394], local_loss=0.0933738574385643, train_loss=0.08698220551013947, time_cost=1.4787118434906006
+
Steps: 0%| | 1394/1000000 [3:30:20<2615:05:45, 9.43s/it, lr=1e-5, step_loss=0.0934]
Steps: 0%| | 1395/1000000 [3:30:33<2921:36:54, 10.53s/it, lr=1e-5, step_loss=0.0934][RANK-0]: Step: [1395], local_loss=0.02641107141971588, train_loss=0.06647816300392151, time_cost=6.6599626541137695
+
Steps: 0%| | 1395/1000000 [3:30:33<2921:36:54, 10.53s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1396/1000000 [3:30:40<2627:45:52, 9.47s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1396], local_loss=0.07032288610935211, train_loss=0.05204429477453232, time_cost=5.0839760303497314
+
Steps: 0%| | 1396/1000000 [3:30:40<2627:45:52, 9.47s/it, lr=1e-5, step_loss=0.0703]
Steps: 0%| | 1397/1000000 [3:30:51<2734:09:24, 9.86s/it, lr=1e-5, step_loss=0.0703][RANK-0]: Step: [1397], local_loss=0.03984459489583969, train_loss=0.047209031879901886, time_cost=3.6756930351257324
+
Steps: 0%| | 1397/1000000 [3:30:51<2734:09:24, 9.86s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 1398/1000000 [3:31:02<2837:45:26, 10.23s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [1398], local_loss=0.06867378950119019, train_loss=0.04653638228774071, time_cost=1.4284641742706299
+
Steps: 0%| | 1398/1000000 [3:31:02<2837:45:26, 10.23s/it, lr=1e-5, step_loss=0.0687]
Steps: 0%| | 1399/1000000 [3:31:10<2620:45:58, 9.45s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [1399], local_loss=0.02598388120532036, train_loss=0.11155123263597488, time_cost=5.271093130111694
+
Steps: 0%| | 1399/1000000 [3:31:10<2620:45:58, 9.45s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 1400/1000000 [3:31:19<2611:20:20, 9.41s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [1400], local_loss=0.046412013471126556, train_loss=0.20520523190498352, time_cost=2.1129250526428223
+
Steps: 0%| | 1400/1000000 [3:31:19<2611:20:20, 9.41s/it, lr=1e-5, step_loss=0.0464]
Steps: 0%| | 1401/1000000 [3:31:28<2613:24:55, 9.42s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [1401], local_loss=0.046362362802028656, train_loss=0.0354452058672905, time_cost=2.646730899810791
+
Steps: 0%| | 1401/1000000 [3:31:28<2613:24:55, 9.42s/it, lr=1e-5, step_loss=0.0464]
Steps: 0%| | 1402/1000000 [3:31:40<2805:57:04, 10.12s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [1402], local_loss=0.039669230580329895, train_loss=0.11255615949630737, time_cost=2.3833532333374023
+
Steps: 0%| | 1402/1000000 [3:31:40<2805:57:04, 10.12s/it, lr=1e-5, step_loss=0.0397]
Steps: 0%| | 1403/1000000 [3:31:47<2539:23:55, 9.15s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [1403], local_loss=0.0473971925675869, train_loss=0.03008941374719143, time_cost=2.8740243911743164
+
Steps: 0%| | 1403/1000000 [3:31:47<2539:23:55, 9.15s/it, lr=1e-5, step_loss=0.0474]
Steps: 0%| | 1404/1000000 [3:31:58<2731:50:39, 9.85s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [1404], local_loss=0.026411447674036026, train_loss=0.035326678305864334, time_cost=3.6857991218566895
+
Steps: 0%| | 1404/1000000 [3:31:58<2731:50:39, 9.85s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1405/1000000 [3:32:06<2556:46:04, 9.22s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1405], local_loss=0.023196017369627953, train_loss=0.08057115972042084, time_cost=2.853215217590332
+
Steps: 0%| | 1405/1000000 [3:32:06<2556:46:04, 9.22s/it, lr=1e-5, step_loss=0.0232]
Steps: 0%| | 1406/1000000 [3:32:11<2233:11:11, 8.05s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [1406], local_loss=0.06443284451961517, train_loss=0.13834892213344574, time_cost=2.3191077709198
+
Steps: 0%| | 1406/1000000 [3:32:11<2233:11:11, 8.05s/it, lr=1e-5, step_loss=0.0644]
Steps: 0%| | 1407/1000000 [3:32:22<2444:59:49, 8.81s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [1407], local_loss=0.17483173310756683, train_loss=0.0703340396285057, time_cost=4.31781530380249
+
Steps: 0%| | 1407/1000000 [3:32:22<2444:59:49, 8.81s/it, lr=1e-5, step_loss=0.175]
Steps: 0%| | 1408/1000000 [3:32:29<2272:26:36, 8.19s/it, lr=1e-5, step_loss=0.175][RANK-0]: Step: [1408], local_loss=0.03928061202168465, train_loss=0.05662718787789345, time_cost=2.364987373352051
+
Steps: 0%| | 1408/1000000 [3:32:29<2272:26:36, 8.19s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 1409/1000000 [3:32:33<1962:17:06, 7.07s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [1409], local_loss=0.023944159969687462, train_loss=0.03267630934715271, time_cost=1.5464162826538086
+
Steps: 0%| | 1409/1000000 [3:32:33<1962:17:06, 7.07s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 1410/1000000 [3:32:44<2296:36:37, 8.28s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [1410], local_loss=0.044219691306352615, train_loss=0.05669853836297989, time_cost=1.23750901222229
+
Steps: 0%| | 1410/1000000 [3:32:44<2296:36:37, 8.28s/it, lr=1e-5, step_loss=0.0442]
Steps: 0%| | 1411/1000000 [3:32:56<2551:20:48, 9.20s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [1411], local_loss=0.021190766245126724, train_loss=0.053114235401153564, time_cost=4.122299909591675
+
Steps: 0%| | 1411/1000000 [3:32:56<2551:20:48, 9.20s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 1412/1000000 [3:33:02<2344:16:50, 8.45s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [1412], local_loss=0.032533980906009674, train_loss=0.04238003492355347, time_cost=1.369436502456665
+
Steps: 0%| | 1412/1000000 [3:33:02<2344:16:50, 8.45s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 1413/1000000 [3:33:08<2101:15:07, 7.58s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [1413], local_loss=0.049742672592401505, train_loss=0.05157282575964928, time_cost=2.560584783554077
+
Steps: 0%| | 1413/1000000 [3:33:08<2101:15:07, 7.58s/it, lr=1e-5, step_loss=0.0497]
Steps: 0%| | 1414/1000000 [3:33:21<2532:16:10, 9.13s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [1414], local_loss=0.027829041704535484, train_loss=0.04221232235431671, time_cost=4.39913535118103
+
Steps: 0%| | 1414/1000000 [3:33:21<2532:16:10, 9.13s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 1415/1000000 [3:33:31<2643:35:43, 9.53s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [1415], local_loss=0.03333278000354767, train_loss=0.04367459565401077, time_cost=2.0937368869781494
+
Steps: 0%| | 1415/1000000 [3:33:31<2643:35:43, 9.53s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 1416/1000000 [3:33:44<2890:01:48, 10.42s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [1416], local_loss=0.0400862880051136, train_loss=0.03811030834913254, time_cost=1.2611632347106934
+
Steps: 0%| | 1416/1000000 [3:33:44<2890:01:48, 10.42s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 1417/1000000 [3:33:50<2513:27:40, 9.06s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [1417], local_loss=0.2866610884666443, train_loss=0.072956383228302, time_cost=1.2792577743530273
+
Steps: 0%| | 1417/1000000 [3:33:50<2513:27:40, 9.06s/it, lr=1e-5, step_loss=0.287]
Steps: 0%| | 1418/1000000 [3:33:59<2515:15:11, 9.07s/it, lr=1e-5, step_loss=0.287][RANK-0]: Step: [1418], local_loss=0.042768124490976334, train_loss=0.08862097561359406, time_cost=3.460545539855957
+
Steps: 0%| | 1418/1000000 [3:33:59<2515:15:11, 9.07s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 1419/1000000 [3:34:06<2351:22:45, 8.48s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [1419], local_loss=0.03770988807082176, train_loss=0.07424458861351013, time_cost=1.2903809547424316
+
Steps: 0%| | 1419/1000000 [3:34:06<2351:22:45, 8.48s/it, lr=1e-5, step_loss=0.0377]
Steps: 0%| | 1420/1000000 [3:34:15<2405:39:30, 8.67s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [1420], local_loss=0.03818495199084282, train_loss=0.10523508489131927, time_cost=3.254115581512451
+
Steps: 0%| | 1420/1000000 [3:34:15<2405:39:30, 8.67s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 1421/1000000 [3:34:32<3095:16:52, 11.16s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [1421], local_loss=0.08325675129890442, train_loss=0.11297021806240082, time_cost=8.901780128479004
+
Steps: 0%| | 1421/1000000 [3:34:32<3095:16:52, 11.16s/it, lr=1e-5, step_loss=0.0833]
Steps: 0%| | 1422/1000000 [3:34:39<2795:30:53, 10.08s/it, lr=1e-5, step_loss=0.0833][RANK-0]: Step: [1422], local_loss=0.02585289254784584, train_loss=0.15721860527992249, time_cost=1.593482255935669
+
Steps: 0%| | 1422/1000000 [3:34:39<2795:30:53, 10.08s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 1423/1000000 [3:34:45<2396:01:40, 8.64s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [1423], local_loss=0.08075359463691711, train_loss=0.07896412909030914, time_cost=2.1886725425720215
+
Steps: 0%| | 1423/1000000 [3:34:45<2396:01:40, 8.64s/it, lr=1e-5, step_loss=0.0808]
Steps: 0%| | 1424/1000000 [3:34:52<2309:34:08, 8.33s/it, lr=1e-5, step_loss=0.0808][RANK-0]: Step: [1424], local_loss=0.056493502110242844, train_loss=0.058366723358631134, time_cost=3.2834129333496094
+
Steps: 0%| | 1424/1000000 [3:34:52<2309:34:08, 8.33s/it, lr=1e-5, step_loss=0.0565]
Steps: 0%| | 1425/1000000 [3:35:06<2789:43:57, 10.06s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [1425], local_loss=0.032301705330610275, train_loss=3.016028642654419, time_cost=1.469975233078003
+
Steps: 0%| | 1425/1000000 [3:35:06<2789:43:57, 10.06s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 1426/1000000 [3:35:11<2372:04:01, 8.55s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [1426], local_loss=0.03672323375940323, train_loss=0.05785118415951729, time_cost=1.2942323684692383
+
Steps: 0%| | 1426/1000000 [3:35:11<2372:04:01, 8.55s/it, lr=1e-5, step_loss=0.0367]
Steps: 0%| | 1427/1000000 [3:35:16<2062:35:36, 7.44s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [1427], local_loss=0.04281600937247276, train_loss=0.05378809943795204, time_cost=2.2205169200897217
+
Steps: 0%| | 1427/1000000 [3:35:16<2062:35:36, 7.44s/it, lr=1e-5, step_loss=0.0428]
Steps: 0%| | 1428/1000000 [3:35:24<2115:53:42, 7.63s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [1428], local_loss=0.07179930806159973, train_loss=0.06103270873427391, time_cost=4.184057235717773
+
Steps: 0%| | 1428/1000000 [3:35:24<2115:53:42, 7.63s/it, lr=1e-5, step_loss=0.0718]
Steps: 0%| | 1429/1000000 [3:35:38<2619:12:05, 9.44s/it, lr=1e-5, step_loss=0.0718][RANK-0]: Step: [1429], local_loss=0.055803343653678894, train_loss=0.04737794026732445, time_cost=8.096930980682373
+
Steps: 0%| | 1429/1000000 [3:35:38<2619:12:05, 9.44s/it, lr=1e-5, step_loss=0.0558]
Steps: 0%| | 1430/1000000 [3:35:44<2357:51:07, 8.50s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [1430], local_loss=0.05832667648792267, train_loss=0.06191222742199898, time_cost=1.2840824127197266
+
Steps: 0%| | 1430/1000000 [3:35:44<2357:51:07, 8.50s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 1431/1000000 [3:35:57<2681:55:39, 9.67s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [1431], local_loss=0.044784218072891235, train_loss=0.03910049423575401, time_cost=2.0081288814544678
+
Steps: 0%| | 1431/1000000 [3:35:57<2681:55:39, 9.67s/it, lr=1e-5, step_loss=0.0448]
Steps: 0%| | 1432/1000000 [3:36:06<2625:17:44, 9.46s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [1432], local_loss=0.03501303866505623, train_loss=0.03647457808256149, time_cost=3.6126177310943604
+
Steps: 0%| | 1432/1000000 [3:36:06<2625:17:44, 9.46s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 1433/1000000 [3:36:12<2327:20:11, 8.39s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [1433], local_loss=0.0199887678027153, train_loss=0.06671375781297684, time_cost=4.040593385696411
+
Steps: 0%| | 1433/1000000 [3:36:12<2327:20:11, 8.39s/it, lr=1e-5, step_loss=0.02]
Steps: 0%| | 1434/1000000 [3:36:20<2366:59:29, 8.53s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [1434], local_loss=0.20508894324302673, train_loss=0.058363351970911026, time_cost=4.28609561920166
+
Steps: 0%| | 1434/1000000 [3:36:20<2366:59:29, 8.53s/it, lr=1e-5, step_loss=0.205]
Steps: 0%| | 1435/1000000 [3:36:27<2220:12:15, 8.00s/it, lr=1e-5, step_loss=0.205][RANK-0]: Step: [1435], local_loss=0.26265665888786316, train_loss=0.11835905909538269, time_cost=2.088467836380005
+
Steps: 0%| | 1435/1000000 [3:36:27<2220:12:15, 8.00s/it, lr=1e-5, step_loss=0.263]
Steps: 0%| | 1436/1000000 [3:36:39<2572:55:31, 9.28s/it, lr=1e-5, step_loss=0.263][RANK-0]: Step: [1436], local_loss=0.10187941789627075, train_loss=0.05368310213088989, time_cost=5.087711334228516
+
Steps: 0%| | 1436/1000000 [3:36:39<2572:55:31, 9.28s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 1437/1000000 [3:36:53<2941:39:42, 10.61s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [1437], local_loss=0.0576423741877079, train_loss=0.08267747610807419, time_cost=5.104666709899902
+
Steps: 0%| | 1437/1000000 [3:36:53<2941:39:42, 10.61s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 1438/1000000 [3:37:00<2658:12:00, 9.58s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [1438], local_loss=0.0576961524784565, train_loss=0.0979735255241394, time_cost=2.6331112384796143
+
Steps: 0%| | 1438/1000000 [3:37:00<2658:12:00, 9.58s/it, lr=1e-5, step_loss=0.0577]
Steps: 0%| | 1439/1000000 [3:37:13<2917:40:57, 10.52s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [1439], local_loss=0.036672089248895645, train_loss=0.07551462948322296, time_cost=5.5187764167785645
+
Steps: 0%| | 1439/1000000 [3:37:13<2917:40:57, 10.52s/it, lr=1e-5, step_loss=0.0367]
Steps: 0%| | 1440/1000000 [3:37:22<2774:32:22, 10.00s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [1440], local_loss=0.044351641088724136, train_loss=0.060755856335163116, time_cost=3.2913644313812256
+
Steps: 0%| | 1440/1000000 [3:37:22<2774:32:22, 10.00s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 1441/1000000 [3:37:37<3238:03:01, 11.67s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [1441], local_loss=0.05879117175936699, train_loss=0.03822482377290726, time_cost=6.756528377532959
+
Steps: 0%| | 1441/1000000 [3:37:37<3238:03:01, 11.67s/it, lr=1e-5, step_loss=0.0588]
Steps: 0%| | 1442/1000000 [3:37:47<3026:30:04, 10.91s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [1442], local_loss=0.02756216935813427, train_loss=0.05100054666399956, time_cost=2.024686574935913
+
Steps: 0%| | 1442/1000000 [3:37:47<3026:30:04, 10.91s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 1443/1000000 [3:37:54<2760:46:46, 9.95s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [1443], local_loss=0.03663373365998268, train_loss=0.09180131554603577, time_cost=1.276231288909912
+
Steps: 0%| | 1443/1000000 [3:37:54<2760:46:46, 9.95s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 1444/1000000 [3:38:00<2373:28:02, 8.56s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [1444], local_loss=0.0250844843685627, train_loss=0.134246826171875, time_cost=4.387013912200928
+
Steps: 0%| | 1444/1000000 [3:38:00<2373:28:02, 8.56s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 1445/1000000 [3:38:05<2125:40:27, 7.66s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [1445], local_loss=0.055914122611284256, train_loss=0.12122097611427307, time_cost=1.4519202709197998
+
Steps: 0%| | 1445/1000000 [3:38:05<2125:40:27, 7.66s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 1446/1000000 [3:38:12<2052:23:34, 7.40s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [1446], local_loss=0.020850948989391327, train_loss=0.06043795496225357, time_cost=2.2107391357421875
+
Steps: 0%| | 1446/1000000 [3:38:12<2052:23:34, 7.40s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 1447/1000000 [3:38:23<2380:01:54, 8.58s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [1447], local_loss=1.0063258409500122, train_loss=0.1854829490184784, time_cost=5.252547264099121
+
Steps: 0%| | 1447/1000000 [3:38:23<2380:01:54, 8.58s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 1448/1000000 [3:38:34<2600:42:02, 9.38s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [1448], local_loss=0.043420400470495224, train_loss=0.0742095336318016, time_cost=2.3508963584899902
+
Steps: 0%| | 1448/1000000 [3:38:34<2600:42:02, 9.38s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 1449/1000000 [3:38:41<2359:46:31, 8.51s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [1449], local_loss=0.029898136854171753, train_loss=0.23998582363128662, time_cost=1.3542735576629639
+
Steps: 0%| | 1449/1000000 [3:38:41<2359:46:31, 8.51s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 1450/1000000 [3:38:52<2575:29:54, 9.29s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [1450], local_loss=0.04065456986427307, train_loss=0.03531227260828018, time_cost=2.420771598815918
+
Steps: 0%| | 1450/1000000 [3:38:52<2575:29:54, 9.29s/it, lr=1e-5, step_loss=0.0407]
Steps: 0%| | 1451/1000000 [3:39:01<2565:14:51, 9.25s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [1451], local_loss=0.05631902441382408, train_loss=0.03707130253314972, time_cost=1.6415269374847412
+
Steps: 0%| | 1451/1000000 [3:39:01<2565:14:51, 9.25s/it, lr=1e-5, step_loss=0.0563]
Steps: 0%| | 1452/1000000 [3:39:08<2335:56:38, 8.42s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [1452], local_loss=0.07949946820735931, train_loss=0.04354480654001236, time_cost=2.749380350112915
+
Steps: 0%| | 1452/1000000 [3:39:08<2335:56:38, 8.42s/it, lr=1e-5, step_loss=0.0795]
Steps: 0%| | 1453/1000000 [3:39:17<2385:27:01, 8.60s/it, lr=1e-5, step_loss=0.0795][RANK-0]: Step: [1453], local_loss=0.03038039430975914, train_loss=0.0586739219725132, time_cost=4.144881010055542
+
Steps: 0%| | 1453/1000000 [3:39:17<2385:27:01, 8.60s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 1454/1000000 [3:39:23<2160:47:58, 7.79s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [1454], local_loss=0.04492620378732681, train_loss=0.07987399399280548, time_cost=4.962984561920166
+
Steps: 0%| | 1454/1000000 [3:39:23<2160:47:58, 7.79s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 1455/1000000 [3:39:28<1938:43:46, 6.99s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [1455], local_loss=0.028202207759022713, train_loss=0.06628366559743881, time_cost=2.6051366329193115
+
Steps: 0%| | 1455/1000000 [3:39:28<1938:43:46, 6.99s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 1456/1000000 [3:39:35<1942:58:58, 7.00s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [1456], local_loss=0.05765533819794655, train_loss=0.0764273852109909, time_cost=2.836604595184326
+
Steps: 0%| | 1456/1000000 [3:39:35<1942:58:58, 7.00s/it, lr=1e-5, step_loss=0.0577]
Steps: 0%| | 1457/1000000 [3:39:40<1753:04:42, 6.32s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [1457], local_loss=0.03651655837893486, train_loss=0.08683653175830841, time_cost=2.040912389755249
+
Steps: 0%| | 1457/1000000 [3:39:40<1753:04:42, 6.32s/it, lr=1e-5, step_loss=0.0365]
Steps: 0%| | 1458/1000000 [3:39:47<1828:46:02, 6.59s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [1458], local_loss=0.04598524048924446, train_loss=0.18205764889717102, time_cost=2.618346929550171
+
Steps: 0%| | 1458/1000000 [3:39:47<1828:46:02, 6.59s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 1459/1000000 [3:40:02<2521:07:11, 9.09s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [1459], local_loss=0.16818514466285706, train_loss=0.06201557442545891, time_cost=6.423579931259155
+
Steps: 0%| | 1459/1000000 [3:40:02<2521:07:11, 9.09s/it, lr=1e-5, step_loss=0.168]
Steps: 0%| | 1460/1000000 [3:40:17<3066:20:41, 11.05s/it, lr=1e-5, step_loss=0.168][RANK-0]: Step: [1460], local_loss=0.035453956574201584, train_loss=0.08124959468841553, time_cost=11.488282918930054
+
Steps: 0%| | 1460/1000000 [3:40:17<3066:20:41, 11.05s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 1461/1000000 [3:40:31<3279:55:21, 11.82s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [1461], local_loss=0.036788079887628555, train_loss=0.03368358314037323, time_cost=4.5809714794158936
+
Steps: 0%| | 1461/1000000 [3:40:31<3279:55:21, 11.82s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 1462/1000000 [3:40:42<3208:10:26, 11.57s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [1462], local_loss=0.023599788546562195, train_loss=0.0454818531870842, time_cost=3.620828866958618
+
Steps: 0%| | 1462/1000000 [3:40:42<3208:10:26, 11.57s/it, lr=1e-5, step_loss=0.0236]
Steps: 0%| | 1463/1000000 [3:40:47<2652:42:13, 9.56s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [1463], local_loss=0.09634225070476532, train_loss=13.689752578735352, time_cost=1.8305835723876953
+
Steps: 0%| | 1463/1000000 [3:40:47<2652:42:13, 9.56s/it, lr=1e-5, step_loss=0.0963]
Steps: 0%| | 1464/1000000 [3:41:01<3076:11:24, 11.09s/it, lr=1e-5, step_loss=0.0963][RANK-0]: Step: [1464], local_loss=0.05833981931209564, train_loss=0.05652379244565964, time_cost=5.811505317687988
+
Steps: 0%| | 1464/1000000 [3:41:01<3076:11:24, 11.09s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 1465/1000000 [3:41:09<2776:54:48, 10.01s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [1465], local_loss=0.02360893227159977, train_loss=0.031064726412296295, time_cost=1.8185005187988281
+
Steps: 0%| | 1465/1000000 [3:41:09<2776:54:48, 10.01s/it, lr=1e-5, step_loss=0.0236]
Steps: 0%| | 1466/1000000 [3:41:14<2401:08:43, 8.66s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [1466], local_loss=0.024001112207770348, train_loss=0.06039776653051376, time_cost=2.8190062046051025
+
Steps: 0%| | 1466/1000000 [3:41:14<2401:08:43, 8.66s/it, lr=1e-5, step_loss=0.024]
Steps: 0%| | 1467/1000000 [3:41:20<2176:12:28, 7.85s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [1467], local_loss=0.04967193305492401, train_loss=0.050749048590660095, time_cost=1.9232532978057861
+
Steps: 0%| | 1467/1000000 [3:41:20<2176:12:28, 7.85s/it, lr=1e-5, step_loss=0.0497]
Steps: 0%| | 1468/1000000 [3:41:31<2438:02:48, 8.79s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [1468], local_loss=0.0235447995364666, train_loss=0.04443574696779251, time_cost=8.046607971191406
+
Steps: 0%| | 1468/1000000 [3:41:31<2438:02:48, 8.79s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 1469/1000000 [3:41:38<2284:00:26, 8.23s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [1469], local_loss=0.03325783461332321, train_loss=0.04056185856461525, time_cost=1.62013840675354
+
Steps: 0%| | 1469/1000000 [3:41:38<2284:00:26, 8.23s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 1470/1000000 [3:41:53<2818:28:16, 10.16s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [1470], local_loss=0.013138060458004475, train_loss=0.1436765491962433, time_cost=1.3376338481903076
+
Steps: 0%| | 1470/1000000 [3:41:53<2818:28:16, 10.16s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 1471/1000000 [3:41:58<2399:07:45, 8.65s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [1471], local_loss=0.02562083676457405, train_loss=0.03939051181077957, time_cost=1.2731409072875977
+
Steps: 0%| | 1471/1000000 [3:41:58<2399:07:45, 8.65s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 1472/1000000 [3:42:09<2602:07:37, 9.38s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [1472], local_loss=0.0734732374548912, train_loss=0.10075925290584564, time_cost=4.430983781814575
+
Steps: 0%| | 1472/1000000 [3:42:09<2602:07:37, 9.38s/it, lr=1e-5, step_loss=0.0735]
Steps: 0%| | 1473/1000000 [3:42:15<2303:09:05, 8.30s/it, lr=1e-5, step_loss=0.0735][RANK-0]: Step: [1473], local_loss=0.02855251170694828, train_loss=0.0505470409989357, time_cost=1.5224087238311768
+
Steps: 0%| | 1473/1000000 [3:42:15<2303:09:05, 8.30s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 1474/1000000 [3:42:24<2343:30:31, 8.45s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [1474], local_loss=0.04459771886467934, train_loss=0.07984377443790436, time_cost=2.525437593460083
+
Steps: 0%| | 1474/1000000 [3:42:24<2343:30:31, 8.45s/it, lr=1e-5, step_loss=0.0446]
Steps: 0%| | 1475/1000000 [3:42:34<2486:05:19, 8.96s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [1475], local_loss=0.04619299992918968, train_loss=0.17317168414592743, time_cost=4.660659313201904
+
Steps: 0%| | 1475/1000000 [3:42:34<2486:05:19, 8.96s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 1476/1000000 [3:42:41<2313:48:06, 8.34s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [1476], local_loss=0.047583773732185364, train_loss=0.07522429525852203, time_cost=5.078205823898315
+
Steps: 0%| | 1476/1000000 [3:42:41<2313:48:06, 8.34s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 1477/1000000 [3:42:45<1996:58:32, 7.20s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [1477], local_loss=0.04391855746507645, train_loss=0.0429866760969162, time_cost=2.192575693130493
+
Steps: 0%| | 1477/1000000 [3:42:45<1996:58:32, 7.20s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 1478/1000000 [3:42:54<2120:48:02, 7.65s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [1478], local_loss=0.18278191983699799, train_loss=0.06970914453268051, time_cost=1.8988678455352783
+
Steps: 0%| | 1478/1000000 [3:42:54<2120:48:02, 7.65s/it, lr=1e-5, step_loss=0.183]
Steps: 0%| | 1479/1000000 [3:43:03<2244:13:56, 8.09s/it, lr=1e-5, step_loss=0.183][RANK-0]: Step: [1479], local_loss=0.059391897171735764, train_loss=0.03805924579501152, time_cost=1.4919464588165283
+
Steps: 0%| | 1479/1000000 [3:43:03<2244:13:56, 8.09s/it, lr=1e-5, step_loss=0.0594]
Steps: 0%| | 1480/1000000 [3:43:11<2207:30:50, 7.96s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [1480], local_loss=0.03569591045379639, train_loss=0.0510784275829792, time_cost=1.2024729251861572
+
Steps: 0%| | 1480/1000000 [3:43:11<2207:30:50, 7.96s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 1481/1000000 [3:43:17<2022:33:20, 7.29s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [1481], local_loss=0.023042818531394005, train_loss=0.04930403083562851, time_cost=1.3656513690948486
+
Steps: 0%| | 1481/1000000 [3:43:17<2022:33:20, 7.29s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 1482/1000000 [3:43:25<2094:56:15, 7.55s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [1482], local_loss=0.04529556259512901, train_loss=0.05691671371459961, time_cost=2.36020565032959
+
Steps: 0%| | 1482/1000000 [3:43:25<2094:56:15, 7.55s/it, lr=1e-5, step_loss=0.0453]
Steps: 0%| | 1483/1000000 [3:43:38<2582:39:21, 9.31s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [1483], local_loss=0.05545184388756752, train_loss=0.07895219326019287, time_cost=7.266521215438843
+
Steps: 0%| | 1483/1000000 [3:43:38<2582:39:21, 9.31s/it, lr=1e-5, step_loss=0.0555]
Steps: 0%| | 1484/1000000 [3:43:49<2706:24:09, 9.76s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [1484], local_loss=0.0401262491941452, train_loss=0.08027403056621552, time_cost=1.9709393978118896
+
Steps: 0%| | 1484/1000000 [3:43:49<2706:24:09, 9.76s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 1485/1000000 [3:44:02<2988:53:34, 10.78s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [1485], local_loss=0.04229775816202164, train_loss=0.038832955062389374, time_cost=4.212770700454712
+
Steps: 0%| | 1485/1000000 [3:44:02<2988:53:34, 10.78s/it, lr=1e-5, step_loss=0.0423]
Steps: 0%| | 1486/1000000 [3:44:15<3164:04:04, 11.41s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [1486], local_loss=0.07518672943115234, train_loss=0.10309465229511261, time_cost=3.7685327529907227
+
Steps: 0%| | 1486/1000000 [3:44:15<3164:04:04, 11.41s/it, lr=1e-5, step_loss=0.0752]
Steps: 0%| | 1487/1000000 [3:44:24<2983:13:29, 10.76s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [1487], local_loss=0.039089009165763855, train_loss=0.07247753441333771, time_cost=3.4328200817108154
+
Steps: 0%| | 1487/1000000 [3:44:24<2983:13:29, 10.76s/it, lr=1e-5, step_loss=0.0391]
Steps: 0%| | 1488/1000000 [3:44:31<2676:00:28, 9.65s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [1488], local_loss=0.021413013339042664, train_loss=0.16380718350410461, time_cost=1.438324213027954
+
Steps: 0%| | 1488/1000000 [3:44:31<2676:00:28, 9.65s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 1489/1000000 [3:44:38<2463:28:21, 8.88s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [1489], local_loss=0.02617025189101696, train_loss=0.03010427951812744, time_cost=2.1692962646484375
+
Steps: 0%| | 1489/1000000 [3:44:38<2463:28:21, 8.88s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 1490/1000000 [3:44:44<2198:07:41, 7.93s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [1490], local_loss=0.04431692138314247, train_loss=0.05464440584182739, time_cost=2.977891445159912
+
Steps: 0%| | 1490/1000000 [3:44:44<2198:07:41, 7.93s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 1491/1000000 [3:44:52<2232:00:56, 8.05s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [1491], local_loss=0.04632442444562912, train_loss=10.641195297241211, time_cost=1.6769883632659912
+
Steps: 0%| | 1491/1000000 [3:44:52<2232:00:56, 8.05s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 1492/1000000 [3:45:06<2660:53:31, 9.59s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [1492], local_loss=0.05773632600903511, train_loss=0.04560856893658638, time_cost=4.345928430557251
+
Steps: 0%| | 1492/1000000 [3:45:06<2660:53:31, 9.59s/it, lr=1e-5, step_loss=0.0577]
Steps: 0%| | 1493/1000000 [3:45:11<2291:31:18, 8.26s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [1493], local_loss=0.026576844975352287, train_loss=0.178567573428154, time_cost=2.586557626724243
+
Steps: 0%| | 1493/1000000 [3:45:11<2291:31:18, 8.26s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 1494/1000000 [3:45:20<2354:58:17, 8.49s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [1494], local_loss=0.030330680310726166, train_loss=0.030765609815716743, time_cost=2.1854279041290283
+
Steps: 0%| | 1494/1000000 [3:45:20<2354:58:17, 8.49s/it, lr=1e-5, step_loss=0.0303]
Steps: 0%| | 1495/1000000 [3:45:27<2242:17:27, 8.08s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [1495], local_loss=0.05618198215961456, train_loss=0.06261852383613586, time_cost=1.7016031742095947
+
Steps: 0%| | 1495/1000000 [3:45:27<2242:17:27, 8.08s/it, lr=1e-5, step_loss=0.0562]
Steps: 0%| | 1496/1000000 [3:45:41<2722:30:41, 9.82s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [1496], local_loss=0.035862017422914505, train_loss=0.04866497963666916, time_cost=2.4488933086395264
+
Steps: 0%| | 1496/1000000 [3:45:41<2722:30:41, 9.82s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 1497/1000000 [3:45:47<2399:42:24, 8.65s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [1497], local_loss=0.031070489436388016, train_loss=0.065670445561409, time_cost=4.1224799156188965
+
Steps: 0%| | 1497/1000000 [3:45:47<2399:42:24, 8.65s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 1498/1000000 [3:45:52<2153:18:12, 7.76s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [1498], local_loss=0.05597858130931854, train_loss=0.06121167540550232, time_cost=2.9300897121429443
+
Steps: 0%| | 1498/1000000 [3:45:52<2153:18:12, 7.76s/it, lr=1e-5, step_loss=0.056]
Steps: 0%| | 1499/1000000 [3:46:03<2409:31:45, 8.69s/it, lr=1e-5, step_loss=0.056][RANK-0]: Step: [1499], local_loss=0.17463694512844086, train_loss=0.09143417328596115, time_cost=3.302670478820801
+
Steps: 0%| | 1499/1000000 [3:46:03<2409:31:45, 8.69s/it, lr=1e-5, step_loss=0.175]
Steps: 0%| | 1500/1000000 [3:46:12<2439:34:10, 8.80s/it, lr=1e-5, step_loss=0.175][RANK-0]: Step: [1500], local_loss=0.02793874964118004, train_loss=0.03644730895757675, time_cost=3.14489483833313
+
Steps: 0%| | 1500/1000000 [3:46:12<2439:34:10, 8.80s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 1501/1000000 [3:46:18<2203:41:27, 7.95s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [1501], local_loss=0.04794275015592575, train_loss=0.06235755607485771, time_cost=4.471774578094482
+
Steps: 0%| | 1501/1000000 [3:46:18<2203:41:27, 7.95s/it, lr=1e-5, step_loss=0.0479]
Steps: 0%| | 1502/1000000 [3:46:23<1943:03:19, 7.01s/it, lr=1e-5, step_loss=0.0479][RANK-0]: Step: [1502], local_loss=0.12442349642515182, train_loss=0.05511637032032013, time_cost=2.2756729125976562
+
Steps: 0%| | 1502/1000000 [3:46:23<1943:03:19, 7.01s/it, lr=1e-5, step_loss=0.124]
Steps: 0%| | 1503/1000000 [3:46:36<2442:54:19, 8.81s/it, lr=1e-5, step_loss=0.124][RANK-0]: Step: [1503], local_loss=0.0258333720266819, train_loss=0.10607141256332397, time_cost=3.644352436065674
+
Steps: 0%| | 1503/1000000 [3:46:36<2442:54:19, 8.81s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 1504/1000000 [3:46:50<2870:14:40, 10.35s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [1504], local_loss=0.04130510985851288, train_loss=0.07512069493532181, time_cost=4.628422737121582
+
Steps: 0%| | 1504/1000000 [3:46:50<2870:14:40, 10.35s/it, lr=1e-5, step_loss=0.0413]
Steps: 0%| | 1505/1000000 [3:47:05<3240:51:50, 11.68s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [1505], local_loss=0.05187337100505829, train_loss=0.09754344820976257, time_cost=5.997148513793945
+
Steps: 0%| | 1505/1000000 [3:47:05<3240:51:50, 11.68s/it, lr=1e-5, step_loss=0.0519]
Steps: 0%| | 1506/1000000 [3:47:14<3001:34:07, 10.82s/it, lr=1e-5, step_loss=0.0519][RANK-0]: Step: [1506], local_loss=0.035234976559877396, train_loss=0.029637880623340607, time_cost=3.130117177963257
+
Steps: 0%| | 1506/1000000 [3:47:14<3001:34:07, 10.82s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 1507/1000000 [3:47:22<2827:56:08, 10.20s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [1507], local_loss=0.06584112346172333, train_loss=0.03769709914922714, time_cost=1.7006688117980957
+
Steps: 0%| | 1507/1000000 [3:47:22<2827:56:08, 10.20s/it, lr=1e-5, step_loss=0.0658]
Steps: 0%| | 1508/1000000 [3:47:30<2598:26:56, 9.37s/it, lr=1e-5, step_loss=0.0658][RANK-0]: Step: [1508], local_loss=0.056230396032333374, train_loss=0.04372960701584816, time_cost=1.8745040893554688
+
Steps: 0%| | 1508/1000000 [3:47:30<2598:26:56, 9.37s/it, lr=1e-5, step_loss=0.0562]
Steps: 0%| | 1509/1000000 [3:47:35<2249:57:24, 8.11s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [1509], local_loss=0.07361602783203125, train_loss=0.06849321722984314, time_cost=1.218480110168457
+
Steps: 0%| | 1509/1000000 [3:47:35<2249:57:24, 8.11s/it, lr=1e-5, step_loss=0.0736]
Steps: 0%| | 1510/1000000 [3:47:39<1920:24:45, 6.92s/it, lr=1e-5, step_loss=0.0736][RANK-0]: Step: [1510], local_loss=0.043748971074819565, train_loss=0.04506600648164749, time_cost=1.4647681713104248
+
Steps: 0%| | 1510/1000000 [3:47:39<1920:24:45, 6.92s/it, lr=1e-5, step_loss=0.0437]
Steps: 0%| | 1511/1000000 [3:47:45<1822:21:51, 6.57s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [1511], local_loss=0.025586187839508057, train_loss=0.046302422881126404, time_cost=1.3983979225158691
+
Steps: 0%| | 1511/1000000 [3:47:45<1822:21:51, 6.57s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 1512/1000000 [3:47:59<2456:50:36, 8.86s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [1512], local_loss=0.03818539157509804, train_loss=0.15990474820137024, time_cost=2.818840980529785
+
Steps: 0%| | 1512/1000000 [3:47:59<2456:50:36, 8.86s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 1513/1000000 [3:48:15<3029:35:22, 10.92s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [1513], local_loss=0.01453362312167883, train_loss=6.0619306564331055, time_cost=7.31719183921814
+
Steps: 0%| | 1513/1000000 [3:48:15<3029:35:22, 10.92s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 1514/1000000 [3:48:27<3171:54:22, 11.44s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [1514], local_loss=0.027715537697076797, train_loss=0.05013709515333176, time_cost=3.703519105911255
+
Steps: 0%| | 1514/1000000 [3:48:27<3171:54:22, 11.44s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 1515/1000000 [3:48:39<3162:35:46, 11.40s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [1515], local_loss=0.06133468076586723, train_loss=0.06555299460887909, time_cost=1.9834802150726318
+
Steps: 0%| | 1515/1000000 [3:48:39<3162:35:46, 11.40s/it, lr=1e-5, step_loss=0.0613]
Steps: 0%| | 1516/1000000 [3:48:45<2713:31:01, 9.78s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [1516], local_loss=0.027100956067442894, train_loss=0.027458462864160538, time_cost=1.290123701095581
+
Steps: 0%| | 1516/1000000 [3:48:45<2713:31:01, 9.78s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 1517/1000000 [3:48:52<2529:31:55, 9.12s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [1517], local_loss=0.04396318271756172, train_loss=0.03674376383423805, time_cost=3.5041580200195312
+
Steps: 0%| | 1517/1000000 [3:48:52<2529:31:55, 9.12s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 1518/1000000 [3:48:59<2336:38:18, 8.42s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [1518], local_loss=0.3382887542247772, train_loss=0.11176639795303345, time_cost=2.9729537963867188
+
Steps: 0%| | 1518/1000000 [3:48:59<2336:38:18, 8.42s/it, lr=1e-5, step_loss=0.338]
Steps: 0%| | 1519/1000000 [3:49:05<2131:29:37, 7.69s/it, lr=1e-5, step_loss=0.338][RANK-0]: Step: [1519], local_loss=0.036916252225637436, train_loss=0.02800053358078003, time_cost=1.761373519897461
+
Steps: 0%| | 1519/1000000 [3:49:05<2131:29:37, 7.69s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 1520/1000000 [3:49:18<2536:09:45, 9.14s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [1520], local_loss=0.05075913667678833, train_loss=0.08505956083536148, time_cost=8.8534836769104
+
Steps: 0%| | 1520/1000000 [3:49:18<2536:09:45, 9.14s/it, lr=1e-5, step_loss=0.0508]
Steps: 0%| | 1521/1000000 [3:49:26<2453:08:02, 8.84s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [1521], local_loss=0.03871230408549309, train_loss=0.05437072366476059, time_cost=1.4270801544189453
+
Steps: 0%| | 1521/1000000 [3:49:26<2453:08:02, 8.84s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 1522/1000000 [3:49:33<2317:08:57, 8.35s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [1522], local_loss=0.035166285932064056, train_loss=0.10078796744346619, time_cost=5.496824026107788
+
Steps: 0%| | 1522/1000000 [3:49:33<2317:08:57, 8.35s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 1523/1000000 [3:49:38<2049:33:58, 7.39s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [1523], local_loss=0.031181177124381065, train_loss=0.1453579217195511, time_cost=3.8920793533325195
+
Steps: 0%| | 1523/1000000 [3:49:38<2049:33:58, 7.39s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 1524/1000000 [3:49:49<2354:53:20, 8.49s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [1524], local_loss=0.035833690315485, train_loss=0.05680131912231445, time_cost=3.2753312587738037
+
Steps: 0%| | 1524/1000000 [3:49:49<2354:53:20, 8.49s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 1525/1000000 [3:49:58<2360:04:25, 8.51s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [1525], local_loss=0.14725624024868011, train_loss=0.07211077213287354, time_cost=2.6283788681030273
+
Steps: 0%| | 1525/1000000 [3:49:58<2360:04:25, 8.51s/it, lr=1e-5, step_loss=0.147]
Steps: 0%| | 1526/1000000 [3:50:04<2141:27:16, 7.72s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [1526], local_loss=0.03128473460674286, train_loss=0.07894094288349152, time_cost=4.445664882659912
+
Steps: 0%| | 1526/1000000 [3:50:04<2141:27:16, 7.72s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 1527/1000000 [3:50:15<2427:55:06, 8.75s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [1527], local_loss=0.198488250374794, train_loss=0.05468904972076416, time_cost=2.2688231468200684
+
Steps: 0%| | 1527/1000000 [3:50:15<2427:55:06, 8.75s/it, lr=1e-5, step_loss=0.198]
Steps: 0%| | 1528/1000000 [3:50:19<2071:09:06, 7.47s/it, lr=1e-5, step_loss=0.198][RANK-0]: Step: [1528], local_loss=0.03729777783155441, train_loss=38.52643585205078, time_cost=1.5131173133850098
+
Steps: 0%| | 1528/1000000 [3:50:19<2071:09:06, 7.47s/it, lr=1e-5, step_loss=0.0373]
Steps: 0%| | 1529/1000000 [3:50:27<2058:15:03, 7.42s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [1529], local_loss=0.01993776299059391, train_loss=0.05984167754650116, time_cost=1.4140934944152832
+
Steps: 0%| | 1529/1000000 [3:50:27<2058:15:03, 7.42s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 1530/1000000 [3:50:40<2531:07:47, 9.13s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [1530], local_loss=0.048536039888858795, train_loss=0.049923382699489594, time_cost=1.2138729095458984
+
Steps: 0%| | 1530/1000000 [3:50:40<2531:07:47, 9.13s/it, lr=1e-5, step_loss=0.0485]
Steps: 0%| | 1531/1000000 [3:50:45<2178:15:20, 7.85s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [1531], local_loss=0.026905333623290062, train_loss=0.03780758008360863, time_cost=2.2760181427001953
+
Steps: 0%| | 1531/1000000 [3:50:45<2178:15:20, 7.85s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 1532/1000000 [3:51:00<2837:01:34, 10.23s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [1532], local_loss=0.04085109755396843, train_loss=0.07859581708908081, time_cost=3.5368363857269287
+
Steps: 0%| | 1532/1000000 [3:51:00<2837:01:34, 10.23s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 1533/1000000 [3:51:07<2523:38:16, 9.10s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [1533], local_loss=0.03218960762023926, train_loss=0.15226580202579498, time_cost=2.329373359680176
+
Steps: 0%| | 1533/1000000 [3:51:07<2523:38:16, 9.10s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 1534/1000000 [3:51:20<2873:29:49, 10.36s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [1534], local_loss=0.02846260741353035, train_loss=0.0279727540910244, time_cost=1.2296476364135742
+
Steps: 0%| | 1534/1000000 [3:51:20<2873:29:49, 10.36s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 1535/1000000 [3:51:33<3110:49:38, 11.22s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [1535], local_loss=0.024145759642124176, train_loss=0.07086187601089478, time_cost=5.3539793491363525
+
Steps: 0%| | 1535/1000000 [3:51:33<3110:49:38, 11.22s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 1536/1000000 [3:51:44<3073:50:35, 11.08s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [1536], local_loss=0.055539947003126144, train_loss=0.06331079453229904, time_cost=4.026515483856201
+
Steps: 0%| | 1536/1000000 [3:51:44<3073:50:35, 11.08s/it, lr=1e-5, step_loss=0.0555]
Steps: 0%| | 1537/1000000 [3:51:52<2798:51:43, 10.09s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [1537], local_loss=0.2235010713338852, train_loss=0.08588237315416336, time_cost=2.9397921562194824
+
Steps: 0%| | 1537/1000000 [3:51:52<2798:51:43, 10.09s/it, lr=1e-5, step_loss=0.224]
Steps: 0%| | 1538/1000000 [3:52:05<3050:12:28, 11.00s/it, lr=1e-5, step_loss=0.224][RANK-0]: Step: [1538], local_loss=0.026115402579307556, train_loss=0.04791697859764099, time_cost=3.69781756401062
+
Steps: 0%| | 1538/1000000 [3:52:05<3050:12:28, 11.00s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 1539/1000000 [3:52:15<2980:29:56, 10.75s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [1539], local_loss=0.02281530573964119, train_loss=0.03781792148947716, time_cost=1.927765130996704
+
Steps: 0%| | 1539/1000000 [3:52:15<2980:29:56, 10.75s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 1540/1000000 [3:52:22<2683:44:43, 9.68s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [1540], local_loss=0.037948720157146454, train_loss=0.03501768410205841, time_cost=2.456648349761963
+
Steps: 0%| | 1540/1000000 [3:52:22<2683:44:43, 9.68s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 1541/1000000 [3:52:33<2773:07:13, 10.00s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [1541], local_loss=0.026395611464977264, train_loss=0.09969247877597809, time_cost=3.5873804092407227
+
Steps: 0%| | 1541/1000000 [3:52:33<2773:07:13, 10.00s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1542/1000000 [3:52:45<2906:15:00, 10.48s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1542], local_loss=0.05523885041475296, train_loss=0.05211244896054268, time_cost=1.227785587310791
+
Steps: 0%| | 1542/1000000 [3:52:45<2906:15:00, 10.48s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 1543/1000000 [3:52:59<3243:36:01, 11.70s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [1543], local_loss=0.02232881635427475, train_loss=0.04496920108795166, time_cost=5.323687553405762
+
Steps: 0%| | 1543/1000000 [3:52:59<3243:36:01, 11.70s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 1544/1000000 [3:53:05<2740:56:04, 9.88s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [1544], local_loss=0.05299455299973488, train_loss=0.19080357253551483, time_cost=2.0446271896362305
+
Steps: 0%| | 1544/1000000 [3:53:05<2740:56:04, 9.88s/it, lr=1e-5, step_loss=0.053]
Steps: 0%| | 1545/1000000 [3:53:16<2857:53:09, 10.30s/it, lr=1e-5, step_loss=0.053][RANK-0]: Step: [1545], local_loss=0.02603161707520485, train_loss=0.08573213964700699, time_cost=8.062614917755127
+
Steps: 0%| | 1545/1000000 [3:53:16<2857:53:09, 10.30s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 1546/1000000 [3:53:29<3083:06:53, 11.12s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [1546], local_loss=0.20166848599910736, train_loss=0.10127011686563492, time_cost=1.2404799461364746
+
Steps: 0%| | 1546/1000000 [3:53:29<3083:06:53, 11.12s/it, lr=1e-5, step_loss=0.202]
Steps: 0%| | 1547/1000000 [3:53:34<2600:38:10, 9.38s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [1547], local_loss=0.021161876618862152, train_loss=0.02477174997329712, time_cost=2.0123326778411865
+
Steps: 0%| | 1547/1000000 [3:53:34<2600:38:10, 9.38s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 1548/1000000 [3:53:40<2261:01:23, 8.15s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [1548], local_loss=0.02787012979388237, train_loss=0.05923715978860855, time_cost=2.0128426551818848
+
Steps: 0%| | 1548/1000000 [3:53:40<2261:01:23, 8.15s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 1549/1000000 [3:53:48<2240:36:19, 8.08s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [1549], local_loss=0.07223102450370789, train_loss=0.2781749963760376, time_cost=1.2230205535888672
+
Steps: 0%| | 1549/1000000 [3:53:48<2240:36:19, 8.08s/it, lr=1e-5, step_loss=0.0722]
Steps: 0%| | 1550/1000000 [3:53:52<1944:50:58, 7.01s/it, lr=1e-5, step_loss=0.0722][RANK-0]: Step: [1550], local_loss=0.01643342338502407, train_loss=0.046322502195835114, time_cost=1.2064685821533203
+
Steps: 0%| | 1550/1000000 [3:53:52<1944:50:58, 7.01s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 1551/1000000 [3:53:58<1829:45:34, 6.60s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [1551], local_loss=0.0292540080845356, train_loss=0.06499995291233063, time_cost=4.566867351531982
+
Steps: 0%| | 1551/1000000 [3:53:58<1829:45:34, 6.60s/it, lr=1e-5, step_loss=0.0293]
Steps: 0%| | 1552/1000000 [3:54:04<1754:02:36, 6.32s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [1552], local_loss=0.059942297637462616, train_loss=0.096779964864254, time_cost=1.2010350227355957
+
Steps: 0%| | 1552/1000000 [3:54:04<1754:02:36, 6.32s/it, lr=1e-5, step_loss=0.0599]
Steps: 0%| | 1553/1000000 [3:54:10<1804:17:47, 6.51s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [1553], local_loss=0.023285675793886185, train_loss=0.05251483619213104, time_cost=2.3327789306640625
+
Steps: 0%| | 1553/1000000 [3:54:10<1804:17:47, 6.51s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 1554/1000000 [3:54:22<2198:51:59, 7.93s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [1554], local_loss=0.03900332376360893, train_loss=0.0465751513838768, time_cost=3.2517125606536865
+
Steps: 0%| | 1554/1000000 [3:54:22<2198:51:59, 7.93s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 1555/1000000 [3:54:31<2300:15:06, 8.29s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [1555], local_loss=0.13343848288059235, train_loss=0.16755607724189758, time_cost=1.228363275527954
+
Steps: 0%| | 1555/1000000 [3:54:31<2300:15:06, 8.29s/it, lr=1e-5, step_loss=0.133]
Steps: 0%| | 1556/1000000 [3:54:37<2084:33:27, 7.52s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [1556], local_loss=0.0223690178245306, train_loss=0.05593295022845268, time_cost=1.5379819869995117
+
Steps: 0%| | 1556/1000000 [3:54:37<2084:33:27, 7.52s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 1557/1000000 [3:54:41<1830:22:55, 6.60s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [1557], local_loss=0.03826005011796951, train_loss=0.04957875981926918, time_cost=1.2457385063171387
+
Steps: 0%| | 1557/1000000 [3:54:41<1830:22:55, 6.60s/it, lr=1e-5, step_loss=0.0383]
Steps: 0%| | 1558/1000000 [3:54:52<2234:59:23, 8.06s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [1558], local_loss=0.053313981741666794, train_loss=0.04570429027080536, time_cost=1.2372236251831055
+
Steps: 0%| | 1558/1000000 [3:54:52<2234:59:23, 8.06s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 1559/1000000 [3:55:06<2654:18:55, 9.57s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [1559], local_loss=0.06597153842449188, train_loss=0.16893450915813446, time_cost=6.289929389953613
+
Steps: 0%| | 1559/1000000 [3:55:06<2654:18:55, 9.57s/it, lr=1e-5, step_loss=0.066]
Steps: 0%| | 1560/1000000 [3:55:12<2393:10:55, 8.63s/it, lr=1e-5, step_loss=0.066][RANK-0]: Step: [1560], local_loss=0.040779609233140945, train_loss=0.05796333774924278, time_cost=4.6790101528167725
+
Steps: 0%| | 1560/1000000 [3:55:12<2393:10:55, 8.63s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 1561/1000000 [3:55:23<2595:16:12, 9.36s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [1561], local_loss=0.031614646315574646, train_loss=0.03571490943431854, time_cost=3.6333415508270264
+
Steps: 0%| | 1561/1000000 [3:55:23<2595:16:12, 9.36s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 1562/1000000 [3:55:28<2224:18:22, 8.02s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [1562], local_loss=0.2793225347995758, train_loss=0.0900043472647667, time_cost=2.427722454071045
+
Steps: 0%| | 1562/1000000 [3:55:28<2224:18:22, 8.02s/it, lr=1e-5, step_loss=0.279]
Steps: 0%| | 1563/1000000 [3:55:37<2314:53:11, 8.35s/it, lr=1e-5, step_loss=0.279][RANK-0]: Step: [1563], local_loss=0.04578927904367447, train_loss=0.06265625357627869, time_cost=3.1296961307525635
+
Steps: 0%| | 1563/1000000 [3:55:37<2314:53:11, 8.35s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 1564/1000000 [3:55:51<2804:04:04, 10.11s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [1564], local_loss=0.027509188279509544, train_loss=0.03696172684431076, time_cost=2.284212589263916
+
Steps: 0%| | 1564/1000000 [3:55:51<2804:04:04, 10.11s/it, lr=1e-5, step_loss=0.0275]
Steps: 0%| | 1565/1000000 [3:55:58<2503:17:08, 9.03s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [1565], local_loss=0.08821149170398712, train_loss=0.06285425275564194, time_cost=3.045684576034546
+
Steps: 0%| | 1565/1000000 [3:55:58<2503:17:08, 9.03s/it, lr=1e-5, step_loss=0.0882]
Steps: 0%| | 1566/1000000 [3:56:03<2183:30:57, 7.87s/it, lr=1e-5, step_loss=0.0882][RANK-0]: Step: [1566], local_loss=0.055579859763383865, train_loss=0.22102168202400208, time_cost=2.4154410362243652
+
Steps: 0%| | 1566/1000000 [3:56:03<2183:30:57, 7.87s/it, lr=1e-5, step_loss=0.0556]
Steps: 0%| | 1567/1000000 [3:56:07<1897:04:41, 6.84s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [1567], local_loss=0.0322505384683609, train_loss=0.08784276247024536, time_cost=1.7232141494750977
+
Steps: 0%| | 1567/1000000 [3:56:07<1897:04:41, 6.84s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 1568/1000000 [3:56:20<2356:37:43, 8.50s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [1568], local_loss=0.022660214453935623, train_loss=0.06913848966360092, time_cost=10.519511938095093
+
Steps: 0%| | 1568/1000000 [3:56:20<2356:37:43, 8.50s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 1569/1000000 [3:56:31<2586:04:03, 9.32s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [1569], local_loss=0.04442734643816948, train_loss=0.09368207305669785, time_cost=1.8202362060546875
+
Steps: 0%| | 1569/1000000 [3:56:31<2586:04:03, 9.32s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 1570/1000000 [3:56:47<3120:37:17, 11.25s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [1570], local_loss=0.03308922052383423, train_loss=0.03579577058553696, time_cost=8.060088872909546
+
Steps: 0%| | 1570/1000000 [3:56:47<3120:37:17, 11.25s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 1571/1000000 [3:56:58<3095:50:13, 11.16s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [1571], local_loss=0.138038769364357, train_loss=0.08074513077735901, time_cost=1.4612579345703125
+
Steps: 0%| | 1571/1000000 [3:56:58<3095:50:13, 11.16s/it, lr=1e-5, step_loss=0.138]
Steps: 0%| | 1572/1000000 [3:57:07<2904:52:17, 10.47s/it, lr=1e-5, step_loss=0.138][RANK-0]: Step: [1572], local_loss=0.08821910619735718, train_loss=0.07499995827674866, time_cost=3.013638496398926
+
Steps: 0%| | 1572/1000000 [3:57:07<2904:52:17, 10.47s/it, lr=1e-5, step_loss=0.0882]
Steps: 0%| | 1573/1000000 [3:57:12<2470:33:11, 8.91s/it, lr=1e-5, step_loss=0.0882][RANK-0]: Step: [1573], local_loss=0.02882295288145542, train_loss=0.09691682457923889, time_cost=1.448427677154541
+
Steps: 0%| | 1573/1000000 [3:57:12<2470:33:11, 8.91s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1574/1000000 [3:57:25<2843:40:42, 10.25s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1574], local_loss=0.017935432493686676, train_loss=0.061028409749269485, time_cost=5.640366077423096
+
Steps: 0%| | 1574/1000000 [3:57:25<2843:40:42, 10.25s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 1575/1000000 [3:57:37<2945:46:07, 10.62s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [1575], local_loss=0.028799893334507942, train_loss=0.06492157280445099, time_cost=2.379080295562744
+
Steps: 0%| | 1575/1000000 [3:57:37<2945:46:07, 10.62s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1576/1000000 [3:57:46<2850:13:40, 10.28s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1576], local_loss=0.051778726279735565, train_loss=0.042804569005966187, time_cost=1.236353874206543
+
Steps: 0%| | 1576/1000000 [3:57:46<2850:13:40, 10.28s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 1577/1000000 [3:57:54<2665:02:22, 9.61s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [1577], local_loss=0.17336729168891907, train_loss=0.06022927165031433, time_cost=1.3080480098724365
+
Steps: 0%| | 1577/1000000 [3:57:54<2665:02:22, 9.61s/it, lr=1e-5, step_loss=0.173]
Steps: 0%| | 1578/1000000 [3:58:01<2438:15:44, 8.79s/it, lr=1e-5, step_loss=0.173][RANK-0]: Step: [1578], local_loss=0.024459626525640488, train_loss=0.06534816324710846, time_cost=2.491454601287842
+
Steps: 0%| | 1578/1000000 [3:58:01<2438:15:44, 8.79s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 1579/1000000 [3:58:10<2474:11:51, 8.92s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [1579], local_loss=0.057332396507263184, train_loss=0.07347332686185837, time_cost=3.672377109527588
+
Steps: 0%| | 1579/1000000 [3:58:10<2474:11:51, 8.92s/it, lr=1e-5, step_loss=0.0573]
Steps: 0%| | 1580/1000000 [3:58:15<2102:16:39, 7.58s/it, lr=1e-5, step_loss=0.0573][RANK-0]: Step: [1580], local_loss=0.059868309646844864, train_loss=0.08424582332372665, time_cost=1.2196166515350342
+
Steps: 0%| | 1580/1000000 [3:58:15<2102:16:39, 7.58s/it, lr=1e-5, step_loss=0.0599]
Steps: 0%| | 1581/1000000 [3:58:20<1893:40:30, 6.83s/it, lr=1e-5, step_loss=0.0599][RANK-0]: Step: [1581], local_loss=0.27599143981933594, train_loss=0.08520478010177612, time_cost=2.6020493507385254
+
Steps: 0%| | 1581/1000000 [3:58:20<1893:40:30, 6.83s/it, lr=1e-5, step_loss=0.276]
Steps: 0%| | 1582/1000000 [3:58:25<1788:07:56, 6.45s/it, lr=1e-5, step_loss=0.276][RANK-0]: Step: [1582], local_loss=0.07050434499979019, train_loss=0.10382112860679626, time_cost=3.0596606731414795
+
Steps: 0%| | 1582/1000000 [3:58:25<1788:07:56, 6.45s/it, lr=1e-5, step_loss=0.0705]
Steps: 0%| | 1583/1000000 [3:58:37<2225:29:38, 8.02s/it, lr=1e-5, step_loss=0.0705][RANK-0]: Step: [1583], local_loss=0.03917314112186432, train_loss=0.1691719889640808, time_cost=4.2621564865112305
+
Steps: 0%| | 1583/1000000 [3:58:37<2225:29:38, 8.02s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 1584/1000000 [3:58:42<2001:28:59, 7.22s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [1584], local_loss=0.04362304136157036, train_loss=0.039187319576740265, time_cost=2.3074865341186523
+
Steps: 0%| | 1584/1000000 [3:58:42<2001:28:59, 7.22s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 1585/1000000 [3:58:48<1902:31:47, 6.86s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [1585], local_loss=0.03320460021495819, train_loss=0.24181926250457764, time_cost=1.785853385925293
+
Steps: 0%| | 1585/1000000 [3:58:48<1902:31:47, 6.86s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 1586/1000000 [3:58:54<1822:53:16, 6.57s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [1586], local_loss=0.32792168855667114, train_loss=0.08653882145881653, time_cost=1.748995304107666
+
Steps: 0%| | 1586/1000000 [3:58:54<1822:53:16, 6.57s/it, lr=1e-5, step_loss=0.328]
Steps: 0%| | 1587/1000000 [3:59:00<1716:25:37, 6.19s/it, lr=1e-5, step_loss=0.328][RANK-0]: Step: [1587], local_loss=0.030946675688028336, train_loss=0.08507397770881653, time_cost=2.2734367847442627
+
Steps: 0%| | 1587/1000000 [3:59:00<1716:25:37, 6.19s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 1588/1000000 [3:59:06<1696:23:05, 6.12s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [1588], local_loss=0.07962793856859207, train_loss=0.04030440375208855, time_cost=4.5195019245147705
+
Steps: 0%| | 1588/1000000 [3:59:06<1696:23:05, 6.12s/it, lr=1e-5, step_loss=0.0796]
Steps: 0%| | 1589/1000000 [3:59:14<1920:44:50, 6.93s/it, lr=1e-5, step_loss=0.0796][RANK-0]: Step: [1589], local_loss=0.09741201996803284, train_loss=0.047461479902267456, time_cost=1.726243495941162
+
Steps: 0%| | 1589/1000000 [3:59:14<1920:44:50, 6.93s/it, lr=1e-5, step_loss=0.0974]
Steps: 0%| | 1590/1000000 [3:59:21<1849:23:50, 6.67s/it, lr=1e-5, step_loss=0.0974][RANK-0]: Step: [1590], local_loss=0.23586633801460266, train_loss=0.060331400483846664, time_cost=2.623676300048828
+
Steps: 0%| | 1590/1000000 [3:59:21<1849:23:50, 6.67s/it, lr=1e-5, step_loss=0.236]
Steps: 0%| | 1591/1000000 [3:59:34<2407:59:24, 8.68s/it, lr=1e-5, step_loss=0.236][RANK-0]: Step: [1591], local_loss=0.3456921875476837, train_loss=0.15604673326015472, time_cost=9.176746368408203
+
Steps: 0%| | 1591/1000000 [3:59:34<2407:59:24, 8.68s/it, lr=1e-5, step_loss=0.346]
Steps: 0%| | 1592/1000000 [3:59:43<2459:29:27, 8.87s/it, lr=1e-5, step_loss=0.346][RANK-0]: Step: [1592], local_loss=0.14814548194408417, train_loss=0.04463174194097519, time_cost=4.330984354019165
+
Steps: 0%| | 1592/1000000 [3:59:43<2459:29:27, 8.87s/it, lr=1e-5, step_loss=0.148]
Steps: 0%| | 1593/1000000 [3:59:55<2720:05:52, 9.81s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [1593], local_loss=0.04481979086995125, train_loss=0.11972326785326004, time_cost=2.24082088470459
+
Steps: 0%| | 1593/1000000 [3:59:55<2720:05:52, 9.81s/it, lr=1e-5, step_loss=0.0448]
Steps: 0%| | 1594/1000000 [4:00:06<2791:37:30, 10.07s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [1594], local_loss=0.02042665146291256, train_loss=0.04536464065313339, time_cost=2.18291974067688
+
Steps: 0%| | 1594/1000000 [4:00:06<2791:37:30, 10.07s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 1595/1000000 [4:00:16<2811:18:28, 10.14s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [1595], local_loss=0.028838787227869034, train_loss=0.04187040030956268, time_cost=9.093592405319214
+
Steps: 0%| | 1595/1000000 [4:00:16<2811:18:28, 10.14s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1596/1000000 [4:00:21<2336:53:12, 8.43s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1596], local_loss=0.04636763408780098, train_loss=0.14458858966827393, time_cost=1.5092103481292725
+
Steps: 0%| | 1596/1000000 [4:00:21<2336:53:12, 8.43s/it, lr=1e-5, step_loss=0.0464]
Steps: 0%| | 1597/1000000 [4:00:28<2278:49:19, 8.22s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [1597], local_loss=0.09661591053009033, train_loss=0.08671797066926956, time_cost=1.9170799255371094
+
Steps: 0%| | 1597/1000000 [4:00:28<2278:49:19, 8.22s/it, lr=1e-5, step_loss=0.0966]
Steps: 0%| | 1598/1000000 [4:00:42<2696:46:40, 9.72s/it, lr=1e-5, step_loss=0.0966][RANK-0]: Step: [1598], local_loss=0.05694805830717087, train_loss=0.061126843094825745, time_cost=5.575053453445435
+
Steps: 0%| | 1598/1000000 [4:00:42<2696:46:40, 9.72s/it, lr=1e-5, step_loss=0.0569]
Steps: 0%| | 1599/1000000 [4:00:49<2485:38:20, 8.96s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [1599], local_loss=0.04541236162185669, train_loss=0.0452742874622345, time_cost=1.5635993480682373
+
Steps: 0%| | 1599/1000000 [4:00:49<2485:38:20, 8.96s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 1600/1000000 [4:00:59<2574:30:39, 9.28s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [1600], local_loss=0.04296058043837547, train_loss=0.03914854675531387, time_cost=8.515574932098389
+
Steps: 0%| | 1600/1000000 [4:00:59<2574:30:39, 9.28s/it, lr=1e-5, step_loss=0.043]
Steps: 0%| | 1601/1000000 [4:01:03<2149:30:44, 7.75s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [1601], local_loss=0.025644056499004364, train_loss=0.04053325951099396, time_cost=1.3600752353668213
+
Steps: 0%| | 1601/1000000 [4:01:03<2149:30:44, 7.75s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 1602/1000000 [4:01:19<2818:02:28, 10.16s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [1602], local_loss=0.04682868719100952, train_loss=0.054142966866493225, time_cost=6.283989667892456
+
Steps: 0%| | 1602/1000000 [4:01:19<2818:02:28, 10.16s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 1603/1000000 [4:01:27<2694:02:48, 9.71s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [1603], local_loss=0.023449476808309555, train_loss=0.16695845127105713, time_cost=2.8003716468811035
+
Steps: 0%| | 1603/1000000 [4:01:27<2694:02:48, 9.71s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 1604/1000000 [4:01:35<2511:44:07, 9.06s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [1604], local_loss=0.026991380378603935, train_loss=0.039145953953266144, time_cost=1.304764986038208
+
Steps: 0%| | 1604/1000000 [4:01:35<2511:44:07, 9.06s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 1605/1000000 [4:01:40<2183:58:01, 7.87s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [1605], local_loss=0.14222462475299835, train_loss=0.05738705396652222, time_cost=3.8730835914611816
+
Steps: 0%| | 1605/1000000 [4:01:40<2183:58:01, 7.87s/it, lr=1e-5, step_loss=0.142]
Steps: 0%| | 1606/1000000 [4:01:55<2743:16:13, 9.89s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [1606], local_loss=0.06093293055891991, train_loss=0.05267279967665672, time_cost=7.412042856216431
+
Steps: 0%| | 1606/1000000 [4:01:55<2743:16:13, 9.89s/it, lr=1e-5, step_loss=0.0609]
Steps: 0%| | 1607/1000000 [4:01:59<2292:48:25, 8.27s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [1607], local_loss=0.020344628021121025, train_loss=0.03001629002392292, time_cost=1.5455472469329834
+
Steps: 0%| | 1607/1000000 [4:01:59<2292:48:25, 8.27s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 1608/1000000 [4:02:12<2639:16:22, 9.52s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [1608], local_loss=0.06683649867773056, train_loss=0.059455059468746185, time_cost=4.30759334564209
+
Steps: 0%| | 1608/1000000 [4:02:12<2639:16:22, 9.52s/it, lr=1e-5, step_loss=0.0668]
Steps: 0%| | 1609/1000000 [4:02:22<2713:59:47, 9.79s/it, lr=1e-5, step_loss=0.0668][RANK-0]: Step: [1609], local_loss=0.045875221490859985, train_loss=0.0495719239115715, time_cost=1.2952759265899658
+
Steps: 0%| | 1609/1000000 [4:02:22<2713:59:47, 9.79s/it, lr=1e-5, step_loss=0.0459]
Steps: 0%| | 1610/1000000 [4:02:27<2317:17:55, 8.36s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [1610], local_loss=0.02551286295056343, train_loss=0.05070221424102783, time_cost=1.2716596126556396
+
Steps: 0%| | 1610/1000000 [4:02:27<2317:17:55, 8.36s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 1611/1000000 [4:02:42<2902:57:14, 10.47s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [1611], local_loss=0.0357668399810791, train_loss=0.04354311525821686, time_cost=1.3037474155426025
+
Steps: 0%| | 1611/1000000 [4:02:42<2902:57:14, 10.47s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 1612/1000000 [4:02:56<3130:30:11, 11.29s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [1612], local_loss=0.11719942837953568, train_loss=0.045476559549570084, time_cost=10.945226907730103
+
Steps: 0%| | 1612/1000000 [4:02:56<3130:30:11, 11.29s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 1613/1000000 [4:03:06<3095:47:08, 11.16s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [1613], local_loss=0.06615888327360153, train_loss=0.04049616679549217, time_cost=1.989598035812378
+
Steps: 0%| | 1613/1000000 [4:03:06<3095:47:08, 11.16s/it, lr=1e-5, step_loss=0.0662]
Steps: 0%| | 1614/1000000 [4:03:13<2696:27:46, 9.72s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [1614], local_loss=0.0321989580988884, train_loss=0.03821234777569771, time_cost=4.546740293502808
+
Steps: 0%| | 1614/1000000 [4:03:13<2696:27:46, 9.72s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 1615/1000000 [4:03:19<2419:15:10, 8.72s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [1615], local_loss=0.03745833411812782, train_loss=0.0769488513469696, time_cost=1.739682912826538
+
Steps: 0%| | 1615/1000000 [4:03:19<2419:15:10, 8.72s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 1616/1000000 [4:03:34<2931:29:03, 10.57s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [1616], local_loss=0.021958325058221817, train_loss=0.06342929601669312, time_cost=1.2785253524780273
+
Steps: 0%| | 1616/1000000 [4:03:34<2931:29:03, 10.57s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 1617/1000000 [4:03:47<3126:12:12, 11.27s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [1617], local_loss=0.04595348238945007, train_loss=0.04114267975091934, time_cost=4.148663282394409
+
Steps: 0%| | 1617/1000000 [4:03:47<3126:12:12, 11.27s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 1618/1000000 [4:03:54<2736:31:25, 9.87s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [1618], local_loss=0.020411690697073936, train_loss=0.04805563762784004, time_cost=2.9893057346343994
+
Steps: 0%| | 1618/1000000 [4:03:54<2736:31:25, 9.87s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 1619/1000000 [4:04:07<3001:05:08, 10.82s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [1619], local_loss=0.028546327725052834, train_loss=0.05612483620643616, time_cost=3.283966064453125
+
Steps: 0%| | 1619/1000000 [4:04:07<3001:05:08, 10.82s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 1620/1000000 [4:04:11<2484:19:10, 8.96s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [1620], local_loss=0.06529688835144043, train_loss=0.050165485590696335, time_cost=2.281787157058716
+
Steps: 0%| | 1620/1000000 [4:04:11<2484:19:10, 8.96s/it, lr=1e-5, step_loss=0.0653]
Steps: 0%| | 1621/1000000 [4:04:23<2701:26:03, 9.74s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [1621], local_loss=0.045765649527311325, train_loss=0.0800831988453865, time_cost=2.860703229904175
+
Steps: 0%| | 1621/1000000 [4:04:23<2701:26:03, 9.74s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 1622/1000000 [4:04:32<2642:11:08, 9.53s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [1622], local_loss=0.029945334419608116, train_loss=0.08520165830850601, time_cost=2.6251466274261475
+
Steps: 0%| | 1622/1000000 [4:04:32<2642:11:08, 9.53s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 1623/1000000 [4:04:43<2760:51:05, 9.96s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [1623], local_loss=0.020590439438819885, train_loss=0.15134288370609283, time_cost=4.570483207702637
+
Steps: 0%| | 1623/1000000 [4:04:43<2760:51:05, 9.96s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 1624/1000000 [4:04:50<2519:43:48, 9.09s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [1624], local_loss=0.060366079211235046, train_loss=0.04619549587368965, time_cost=1.2881169319152832
+
Steps: 0%| | 1624/1000000 [4:04:50<2519:43:48, 9.09s/it, lr=1e-5, step_loss=0.0604]
Steps: 0%| | 1625/1000000 [4:04:55<2181:01:51, 7.86s/it, lr=1e-5, step_loss=0.0604][RANK-0]: Step: [1625], local_loss=0.14048895239830017, train_loss=0.05075672268867493, time_cost=2.9782183170318604
+
Steps: 0%| | 1625/1000000 [4:04:55<2181:01:51, 7.86s/it, lr=1e-5, step_loss=0.14]
Steps: 0%| | 1626/1000000 [4:05:06<2463:29:45, 8.88s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [1626], local_loss=0.073857881128788, train_loss=0.04651399701833725, time_cost=1.8868060111999512
+
Steps: 0%| | 1626/1000000 [4:05:06<2463:29:45, 8.88s/it, lr=1e-5, step_loss=0.0739]
Steps: 0%| | 1627/1000000 [4:05:19<2832:04:05, 10.21s/it, lr=1e-5, step_loss=0.0739][RANK-0]: Step: [1627], local_loss=0.016963083297014236, train_loss=0.04032765328884125, time_cost=1.270362377166748
+
Steps: 0%| | 1627/1000000 [4:05:19<2832:04:05, 10.21s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 1628/1000000 [4:05:28<2702:10:28, 9.74s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [1628], local_loss=0.029641207307577133, train_loss=0.034088507294654846, time_cost=4.841109991073608
+
Steps: 0%| | 1628/1000000 [4:05:28<2702:10:28, 9.74s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 1629/1000000 [4:05:41<2949:28:26, 10.64s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [1629], local_loss=0.04036089777946472, train_loss=0.05332045257091522, time_cost=5.5184006690979
+
Steps: 0%| | 1629/1000000 [4:05:41<2949:28:26, 10.64s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 1630/1000000 [4:05:52<2956:10:51, 10.66s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [1630], local_loss=0.09773645550012589, train_loss=0.046240873634815216, time_cost=8.056244611740112
+
Steps: 0%| | 1630/1000000 [4:05:52<2956:10:51, 10.66s/it, lr=1e-5, step_loss=0.0977]
Steps: 0%| | 1631/1000000 [4:06:01<2823:31:31, 10.18s/it, lr=1e-5, step_loss=0.0977][RANK-0]: Step: [1631], local_loss=0.054012052714824677, train_loss=0.11408458650112152, time_cost=2.108222484588623
+
Steps: 0%| | 1631/1000000 [4:06:01<2823:31:31, 10.18s/it, lr=1e-5, step_loss=0.054]
Steps: 0%| | 1632/1000000 [4:06:10<2798:07:06, 10.09s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [1632], local_loss=0.025887224823236465, train_loss=0.043412595987319946, time_cost=3.609276294708252
+
Steps: 0%| | 1632/1000000 [4:06:10<2798:07:06, 10.09s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 1633/1000000 [4:06:18<2589:23:57, 9.34s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [1633], local_loss=0.3644351661205292, train_loss=0.09636931121349335, time_cost=1.1967811584472656
+
Steps: 0%| | 1633/1000000 [4:06:18<2589:23:57, 9.34s/it, lr=1e-5, step_loss=0.364]
Steps: 0%| | 1634/1000000 [4:06:27<2541:45:54, 9.17s/it, lr=1e-5, step_loss=0.364][RANK-0]: Step: [1634], local_loss=0.026718009263277054, train_loss=0.06011062115430832, time_cost=3.808678388595581
+
Steps: 0%| | 1634/1000000 [4:06:27<2541:45:54, 9.17s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 1635/1000000 [4:06:41<2979:41:11, 10.74s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [1635], local_loss=0.021158717572689056, train_loss=0.06698811054229736, time_cost=1.2188746929168701
+
Steps: 0%| | 1635/1000000 [4:06:41<2979:41:11, 10.74s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 1636/1000000 [4:06:47<2525:29:50, 9.11s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [1636], local_loss=0.06647879630327225, train_loss=0.07128117978572845, time_cost=2.682225465774536
+
Steps: 0%| | 1636/1000000 [4:06:47<2525:29:50, 9.11s/it, lr=1e-5, step_loss=0.0665]
Steps: 0%| | 1637/1000000 [4:06:58<2720:31:00, 9.81s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [1637], local_loss=0.03927082568407059, train_loss=0.0834805965423584, time_cost=2.40938138961792
+
Steps: 0%| | 1637/1000000 [4:06:58<2720:31:00, 9.81s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 1638/1000000 [4:07:02<2278:16:29, 8.22s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [1638], local_loss=0.016910409554839134, train_loss=0.035658303648233414, time_cost=1.8684988021850586
+
Steps: 0%| | 1638/1000000 [4:07:02<2278:16:29, 8.22s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 1639/1000000 [4:07:10<2194:50:08, 7.91s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [1639], local_loss=0.2898458242416382, train_loss=0.07556144148111343, time_cost=1.7401652336120605
+
Steps: 0%| | 1639/1000000 [4:07:10<2194:50:08, 7.91s/it, lr=1e-5, step_loss=0.29]
Steps: 0%| | 1640/1000000 [4:07:21<2455:04:32, 8.85s/it, lr=1e-5, step_loss=0.29][RANK-0]: Step: [1640], local_loss=0.030818775296211243, train_loss=0.07324624061584473, time_cost=2.4387946128845215
+
Steps: 0%| | 1640/1000000 [4:07:21<2455:04:32, 8.85s/it, lr=1e-5, step_loss=0.0308]
Steps: 0%| | 1641/1000000 [4:07:26<2142:40:12, 7.73s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [1641], local_loss=0.02713373489677906, train_loss=0.05024208128452301, time_cost=2.310865640640259
+
Steps: 0%| | 1641/1000000 [4:07:26<2142:40:12, 7.73s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 1642/1000000 [4:07:38<2481:56:59, 8.95s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [1642], local_loss=0.02016018144786358, train_loss=0.08978429436683655, time_cost=4.259072542190552
+
Steps: 0%| | 1642/1000000 [4:07:38<2481:56:59, 8.95s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 1643/1000000 [4:07:45<2319:56:33, 8.37s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [1643], local_loss=0.028329558670520782, train_loss=0.05115628242492676, time_cost=2.145839214324951
+
Steps: 0%| | 1643/1000000 [4:07:45<2319:56:33, 8.37s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 1644/1000000 [4:07:51<2124:13:20, 7.66s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [1644], local_loss=0.08688671141862869, train_loss=0.06449179351329803, time_cost=2.189641237258911
+
Steps: 0%| | 1644/1000000 [4:07:51<2124:13:20, 7.66s/it, lr=1e-5, step_loss=0.0869]
Steps: 0%| | 1645/1000000 [4:07:57<1989:16:47, 7.17s/it, lr=1e-5, step_loss=0.0869][RANK-0]: Step: [1645], local_loss=0.04898761212825775, train_loss=0.07008352875709534, time_cost=1.6695623397827148
+
Steps: 0%| | 1645/1000000 [4:07:57<1989:16:47, 7.17s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 1646/1000000 [4:08:03<1914:14:30, 6.90s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [1646], local_loss=0.016799353063106537, train_loss=0.03983587399125099, time_cost=2.4889445304870605
+
Steps: 0%| | 1646/1000000 [4:08:03<1914:14:30, 6.90s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 1647/1000000 [4:08:14<2299:46:27, 8.29s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [1647], local_loss=0.03384878486394882, train_loss=0.0616239458322525, time_cost=4.562952518463135
+
Steps: 0%| | 1647/1000000 [4:08:14<2299:46:27, 8.29s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 1648/1000000 [4:08:26<2601:15:09, 9.38s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [1648], local_loss=0.028313182294368744, train_loss=0.048478975892066956, time_cost=4.571483373641968
+
Steps: 0%| | 1648/1000000 [4:08:26<2601:15:09, 9.38s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 1649/1000000 [4:08:31<2191:18:17, 7.90s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [1649], local_loss=0.035944826900959015, train_loss=0.032897207885980606, time_cost=1.4829270839691162
+
Steps: 0%| | 1649/1000000 [4:08:31<2191:18:17, 7.90s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 1650/1000000 [4:08:38<2162:19:02, 7.80s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [1650], local_loss=0.030851416289806366, train_loss=0.05409131944179535, time_cost=5.099138975143433
+
Steps: 0%| | 1650/1000000 [4:08:38<2162:19:02, 7.80s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 1651/1000000 [4:08:49<2356:32:07, 8.50s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [1651], local_loss=0.041541729122400284, train_loss=0.04089074954390526, time_cost=4.411693334579468
+
Steps: 0%| | 1651/1000000 [4:08:49<2356:32:07, 8.50s/it, lr=1e-5, step_loss=0.0415]
Steps: 0%| | 1652/1000000 [4:09:00<2614:25:47, 9.43s/it, lr=1e-5, step_loss=0.0415][RANK-0]: Step: [1652], local_loss=0.3283015787601471, train_loss=0.08268606662750244, time_cost=4.102445125579834
+
Steps: 0%| | 1652/1000000 [4:09:00<2614:25:47, 9.43s/it, lr=1e-5, step_loss=0.328]
Steps: 0%| | 1653/1000000 [4:09:14<2981:15:48, 10.75s/it, lr=1e-5, step_loss=0.328][RANK-0]: Step: [1653], local_loss=0.43803301453590393, train_loss=0.2318548858165741, time_cost=2.9027109146118164
+
Steps: 0%| | 1653/1000000 [4:09:14<2981:15:48, 10.75s/it, lr=1e-5, step_loss=0.438]
Steps: 0%| | 1654/1000000 [4:09:26<3055:18:48, 11.02s/it, lr=1e-5, step_loss=0.438][RANK-0]: Step: [1654], local_loss=0.028201766312122345, train_loss=0.05528619885444641, time_cost=2.358180522918701
+
Steps: 0%| | 1654/1000000 [4:09:26<3055:18:48, 11.02s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 1655/1000000 [4:09:36<3026:37:51, 10.91s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [1655], local_loss=0.015942957252264023, train_loss=0.032688505947589874, time_cost=1.2839276790618896
+
Steps: 0%| | 1655/1000000 [4:09:36<3026:37:51, 10.91s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 1656/1000000 [4:09:48<3054:41:22, 11.02s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [1656], local_loss=0.11016502231359482, train_loss=0.04600738734006882, time_cost=1.8900790214538574
+
Steps: 0%| | 1656/1000000 [4:09:48<3054:41:22, 11.02s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 1657/1000000 [4:09:55<2793:07:41, 10.07s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [1657], local_loss=0.02479240484535694, train_loss=0.04345831274986267, time_cost=1.2914481163024902
+
Steps: 0%| | 1657/1000000 [4:09:55<2793:07:41, 10.07s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 1658/1000000 [4:10:07<2925:04:11, 10.55s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [1658], local_loss=0.37820202112197876, train_loss=0.11181239038705826, time_cost=1.6182100772857666
+
Steps: 0%| | 1658/1000000 [4:10:07<2925:04:11, 10.55s/it, lr=1e-5, step_loss=0.378]
Steps: 0%| | 1659/1000000 [4:10:15<2728:02:18, 9.84s/it, lr=1e-5, step_loss=0.378][RANK-0]: Step: [1659], local_loss=133.8487548828125, train_loss=16.76045799255371, time_cost=6.7370765209198
+
Steps: 0%| | 1659/1000000 [4:10:15<2728:02:18, 9.84s/it, lr=1e-5, step_loss=134]
Steps: 0%| | 1660/1000000 [4:10:24<2640:43:38, 9.52s/it, lr=1e-5, step_loss=134][RANK-0]: Step: [1660], local_loss=0.0386083647608757, train_loss=0.0355113223195076, time_cost=1.2454454898834229
+
Steps: 0%| | 1660/1000000 [4:10:24<2640:43:38, 9.52s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 1661/1000000 [4:10:33<2626:35:57, 9.47s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [1661], local_loss=0.04852990061044693, train_loss=0.05585628002882004, time_cost=2.1846184730529785
+
Steps: 0%| | 1661/1000000 [4:10:33<2626:35:57, 9.47s/it, lr=1e-5, step_loss=0.0485]
Steps: 0%| | 1662/1000000 [4:10:41<2457:41:16, 8.86s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [1662], local_loss=0.030851511284708977, train_loss=0.17225851118564606, time_cost=2.4960978031158447
+
Steps: 0%| | 1662/1000000 [4:10:41<2457:41:16, 8.86s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 1663/1000000 [4:10:53<2742:43:56, 9.89s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [1663], local_loss=0.0655030682682991, train_loss=0.12263642251491547, time_cost=4.164564371109009
+
Steps: 0%| | 1663/1000000 [4:10:53<2742:43:56, 9.89s/it, lr=1e-5, step_loss=0.0655]
Steps: 0%| | 1664/1000000 [4:10:57<2252:44:09, 8.12s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [1664], local_loss=0.07939663529396057, train_loss=0.03421304002404213, time_cost=1.2229411602020264
+
Steps: 0%| | 1664/1000000 [4:10:57<2252:44:09, 8.12s/it, lr=1e-5, step_loss=0.0794]
Steps: 0%| | 1665/1000000 [4:11:02<1944:02:51, 7.01s/it, lr=1e-5, step_loss=0.0794][RANK-0]: Step: [1665], local_loss=0.032511647790670395, train_loss=0.0489988774061203, time_cost=1.896543264389038
+
Steps: 0%| | 1665/1000000 [4:11:02<1944:02:51, 7.01s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 1666/1000000 [4:11:14<2429:16:33, 8.76s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [1666], local_loss=0.034289419651031494, train_loss=0.2242954969406128, time_cost=4.43625020980835
+
Steps: 0%| | 1666/1000000 [4:11:14<2429:16:33, 8.76s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 1667/1000000 [4:11:27<2779:09:16, 10.02s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [1667], local_loss=0.0552339032292366, train_loss=0.04121392220258713, time_cost=3.9606614112854004
+
Steps: 0%| | 1667/1000000 [4:11:27<2779:09:16, 10.02s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 1668/1000000 [4:11:33<2444:50:14, 8.82s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [1668], local_loss=0.026107482612133026, train_loss=0.10888412594795227, time_cost=1.6517558097839355
+
Steps: 0%| | 1668/1000000 [4:11:33<2444:50:14, 8.82s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 1669/1000000 [4:11:38<2070:24:01, 7.47s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [1669], local_loss=0.02161273919045925, train_loss=0.047839343547821045, time_cost=1.5933446884155273
+
Steps: 0%| | 1669/1000000 [4:11:38<2070:24:01, 7.47s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 1670/1000000 [4:11:44<1947:50:05, 7.02s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [1670], local_loss=0.9966329336166382, train_loss=0.16944122314453125, time_cost=1.4556620121002197
+
Steps: 0%| | 1670/1000000 [4:11:44<1947:50:05, 7.02s/it, lr=1e-5, step_loss=0.997]
Steps: 0%| | 1671/1000000 [4:11:52<2044:38:35, 7.37s/it, lr=1e-5, step_loss=0.997][RANK-0]: Step: [1671], local_loss=0.04852210357785225, train_loss=0.036747802048921585, time_cost=4.654879570007324
+
Steps: 0%| | 1671/1000000 [4:11:52<2044:38:35, 7.37s/it, lr=1e-5, step_loss=0.0485]
Steps: 0%| | 1672/1000000 [4:11:57<1851:59:01, 6.68s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [1672], local_loss=0.038739681243896484, train_loss=0.043597303330898285, time_cost=1.3134691715240479
+
Steps: 0%| | 1672/1000000 [4:11:57<1851:59:01, 6.68s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 1673/1000000 [4:12:08<2202:12:14, 7.94s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [1673], local_loss=0.02700853906571865, train_loss=0.044604040682315826, time_cost=4.763243913650513
+
Steps: 0%| | 1673/1000000 [4:12:08<2202:12:14, 7.94s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 1674/1000000 [4:12:21<2667:51:14, 9.62s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [1674], local_loss=0.14755861461162567, train_loss=0.0513015016913414, time_cost=5.592419862747192
+
Steps: 0%| | 1674/1000000 [4:12:21<2667:51:14, 9.62s/it, lr=1e-5, step_loss=0.148]
Steps: 0%| | 1675/1000000 [4:12:37<3148:22:29, 11.35s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [1675], local_loss=0.03320404887199402, train_loss=0.041192468255758286, time_cost=7.289320707321167
+
Steps: 0%| | 1675/1000000 [4:12:37<3148:22:29, 11.35s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 1676/1000000 [4:12:42<2658:41:55, 9.59s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [1676], local_loss=0.03463318198919296, train_loss=0.04050905257463455, time_cost=1.415696382522583
+
Steps: 0%| | 1676/1000000 [4:12:42<2658:41:55, 9.59s/it, lr=1e-5, step_loss=0.0346]
Steps: 0%| | 1677/1000000 [4:12:56<2998:05:46, 10.81s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [1677], local_loss=0.02367343381047249, train_loss=0.05443678796291351, time_cost=2.2697038650512695
+
Steps: 0%| | 1677/1000000 [4:12:56<2998:05:46, 10.81s/it, lr=1e-5, step_loss=0.0237]
Steps: 0%| | 1678/1000000 [4:13:10<3266:50:08, 11.78s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [1678], local_loss=0.26695239543914795, train_loss=0.1949939727783203, time_cost=6.245899200439453
+
Steps: 0%| | 1678/1000000 [4:13:10<3266:50:08, 11.78s/it, lr=1e-5, step_loss=0.267]
Steps: 0%| | 1679/1000000 [4:13:16<2826:00:03, 10.19s/it, lr=1e-5, step_loss=0.267][RANK-0]: Step: [1679], local_loss=0.0339515320956707, train_loss=0.07840530574321747, time_cost=2.3152408599853516
+
Steps: 0%| | 1679/1000000 [4:13:16<2826:00:03, 10.19s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 1680/1000000 [4:13:23<2534:54:31, 9.14s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [1680], local_loss=0.024467041715979576, train_loss=0.03592313826084137, time_cost=1.2765402793884277
+
Steps: 0%| | 1680/1000000 [4:13:23<2534:54:31, 9.14s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 1681/1000000 [4:13:27<2131:29:28, 7.69s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [1681], local_loss=0.03243337944149971, train_loss=0.035196900367736816, time_cost=1.7514159679412842
+
Steps: 0%| | 1681/1000000 [4:13:27<2131:29:28, 7.69s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 1682/1000000 [4:13:42<2716:16:48, 9.80s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [1682], local_loss=0.030910490080714226, train_loss=0.06322842091321945, time_cost=1.2532482147216797
+
Steps: 0%| | 1682/1000000 [4:13:42<2716:16:48, 9.80s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 1683/1000000 [4:13:47<2304:32:34, 8.31s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [1683], local_loss=0.021711576730012894, train_loss=0.04080754518508911, time_cost=1.9929444789886475
+
Steps: 0%| | 1683/1000000 [4:13:47<2304:32:34, 8.31s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 1684/1000000 [4:13:57<2450:38:42, 8.84s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [1684], local_loss=0.021923592314124107, train_loss=0.03949867933988571, time_cost=1.228058099746704
+
Steps: 0%| | 1684/1000000 [4:13:57<2450:38:42, 8.84s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 1685/1000000 [4:14:03<2230:17:27, 8.04s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [1685], local_loss=0.05568370223045349, train_loss=0.03657393530011177, time_cost=2.6487340927124023
+
Steps: 0%| | 1685/1000000 [4:14:03<2230:17:27, 8.04s/it, lr=1e-5, step_loss=0.0557]
Steps: 0%| | 1686/1000000 [4:14:16<2600:18:30, 9.38s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [1686], local_loss=0.016298282891511917, train_loss=0.04998202249407768, time_cost=1.2032790184020996
+
Steps: 0%| | 1686/1000000 [4:14:16<2600:18:30, 9.38s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 1687/1000000 [4:14:24<2554:03:50, 9.21s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [1687], local_loss=0.01745179109275341, train_loss=0.03374502807855606, time_cost=6.2435691356658936
+
Steps: 0%| | 1687/1000000 [4:14:25<2554:03:50, 9.21s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 1688/1000000 [4:14:32<2381:50:15, 8.59s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [1688], local_loss=0.06469564139842987, train_loss=0.05255189910531044, time_cost=2.6259536743164062
+
Steps: 0%| | 1688/1000000 [4:14:32<2381:50:15, 8.59s/it, lr=1e-5, step_loss=0.0647]
Steps: 0%| | 1689/1000000 [4:14:40<2336:15:42, 8.42s/it, lr=1e-5, step_loss=0.0647][RANK-0]: Step: [1689], local_loss=0.04762842878699303, train_loss=0.09375058114528656, time_cost=1.229262113571167
+
Steps: 0%| | 1689/1000000 [4:14:40<2336:15:42, 8.42s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 1690/1000000 [4:14:54<2818:05:13, 10.16s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [1690], local_loss=0.06251881271600723, train_loss=0.12581460177898407, time_cost=5.454726934432983
+
Steps: 0%| | 1690/1000000 [4:14:54<2818:05:13, 10.16s/it, lr=1e-5, step_loss=0.0625]
Steps: 0%| | 1691/1000000 [4:14:59<2363:23:01, 8.52s/it, lr=1e-5, step_loss=0.0625][RANK-0]: Step: [1691], local_loss=0.03598419949412346, train_loss=0.07982202619314194, time_cost=1.9105026721954346
+
Steps: 0%| | 1691/1000000 [4:14:59<2363:23:01, 8.52s/it, lr=1e-5, step_loss=0.036]
Steps: 0%| | 1692/1000000 [4:15:06<2255:02:01, 8.13s/it, lr=1e-5, step_loss=0.036][RANK-0]: Step: [1692], local_loss=0.03983470797538757, train_loss=0.13221320509910583, time_cost=2.6979258060455322
+
Steps: 0%| | 1692/1000000 [4:15:06<2255:02:01, 8.13s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 1693/1000000 [4:15:21<2849:31:06, 10.28s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [1693], local_loss=0.05975009500980377, train_loss=0.0676557645201683, time_cost=6.028266191482544
+
Steps: 0%| | 1693/1000000 [4:15:21<2849:31:06, 10.28s/it, lr=1e-5, step_loss=0.0598]
Steps: 0%| | 1694/1000000 [4:15:31<2833:11:44, 10.22s/it, lr=1e-5, step_loss=0.0598][RANK-0]: Step: [1694], local_loss=0.27829575538635254, train_loss=0.06471000611782074, time_cost=7.276245355606079
+
Steps: 0%| | 1694/1000000 [4:15:31<2833:11:44, 10.22s/it, lr=1e-5, step_loss=0.278]
Steps: 0%| | 1695/1000000 [4:15:42<2908:22:06, 10.49s/it, lr=1e-5, step_loss=0.278][RANK-0]: Step: [1695], local_loss=0.053121525794267654, train_loss=0.0407324880361557, time_cost=2.162400722503662
+
Steps: 0%| | 1695/1000000 [4:15:42<2908:22:06, 10.49s/it, lr=1e-5, step_loss=0.0531]
Steps: 0%| | 1696/1000000 [4:15:49<2582:45:55, 9.31s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [1696], local_loss=0.03200863301753998, train_loss=0.044390469789505005, time_cost=5.0237531661987305
+
Steps: 0%| | 1696/1000000 [4:15:49<2582:45:55, 9.31s/it, lr=1e-5, step_loss=0.032]
Steps: 0%| | 1697/1000000 [4:15:54<2272:59:20, 8.20s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [1697], local_loss=0.050251007080078125, train_loss=0.04009833186864853, time_cost=1.6551806926727295
+
Steps: 0%| | 1697/1000000 [4:15:54<2272:59:20, 8.20s/it, lr=1e-5, step_loss=0.0503]
Steps: 0%| | 1698/1000000 [4:16:05<2509:36:47, 9.05s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [1698], local_loss=0.06458691507577896, train_loss=0.22240543365478516, time_cost=1.6797099113464355
+
Steps: 0%| | 1698/1000000 [4:16:05<2509:36:47, 9.05s/it, lr=1e-5, step_loss=0.0646]
Steps: 0%| | 1699/1000000 [4:16:10<2104:34:13, 7.59s/it, lr=1e-5, step_loss=0.0646][RANK-0]: Step: [1699], local_loss=0.04778432101011276, train_loss=0.028135523200035095, time_cost=1.3198199272155762
+
Steps: 0%| | 1699/1000000 [4:16:10<2104:34:13, 7.59s/it, lr=1e-5, step_loss=0.0478]
Steps: 0%| | 1700/1000000 [4:16:14<1850:16:19, 6.67s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [1700], local_loss=0.05111365020275116, train_loss=0.058402642607688904, time_cost=1.6922557353973389
+
Steps: 0%| | 1700/1000000 [4:16:14<1850:16:19, 6.67s/it, lr=1e-5, step_loss=0.0511]
Steps: 0%| | 1701/1000000 [4:16:26<2241:08:15, 8.08s/it, lr=1e-5, step_loss=0.0511][RANK-0]: Step: [1701], local_loss=0.01922581158578396, train_loss=0.03744425252079964, time_cost=3.703153371810913
+
Steps: 0%| | 1701/1000000 [4:16:26<2241:08:15, 8.08s/it, lr=1e-5, step_loss=0.0192]
Steps: 0%| | 1702/1000000 [4:16:33<2212:55:42, 7.98s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [1702], local_loss=0.039630115032196045, train_loss=0.03347495570778847, time_cost=2.8928332328796387
+
Steps: 0%| | 1702/1000000 [4:16:33<2212:55:42, 7.98s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 1703/1000000 [4:16:38<1966:18:49, 7.09s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [1703], local_loss=0.04449097439646721, train_loss=0.09605163335800171, time_cost=2.1043972969055176
+
Steps: 0%| | 1703/1000000 [4:16:38<1966:18:49, 7.09s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 1704/1000000 [4:16:49<2269:15:54, 8.18s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [1704], local_loss=0.019082795828580856, train_loss=0.05516001954674721, time_cost=1.675398826599121
+
Steps: 0%| | 1704/1000000 [4:16:49<2269:15:54, 8.18s/it, lr=1e-5, step_loss=0.0191]
Steps: 0%| | 1705/1000000 [4:16:54<1956:59:58, 7.06s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [1705], local_loss=0.05350921303033829, train_loss=0.0839892029762268, time_cost=1.4004318714141846
+
Steps: 0%| | 1705/1000000 [4:16:54<1956:59:58, 7.06s/it, lr=1e-5, step_loss=0.0535]
Steps: 0%| | 1706/1000000 [4:17:08<2584:52:42, 9.32s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [1706], local_loss=0.16243323683738708, train_loss=0.0984407514333725, time_cost=5.707909345626831
+
Steps: 0%| | 1706/1000000 [4:17:08<2584:52:42, 9.32s/it, lr=1e-5, step_loss=0.162]
Steps: 0%| | 1707/1000000 [4:17:19<2695:03:36, 9.72s/it, lr=1e-5, step_loss=0.162][RANK-0]: Step: [1707], local_loss=0.03155215084552765, train_loss=0.04345560073852539, time_cost=1.8731396198272705
+
Steps: 0%| | 1707/1000000 [4:17:19<2695:03:36, 9.72s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 1708/1000000 [4:17:26<2469:03:11, 8.90s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [1708], local_loss=0.014757858589291573, train_loss=0.0424746535718441, time_cost=3.3249127864837646
+
Steps: 0%| | 1708/1000000 [4:17:26<2469:03:11, 8.90s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 1709/1000000 [4:17:31<2188:47:53, 7.89s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [1709], local_loss=0.027590300887823105, train_loss=0.0651530921459198, time_cost=2.229952096939087
+
Steps: 0%| | 1709/1000000 [4:17:31<2188:47:53, 7.89s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 1710/1000000 [4:17:40<2293:07:44, 8.27s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [1710], local_loss=0.02946089208126068, train_loss=0.05101087689399719, time_cost=3.2469284534454346
+
Steps: 0%| | 1710/1000000 [4:17:40<2293:07:44, 8.27s/it, lr=1e-5, step_loss=0.0295]
Steps: 0%| | 1711/1000000 [4:17:51<2521:48:08, 9.09s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [1711], local_loss=0.09140273183584213, train_loss=0.04245726391673088, time_cost=3.9606761932373047
+
Steps: 0%| | 1711/1000000 [4:17:51<2521:48:08, 9.09s/it, lr=1e-5, step_loss=0.0914]
Steps: 0%| | 1712/1000000 [4:18:07<3079:15:43, 11.10s/it, lr=1e-5, step_loss=0.0914][RANK-0]: Step: [1712], local_loss=0.017309032380580902, train_loss=0.02731148898601532, time_cost=1.194410800933838
+
Steps: 0%| | 1712/1000000 [4:18:07<3079:15:43, 11.10s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 1713/1000000 [4:18:23<3451:06:57, 12.45s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [1713], local_loss=0.052566275000572205, train_loss=0.056835658848285675, time_cost=12.551706075668335
+
Steps: 0%| | 1713/1000000 [4:18:23<3451:06:57, 12.45s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 1714/1000000 [4:18:28<2822:39:12, 10.18s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [1714], local_loss=0.07196389883756638, train_loss=0.04071121662855148, time_cost=2.2001914978027344
+
Steps: 0%| | 1714/1000000 [4:18:28<2822:39:12, 10.18s/it, lr=1e-5, step_loss=0.072]
Steps: 0%| | 1715/1000000 [4:18:34<2486:49:08, 8.97s/it, lr=1e-5, step_loss=0.072][RANK-0]: Step: [1715], local_loss=0.028955677524209023, train_loss=0.05502832308411598, time_cost=1.9520962238311768
+
Steps: 0%| | 1715/1000000 [4:18:34<2486:49:08, 8.97s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 1716/1000000 [4:18:48<2897:09:49, 10.45s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [1716], local_loss=0.9962142109870911, train_loss=0.17393305897712708, time_cost=1.227278709411621
+
Steps: 0%| | 1716/1000000 [4:18:48<2897:09:49, 10.45s/it, lr=1e-5, step_loss=0.996]
Steps: 0%| | 1717/1000000 [4:18:55<2633:51:51, 9.50s/it, lr=1e-5, step_loss=0.996][RANK-0]: Step: [1717], local_loss=0.027868274599313736, train_loss=0.03671756386756897, time_cost=1.8988852500915527
+
Steps: 0%| | 1717/1000000 [4:18:55<2633:51:51, 9.50s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 1718/1000000 [4:19:01<2329:17:45, 8.40s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [1718], local_loss=0.021012568846344948, train_loss=0.08240041881799698, time_cost=1.6171646118164062
+
Steps: 0%| | 1718/1000000 [4:19:01<2329:17:45, 8.40s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 1719/1000000 [4:19:14<2740:07:21, 9.88s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [1719], local_loss=0.021797001361846924, train_loss=0.0648210346698761, time_cost=1.4826555252075195
+
Steps: 0%| | 1719/1000000 [4:19:14<2740:07:21, 9.88s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 1720/1000000 [4:19:21<2494:11:21, 8.99s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [1720], local_loss=0.024786846712231636, train_loss=0.08743952214717865, time_cost=1.515629768371582
+
Steps: 0%| | 1720/1000000 [4:19:21<2494:11:21, 8.99s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 1721/1000000 [4:19:35<2868:47:40, 10.35s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [1721], local_loss=0.03979780524969101, train_loss=0.05430559068918228, time_cost=4.316598653793335
+
Steps: 0%| | 1721/1000000 [4:19:35<2868:47:40, 10.35s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 1722/1000000 [4:19:39<2355:48:25, 8.50s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [1722], local_loss=0.05486590787768364, train_loss=0.04005398601293564, time_cost=1.4783034324645996
+
Steps: 0%| | 1722/1000000 [4:19:39<2355:48:25, 8.50s/it, lr=1e-5, step_loss=0.0549]
Steps: 0%| | 1723/1000000 [4:19:54<2928:00:43, 10.56s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [1723], local_loss=0.0460539385676384, train_loss=0.07712304592132568, time_cost=12.673294067382812
+
Steps: 0%| | 1723/1000000 [4:19:54<2928:00:43, 10.56s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 1724/1000000 [4:20:03<2809:25:57, 10.13s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [1724], local_loss=0.05066164210438728, train_loss=0.05544218048453331, time_cost=3.312122344970703
+
Steps: 0%| | 1724/1000000 [4:20:03<2809:25:57, 10.13s/it, lr=1e-5, step_loss=0.0507]
Steps: 0%| | 1725/1000000 [4:20:09<2399:43:49, 8.65s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [1725], local_loss=0.1538744419813156, train_loss=0.07034013420343399, time_cost=2.0021579265594482
+
Steps: 0%| | 1725/1000000 [4:20:09<2399:43:49, 8.65s/it, lr=1e-5, step_loss=0.154]
Steps: 0%| | 1726/1000000 [4:20:18<2503:18:05, 9.03s/it, lr=1e-5, step_loss=0.154][RANK-0]: Step: [1726], local_loss=0.17784492671489716, train_loss=0.10240902006626129, time_cost=4.42367148399353
+
Steps: 0%| | 1726/1000000 [4:20:18<2503:18:05, 9.03s/it, lr=1e-5, step_loss=0.178]
Steps: 0%| | 1727/1000000 [4:20:32<2896:22:37, 10.44s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [1727], local_loss=0.06773611903190613, train_loss=0.07355473190546036, time_cost=5.304275751113892
+
Steps: 0%| | 1727/1000000 [4:20:32<2896:22:37, 10.44s/it, lr=1e-5, step_loss=0.0677]
Steps: 0%| | 1728/1000000 [4:20:37<2465:20:14, 8.89s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [1728], local_loss=0.030968327075242996, train_loss=0.044024981558322906, time_cost=4.222108364105225
+
Steps: 0%| | 1728/1000000 [4:20:37<2465:20:14, 8.89s/it, lr=1e-5, step_loss=0.031]
Steps: 0%| | 1729/1000000 [4:20:48<2643:40:57, 9.53s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [1729], local_loss=0.02024345099925995, train_loss=0.04247661307454109, time_cost=3.5749030113220215
+
Steps: 0%| | 1729/1000000 [4:20:48<2643:40:57, 9.53s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 1730/1000000 [4:20:56<2475:17:58, 8.93s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [1730], local_loss=0.03108162432909012, train_loss=0.048024021089076996, time_cost=2.0736939907073975
+
Steps: 0%| | 1730/1000000 [4:20:56<2475:17:58, 8.93s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 1731/1000000 [4:21:01<2140:03:54, 7.72s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [1731], local_loss=0.1531747281551361, train_loss=0.08989322185516357, time_cost=2.249272346496582
+
Steps: 0%| | 1731/1000000 [4:21:01<2140:03:54, 7.72s/it, lr=1e-5, step_loss=0.153]
Steps: 0%| | 1732/1000000 [4:21:08<2084:53:56, 7.52s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [1732], local_loss=0.03915546089410782, train_loss=0.12895822525024414, time_cost=5.222030401229858
+
Steps: 0%| | 1732/1000000 [4:21:08<2084:53:56, 7.52s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 1733/1000000 [4:21:15<2023:50:37, 7.30s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [1733], local_loss=0.06861206144094467, train_loss=0.06778346002101898, time_cost=2.519827127456665
+
Steps: 0%| | 1733/1000000 [4:21:15<2023:50:37, 7.30s/it, lr=1e-5, step_loss=0.0686]
Steps: 0%| | 1734/1000000 [4:21:20<1841:52:31, 6.64s/it, lr=1e-5, step_loss=0.0686][RANK-0]: Step: [1734], local_loss=0.05823775753378868, train_loss=0.0666472315788269, time_cost=2.1630969047546387
+
Steps: 0%| | 1734/1000000 [4:21:20<1841:52:31, 6.64s/it, lr=1e-5, step_loss=0.0582]
Steps: 0%| | 1735/1000000 [4:21:25<1697:03:12, 6.12s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [1735], local_loss=0.046596888452768326, train_loss=0.05514557287096977, time_cost=3.614194393157959
+
Steps: 0%| | 1735/1000000 [4:21:25<1697:03:12, 6.12s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 1736/1000000 [4:21:37<2183:04:35, 7.87s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [1736], local_loss=0.04725133627653122, train_loss=0.051406510174274445, time_cost=2.3186137676239014
+
Steps: 0%| | 1736/1000000 [4:21:37<2183:04:35, 7.87s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 1737/1000000 [4:21:44<2164:02:20, 7.80s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [1737], local_loss=0.027935395017266273, train_loss=0.06419926136732101, time_cost=4.853659152984619
+
Steps: 0%| | 1737/1000000 [4:21:44<2164:02:20, 7.80s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 1738/1000000 [4:21:55<2395:16:03, 8.64s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [1738], local_loss=0.05982043221592903, train_loss=0.057974375784397125, time_cost=1.2319507598876953
+
Steps: 0%| | 1738/1000000 [4:21:55<2395:16:03, 8.64s/it, lr=1e-5, step_loss=0.0598]
Steps: 0%| | 1739/1000000 [4:22:02<2230:08:42, 8.04s/it, lr=1e-5, step_loss=0.0598][RANK-0]: Step: [1739], local_loss=0.006197265814989805, train_loss=0.03589276969432831, time_cost=1.2231779098510742
+
Steps: 0%| | 1739/1000000 [4:22:02<2230:08:42, 8.04s/it, lr=1e-5, step_loss=0.0062]
Steps: 0%| | 1740/1000000 [4:22:14<2634:03:29, 9.50s/it, lr=1e-5, step_loss=0.0062][RANK-0]: Step: [1740], local_loss=0.040932945907115936, train_loss=0.03433995693922043, time_cost=2.8908016681671143
+
Steps: 0%| | 1740/1000000 [4:22:14<2634:03:29, 9.50s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 1741/1000000 [4:22:23<2547:09:18, 9.19s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [1741], local_loss=0.188787043094635, train_loss=0.09638398885726929, time_cost=1.7172656059265137
+
Steps: 0%| | 1741/1000000 [4:22:23<2547:09:18, 9.19s/it, lr=1e-5, step_loss=0.189]
Steps: 0%| | 1742/1000000 [4:22:36<2863:49:37, 10.33s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [1742], local_loss=0.04517729580402374, train_loss=0.0937228873372078, time_cost=5.939242124557495
+
Steps: 0%| | 1742/1000000 [4:22:36<2863:49:37, 10.33s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 1743/1000000 [4:22:51<3293:46:19, 11.88s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [1743], local_loss=0.1496296525001526, train_loss=0.052437275648117065, time_cost=8.409166097640991
+
Steps: 0%| | 1743/1000000 [4:22:51<3293:46:19, 11.88s/it, lr=1e-5, step_loss=0.15]
Steps: 0%| | 1744/1000000 [4:23:04<3374:32:17, 12.17s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [1744], local_loss=0.03135437145829201, train_loss=0.0415792316198349, time_cost=1.2156603336334229
+
Steps: 0%| | 1744/1000000 [4:23:04<3374:32:17, 12.17s/it, lr=1e-5, step_loss=0.0314]
Steps: 0%| | 1745/1000000 [4:23:15<3265:50:31, 11.78s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [1745], local_loss=0.03950360417366028, train_loss=0.0542154535651207, time_cost=5.634943008422852
+
Steps: 0%| | 1745/1000000 [4:23:15<3265:50:31, 11.78s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 1746/1000000 [4:23:26<3206:48:40, 11.56s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [1746], local_loss=0.09452102333307266, train_loss=0.0500783734023571, time_cost=2.5858824253082275
+
Steps: 0%| | 1746/1000000 [4:23:26<3206:48:40, 11.56s/it, lr=1e-5, step_loss=0.0945]
Steps: 0%| | 1747/1000000 [4:23:42<3524:33:31, 12.71s/it, lr=1e-5, step_loss=0.0945][RANK-0]: Step: [1747], local_loss=0.013245358131825924, train_loss=0.04139973595738411, time_cost=6.817325592041016
+
Steps: 0%| | 1747/1000000 [4:23:42<3524:33:31, 12.71s/it, lr=1e-5, step_loss=0.0132]
Steps: 0%| | 1748/1000000 [4:23:51<3211:03:34, 11.58s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [1748], local_loss=0.04466177895665169, train_loss=0.049444522708654404, time_cost=1.240267038345337
+
Steps: 0%| | 1748/1000000 [4:23:51<3211:03:34, 11.58s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 1749/1000000 [4:24:00<3038:36:19, 10.96s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [1749], local_loss=0.11908674240112305, train_loss=0.05991790443658829, time_cost=7.137038469314575
+
Steps: 0%| | 1749/1000000 [4:24:00<3038:36:19, 10.96s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 1750/1000000 [4:24:07<2709:10:11, 9.77s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [1750], local_loss=0.015776827931404114, train_loss=0.15394048392772675, time_cost=1.2811408042907715
+
Steps: 0%| | 1750/1000000 [4:24:07<2709:10:11, 9.77s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 1751/1000000 [4:24:12<2328:00:36, 8.40s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [1751], local_loss=0.029401417821645737, train_loss=0.0810621976852417, time_cost=1.1896183490753174
+
Steps: 0%| | 1751/1000000 [4:24:12<2328:00:36, 8.40s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 1752/1000000 [4:24:19<2225:35:40, 8.03s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [1752], local_loss=0.5642619729042053, train_loss=0.10107690840959549, time_cost=3.0323002338409424
+
Steps: 0%| | 1752/1000000 [4:24:19<2225:35:40, 8.03s/it, lr=1e-5, step_loss=0.564]
Steps: 0%| | 1753/1000000 [4:24:29<2389:43:18, 8.62s/it, lr=1e-5, step_loss=0.564][RANK-0]: Step: [1753], local_loss=0.024073168635368347, train_loss=0.06410844624042511, time_cost=1.8665883541107178
+
Steps: 0%| | 1753/1000000 [4:24:29<2389:43:18, 8.62s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 1754/1000000 [4:24:43<2810:35:14, 10.14s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [1754], local_loss=0.062222160398960114, train_loss=0.044036321341991425, time_cost=6.39736008644104
+
Steps: 0%| | 1754/1000000 [4:24:43<2810:35:14, 10.14s/it, lr=1e-5, step_loss=0.0622]
Steps: 0%| | 1755/1000000 [4:24:54<2892:54:08, 10.43s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [1755], local_loss=0.053713977336883545, train_loss=0.11158498376607895, time_cost=2.7088510990142822
+
Steps: 0%| | 1755/1000000 [4:24:54<2892:54:08, 10.43s/it, lr=1e-5, step_loss=0.0537]
Steps: 0%| | 1756/1000000 [4:25:00<2483:13:12, 8.96s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [1756], local_loss=0.031345248222351074, train_loss=24.77739715576172, time_cost=2.74686861038208
+
Steps: 0%| | 1756/1000000 [4:25:00<2483:13:12, 8.96s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 1757/1000000 [4:25:10<2609:43:13, 9.41s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [1757], local_loss=0.060575418174266815, train_loss=0.039151713252067566, time_cost=1.98652982711792
+
Steps: 0%| | 1757/1000000 [4:25:10<2609:43:13, 9.41s/it, lr=1e-5, step_loss=0.0606]
Steps: 0%| | 1758/1000000 [4:25:20<2615:37:53, 9.43s/it, lr=1e-5, step_loss=0.0606][RANK-0]: Step: [1758], local_loss=0.06868889927864075, train_loss=0.08462464064359665, time_cost=3.9596922397613525
+
Steps: 0%| | 1758/1000000 [4:25:20<2615:37:53, 9.43s/it, lr=1e-5, step_loss=0.0687]
Steps: 0%| | 1759/1000000 [4:25:24<2201:22:41, 7.94s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [1759], local_loss=0.036804355680942535, train_loss=0.27565425634384155, time_cost=3.4022953510284424
+
Steps: 0%| | 1759/1000000 [4:25:24<2201:22:41, 7.94s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 1760/1000000 [4:25:29<1944:22:10, 7.01s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [1760], local_loss=0.039569661021232605, train_loss=0.04381541535258293, time_cost=3.6732017993927
+
Steps: 0%| | 1760/1000000 [4:25:29<1944:22:10, 7.01s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 1761/1000000 [4:25:34<1764:32:52, 6.36s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [1761], local_loss=0.06329454481601715, train_loss=0.0671108141541481, time_cost=1.7900190353393555
+
Steps: 0%| | 1761/1000000 [4:25:34<1764:32:52, 6.36s/it, lr=1e-5, step_loss=0.0633]
Steps: 0%| | 1762/1000000 [4:25:41<1815:40:03, 6.55s/it, lr=1e-5, step_loss=0.0633][RANK-0]: Step: [1762], local_loss=0.03358040377497673, train_loss=0.06673993170261383, time_cost=1.6248159408569336
+
Steps: 0%| | 1762/1000000 [4:25:41<1815:40:03, 6.55s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 1763/1000000 [4:25:48<1911:31:37, 6.89s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [1763], local_loss=0.051645826548337936, train_loss=0.033207006752491, time_cost=3.972273349761963
+
Steps: 0%| | 1763/1000000 [4:25:48<1911:31:37, 6.89s/it, lr=1e-5, step_loss=0.0516]
Steps: 0%| | 1764/1000000 [4:26:00<2327:58:57, 8.40s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [1764], local_loss=0.03936087340116501, train_loss=0.044335752725601196, time_cost=2.8065545558929443
+
Steps: 0%| | 1764/1000000 [4:26:00<2327:58:57, 8.40s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 1765/1000000 [4:26:13<2660:20:54, 9.59s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [1765], local_loss=0.03637421876192093, train_loss=0.14042986929416656, time_cost=2.6385066509246826
+
Steps: 0%| | 1765/1000000 [4:26:13<2660:20:54, 9.59s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 1766/1000000 [4:26:22<2644:39:08, 9.54s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [1766], local_loss=0.06505513191223145, train_loss=0.034444428980350494, time_cost=6.424384593963623
+
Steps: 0%| | 1766/1000000 [4:26:22<2644:39:08, 9.54s/it, lr=1e-5, step_loss=0.0651]
Steps: 0%| | 1767/1000000 [4:26:31<2547:26:20, 9.19s/it, lr=1e-5, step_loss=0.0651][RANK-0]: Step: [1767], local_loss=0.24313750863075256, train_loss=0.06537982821464539, time_cost=1.2229394912719727
+
Steps: 0%| | 1767/1000000 [4:26:31<2547:26:20, 9.19s/it, lr=1e-5, step_loss=0.243]
Steps: 0%| | 1768/1000000 [4:26:36<2203:40:09, 7.95s/it, lr=1e-5, step_loss=0.243][RANK-0]: Step: [1768], local_loss=0.021525735035538673, train_loss=0.17738838493824005, time_cost=2.019601583480835
+
Steps: 0%| | 1768/1000000 [4:26:36<2203:40:09, 7.95s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 1769/1000000 [4:26:45<2282:21:22, 8.23s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [1769], local_loss=0.10037816315889359, train_loss=0.041953280568122864, time_cost=3.302089214324951
+
Steps: 0%| | 1769/1000000 [4:26:45<2282:21:22, 8.23s/it, lr=1e-5, step_loss=0.1]
Steps: 0%| | 1770/1000000 [4:27:00<2884:47:28, 10.40s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [1770], local_loss=0.07912319153547287, train_loss=0.04257216304540634, time_cost=6.941394090652466
+
Steps: 0%| | 1770/1000000 [4:27:00<2884:47:28, 10.40s/it, lr=1e-5, step_loss=0.0791]
Steps: 0%| | 1771/1000000 [4:27:14<3155:49:51, 11.38s/it, lr=1e-5, step_loss=0.0791][RANK-0]: Step: [1771], local_loss=0.024551723152399063, train_loss=0.09284044057130814, time_cost=4.463087558746338
+
Steps: 0%| | 1771/1000000 [4:27:14<3155:49:51, 11.38s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 1772/1000000 [4:27:20<2768:44:15, 9.99s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [1772], local_loss=0.03536398336291313, train_loss=0.0706460252404213, time_cost=2.6604530811309814
+
Steps: 0%| | 1772/1000000 [4:27:20<2768:44:15, 9.99s/it, lr=1e-5, step_loss=0.0354]
Steps: 0%| | 1773/1000000 [4:27:25<2357:24:39, 8.50s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [1773], local_loss=0.013509414158761501, train_loss=0.07257433235645294, time_cost=2.0301764011383057
+
Steps: 0%| | 1773/1000000 [4:27:25<2357:24:39, 8.50s/it, lr=1e-5, step_loss=0.0135]
Steps: 0%| | 1774/1000000 [4:27:37<2595:43:52, 9.36s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [1774], local_loss=0.07470130920410156, train_loss=0.1502767950296402, time_cost=1.7739841938018799
+
Steps: 0%| | 1774/1000000 [4:27:37<2595:43:52, 9.36s/it, lr=1e-5, step_loss=0.0747]
Steps: 0%| | 1775/1000000 [4:27:50<2912:03:34, 10.50s/it, lr=1e-5, step_loss=0.0747][RANK-0]: Step: [1775], local_loss=0.03001522086560726, train_loss=0.045940302312374115, time_cost=1.2300431728363037
+
Steps: 0%| | 1775/1000000 [4:27:50<2912:03:34, 10.50s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 1776/1000000 [4:27:57<2619:15:39, 9.45s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [1776], local_loss=0.023485679179430008, train_loss=0.03288872912526131, time_cost=5.521578311920166
+
Steps: 0%| | 1776/1000000 [4:27:57<2619:15:39, 9.45s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 1777/1000000 [4:28:12<3057:41:29, 11.03s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [1777], local_loss=0.05260206758975983, train_loss=0.03707686439156532, time_cost=7.569863796234131
+
Steps: 0%| | 1777/1000000 [4:28:12<3057:41:29, 11.03s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 1778/1000000 [4:28:27<3401:01:33, 12.27s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [1778], local_loss=0.027692584320902824, train_loss=0.08310549706220627, time_cost=5.014737367630005
+
Steps: 0%| | 1778/1000000 [4:28:27<3401:01:33, 12.27s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 1779/1000000 [4:28:32<2799:49:28, 10.10s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [1779], local_loss=0.031346675008535385, train_loss=0.20158784091472626, time_cost=2.0607967376708984
+
Steps: 0%| | 1779/1000000 [4:28:32<2799:49:28, 10.10s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 1780/1000000 [4:28:38<2466:09:48, 8.89s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [1780], local_loss=0.02143203280866146, train_loss=0.036785900592803955, time_cost=1.9140150547027588
+
Steps: 0%| | 1780/1000000 [4:28:38<2466:09:48, 8.89s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 1781/1000000 [4:28:49<2684:28:00, 9.68s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [1781], local_loss=0.022392814978957176, train_loss=0.04215945303440094, time_cost=3.65362548828125
+
Steps: 0%| | 1781/1000000 [4:28:49<2684:28:00, 9.68s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 1782/1000000 [4:28:57<2547:47:19, 9.19s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [1782], local_loss=0.0268389992415905, train_loss=0.036700762808322906, time_cost=1.9978196620941162
+
Steps: 0%| | 1782/1000000 [4:28:57<2547:47:19, 9.19s/it, lr=1e-5, step_loss=0.0268]
Steps: 0%| | 1783/1000000 [4:29:04<2309:50:08, 8.33s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [1783], local_loss=0.017954152077436447, train_loss=0.06874172389507294, time_cost=4.047173500061035
+
Steps: 0%| | 1783/1000000 [4:29:04<2309:50:08, 8.33s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 1784/1000000 [4:29:18<2762:39:40, 9.96s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [1784], local_loss=0.1464429646730423, train_loss=0.07109295576810837, time_cost=2.487368583679199
+
Steps: 0%| | 1784/1000000 [4:29:18<2762:39:40, 9.96s/it, lr=1e-5, step_loss=0.146]
Steps: 0%| | 1785/1000000 [4:29:33<3218:55:11, 11.61s/it, lr=1e-5, step_loss=0.146][RANK-0]: Step: [1785], local_loss=0.018952231854200363, train_loss=15.733071327209473, time_cost=6.8729612827301025
+
Steps: 0%| | 1785/1000000 [4:29:33<3218:55:11, 11.61s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 1786/1000000 [4:29:38<2701:02:45, 9.74s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [1786], local_loss=0.026837697252631187, train_loss=0.02751203626394272, time_cost=2.5137946605682373
+
Steps: 0%| | 1786/1000000 [4:29:38<2701:02:45, 9.74s/it, lr=1e-5, step_loss=0.0268]
Steps: 0%| | 1787/1000000 [4:29:49<2776:11:55, 10.01s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [1787], local_loss=0.13206282258033752, train_loss=0.08847988396883011, time_cost=2.943737030029297
+
Steps: 0%| | 1787/1000000 [4:29:49<2776:11:55, 10.01s/it, lr=1e-5, step_loss=0.132]
Steps: 0%| | 1788/1000000 [4:30:01<2916:41:03, 10.52s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [1788], local_loss=0.05481637269258499, train_loss=0.10483114421367645, time_cost=2.159024953842163
+
Steps: 0%| | 1788/1000000 [4:30:01<2916:41:03, 10.52s/it, lr=1e-5, step_loss=0.0548]
Steps: 0%| | 1789/1000000 [4:30:17<3384:15:33, 12.21s/it, lr=1e-5, step_loss=0.0548][RANK-0]: Step: [1789], local_loss=0.02816043235361576, train_loss=0.06038599833846092, time_cost=2.8868722915649414
+
Steps: 0%| | 1789/1000000 [4:30:17<3384:15:33, 12.21s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 1790/1000000 [4:30:27<3233:30:58, 11.66s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [1790], local_loss=0.08980999886989594, train_loss=0.04217204824090004, time_cost=1.4758257865905762
+
Steps: 0%| | 1790/1000000 [4:30:27<3233:30:58, 11.66s/it, lr=1e-5, step_loss=0.0898]
Steps: 0%| | 1791/1000000 [4:30:37<3095:04:30, 11.16s/it, lr=1e-5, step_loss=0.0898][RANK-0]: Step: [1791], local_loss=0.028583059087395668, train_loss=0.06028009578585625, time_cost=5.434170722961426
+
Steps: 0%| | 1791/1000000 [4:30:37<3095:04:30, 11.16s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 1792/1000000 [4:30:43<2675:31:59, 9.65s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [1792], local_loss=0.04925328865647316, train_loss=0.03846585750579834, time_cost=1.8694288730621338
+
Steps: 0%| | 1792/1000000 [4:30:43<2675:31:59, 9.65s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 1793/1000000 [4:30:49<2326:51:54, 8.39s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [1793], local_loss=0.025550970807671547, train_loss=0.24754869937896729, time_cost=1.2312138080596924
+
Steps: 0%| | 1793/1000000 [4:30:49<2326:51:54, 8.39s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 1794/1000000 [4:30:54<2051:59:46, 7.40s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [1794], local_loss=0.04920594021677971, train_loss=0.03919777646660805, time_cost=2.3899343013763428
+
Steps: 0%| | 1794/1000000 [4:30:54<2051:59:46, 7.40s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 1795/1000000 [4:31:00<1910:27:44, 6.89s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [1795], local_loss=0.03421081230044365, train_loss=0.1553885042667389, time_cost=1.2263474464416504
+
Steps: 0%| | 1795/1000000 [4:31:00<1910:27:44, 6.89s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 1796/1000000 [4:31:08<2067:13:41, 7.46s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [1796], local_loss=0.026335544884204865, train_loss=0.05820624902844429, time_cost=1.9832713603973389
+
Steps: 0%| | 1796/1000000 [4:31:08<2067:13:41, 7.46s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 1797/1000000 [4:31:16<2072:59:34, 7.48s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [1797], local_loss=0.02397850714623928, train_loss=0.050827059894800186, time_cost=1.4250035285949707
+
Steps: 0%| | 1797/1000000 [4:31:16<2072:59:34, 7.48s/it, lr=1e-5, step_loss=0.024]
Steps: 0%| | 1798/1000000 [4:31:21<1873:15:17, 6.76s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [1798], local_loss=0.02520487830042839, train_loss=0.044858068227767944, time_cost=3.7978298664093018
+
Steps: 0%| | 1798/1000000 [4:31:21<1873:15:17, 6.76s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 1799/1000000 [4:31:34<2409:22:02, 8.69s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [1799], local_loss=0.053287848830223083, train_loss=0.081938236951828, time_cost=3.1427693367004395
+
Steps: 0%| | 1799/1000000 [4:31:34<2409:22:02, 8.69s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 1800/1000000 [4:31:49<2909:30:07, 10.49s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [1800], local_loss=0.05098601430654526, train_loss=0.09283789247274399, time_cost=6.36381196975708
+
Steps: 0%| | 1800/1000000 [4:31:49<2909:30:07, 10.49s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 1801/1000000 [4:32:04<3305:59:19, 11.92s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [1801], local_loss=0.02331315167248249, train_loss=0.044898927211761475, time_cost=12.776337146759033
+
Steps: 0%| | 1801/1000000 [4:32:04<3305:59:19, 11.92s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 1802/1000000 [4:32:11<2892:39:30, 10.43s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [1802], local_loss=0.04443156346678734, train_loss=0.09710614383220673, time_cost=3.052415132522583
+
Steps: 0%| | 1802/1000000 [4:32:11<2892:39:30, 10.43s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 1803/1000000 [4:32:20<2747:56:56, 9.91s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [1803], local_loss=0.04401330649852753, train_loss=0.044999852776527405, time_cost=2.696120262145996
+
Steps: 0%| | 1803/1000000 [4:32:20<2747:56:56, 9.91s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 1804/1000000 [4:32:26<2411:54:19, 8.70s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [1804], local_loss=0.016058363020420074, train_loss=0.03395983949303627, time_cost=1.953526496887207
+
Steps: 0%| | 1804/1000000 [4:32:26<2411:54:19, 8.70s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 1805/1000000 [4:32:31<2098:55:05, 7.57s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [1805], local_loss=0.04707074165344238, train_loss=0.05080664902925491, time_cost=2.0000016689300537
+
Steps: 0%| | 1805/1000000 [4:32:31<2098:55:05, 7.57s/it, lr=1e-5, step_loss=0.0471]
Steps: 0%| | 1806/1000000 [4:32:38<2075:34:16, 7.49s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [1806], local_loss=0.01722652092576027, train_loss=0.0563126765191555, time_cost=5.233672618865967
+
Steps: 0%| | 1806/1000000 [4:32:38<2075:34:16, 7.49s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 1807/1000000 [4:32:53<2723:08:03, 9.82s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [1807], local_loss=0.05178644135594368, train_loss=0.04104546830058098, time_cost=1.218646764755249
+
Steps: 0%| | 1807/1000000 [4:32:53<2723:08:03, 9.82s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 1808/1000000 [4:33:01<2534:12:00, 9.14s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [1808], local_loss=0.023035449907183647, train_loss=0.05314304679632187, time_cost=3.2440178394317627
+
Steps: 0%| | 1808/1000000 [4:33:01<2534:12:00, 9.14s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 1809/1000000 [4:33:12<2695:27:24, 9.72s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [1809], local_loss=0.01978861726820469, train_loss=0.04311942681670189, time_cost=3.7661964893341064
+
Steps: 0%| | 1809/1000000 [4:33:12<2695:27:24, 9.72s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 1810/1000000 [4:33:27<3155:08:44, 11.38s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [1810], local_loss=0.03203218802809715, train_loss=0.05433524399995804, time_cost=2.6937644481658936
+
Steps: 0%| | 1810/1000000 [4:33:27<3155:08:44, 11.38s/it, lr=1e-5, step_loss=0.032]
Steps: 0%| | 1811/1000000 [4:33:31<2560:26:55, 9.23s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [1811], local_loss=0.14997291564941406, train_loss=0.11725523322820663, time_cost=1.227891445159912
+
Steps: 0%| | 1811/1000000 [4:33:31<2560:26:55, 9.23s/it, lr=1e-5, step_loss=0.15]
Steps: 0%| | 1812/1000000 [4:33:36<2144:07:58, 7.73s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [1812], local_loss=0.10056521743535995, train_loss=0.06646852940320969, time_cost=1.240041732788086
+
Steps: 0%| | 1812/1000000 [4:33:36<2144:07:58, 7.73s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 1813/1000000 [4:33:46<2406:42:37, 8.68s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [1813], local_loss=0.053991761058568954, train_loss=0.16527321934700012, time_cost=1.6245653629302979
+
Steps: 0%| | 1813/1000000 [4:33:46<2406:42:37, 8.68s/it, lr=1e-5, step_loss=0.054]
Steps: 0%| | 1814/1000000 [4:34:00<2838:45:45, 10.24s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [1814], local_loss=0.02104916423559189, train_loss=0.03154647722840309, time_cost=4.895199537277222
+
Steps: 0%| | 1814/1000000 [4:34:00<2838:45:45, 10.24s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 1815/1000000 [4:34:14<3132:53:30, 11.30s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [1815], local_loss=0.01689433678984642, train_loss=0.08779606968164444, time_cost=3.82850980758667
+
Steps: 0%| | 1815/1000000 [4:34:14<3132:53:30, 11.30s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 1816/1000000 [4:34:24<3005:58:50, 10.84s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [1816], local_loss=0.05080229416489601, train_loss=7.094965934753418, time_cost=1.575031042098999
+
Steps: 0%| | 1816/1000000 [4:34:24<3005:58:50, 10.84s/it, lr=1e-5, step_loss=0.0508]
Steps: 0%| | 1817/1000000 [4:34:37<3232:20:24, 11.66s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [1817], local_loss=0.0662180632352829, train_loss=0.21044595539569855, time_cost=3.7852931022644043
+
Steps: 0%| | 1817/1000000 [4:34:37<3232:20:24, 11.66s/it, lr=1e-5, step_loss=0.0662]
Steps: 0%| | 1818/1000000 [4:34:44<2852:13:11, 10.29s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [1818], local_loss=0.04889799654483795, train_loss=0.10963812470436096, time_cost=2.65413761138916
+
Steps: 0%| | 1818/1000000 [4:34:44<2852:13:11, 10.29s/it, lr=1e-5, step_loss=0.0489]
Steps: 0%| | 1819/1000000 [4:34:50<2441:13:24, 8.80s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [1819], local_loss=0.07955651730298996, train_loss=0.037891652435064316, time_cost=2.5075523853302
+
Steps: 0%| | 1819/1000000 [4:34:50<2441:13:24, 8.80s/it, lr=1e-5, step_loss=0.0796]
Steps: 0%| | 1820/1000000 [4:35:03<2826:56:27, 10.20s/it, lr=1e-5, step_loss=0.0796][RANK-0]: Step: [1820], local_loss=0.06123645603656769, train_loss=0.2042330801486969, time_cost=4.283855199813843
+
Steps: 0%| | 1820/1000000 [4:35:03<2826:56:27, 10.20s/it, lr=1e-5, step_loss=0.0612]
Steps: 0%| | 1821/1000000 [4:35:17<3109:14:26, 11.21s/it, lr=1e-5, step_loss=0.0612][RANK-0]: Step: [1821], local_loss=0.04583817720413208, train_loss=0.05735566467046738, time_cost=1.2391345500946045
+
Steps: 0%| | 1821/1000000 [4:35:17<3109:14:26, 11.21s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 1822/1000000 [4:35:26<2974:27:55, 10.73s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [1822], local_loss=0.02637444995343685, train_loss=0.027613889425992966, time_cost=3.5599353313446045
+
Steps: 0%| | 1822/1000000 [4:35:26<2974:27:55, 10.73s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1823/1000000 [4:35:42<3411:37:32, 12.30s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1823], local_loss=0.16532008349895477, train_loss=0.08197231590747833, time_cost=6.033556938171387
+
Steps: 0%| | 1823/1000000 [4:35:42<3411:37:32, 12.30s/it, lr=1e-5, step_loss=0.165]
Steps: 0%| | 1824/1000000 [4:35:47<2759:46:24, 9.95s/it, lr=1e-5, step_loss=0.165][RANK-0]: Step: [1824], local_loss=0.01676984690129757, train_loss=0.050915904343128204, time_cost=1.4058318138122559
+
Steps: 0%| | 1824/1000000 [4:35:47<2759:46:24, 9.95s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 1825/1000000 [4:35:57<2764:58:18, 9.97s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [1825], local_loss=0.2620250880718231, train_loss=0.07691913843154907, time_cost=3.651503324508667
+
Steps: 0%| | 1825/1000000 [4:35:57<2764:58:18, 9.97s/it, lr=1e-5, step_loss=0.262]
Steps: 0%| | 1826/1000000 [4:36:09<2971:51:49, 10.72s/it, lr=1e-5, step_loss=0.262][RANK-0]: Step: [1826], local_loss=0.01740926131606102, train_loss=0.09923329949378967, time_cost=1.224170207977295
+
Steps: 0%| | 1826/1000000 [4:36:09<2971:51:49, 10.72s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 1827/1000000 [4:36:15<2551:39:39, 9.20s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [1827], local_loss=0.4413125514984131, train_loss=11.850722312927246, time_cost=1.6230366230010986
+
Steps: 0%| | 1827/1000000 [4:36:15<2551:39:39, 9.20s/it, lr=1e-5, step_loss=0.441]
Steps: 0%| | 1828/1000000 [4:36:26<2693:28:44, 9.71s/it, lr=1e-5, step_loss=0.441][RANK-0]: Step: [1828], local_loss=0.02114199474453926, train_loss=0.1781185269355774, time_cost=2.0475125312805176
+
Steps: 0%| | 1828/1000000 [4:36:26<2693:28:44, 9.71s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 1829/1000000 [4:36:31<2312:24:13, 8.34s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [1829], local_loss=0.044523850083351135, train_loss=0.038038261234760284, time_cost=2.2695672512054443
+
Steps: 0%| | 1829/1000000 [4:36:31<2312:24:13, 8.34s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 1830/1000000 [4:36:45<2799:51:20, 10.10s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [1830], local_loss=0.03297444432973862, train_loss=0.03688080981373787, time_cost=2.826369285583496
+
Steps: 0%| | 1830/1000000 [4:36:45<2799:51:20, 10.10s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 1831/1000000 [4:36:52<2558:53:40, 9.23s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [1831], local_loss=0.02825332060456276, train_loss=0.2109232097864151, time_cost=5.525070905685425
+
Steps: 0%| | 1831/1000000 [4:36:52<2558:53:40, 9.23s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 1832/1000000 [4:36:59<2359:45:23, 8.51s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [1832], local_loss=0.030598916113376617, train_loss=0.03135320544242859, time_cost=5.417765378952026
+
Steps: 0%| | 1832/1000000 [4:36:59<2359:45:23, 8.51s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 1833/1000000 [4:37:14<2880:38:26, 10.39s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [1833], local_loss=0.017405489459633827, train_loss=0.03797908127307892, time_cost=5.63340425491333
+
Steps: 0%| | 1833/1000000 [4:37:14<2880:38:26, 10.39s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 1834/1000000 [4:37:20<2519:12:03, 9.09s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [1834], local_loss=0.019404152408242226, train_loss=0.03321923688054085, time_cost=1.7846643924713135
+
Steps: 0%| | 1834/1000000 [4:37:20<2519:12:03, 9.09s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 1835/1000000 [4:37:34<2918:59:11, 10.53s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [1835], local_loss=0.07809358835220337, train_loss=0.05344248563051224, time_cost=5.043129920959473
+
Steps: 0%| | 1835/1000000 [4:37:34<2918:59:11, 10.53s/it, lr=1e-5, step_loss=0.0781]
Steps: 0%| | 1836/1000000 [4:37:47<3118:19:46, 11.25s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [1836], local_loss=0.027440831065177917, train_loss=0.03832013159990311, time_cost=5.043995380401611
+
Steps: 0%| | 1836/1000000 [4:37:47<3118:19:46, 11.25s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 1837/1000000 [4:38:01<3313:15:51, 11.95s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [1837], local_loss=0.033466387540102005, train_loss=0.080796018242836, time_cost=9.766527652740479
+
Steps: 0%| | 1837/1000000 [4:38:01<3313:15:51, 11.95s/it, lr=1e-5, step_loss=0.0335]
Steps: 0%| | 1838/1000000 [4:38:08<2924:24:15, 10.55s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [1838], local_loss=0.01807626336812973, train_loss=0.05218570679426193, time_cost=3.374945640563965
+
Steps: 0%| | 1838/1000000 [4:38:08<2924:24:15, 10.55s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 1839/1000000 [4:38:21<3176:19:52, 11.46s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [1839], local_loss=0.02440713718533516, train_loss=0.04479534924030304, time_cost=1.4391705989837646
+
Steps: 0%| | 1839/1000000 [4:38:21<3176:19:52, 11.46s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 1840/1000000 [4:38:26<2623:02:55, 9.46s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [1840], local_loss=0.025128185749053955, train_loss=0.05042492225766182, time_cost=2.8429970741271973
+
Steps: 0%| | 1840/1000000 [4:38:26<2623:02:55, 9.46s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 1841/1000000 [4:38:39<2909:23:52, 10.49s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [1841], local_loss=0.14342446625232697, train_loss=0.08927720785140991, time_cost=1.2193069458007812
+
Steps: 0%| | 1841/1000000 [4:38:39<2909:23:52, 10.49s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 1842/1000000 [4:38:43<2385:33:26, 8.60s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [1842], local_loss=0.046691883355379105, train_loss=0.038046374917030334, time_cost=1.5031521320343018
+
Steps: 0%| | 1842/1000000 [4:38:43<2385:33:26, 8.60s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 1843/1000000 [4:38:49<2149:41:10, 7.75s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [1843], local_loss=0.20429407060146332, train_loss=0.062274664640426636, time_cost=1.253568410873413
+
Steps: 0%| | 1843/1000000 [4:38:49<2149:41:10, 7.75s/it, lr=1e-5, step_loss=0.204]
Steps: 0%| | 1844/1000000 [4:39:07<2959:18:14, 10.67s/it, lr=1e-5, step_loss=0.204][RANK-0]: Step: [1844], local_loss=0.04485702142119408, train_loss=0.03804627060890198, time_cost=5.149045467376709
+
Steps: 0%| | 1844/1000000 [4:39:07<2959:18:14, 10.67s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 1845/1000000 [4:39:14<2731:01:32, 9.85s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [1845], local_loss=0.05436228960752487, train_loss=0.04195060953497887, time_cost=3.8973631858825684
+
Steps: 0%| | 1845/1000000 [4:39:14<2731:01:32, 9.85s/it, lr=1e-5, step_loss=0.0544]
Steps: 0%| | 1846/1000000 [4:39:21<2423:30:24, 8.74s/it, lr=1e-5, step_loss=0.0544][RANK-0]: Step: [1846], local_loss=0.03162047266960144, train_loss=0.07623539865016937, time_cost=2.53802227973938
+
Steps: 0%| | 1846/1000000 [4:39:21<2423:30:24, 8.74s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 1847/1000000 [4:39:31<2599:06:58, 9.37s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [1847], local_loss=0.02783052809536457, train_loss=0.06817251443862915, time_cost=2.523033380508423
+
Steps: 0%| | 1847/1000000 [4:39:31<2599:06:58, 9.37s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 1848/1000000 [4:39:42<2732:17:27, 9.85s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [1848], local_loss=0.023140372708439827, train_loss=0.034993335604667664, time_cost=3.3064942359924316
+
Steps: 0%| | 1848/1000000 [4:39:42<2732:17:27, 9.85s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 1849/1000000 [4:39:52<2739:27:16, 9.88s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [1849], local_loss=0.017333419993519783, train_loss=0.20641295611858368, time_cost=3.7003636360168457
+
Steps: 0%| | 1849/1000000 [4:39:52<2739:27:16, 9.88s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 1850/1000000 [4:40:05<2941:55:02, 10.61s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [1850], local_loss=0.06006108224391937, train_loss=0.0588153600692749, time_cost=2.1789493560791016
+
Steps: 0%| | 1850/1000000 [4:40:05<2941:55:02, 10.61s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 1851/1000000 [4:40:09<2454:43:36, 8.85s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [1851], local_loss=0.025727529078722, train_loss=0.054152823984622955, time_cost=2.4666805267333984
+
Steps: 0%| | 1851/1000000 [4:40:09<2454:43:36, 8.85s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 1852/1000000 [4:40:22<2799:45:59, 10.10s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [1852], local_loss=0.03897843509912491, train_loss=0.08700619637966156, time_cost=5.568112850189209
+
Steps: 0%| | 1852/1000000 [4:40:22<2799:45:59, 10.10s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 1853/1000000 [4:40:27<2320:11:55, 8.37s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [1853], local_loss=0.046771980822086334, train_loss=0.050758976489305496, time_cost=1.2502241134643555
+
Steps: 0%| | 1853/1000000 [4:40:27<2320:11:55, 8.37s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 1854/1000000 [4:40:31<1992:46:48, 7.19s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [1854], local_loss=0.01698077842593193, train_loss=0.04175765812397003, time_cost=1.5005214214324951
+
Steps: 0%| | 1854/1000000 [4:40:31<1992:46:48, 7.19s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 1855/1000000 [4:40:38<1921:18:41, 6.93s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [1855], local_loss=0.041843120008707047, train_loss=0.05087912082672119, time_cost=2.6391162872314453
+
Steps: 0%| | 1855/1000000 [4:40:38<1921:18:41, 6.93s/it, lr=1e-5, step_loss=0.0418]
Steps: 0%| | 1856/1000000 [4:40:48<2212:05:33, 7.98s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [1856], local_loss=0.08294441550970078, train_loss=0.06239979714155197, time_cost=1.8279829025268555
+
Steps: 0%| | 1856/1000000 [4:40:48<2212:05:33, 7.98s/it, lr=1e-5, step_loss=0.0829]
Steps: 0%| | 1857/1000000 [4:40:53<1968:29:06, 7.10s/it, lr=1e-5, step_loss=0.0829][RANK-0]: Step: [1857], local_loss=0.06734685599803925, train_loss=0.05390486121177673, time_cost=2.045822858810425
+
Steps: 0%| | 1857/1000000 [4:40:53<1968:29:06, 7.10s/it, lr=1e-5, step_loss=0.0673]
Steps: 0%| | 1858/1000000 [4:41:02<2120:31:25, 7.65s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [1858], local_loss=0.1712508499622345, train_loss=0.05458509922027588, time_cost=2.8899857997894287
+
Steps: 0%| | 1858/1000000 [4:41:02<2120:31:25, 7.65s/it, lr=1e-5, step_loss=0.171]
Steps: 0%| | 1859/1000000 [4:41:13<2399:00:34, 8.65s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [1859], local_loss=0.024772807955741882, train_loss=0.05560789257287979, time_cost=1.3578734397888184
+
Steps: 0%| | 1859/1000000 [4:41:13<2399:00:34, 8.65s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 1860/1000000 [4:41:17<2035:07:17, 7.34s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [1860], local_loss=0.02122664451599121, train_loss=0.10919728130102158, time_cost=1.5947976112365723
+
Steps: 0%| | 1860/1000000 [4:41:17<2035:07:17, 7.34s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 1861/1000000 [4:41:25<2079:07:28, 7.50s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [1861], local_loss=0.030504411086440086, train_loss=0.0546417310833931, time_cost=2.4081060886383057
+
Steps: 0%| | 1861/1000000 [4:41:25<2079:07:28, 7.50s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 1862/1000000 [4:41:34<2181:32:47, 7.87s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [1862], local_loss=0.058100856840610504, train_loss=0.045088768005371094, time_cost=1.7995314598083496
+
Steps: 0%| | 1862/1000000 [4:41:34<2181:32:47, 7.87s/it, lr=1e-5, step_loss=0.0581]
Steps: 0%| | 1863/1000000 [4:41:39<1988:14:48, 7.17s/it, lr=1e-5, step_loss=0.0581][RANK-0]: Step: [1863], local_loss=0.013776625506579876, train_loss=0.06874072551727295, time_cost=2.8136825561523438
+
Steps: 0%| | 1863/1000000 [4:41:39<1988:14:48, 7.17s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 1864/1000000 [4:41:46<1970:14:59, 7.11s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [1864], local_loss=0.01611119881272316, train_loss=0.04335939139127731, time_cost=3.1077487468719482
+
Steps: 0%| | 1864/1000000 [4:41:46<1970:14:59, 7.11s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 1865/1000000 [4:41:58<2330:10:34, 8.40s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [1865], local_loss=0.032383378595113754, train_loss=15.601344108581543, time_cost=3.218799352645874
+
Steps: 0%| | 1865/1000000 [4:41:58<2330:10:34, 8.40s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 1866/1000000 [4:42:04<2187:34:35, 7.89s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [1866], local_loss=0.01795951835811138, train_loss=0.03414186090230942, time_cost=1.9732036590576172
+
Steps: 0%| | 1866/1000000 [4:42:04<2187:34:35, 7.89s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 1867/1000000 [4:42:10<1953:58:59, 7.05s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [1867], local_loss=0.04911471903324127, train_loss=0.05606094375252724, time_cost=2.1023688316345215
+
Steps: 0%| | 1867/1000000 [4:42:10<1953:58:59, 7.05s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 1868/1000000 [4:42:21<2279:49:14, 8.22s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [1868], local_loss=0.049844708293676376, train_loss=0.05572120100259781, time_cost=2.4032368659973145
+
Steps: 0%| | 1868/1000000 [4:42:21<2279:49:14, 8.22s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 1869/1000000 [4:42:35<2833:56:47, 10.22s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [1869], local_loss=0.02255006693303585, train_loss=0.02703443542122841, time_cost=6.5195348262786865
+
Steps: 0%| | 1869/1000000 [4:42:35<2833:56:47, 10.22s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 1870/1000000 [4:42:41<2410:56:15, 8.70s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [1870], local_loss=0.027202285826206207, train_loss=0.03649136424064636, time_cost=2.3286068439483643
+
Steps: 0%| | 1870/1000000 [4:42:41<2410:56:15, 8.70s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 1871/1000000 [4:42:51<2560:22:11, 9.23s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [1871], local_loss=0.020789798349142075, train_loss=0.03310977295041084, time_cost=1.647282361984253
+
Steps: 0%| | 1871/1000000 [4:42:51<2560:22:11, 9.23s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 1872/1000000 [4:42:56<2169:32:00, 7.82s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [1872], local_loss=0.03386680409312248, train_loss=0.03820125758647919, time_cost=2.08355975151062
+
Steps: 0%| | 1872/1000000 [4:42:56<2169:32:00, 7.82s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 1873/1000000 [4:43:05<2264:38:02, 8.17s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [1873], local_loss=0.06577908992767334, train_loss=0.05829629674553871, time_cost=2.908149480819702
+
Steps: 0%| | 1873/1000000 [4:43:05<2264:38:02, 8.17s/it, lr=1e-5, step_loss=0.0658]
Steps: 0%| | 1874/1000000 [4:43:19<2775:08:17, 10.01s/it, lr=1e-5, step_loss=0.0658][RANK-0]: Step: [1874], local_loss=0.07945748418569565, train_loss=0.0895409807562828, time_cost=3.15433406829834
+
Steps: 0%| | 1874/1000000 [4:43:19<2775:08:17, 10.01s/it, lr=1e-5, step_loss=0.0795]
Steps: 0%| | 1875/1000000 [4:43:32<2997:27:55, 10.81s/it, lr=1e-5, step_loss=0.0795][RANK-0]: Step: [1875], local_loss=0.011716223321855068, train_loss=0.23747074604034424, time_cost=1.2208890914916992
+
Steps: 0%| | 1875/1000000 [4:43:32<2997:27:55, 10.81s/it, lr=1e-5, step_loss=0.0117]
Steps: 0%| | 1876/1000000 [4:43:39<2723:40:37, 9.82s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [1876], local_loss=0.08888880163431168, train_loss=0.04441926255822182, time_cost=2.0410315990448
+
Steps: 0%| | 1876/1000000 [4:43:39<2723:40:37, 9.82s/it, lr=1e-5, step_loss=0.0889]
Steps: 0%| | 1877/1000000 [4:43:44<2322:13:19, 8.38s/it, lr=1e-5, step_loss=0.0889][RANK-0]: Step: [1877], local_loss=0.0207584910094738, train_loss=0.05217989534139633, time_cost=2.1328165531158447
+
Steps: 0%| | 1877/1000000 [4:43:44<2322:13:19, 8.38s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 1878/1000000 [4:43:57<2699:28:01, 9.74s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [1878], local_loss=0.023481369018554688, train_loss=0.0392463281750679, time_cost=4.6163694858551025
+
Steps: 0%| | 1878/1000000 [4:43:57<2699:28:01, 9.74s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 1879/1000000 [4:44:11<3033:40:36, 10.94s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [1879], local_loss=0.015261021442711353, train_loss=0.14910900592803955, time_cost=5.0743536949157715
+
Steps: 0%| | 1879/1000000 [4:44:11<3033:40:36, 10.94s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 1880/1000000 [4:44:24<3219:31:58, 11.61s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [1880], local_loss=0.022724784910678864, train_loss=0.03189876675605774, time_cost=1.9142603874206543
+
Steps: 0%| | 1880/1000000 [4:44:24<3219:31:58, 11.61s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 1881/1000000 [4:44:30<2760:50:40, 9.96s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [1881], local_loss=0.022371981292963028, train_loss=0.03812815994024277, time_cost=1.9669957160949707
+
Steps: 0%| | 1881/1000000 [4:44:30<2760:50:40, 9.96s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 1882/1000000 [4:44:44<3085:58:31, 11.13s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [1882], local_loss=0.03534955531358719, train_loss=0.2538034915924072, time_cost=4.89413595199585
+
Steps: 0%| | 1882/1000000 [4:44:44<3085:58:31, 11.13s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 1883/1000000 [4:44:54<2970:43:24, 10.71s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [1883], local_loss=0.06546377390623093, train_loss=0.04858655855059624, time_cost=3.3417325019836426
+
Steps: 0%| | 1883/1000000 [4:44:54<2970:43:24, 10.71s/it, lr=1e-5, step_loss=0.0655]
Steps: 0%| | 1884/1000000 [4:44:58<2454:01:08, 8.85s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [1884], local_loss=0.014018230140209198, train_loss=0.04373226687312126, time_cost=1.2625536918640137
+
Steps: 0%| | 1884/1000000 [4:44:58<2454:01:08, 8.85s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 1885/1000000 [4:45:03<2091:04:25, 7.54s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [1885], local_loss=0.39367321133613586, train_loss=0.07599541544914246, time_cost=1.4452729225158691
+
Steps: 0%| | 1885/1000000 [4:45:03<2091:04:25, 7.54s/it, lr=1e-5, step_loss=0.394]
Steps: 0%| | 1886/1000000 [4:45:07<1797:12:04, 6.48s/it, lr=1e-5, step_loss=0.394][RANK-0]: Step: [1886], local_loss=0.026701554656028748, train_loss=0.15040084719657898, time_cost=1.28529953956604
+
Steps: 0%| | 1886/1000000 [4:45:07<1797:12:04, 6.48s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 1887/1000000 [4:45:12<1730:09:39, 6.24s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [1887], local_loss=0.020527781918644905, train_loss=0.07508520036935806, time_cost=2.4254322052001953
+
Steps: 0%| | 1887/1000000 [4:45:12<1730:09:39, 6.24s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 1888/1000000 [4:45:24<2155:34:40, 7.77s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [1888], local_loss=0.03862549364566803, train_loss=0.03426095098257065, time_cost=2.568657398223877
+
Steps: 0%| | 1888/1000000 [4:45:24<2155:34:40, 7.77s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 1889/1000000 [4:45:37<2616:11:28, 9.44s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [1889], local_loss=0.19669483602046967, train_loss=48.03112030029297, time_cost=3.2502975463867188
+
Steps: 0%| | 1889/1000000 [4:45:37<2616:11:28, 9.44s/it, lr=1e-5, step_loss=0.197]
Steps: 0%| | 1890/1000000 [4:45:45<2475:53:08, 8.93s/it, lr=1e-5, step_loss=0.197][RANK-0]: Step: [1890], local_loss=0.034090183675289154, train_loss=0.05806460231542587, time_cost=2.151319742202759
+
Steps: 0%| | 1890/1000000 [4:45:45<2475:53:08, 8.93s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 1891/1000000 [4:45:50<2151:29:18, 7.76s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [1891], local_loss=0.04192022234201431, train_loss=0.0868186354637146, time_cost=2.283048391342163
+
Steps: 0%| | 1891/1000000 [4:45:50<2151:29:18, 7.76s/it, lr=1e-5, step_loss=0.0419]
Steps: 0%| | 1892/1000000 [4:45:55<1977:43:04, 7.13s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [1892], local_loss=0.02843387983739376, train_loss=0.04339074343442917, time_cost=2.6442553997039795
+
Steps: 0%| | 1892/1000000 [4:45:55<1977:43:04, 7.13s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 1893/1000000 [4:46:10<2606:54:55, 9.40s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [1893], local_loss=0.025123799219727516, train_loss=0.053950414061546326, time_cost=5.9155333042144775
+
Steps: 0%| | 1893/1000000 [4:46:10<2606:54:55, 9.40s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 1894/1000000 [4:46:22<2818:32:38, 10.17s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [1894], local_loss=0.018932675942778587, train_loss=0.02444894053041935, time_cost=2.8146941661834717
+
Steps: 0%| | 1894/1000000 [4:46:22<2818:32:38, 10.17s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 1895/1000000 [4:46:33<2864:04:22, 10.33s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [1895], local_loss=0.015785982832312584, train_loss=0.06436958909034729, time_cost=1.4146792888641357
+
Steps: 0%| | 1895/1000000 [4:46:33<2864:04:22, 10.33s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 1896/1000000 [4:46:44<2924:49:13, 10.55s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [1896], local_loss=0.9928371906280518, train_loss=0.1644030064344406, time_cost=1.2436156272888184
+
Steps: 0%| | 1896/1000000 [4:46:44<2924:49:13, 10.55s/it, lr=1e-5, step_loss=0.993]
Steps: 0%| | 1897/1000000 [4:46:50<2535:31:07, 9.15s/it, lr=1e-5, step_loss=0.993][RANK-0]: Step: [1897], local_loss=0.0372362844645977, train_loss=0.03755588084459305, time_cost=1.1864774227142334
+
Steps: 0%| | 1897/1000000 [4:46:50<2535:31:07, 9.15s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 1898/1000000 [4:46:55<2229:35:55, 8.04s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [1898], local_loss=0.027026651427149773, train_loss=0.04241202771663666, time_cost=1.224290132522583
+
Steps: 0%| | 1898/1000000 [4:46:55<2229:35:55, 8.04s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 1899/1000000 [4:46:59<1907:31:33, 6.88s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [1899], local_loss=0.026014460250735283, train_loss=0.06081506609916687, time_cost=1.407060146331787
+
Steps: 0%| | 1899/1000000 [4:46:59<1907:31:33, 6.88s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 1900/1000000 [4:47:15<2613:39:58, 9.43s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [1900], local_loss=0.01926354505121708, train_loss=0.05663159489631653, time_cost=8.512940883636475
+
Steps: 0%| | 1900/1000000 [4:47:15<2613:39:58, 9.43s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 1901/1000000 [4:47:22<2427:37:46, 8.76s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [1901], local_loss=0.02245909906923771, train_loss=0.04010787233710289, time_cost=3.198103189468384
+
Steps: 0%| | 1901/1000000 [4:47:22<2427:37:46, 8.76s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 1902/1000000 [4:47:33<2615:36:15, 9.43s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [1902], local_loss=0.025535643100738525, train_loss=0.06919550895690918, time_cost=2.4410572052001953
+
Steps: 0%| | 1902/1000000 [4:47:33<2615:36:15, 9.43s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 1903/1000000 [4:47:47<2985:48:52, 10.77s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [1903], local_loss=0.014342740178108215, train_loss=0.02989846281707287, time_cost=4.566165208816528
+
Steps: 0%| | 1903/1000000 [4:47:47<2985:48:52, 10.77s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 1904/1000000 [4:47:52<2556:54:28, 9.22s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [1904], local_loss=0.02382108010351658, train_loss=0.049982473254203796, time_cost=1.8164288997650146
+
Steps: 0%| | 1904/1000000 [4:47:52<2556:54:28, 9.22s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 1905/1000000 [4:48:01<2489:54:44, 8.98s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [1905], local_loss=0.02642870508134365, train_loss=0.036070823669433594, time_cost=2.6070010662078857
+
Steps: 0%| | 1905/1000000 [4:48:01<2489:54:44, 8.98s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 1906/1000000 [4:48:09<2394:41:12, 8.64s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [1906], local_loss=0.016888441517949104, train_loss=0.026272037997841835, time_cost=3.5627706050872803
+
Steps: 0%| | 1906/1000000 [4:48:09<2394:41:12, 8.64s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 1907/1000000 [4:48:19<2560:31:05, 9.24s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [1907], local_loss=0.04224411025643349, train_loss=0.04815388843417168, time_cost=1.3411126136779785
+
Steps: 0%| | 1907/1000000 [4:48:19<2560:31:05, 9.24s/it, lr=1e-5, step_loss=0.0422]
Steps: 0%| | 1908/1000000 [4:48:33<2941:22:43, 10.61s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [1908], local_loss=0.022532761096954346, train_loss=0.0413774736225605, time_cost=1.2357330322265625
+
Steps: 0%| | 1908/1000000 [4:48:33<2941:22:43, 10.61s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 1909/1000000 [4:48:40<2633:25:15, 9.50s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [1909], local_loss=0.9941415786743164, train_loss=0.17713886499404907, time_cost=3.249235153198242
+
Steps: 0%| | 1909/1000000 [4:48:40<2633:25:15, 9.50s/it, lr=1e-5, step_loss=0.994]
Steps: 0%| | 1910/1000000 [4:48:49<2580:21:45, 9.31s/it, lr=1e-5, step_loss=0.994][RANK-0]: Step: [1910], local_loss=0.024322647601366043, train_loss=0.09677599370479584, time_cost=2.704057455062866
+
Steps: 0%| | 1910/1000000 [4:48:49<2580:21:45, 9.31s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 1911/1000000 [4:48:59<2648:25:25, 9.55s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [1911], local_loss=0.04015869274735451, train_loss=0.09638708829879761, time_cost=4.786952495574951
+
Steps: 0%| | 1911/1000000 [4:48:59<2648:25:25, 9.55s/it, lr=1e-5, step_loss=0.0402]
Steps: 0%| | 1912/1000000 [4:49:04<2290:01:37, 8.26s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [1912], local_loss=0.06343396753072739, train_loss=0.04991370812058449, time_cost=2.0311429500579834
+
Steps: 0%| | 1912/1000000 [4:49:04<2290:01:37, 8.26s/it, lr=1e-5, step_loss=0.0634]
Steps: 0%| | 1913/1000000 [4:49:19<2791:20:30, 10.07s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [1913], local_loss=0.04308147355914116, train_loss=0.08341032266616821, time_cost=1.5968773365020752
+
Steps: 0%| | 1913/1000000 [4:49:19<2791:20:30, 10.07s/it, lr=1e-5, step_loss=0.0431]
Steps: 0%| | 1914/1000000 [4:49:29<2827:42:13, 10.20s/it, lr=1e-5, step_loss=0.0431][RANK-0]: Step: [1914], local_loss=0.08878502994775772, train_loss=0.05014181137084961, time_cost=4.687033414840698
+
Steps: 0%| | 1914/1000000 [4:49:29<2827:42:13, 10.20s/it, lr=1e-5, step_loss=0.0888]
Steps: 0%| | 1915/1000000 [4:49:39<2808:31:48, 10.13s/it, lr=1e-5, step_loss=0.0888][RANK-0]: Step: [1915], local_loss=0.03395187854766846, train_loss=0.05381970480084419, time_cost=4.086189031600952
+
Steps: 0%| | 1915/1000000 [4:49:39<2808:31:48, 10.13s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 1916/1000000 [4:49:54<3175:17:16, 11.45s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [1916], local_loss=0.050076913088560104, train_loss=0.05985706299543381, time_cost=6.0300328731536865
+
Steps: 0%| | 1916/1000000 [4:49:54<3175:17:16, 11.45s/it, lr=1e-5, step_loss=0.0501]
Steps: 0%| | 1917/1000000 [4:49:59<2640:40:53, 9.52s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [1917], local_loss=0.02654368430376053, train_loss=0.07128994911909103, time_cost=1.442690134048462
+
Steps: 0%| | 1917/1000000 [4:49:59<2640:40:53, 9.52s/it, lr=1e-5, step_loss=0.0265]
Steps: 0%| | 1918/1000000 [4:50:08<2653:34:59, 9.57s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [1918], local_loss=0.07518766820430756, train_loss=0.10203185677528381, time_cost=1.246145486831665
+
Steps: 0%| | 1918/1000000 [4:50:08<2653:34:59, 9.57s/it, lr=1e-5, step_loss=0.0752]
Steps: 0%| | 1919/1000000 [4:50:13<2266:14:57, 8.17s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [1919], local_loss=0.06717183440923691, train_loss=0.0611969493329525, time_cost=1.8724853992462158
+
Steps: 0%| | 1919/1000000 [4:50:13<2266:14:57, 8.17s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 1920/1000000 [4:50:20<2172:30:00, 7.84s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [1920], local_loss=0.30964910984039307, train_loss=0.09816280007362366, time_cost=3.261754274368286
+
Steps: 0%| | 1920/1000000 [4:50:20<2172:30:00, 7.84s/it, lr=1e-5, step_loss=0.31]
Steps: 0%| | 1921/1000000 [4:50:33<2617:18:37, 9.44s/it, lr=1e-5, step_loss=0.31][RANK-0]: Step: [1921], local_loss=0.040410637855529785, train_loss=0.04423845559358597, time_cost=3.908090591430664
+
Steps: 0%| | 1921/1000000 [4:50:33<2617:18:37, 9.44s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 1922/1000000 [4:50:38<2252:57:19, 8.13s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [1922], local_loss=0.022893026471138, train_loss=0.03969695419073105, time_cost=2.1931331157684326
+
Steps: 0%| | 1922/1000000 [4:50:38<2252:57:19, 8.13s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 1923/1000000 [4:50:46<2205:11:00, 7.95s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [1923], local_loss=0.04482129588723183, train_loss=0.08496061712503433, time_cost=2.4476401805877686
+
Steps: 0%| | 1923/1000000 [4:50:46<2205:11:00, 7.95s/it, lr=1e-5, step_loss=0.0448]
Steps: 0%| | 1924/1000000 [4:50:57<2471:27:50, 8.91s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [1924], local_loss=0.04201601818203926, train_loss=0.07487515360116959, time_cost=1.3330638408660889
+
Steps: 0%| | 1924/1000000 [4:50:57<2471:27:50, 8.91s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 1925/1000000 [4:51:08<2647:03:50, 9.55s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [1925], local_loss=0.05615930259227753, train_loss=0.04557017982006073, time_cost=1.7029879093170166
+
Steps: 0%| | 1925/1000000 [4:51:08<2647:03:50, 9.55s/it, lr=1e-5, step_loss=0.0562]
Steps: 0%| | 1926/1000000 [4:51:12<2213:12:39, 7.98s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [1926], local_loss=0.13976390659809113, train_loss=0.061184708029031754, time_cost=1.3361546993255615
+
Steps: 0%| | 1926/1000000 [4:51:12<2213:12:39, 7.98s/it, lr=1e-5, step_loss=0.14]
Steps: 0%| | 1927/1000000 [4:51:28<2825:50:29, 10.19s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [1927], local_loss=0.14172162115573883, train_loss=0.08002035319805145, time_cost=4.108681917190552
+
Steps: 0%| | 1927/1000000 [4:51:28<2825:50:29, 10.19s/it, lr=1e-5, step_loss=0.142]
Steps: 0%| | 1928/1000000 [4:51:35<2607:50:17, 9.41s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [1928], local_loss=0.02774813398718834, train_loss=0.08569062501192093, time_cost=1.3615443706512451
+
Steps: 0%| | 1928/1000000 [4:51:35<2607:50:17, 9.41s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 1929/1000000 [4:51:43<2429:16:19, 8.76s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [1929], local_loss=0.07811231911182404, train_loss=0.05005375295877457, time_cost=1.7168254852294922
+
Steps: 0%| | 1929/1000000 [4:51:43<2429:16:19, 8.76s/it, lr=1e-5, step_loss=0.0781]
Steps: 0%| | 1930/1000000 [4:51:52<2511:00:49, 9.06s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [1930], local_loss=0.02521122433245182, train_loss=4.010493755340576, time_cost=1.2389473915100098
+
Steps: 0%| | 1930/1000000 [4:51:52<2511:00:49, 9.06s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 1931/1000000 [4:52:00<2356:29:05, 8.50s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [1931], local_loss=0.01877295970916748, train_loss=0.07155530154705048, time_cost=5.67462682723999
+
Steps: 0%| | 1931/1000000 [4:52:00<2356:29:05, 8.50s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 1932/1000000 [4:52:05<2128:09:31, 7.68s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [1932], local_loss=0.04103123024106026, train_loss=0.0804714560508728, time_cost=1.7395172119140625
+
Steps: 0%| | 1932/1000000 [4:52:05<2128:09:31, 7.68s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 1933/1000000 [4:52:10<1904:25:15, 6.87s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [1933], local_loss=0.0505845844745636, train_loss=0.04048841446638107, time_cost=1.874356985092163
+
Steps: 0%| | 1933/1000000 [4:52:10<1904:25:15, 6.87s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 1934/1000000 [4:52:18<1990:15:38, 7.18s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [1934], local_loss=0.025835638865828514, train_loss=0.07801264524459839, time_cost=2.939074754714966
+
Steps: 0%| | 1934/1000000 [4:52:18<1990:15:38, 7.18s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 1935/1000000 [4:52:23<1787:39:42, 6.45s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [1935], local_loss=0.1523483395576477, train_loss=0.07567164301872253, time_cost=1.7382559776306152
+
Steps: 0%| | 1935/1000000 [4:52:23<1787:39:42, 6.45s/it, lr=1e-5, step_loss=0.152]
Steps: 0%| | 1936/1000000 [4:52:28<1679:53:11, 6.06s/it, lr=1e-5, step_loss=0.152][RANK-0]: Step: [1936], local_loss=0.0272495299577713, train_loss=0.042457662522792816, time_cost=2.6107194423675537
+
Steps: 0%| | 1936/1000000 [4:52:28<1679:53:11, 6.06s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 1937/1000000 [4:52:36<1839:26:59, 6.63s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [1937], local_loss=0.044524963945150375, train_loss=0.08711977303028107, time_cost=7.050095558166504
+
Steps: 0%| | 1937/1000000 [4:52:36<1839:26:59, 6.63s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 1938/1000000 [4:52:47<2202:38:28, 7.94s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [1938], local_loss=0.07580048590898514, train_loss=0.16084139049053192, time_cost=1.981943130493164
+
Steps: 0%| | 1938/1000000 [4:52:47<2202:38:28, 7.94s/it, lr=1e-5, step_loss=0.0758]
Steps: 0%| | 1939/1000000 [4:52:55<2191:54:17, 7.91s/it, lr=1e-5, step_loss=0.0758][RANK-0]: Step: [1939], local_loss=0.0211852565407753, train_loss=0.03079236112535, time_cost=3.8168022632598877
+
Steps: 0%| | 1939/1000000 [4:52:55<2191:54:17, 7.91s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 1940/1000000 [4:53:08<2608:47:58, 9.41s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [1940], local_loss=0.043568480759859085, train_loss=0.05479782447218895, time_cost=1.208693027496338
+
Steps: 0%| | 1940/1000000 [4:53:08<2608:47:58, 9.41s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 1941/1000000 [4:53:19<2733:44:00, 9.86s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [1941], local_loss=0.025678541511297226, train_loss=0.02895703911781311, time_cost=3.8167154788970947
+
Steps: 0%| | 1941/1000000 [4:53:19<2733:44:00, 9.86s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 1942/1000000 [4:53:25<2427:14:50, 8.76s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [1942], local_loss=0.030874991789460182, train_loss=0.04108276590704918, time_cost=2.64225435256958
+
Steps: 0%| | 1942/1000000 [4:53:25<2427:14:50, 8.76s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 1943/1000000 [4:53:36<2631:22:49, 9.49s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [1943], local_loss=0.033915095031261444, train_loss=0.07937057316303253, time_cost=3.6527469158172607
+
Steps: 0%| | 1943/1000000 [4:53:36<2631:22:49, 9.49s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 1944/1000000 [4:53:41<2265:57:14, 8.17s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [1944], local_loss=0.020751945674419403, train_loss=0.05138188600540161, time_cost=2.1796441078186035
+
Steps: 0%| | 1944/1000000 [4:53:41<2265:57:14, 8.17s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 1945/1000000 [4:53:46<2002:48:02, 7.22s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [1945], local_loss=0.05451272055506706, train_loss=0.06315819919109344, time_cost=2.5245697498321533
+
Steps: 0%| | 1945/1000000 [4:53:46<2002:48:02, 7.22s/it, lr=1e-5, step_loss=0.0545]
Steps: 0%| | 1946/1000000 [4:53:59<2501:05:25, 9.02s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [1946], local_loss=0.03259355574846268, train_loss=0.0523996576666832, time_cost=1.219238519668579
+
Steps: 0%| | 1946/1000000 [4:53:59<2501:05:25, 9.02s/it, lr=1e-5, step_loss=0.0326]
Steps: 0%| | 1947/1000000 [4:54:10<2647:10:50, 9.55s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [1947], local_loss=0.03397023305296898, train_loss=0.03269872069358826, time_cost=3.910771131515503
+
Steps: 0%| | 1947/1000000 [4:54:10<2647:10:50, 9.55s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 1948/1000000 [4:54:24<3028:55:22, 10.93s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [1948], local_loss=0.06615650653839111, train_loss=0.04747025668621063, time_cost=4.437661170959473
+
Steps: 0%| | 1948/1000000 [4:54:24<3028:55:22, 10.93s/it, lr=1e-5, step_loss=0.0662]
Steps: 0%| | 1949/1000000 [4:54:37<3173:06:09, 11.45s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [1949], local_loss=0.06132087484002113, train_loss=0.07296763360500336, time_cost=4.008387565612793
+
Steps: 0%| | 1949/1000000 [4:54:37<3173:06:09, 11.45s/it, lr=1e-5, step_loss=0.0613]
Steps: 0%| | 1950/1000000 [4:54:50<3263:35:32, 11.77s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [1950], local_loss=0.023860573768615723, train_loss=0.032621629536151886, time_cost=2.984463691711426
+
Steps: 0%| | 1950/1000000 [4:54:50<3263:35:32, 11.77s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 1951/1000000 [4:54:56<2823:44:42, 10.19s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [1951], local_loss=0.025283081457018852, train_loss=0.030265551060438156, time_cost=1.4386656284332275
+
Steps: 0%| | 1951/1000000 [4:54:56<2823:44:42, 10.19s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 1952/1000000 [4:55:02<2441:50:27, 8.81s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [1952], local_loss=0.02511497586965561, train_loss=0.05002400651574135, time_cost=4.667725563049316
+
Steps: 0%| | 1952/1000000 [4:55:02<2441:50:27, 8.81s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 1953/1000000 [4:55:09<2295:56:59, 8.28s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [1953], local_loss=0.07333581149578094, train_loss=1.5870857238769531, time_cost=2.6901986598968506
+
Steps: 0%| | 1953/1000000 [4:55:09<2295:56:59, 8.28s/it, lr=1e-5, step_loss=0.0733]
Steps: 0%| | 1954/1000000 [4:55:13<1974:01:23, 7.12s/it, lr=1e-5, step_loss=0.0733][RANK-0]: Step: [1954], local_loss=0.13478632271289825, train_loss=0.04999479651451111, time_cost=1.4773671627044678
+
Steps: 0%| | 1954/1000000 [4:55:13<1974:01:23, 7.12s/it, lr=1e-5, step_loss=0.135]
Steps: 0%| | 1955/1000000 [4:55:24<2277:10:11, 8.21s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [1955], local_loss=0.11857399344444275, train_loss=23.149656295776367, time_cost=2.638662815093994
+
Steps: 0%| | 1955/1000000 [4:55:24<2277:10:11, 8.21s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 1956/1000000 [4:55:35<2527:23:58, 9.12s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [1956], local_loss=0.20387007296085358, train_loss=0.04648126661777496, time_cost=7.852171421051025
+
Steps: 0%| | 1956/1000000 [4:55:35<2527:23:58, 9.12s/it, lr=1e-5, step_loss=0.204]
Steps: 0%| | 1957/1000000 [4:55:49<2963:39:25, 10.69s/it, lr=1e-5, step_loss=0.204][RANK-0]: Step: [1957], local_loss=0.060305044054985046, train_loss=0.08656598627567291, time_cost=5.542353868484497
+
Steps: 0%| | 1957/1000000 [4:55:49<2963:39:25, 10.69s/it, lr=1e-5, step_loss=0.0603]
Steps: 0%| | 1958/1000000 [4:55:55<2504:40:28, 9.03s/it, lr=1e-5, step_loss=0.0603][RANK-0]: Step: [1958], local_loss=0.013693802990019321, train_loss=0.03393812105059624, time_cost=1.6932380199432373
+
Steps: 0%| | 1958/1000000 [4:55:55<2504:40:28, 9.03s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 1959/1000000 [4:56:06<2657:57:52, 9.59s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [1959], local_loss=0.019143879413604736, train_loss=0.06403738260269165, time_cost=1.7166252136230469
+
Steps: 0%| | 1959/1000000 [4:56:06<2657:57:52, 9.59s/it, lr=1e-5, step_loss=0.0191]
Steps: 0%| | 1960/1000000 [4:56:12<2400:17:09, 8.66s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [1960], local_loss=0.016175294294953346, train_loss=0.17572051286697388, time_cost=1.2235445976257324
+
Steps: 0%| | 1960/1000000 [4:56:12<2400:17:09, 8.66s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 1961/1000000 [4:56:23<2628:14:36, 9.48s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [1961], local_loss=0.05218033865094185, train_loss=0.05490301549434662, time_cost=1.1995086669921875
+
Steps: 0%| | 1961/1000000 [4:56:23<2628:14:36, 9.48s/it, lr=1e-5, step_loss=0.0522]
Steps: 0%| | 1962/1000000 [4:56:32<2593:13:34, 9.35s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [1962], local_loss=0.12526902556419373, train_loss=0.04069218784570694, time_cost=1.5729129314422607
+
Steps: 0%| | 1962/1000000 [4:56:32<2593:13:34, 9.35s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 1963/1000000 [4:56:41<2558:08:45, 9.23s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [1963], local_loss=0.1902226209640503, train_loss=0.049047160893678665, time_cost=2.751516580581665
+
Steps: 0%| | 1963/1000000 [4:56:41<2558:08:45, 9.23s/it, lr=1e-5, step_loss=0.19]
Steps: 0%| | 1964/1000000 [4:56:54<2875:46:43, 10.37s/it, lr=1e-5, step_loss=0.19][RANK-0]: Step: [1964], local_loss=0.2240709513425827, train_loss=0.12213876843452454, time_cost=1.2266359329223633
+
Steps: 0%| | 1964/1000000 [4:56:54<2875:46:43, 10.37s/it, lr=1e-5, step_loss=0.224]
Steps: 0%| | 1965/1000000 [4:57:02<2613:29:04, 9.43s/it, lr=1e-5, step_loss=0.224][RANK-0]: Step: [1965], local_loss=0.018964793533086777, train_loss=0.03563340753316879, time_cost=2.6557559967041016
+
Steps: 0%| | 1965/1000000 [4:57:02<2613:29:04, 9.43s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 1966/1000000 [4:57:08<2317:38:36, 8.36s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [1966], local_loss=0.04124636575579643, train_loss=0.06852997839450836, time_cost=2.799330234527588
+
Steps: 0%| | 1966/1000000 [4:57:08<2317:38:36, 8.36s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 1967/1000000 [4:57:19<2541:05:18, 9.17s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [1967], local_loss=0.11718307435512543, train_loss=0.08709955215454102, time_cost=2.8646626472473145
+
Steps: 0%| | 1967/1000000 [4:57:19<2541:05:18, 9.17s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 1968/1000000 [4:57:35<3122:36:02, 11.26s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [1968], local_loss=0.029585106298327446, train_loss=0.04132010042667389, time_cost=7.69018030166626
+
Steps: 0%| | 1968/1000000 [4:57:35<3122:36:02, 11.26s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 1969/1000000 [4:57:47<3221:54:12, 11.62s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [1969], local_loss=0.03226067125797272, train_loss=0.08960774540901184, time_cost=3.425187587738037
+
Steps: 0%| | 1969/1000000 [4:57:47<3221:54:12, 11.62s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 1970/1000000 [4:57:58<3169:04:55, 11.43s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [1970], local_loss=0.03750110790133476, train_loss=0.05247064307332039, time_cost=1.330439805984497
+
Steps: 0%| | 1970/1000000 [4:57:58<3169:04:55, 11.43s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 1971/1000000 [4:58:11<3317:01:49, 11.96s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [1971], local_loss=0.02814469486474991, train_loss=0.06525807082653046, time_cost=9.779847145080566
+
Steps: 0%| | 1971/1000000 [4:58:11<3317:01:49, 11.96s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 1972/1000000 [4:58:16<2685:06:49, 9.69s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [1972], local_loss=0.028816433623433113, train_loss=0.027278319001197815, time_cost=1.641505241394043
+
Steps: 0%| | 1972/1000000 [4:58:16<2685:06:49, 9.69s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 1973/1000000 [4:58:23<2509:52:14, 9.05s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [1973], local_loss=0.027386270463466644, train_loss=0.17105862498283386, time_cost=3.8059566020965576
+
Steps: 0%| | 1973/1000000 [4:58:23<2509:52:14, 9.05s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 1974/1000000 [4:58:32<2474:54:51, 8.93s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [1974], local_loss=0.05017475783824921, train_loss=17.042509078979492, time_cost=3.396259069442749
+
Steps: 0%| | 1974/1000000 [4:58:32<2474:54:51, 8.93s/it, lr=1e-5, step_loss=0.0502]
Steps: 0%| | 1975/1000000 [4:58:44<2741:54:31, 9.89s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [1975], local_loss=0.040768519043922424, train_loss=0.025754272937774658, time_cost=2.466398239135742
+
Steps: 0%| | 1975/1000000 [4:58:44<2741:54:31, 9.89s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 1976/1000000 [4:58:49<2341:47:43, 8.45s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [1976], local_loss=0.03001956269145012, train_loss=0.08676926046609879, time_cost=2.117929697036743
+
Steps: 0%| | 1976/1000000 [4:58:49<2341:47:43, 8.45s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 1977/1000000 [4:58:58<2339:28:01, 8.44s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [1977], local_loss=0.01998504437506199, train_loss=0.06909119337797165, time_cost=4.349599123001099
+
Steps: 0%| | 1977/1000000 [4:58:58<2339:28:01, 8.44s/it, lr=1e-5, step_loss=0.02]
Steps: 0%| | 1978/1000000 [4:59:06<2293:02:42, 8.27s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [1978], local_loss=0.035112857818603516, train_loss=0.11177396774291992, time_cost=3.791098117828369
+
Steps: 0%| | 1978/1000000 [4:59:06<2293:02:42, 8.27s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 1979/1000000 [4:59:18<2647:01:44, 9.55s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [1979], local_loss=0.025470063090324402, train_loss=0.08037247508764267, time_cost=9.770132541656494
+
Steps: 0%| | 1979/1000000 [4:59:18<2647:01:44, 9.55s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 1980/1000000 [4:59:28<2673:25:59, 9.64s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [1980], local_loss=0.03950871527194977, train_loss=0.03089488297700882, time_cost=2.1055729389190674
+
Steps: 0%| | 1980/1000000 [4:59:28<2673:25:59, 9.64s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 1981/1000000 [4:59:39<2801:24:42, 10.11s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [1981], local_loss=0.05394367128610611, train_loss=0.03343013674020767, time_cost=3.9054880142211914
+
Steps: 0%| | 1981/1000000 [4:59:39<2801:24:42, 10.11s/it, lr=1e-5, step_loss=0.0539]
Steps: 0%| | 1982/1000000 [4:59:45<2425:49:05, 8.75s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [1982], local_loss=0.030331488698720932, train_loss=0.04216568171977997, time_cost=3.6163532733917236
+
Steps: 0%| | 1982/1000000 [4:59:45<2425:49:05, 8.75s/it, lr=1e-5, step_loss=0.0303]
Steps: 0%| | 1983/1000000 [4:59:53<2414:18:20, 8.71s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [1983], local_loss=0.021762527525424957, train_loss=0.045092739164829254, time_cost=2.25246000289917
+
Steps: 0%| | 1983/1000000 [4:59:53<2414:18:20, 8.71s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 1984/1000000 [5:00:04<2574:28:18, 9.29s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [1984], local_loss=0.11140421777963638, train_loss=0.04667333513498306, time_cost=2.7813937664031982
+
Steps: 0%| | 1984/1000000 [5:00:04<2574:28:18, 9.29s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 1985/1000000 [5:00:15<2718:23:27, 9.81s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [1985], local_loss=0.02271549589931965, train_loss=0.09101849794387817, time_cost=4.609668493270874
+
Steps: 0%| | 1985/1000000 [5:00:15<2718:23:27, 9.81s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 1986/1000000 [5:00:29<3072:38:29, 11.08s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [1986], local_loss=0.031291671097278595, train_loss=0.1517050862312317, time_cost=2.4608893394470215
+
Steps: 0%| | 1986/1000000 [5:00:29<3072:38:29, 11.08s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 1987/1000000 [5:00:38<2902:48:55, 10.47s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [1987], local_loss=0.018866421654820442, train_loss=0.15480780601501465, time_cost=2.967097759246826
+
Steps: 0%| | 1987/1000000 [5:00:38<2902:48:55, 10.47s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 1988/1000000 [5:00:49<2976:29:35, 10.74s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [1988], local_loss=0.03520144522190094, train_loss=0.04993346333503723, time_cost=3.6968977451324463
+
Steps: 0%| | 1988/1000000 [5:00:49<2976:29:35, 10.74s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 1989/1000000 [5:00:55<2560:28:41, 9.24s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [1989], local_loss=0.07837165147066116, train_loss=0.03173092380166054, time_cost=1.9626679420471191
+
Steps: 0%| | 1989/1000000 [5:00:55<2560:28:41, 9.24s/it, lr=1e-5, step_loss=0.0784]
Steps: 0%| | 1990/1000000 [5:01:06<2661:31:49, 9.60s/it, lr=1e-5, step_loss=0.0784][RANK-0]: Step: [1990], local_loss=0.08559215813875198, train_loss=18.787120819091797, time_cost=5.054277420043945
+
Steps: 0%| | 1990/1000000 [5:01:06<2661:31:49, 9.60s/it, lr=1e-5, step_loss=0.0856]
Steps: 0%| | 1991/1000000 [5:01:14<2588:45:25, 9.34s/it, lr=1e-5, step_loss=0.0856][RANK-0]: Step: [1991], local_loss=0.025919340550899506, train_loss=0.038187816739082336, time_cost=1.2061631679534912
+
Steps: 0%| | 1991/1000000 [5:01:14<2588:45:25, 9.34s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 1992/1000000 [5:01:23<2506:09:39, 9.04s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [1992], local_loss=0.07667088508605957, train_loss=0.08015772700309753, time_cost=2.264723539352417
+
Steps: 0%| | 1992/1000000 [5:01:23<2506:09:39, 9.04s/it, lr=1e-5, step_loss=0.0767]
Steps: 0%| | 1993/1000000 [5:01:29<2244:55:19, 8.10s/it, lr=1e-5, step_loss=0.0767][RANK-0]: Step: [1993], local_loss=0.02088772878050804, train_loss=0.04415493458509445, time_cost=1.7599282264709473
+
Steps: 0%| | 1993/1000000 [5:01:29<2244:55:19, 8.10s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 1994/1000000 [5:01:34<1997:43:44, 7.21s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [1994], local_loss=0.02775430493056774, train_loss=0.224502831697464, time_cost=2.643580198287964
+
Steps: 0%| | 1994/1000000 [5:01:34<1997:43:44, 7.21s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 1995/1000000 [5:01:42<2061:14:55, 7.44s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [1995], local_loss=0.19900846481323242, train_loss=0.07071440666913986, time_cost=1.3457262516021729
+
Steps: 0%| | 1995/1000000 [5:01:42<2061:14:55, 7.44s/it, lr=1e-5, step_loss=0.199]
Steps: 0%| | 1996/1000000 [5:01:53<2375:55:19, 8.57s/it, lr=1e-5, step_loss=0.199][RANK-0]: Step: [1996], local_loss=0.07768549025058746, train_loss=0.15342004597187042, time_cost=9.544614553451538
+
Steps: 0%| | 1996/1000000 [5:01:53<2375:55:19, 8.57s/it, lr=1e-5, step_loss=0.0777]
Steps: 0%| | 1997/1000000 [5:01:59<2157:51:54, 7.78s/it, lr=1e-5, step_loss=0.0777][RANK-0]: Step: [1997], local_loss=0.017294403165578842, train_loss=0.03746272996068001, time_cost=1.4635696411132812
+
Steps: 0%| | 1997/1000000 [5:01:59<2157:51:54, 7.78s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 1998/1000000 [5:02:05<2022:07:44, 7.29s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [1998], local_loss=0.1305907517671585, train_loss=0.2347458451986313, time_cost=1.372708797454834
+
Steps: 0%| | 1998/1000000 [5:02:05<2022:07:44, 7.29s/it, lr=1e-5, step_loss=0.131]
Steps: 0%| | 1999/1000000 [5:02:16<2299:13:11, 8.29s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [1999], local_loss=0.0656050592660904, train_loss=0.08100365102291107, time_cost=4.356781482696533
+
Steps: 0%| | 1999/1000000 [5:02:16<2299:13:11, 8.29s/it, lr=1e-5, step_loss=0.0656]
Steps: 0%| | 2000/1000000 [5:02:31<2907:41:49, 10.49s/it, lr=1e-5, step_loss=0.0656][RANK-0]: Step: [2000], local_loss=0.0645354688167572, train_loss=0.04893460497260094, time_cost=1.3292787075042725
+09/19/2024 04:12:24 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000
+09/19/2024 04:12:24 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 04:12:24,311] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-19 04:12:24,341] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 04:12:24,342] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 04:12:45,113] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 04:12:45,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 04:13:18,564] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:18,564] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:18,565] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:19,333] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:19,334] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:19,334] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:19,746] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:19,746] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:19,747] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:19,867] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:19,868] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:19,868] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:20,755] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:20,755] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:20,755] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:20,991] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:20,991] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:20,991] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:21,022] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:21,064] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:21,064] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 04:13:21,133] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 04:13:21,133] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 04:13:21,134] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 04:13:21 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/model/diffusion_pytorch_model.safetensors
+09/19/2024 04:14:44 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/scheduler.bin
+09/19/2024 04:14:44 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/sampler.bin
+09/19/2024 04:14:44 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000/random_states_0.pkl
+09/19/2024 04:14:44 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-2000
+
Steps: 0%| | 2000/1000000 [5:04:51<2907:41:49, 10.49s/it, lr=1e-5, step_loss=0.0645]
Steps: 0%| | 2001/1000000 [5:04:59<14317:48:16, 51.65s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [2001], local_loss=0.0449371300637722, train_loss=0.10085074603557587, time_cost=1.204050064086914
+
Steps: 0%| | 2001/1000000 [5:04:59<14317:48:16, 51.65s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 2002/1000000 [5:05:12<11140:46:35, 40.19s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [2002], local_loss=0.018703186884522438, train_loss=0.07195652276277542, time_cost=4.900958776473999
+
Steps: 0%| | 2002/1000000 [5:05:12<11140:46:35, 40.19s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 2003/1000000 [5:05:27<9011:58:57, 32.51s/it, lr=1e-5, step_loss=0.0187] [RANK-0]: Step: [2003], local_loss=0.17149832844734192, train_loss=0.08170681446790695, time_cost=4.7542805671691895
+
Steps: 0%| | 2003/1000000 [5:05:27<9011:58:57, 32.51s/it, lr=1e-5, step_loss=0.171]
Steps: 0%| | 2004/1000000 [5:05:43<7677:47:59, 27.70s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [2004], local_loss=0.011510762386023998, train_loss=0.05741723254323006, time_cost=9.065373420715332
+
Steps: 0%| | 2004/1000000 [5:05:43<7677:47:59, 27.70s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 2005/1000000 [5:05:54<6254:17:36, 22.56s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [2005], local_loss=0.03783383592963219, train_loss=0.04857566952705383, time_cost=1.8771758079528809
+
Steps: 0%| | 2005/1000000 [5:05:54<6254:17:36, 22.56s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 2006/1000000 [5:06:01<4999:12:15, 18.03s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [2006], local_loss=0.024543508887290955, train_loss=0.0457640178501606, time_cost=1.323848009109497
+
Steps: 0%| | 2006/1000000 [5:06:01<4999:12:15, 18.03s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 2007/1000000 [5:06:16<4685:52:21, 16.90s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [2007], local_loss=0.021997852250933647, train_loss=0.058398544788360596, time_cost=4.253431558609009
+
Steps: 0%| | 2007/1000000 [5:06:16<4685:52:21, 16.90s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 2008/1000000 [5:06:21<3709:20:43, 13.38s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [2008], local_loss=0.02472476288676262, train_loss=0.026863185688853264, time_cost=2.413898229598999
+
Steps: 0%| | 2008/1000000 [5:06:21<3709:20:43, 13.38s/it, lr=1e-5, step_loss=0.0247]
Steps: 0%| | 2009/1000000 [5:06:33<3565:31:56, 12.86s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [2009], local_loss=0.023492110893130302, train_loss=0.040312543511390686, time_cost=4.689321279525757
+
Steps: 0%| | 2009/1000000 [5:06:33<3565:31:56, 12.86s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 2010/1000000 [5:06:39<3039:06:10, 10.96s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [2010], local_loss=0.019263461232185364, train_loss=0.047270022332668304, time_cost=2.0583748817443848
+
Steps: 0%| | 2010/1000000 [5:06:39<3039:06:10, 10.96s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2011/1000000 [5:06:53<3281:28:33, 11.84s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2011], local_loss=0.03494783118367195, train_loss=0.051736824214458466, time_cost=5.423618316650391
+
Steps: 0%| | 2011/1000000 [5:06:53<3281:28:33, 11.84s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 2012/1000000 [5:07:02<3038:51:07, 10.96s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [2012], local_loss=0.022627990692853928, train_loss=0.09093689918518066, time_cost=2.6919355392456055
+
Steps: 0%| | 2012/1000000 [5:07:02<3038:51:07, 10.96s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 2013/1000000 [5:07:09<2718:19:52, 9.81s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [2013], local_loss=0.025699378922581673, train_loss=0.07650831341743469, time_cost=1.2257912158966064
+
Steps: 0%| | 2013/1000000 [5:07:09<2718:19:52, 9.81s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 2014/1000000 [5:07:24<3132:26:54, 11.30s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [2014], local_loss=0.02967824414372444, train_loss=0.16635529696941376, time_cost=11.762726545333862
+
Steps: 0%| | 2014/1000000 [5:07:24<3132:26:54, 11.30s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 2015/1000000 [5:07:36<3242:50:36, 11.70s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [2015], local_loss=0.02758714184165001, train_loss=0.028542594984173775, time_cost=1.1972582340240479
+
Steps: 0%| | 2015/1000000 [5:07:36<3242:50:36, 11.70s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 2016/1000000 [5:07:44<2869:53:35, 10.35s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [2016], local_loss=0.11600448191165924, train_loss=0.0570417195558548, time_cost=1.2258150577545166
+
Steps: 0%| | 2016/1000000 [5:07:44<2869:53:35, 10.35s/it, lr=1e-5, step_loss=0.116]
Steps: 0%| | 2017/1000000 [5:07:51<2635:47:13, 9.51s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [2017], local_loss=0.057111240923404694, train_loss=0.04307927191257477, time_cost=2.940406322479248
+
Steps: 0%| | 2017/1000000 [5:07:51<2635:47:13, 9.51s/it, lr=1e-5, step_loss=0.0571]
Steps: 0%| | 2018/1000000 [5:07:57<2310:47:10, 8.34s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [2018], local_loss=0.021137956529855728, train_loss=19.958425521850586, time_cost=4.630763292312622
+
Steps: 0%| | 2018/1000000 [5:07:57<2310:47:10, 8.34s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 2019/1000000 [5:08:09<2624:01:04, 9.47s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [2019], local_loss=0.0225808285176754, train_loss=0.048599377274513245, time_cost=3.492835760116577
+
Steps: 0%| | 2019/1000000 [5:08:09<2624:01:04, 9.47s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 2020/1000000 [5:08:13<2182:39:57, 7.87s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [2020], local_loss=0.02281106635928154, train_loss=0.04694592207670212, time_cost=1.383030891418457
+
Steps: 0%| | 2020/1000000 [5:08:13<2182:39:57, 7.87s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 2021/1000000 [5:08:18<1936:49:10, 6.99s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [2021], local_loss=0.020697059109807014, train_loss=0.04092941805720329, time_cost=2.444408893585205
+
Steps: 0%| | 2021/1000000 [5:08:18<1936:49:10, 6.99s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 2022/1000000 [5:08:32<2506:20:38, 9.04s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [2022], local_loss=0.02596072293817997, train_loss=0.03381264954805374, time_cost=4.672215461730957
+
Steps: 0%| | 2022/1000000 [5:08:32<2506:20:38, 9.04s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 2023/1000000 [5:08:42<2648:34:08, 9.55s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [2023], local_loss=0.018890678882598877, train_loss=0.08224207907915115, time_cost=3.2019004821777344
+
Steps: 0%| | 2023/1000000 [5:08:42<2648:34:08, 9.55s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 2024/1000000 [5:08:59<3227:40:15, 11.64s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [2024], local_loss=0.015736347064375877, train_loss=0.056860700249671936, time_cost=7.574483156204224
+
Steps: 0%| | 2024/1000000 [5:08:59<3227:40:15, 11.64s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2025/1000000 [5:09:14<3547:57:22, 12.80s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2025], local_loss=0.019109616056084633, train_loss=0.0312703512609005, time_cost=6.675321340560913
+
Steps: 0%| | 2025/1000000 [5:09:15<3547:57:22, 12.80s/it, lr=1e-5, step_loss=0.0191]
Steps: 0%| | 2026/1000000 [5:09:19<2898:12:05, 10.45s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [2026], local_loss=0.040006909519433975, train_loss=0.03363266587257385, time_cost=2.3283286094665527
+
Steps: 0%| | 2026/1000000 [5:09:19<2898:12:05, 10.45s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 2027/1000000 [5:09:31<3010:18:07, 10.86s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [2027], local_loss=0.07206334918737411, train_loss=0.07288917154073715, time_cost=5.129953384399414
+
Steps: 0%| | 2027/1000000 [5:09:31<3010:18:07, 10.86s/it, lr=1e-5, step_loss=0.0721]
Steps: 0%| | 2028/1000000 [5:09:44<3143:47:54, 11.34s/it, lr=1e-5, step_loss=0.0721][RANK-0]: Step: [2028], local_loss=0.02600010670721531, train_loss=0.1686127632856369, time_cost=9.950172901153564
+
Steps: 0%| | 2028/1000000 [5:09:44<3143:47:54, 11.34s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 2029/1000000 [5:09:51<2801:58:21, 10.11s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [2029], local_loss=0.034904349595308304, train_loss=0.10668734461069107, time_cost=1.2899224758148193
+
Steps: 0%| | 2029/1000000 [5:09:51<2801:58:21, 10.11s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 2030/1000000 [5:10:00<2698:37:24, 9.73s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [2030], local_loss=0.054137662053108215, train_loss=0.05083470791578293, time_cost=2.8623931407928467
+
Steps: 0%| | 2030/1000000 [5:10:00<2698:37:24, 9.73s/it, lr=1e-5, step_loss=0.0541]
Steps: 0%| | 2031/1000000 [5:10:09<2656:59:24, 9.58s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [2031], local_loss=0.9865362644195557, train_loss=0.18219350278377533, time_cost=2.743939161300659
+
Steps: 0%| | 2031/1000000 [5:10:09<2656:59:24, 9.58s/it, lr=1e-5, step_loss=0.987]
Steps: 0%| | 2032/1000000 [5:10:24<3111:38:43, 11.22s/it, lr=1e-5, step_loss=0.987][RANK-0]: Step: [2032], local_loss=0.03708773851394653, train_loss=0.037286315113306046, time_cost=6.185055732727051
+
Steps: 0%| | 2032/1000000 [5:10:24<3111:38:43, 11.22s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 2033/1000000 [5:10:33<2874:59:28, 10.37s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [2033], local_loss=0.023439176380634308, train_loss=0.028369279578328133, time_cost=3.783127546310425
+
Steps: 0%| | 2033/1000000 [5:10:33<2874:59:28, 10.37s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 2034/1000000 [5:10:39<2514:18:50, 9.07s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [2034], local_loss=0.06336581707000732, train_loss=0.07640314102172852, time_cost=1.7612152099609375
+
Steps: 0%| | 2034/1000000 [5:10:39<2514:18:50, 9.07s/it, lr=1e-5, step_loss=0.0634]
Steps: 0%| | 2035/1000000 [5:10:44<2178:02:58, 7.86s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [2035], local_loss=0.03445715084671974, train_loss=0.20031258463859558, time_cost=2.252460241317749
+
Steps: 0%| | 2035/1000000 [5:10:44<2178:02:58, 7.86s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 2036/1000000 [5:10:51<2115:16:14, 7.63s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [2036], local_loss=0.023316096514463425, train_loss=0.043257199227809906, time_cost=3.1488497257232666
+
Steps: 0%| | 2036/1000000 [5:10:51<2115:16:14, 7.63s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 2037/1000000 [5:11:01<2379:37:13, 8.58s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [2037], local_loss=0.09553425759077072, train_loss=0.06006016209721565, time_cost=3.2055342197418213
+
Steps: 0%| | 2037/1000000 [5:11:01<2379:37:13, 8.58s/it, lr=1e-5, step_loss=0.0955]
Steps: 0%| | 2038/1000000 [5:11:11<2442:10:33, 8.81s/it, lr=1e-5, step_loss=0.0955][RANK-0]: Step: [2038], local_loss=0.027071956545114517, train_loss=0.07446305453777313, time_cost=1.3178150653839111
+
Steps: 0%| | 2038/1000000 [5:11:11<2442:10:33, 8.81s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 2039/1000000 [5:11:15<2050:28:58, 7.40s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [2039], local_loss=0.015826864168047905, train_loss=0.06763079762458801, time_cost=1.7475714683532715
+
Steps: 0%| | 2039/1000000 [5:11:15<2050:28:58, 7.40s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 2040/1000000 [5:11:22<2020:48:24, 7.29s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [2040], local_loss=0.12202709913253784, train_loss=0.04741983115673065, time_cost=2.572068214416504
+
Steps: 0%| | 2040/1000000 [5:11:22<2020:48:24, 7.29s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 2041/1000000 [5:11:35<2498:29:46, 9.01s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [2041], local_loss=0.04291640222072601, train_loss=0.03027096390724182, time_cost=10.31067419052124
+
Steps: 0%| | 2041/1000000 [5:11:35<2498:29:46, 9.01s/it, lr=1e-5, step_loss=0.0429]
Steps: 0%| | 2042/1000000 [5:11:50<3020:43:02, 10.90s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [2042], local_loss=0.11985011398792267, train_loss=0.0485362708568573, time_cost=5.550432920455933
+
Steps: 0%| | 2042/1000000 [5:11:50<3020:43:02, 10.90s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 2043/1000000 [5:11:57<2675:37:04, 9.65s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [2043], local_loss=0.020754627883434296, train_loss=0.030227428302168846, time_cost=2.3183319568634033
+
Steps: 0%| | 2043/1000000 [5:11:57<2675:37:04, 9.65s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 2044/1000000 [5:12:07<2690:11:00, 9.70s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [2044], local_loss=0.029701145365834236, train_loss=0.07870054244995117, time_cost=1.2257719039916992
+
Steps: 0%| | 2044/1000000 [5:12:07<2690:11:00, 9.70s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 2045/1000000 [5:12:12<2302:28:15, 8.31s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [2045], local_loss=0.10545893013477325, train_loss=0.14148321747779846, time_cost=2.4974238872528076
+
Steps: 0%| | 2045/1000000 [5:12:12<2302:28:15, 8.31s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 2046/1000000 [5:12:19<2183:57:05, 7.88s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [2046], local_loss=0.07230951637029648, train_loss=0.10351470112800598, time_cost=2.6713171005249023
+
Steps: 0%| | 2046/1000000 [5:12:19<2183:57:05, 7.88s/it, lr=1e-5, step_loss=0.0723]
Steps: 0%| | 2047/1000000 [5:12:27<2207:36:12, 7.96s/it, lr=1e-5, step_loss=0.0723][RANK-0]: Step: [2047], local_loss=0.03540943190455437, train_loss=0.055602431297302246, time_cost=3.2490780353546143
+
Steps: 0%| | 2047/1000000 [5:12:27<2207:36:12, 7.96s/it, lr=1e-5, step_loss=0.0354]
Steps: 0%| | 2048/1000000 [5:12:32<1968:32:30, 7.10s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [2048], local_loss=0.18911415338516235, train_loss=27.440820693969727, time_cost=1.2036030292510986
+
Steps: 0%| | 2048/1000000 [5:12:32<1968:32:30, 7.10s/it, lr=1e-5, step_loss=0.189]
Steps: 0%| | 2049/1000000 [5:12:46<2540:10:31, 9.16s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [2049], local_loss=0.053182389587163925, train_loss=0.06786572933197021, time_cost=3.85792875289917
+
Steps: 0%| | 2049/1000000 [5:12:46<2540:10:31, 9.16s/it, lr=1e-5, step_loss=0.0532]
Steps: 0%| | 2050/1000000 [5:12:55<2500:59:31, 9.02s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [2050], local_loss=0.08834231644868851, train_loss=0.04028836637735367, time_cost=2.6141462326049805
+
Steps: 0%| | 2050/1000000 [5:12:55<2500:59:31, 9.02s/it, lr=1e-5, step_loss=0.0883]
Steps: 0%| | 2051/1000000 [5:13:04<2533:05:48, 9.14s/it, lr=1e-5, step_loss=0.0883][RANK-0]: Step: [2051], local_loss=0.06653907150030136, train_loss=0.10363259166479111, time_cost=3.03804349899292
+
Steps: 0%| | 2051/1000000 [5:13:04<2533:05:48, 9.14s/it, lr=1e-5, step_loss=0.0665]
Steps: 0%| | 2052/1000000 [5:13:19<3039:30:42, 10.96s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [2052], local_loss=0.07662822306156158, train_loss=0.08682503551244736, time_cost=1.2441446781158447
+
Steps: 0%| | 2052/1000000 [5:13:19<3039:30:42, 10.96s/it, lr=1e-5, step_loss=0.0766]
Steps: 0%| | 2053/1000000 [5:13:27<2771:41:40, 10.00s/it, lr=1e-5, step_loss=0.0766][RANK-0]: Step: [2053], local_loss=0.025399100035429, train_loss=0.03617566078901291, time_cost=1.2354907989501953
+
Steps: 0%| | 2053/1000000 [5:13:27<2771:41:40, 10.00s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 2054/1000000 [5:13:34<2535:10:02, 9.15s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [2054], local_loss=0.0711875781416893, train_loss=0.06410616636276245, time_cost=2.605936288833618
+
Steps: 0%| | 2054/1000000 [5:13:34<2535:10:02, 9.15s/it, lr=1e-5, step_loss=0.0712]
Steps: 0%| | 2055/1000000 [5:13:39<2186:38:02, 7.89s/it, lr=1e-5, step_loss=0.0712][RANK-0]: Step: [2055], local_loss=0.058017976582050323, train_loss=0.0466318279504776, time_cost=2.401844024658203
+
Steps: 0%| | 2055/1000000 [5:13:39<2186:38:02, 7.89s/it, lr=1e-5, step_loss=0.058]
Steps: 0%| | 2056/1000000 [5:13:44<1914:22:24, 6.91s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [2056], local_loss=0.020887859165668488, train_loss=0.0411524660885334, time_cost=3.4732627868652344
+
Steps: 0%| | 2056/1000000 [5:13:44<1914:22:24, 6.91s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 2057/1000000 [5:13:49<1760:06:30, 6.35s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [2057], local_loss=0.021206554025411606, train_loss=0.1355871707201004, time_cost=2.137251138687134
+
Steps: 0%| | 2057/1000000 [5:13:49<1760:06:30, 6.35s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 2058/1000000 [5:13:54<1660:22:04, 5.99s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [2058], local_loss=0.08498860150575638, train_loss=0.20887459814548492, time_cost=2.011768102645874
+
Steps: 0%| | 2058/1000000 [5:13:54<1660:22:04, 5.99s/it, lr=1e-5, step_loss=0.085]
Steps: 0%| | 2059/1000000 [5:14:06<2180:25:12, 7.87s/it, lr=1e-5, step_loss=0.085][RANK-0]: Step: [2059], local_loss=0.04009547829627991, train_loss=0.04573340341448784, time_cost=2.844843864440918
+
Steps: 0%| | 2059/1000000 [5:14:06<2180:25:12, 7.87s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 2060/1000000 [5:14:17<2413:45:44, 8.71s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [2060], local_loss=0.021457819268107414, train_loss=0.05071757733821869, time_cost=2.8108105659484863
+
Steps: 0%| | 2060/1000000 [5:14:17<2413:45:44, 8.71s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2061/1000000 [5:14:22<2086:49:31, 7.53s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2061], local_loss=0.024611065164208412, train_loss=0.03547082841396332, time_cost=2.0493431091308594
+
Steps: 0%| | 2061/1000000 [5:14:22<2086:49:31, 7.53s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 2062/1000000 [5:14:26<1824:14:55, 6.58s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [2062], local_loss=0.02337796799838543, train_loss=0.05869235470890999, time_cost=1.7552597522735596
+
Steps: 0%| | 2062/1000000 [5:14:26<1824:14:55, 6.58s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 2063/1000000 [5:14:35<2028:17:29, 7.32s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [2063], local_loss=0.043036431074142456, train_loss=0.046110980212688446, time_cost=1.2158472537994385
+
Steps: 0%| | 2063/1000000 [5:14:35<2028:17:29, 7.32s/it, lr=1e-5, step_loss=0.043]
Steps: 0%| | 2064/1000000 [5:14:42<2034:11:55, 7.34s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [2064], local_loss=0.07301726192235947, train_loss=0.06633701920509338, time_cost=1.2228484153747559
+
Steps: 0%| | 2064/1000000 [5:14:42<2034:11:55, 7.34s/it, lr=1e-5, step_loss=0.073]
Steps: 0%| | 2065/1000000 [5:14:47<1838:03:28, 6.63s/it, lr=1e-5, step_loss=0.073][RANK-0]: Step: [2065], local_loss=0.03906027972698212, train_loss=0.05935895815491676, time_cost=1.9898970127105713
+
Steps: 0%| | 2065/1000000 [5:14:47<1838:03:28, 6.63s/it, lr=1e-5, step_loss=0.0391]
Steps: 0%| | 2066/1000000 [5:14:56<2006:45:08, 7.24s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [2066], local_loss=0.02074856124818325, train_loss=0.05537158250808716, time_cost=3.488821506500244
+
Steps: 0%| | 2066/1000000 [5:14:56<2006:45:08, 7.24s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 2067/1000000 [5:15:00<1761:17:33, 6.35s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [2067], local_loss=0.0186933446675539, train_loss=0.028030984103679657, time_cost=1.2388098239898682
+
Steps: 0%| | 2067/1000000 [5:15:00<1761:17:33, 6.35s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 2068/1000000 [5:15:05<1649:12:32, 5.95s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [2068], local_loss=0.12258914113044739, train_loss=0.08122798800468445, time_cost=2.030925989151001
+
Steps: 0%| | 2068/1000000 [5:15:05<1649:12:32, 5.95s/it, lr=1e-5, step_loss=0.123]
Steps: 0%| | 2069/1000000 [5:15:12<1684:23:40, 6.08s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [2069], local_loss=0.2900035083293915, train_loss=0.06300953030586243, time_cost=1.2660703659057617
+
Steps: 0%| | 2069/1000000 [5:15:12<1684:23:40, 6.08s/it, lr=1e-5, step_loss=0.29]
Steps: 0%| | 2070/1000000 [5:15:18<1697:07:18, 6.12s/it, lr=1e-5, step_loss=0.29][RANK-0]: Step: [2070], local_loss=0.0209070835262537, train_loss=0.058871373534202576, time_cost=5.083691120147705
+
Steps: 0%| | 2070/1000000 [5:15:18<1697:07:18, 6.12s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 2071/1000000 [5:15:24<1700:43:23, 6.14s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [2071], local_loss=0.08157698065042496, train_loss=0.11843512207269669, time_cost=2.3155856132507324
+
Steps: 0%| | 2071/1000000 [5:15:24<1700:43:23, 6.14s/it, lr=1e-5, step_loss=0.0816]
Steps: 0%| | 2072/1000000 [5:15:37<2293:50:09, 8.27s/it, lr=1e-5, step_loss=0.0816][RANK-0]: Step: [2072], local_loss=0.02729886956512928, train_loss=0.09638357907533646, time_cost=3.2034640312194824
+
Steps: 0%| | 2072/1000000 [5:15:37<2293:50:09, 8.27s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 2073/1000000 [5:15:50<2623:59:48, 9.47s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [2073], local_loss=0.018292007967829704, train_loss=0.02594023570418358, time_cost=2.425156354904175
+
Steps: 0%| | 2073/1000000 [5:15:50<2623:59:48, 9.47s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 2074/1000000 [5:15:56<2319:31:15, 8.37s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [2074], local_loss=0.02134394831955433, train_loss=0.07437534630298615, time_cost=1.2643287181854248
+
Steps: 0%| | 2074/1000000 [5:15:56<2319:31:15, 8.37s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 2075/1000000 [5:16:07<2558:27:39, 9.23s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [2075], local_loss=0.07663778215646744, train_loss=0.05841200426220894, time_cost=1.931114912033081
+
Steps: 0%| | 2075/1000000 [5:16:07<2558:27:39, 9.23s/it, lr=1e-5, step_loss=0.0766]
Steps: 0%| | 2076/1000000 [5:16:14<2370:52:26, 8.55s/it, lr=1e-5, step_loss=0.0766][RANK-0]: Step: [2076], local_loss=0.03076833486557007, train_loss=0.09121610224246979, time_cost=2.523502826690674
+
Steps: 0%| | 2076/1000000 [5:16:14<2370:52:26, 8.55s/it, lr=1e-5, step_loss=0.0308]
Steps: 0%| | 2077/1000000 [5:16:21<2280:53:05, 8.23s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [2077], local_loss=0.01937999576330185, train_loss=0.04185192286968231, time_cost=5.022191286087036
+
Steps: 0%| | 2077/1000000 [5:16:21<2280:53:05, 8.23s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 2078/1000000 [5:16:30<2362:20:28, 8.52s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [2078], local_loss=0.07921837270259857, train_loss=0.035608403384685516, time_cost=1.6808559894561768
+
Steps: 0%| | 2078/1000000 [5:16:30<2362:20:28, 8.52s/it, lr=1e-5, step_loss=0.0792]
Steps: 0%| | 2079/1000000 [5:16:43<2698:35:21, 9.74s/it, lr=1e-5, step_loss=0.0792][RANK-0]: Step: [2079], local_loss=0.03524415194988251, train_loss=0.040966346859931946, time_cost=10.271817445755005
+
Steps: 0%| | 2079/1000000 [5:16:43<2698:35:21, 9.74s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 2080/1000000 [5:16:48<2305:25:48, 8.32s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [2080], local_loss=0.025457706302404404, train_loss=0.0545668825507164, time_cost=1.2332141399383545
+
Steps: 0%| | 2080/1000000 [5:16:48<2305:25:48, 8.32s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 2081/1000000 [5:17:00<2603:35:57, 9.39s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [2081], local_loss=0.030504979193210602, train_loss=0.044824738055467606, time_cost=1.2205555438995361
+
Steps: 0%| | 2081/1000000 [5:17:00<2603:35:57, 9.39s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 2082/1000000 [5:17:06<2294:58:00, 8.28s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [2082], local_loss=0.01569518819451332, train_loss=0.05210919678211212, time_cost=1.5163452625274658
+
Steps: 0%| | 2082/1000000 [5:17:06<2294:58:00, 8.28s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2083/1000000 [5:17:21<2921:56:53, 10.54s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2083], local_loss=0.03273739665746689, train_loss=0.041404739022254944, time_cost=1.2516653537750244
+
Steps: 0%| | 2083/1000000 [5:17:21<2921:56:53, 10.54s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 2084/1000000 [5:17:30<2771:49:10, 10.00s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [2084], local_loss=0.026878446340560913, train_loss=0.04855310916900635, time_cost=6.365127325057983
+
Steps: 0%| | 2084/1000000 [5:17:30<2771:49:10, 10.00s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 2085/1000000 [5:17:35<2360:40:27, 8.52s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [2085], local_loss=0.21402660012245178, train_loss=0.052664030343294144, time_cost=3.703484535217285
+
Steps: 0%| | 2085/1000000 [5:17:35<2360:40:27, 8.52s/it, lr=1e-5, step_loss=0.214]
Steps: 0%| | 2086/1000000 [5:17:45<2433:16:51, 8.78s/it, lr=1e-5, step_loss=0.214][RANK-0]: Step: [2086], local_loss=0.029698358848690987, train_loss=0.03588242828845978, time_cost=1.4495468139648438
+
Steps: 0%| | 2086/1000000 [5:17:45<2433:16:51, 8.78s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 2087/1000000 [5:17:49<2058:05:34, 7.42s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [2087], local_loss=0.02941633202135563, train_loss=0.13681086897850037, time_cost=1.805757999420166
+
Steps: 0%| | 2087/1000000 [5:17:49<2058:05:34, 7.42s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 2088/1000000 [5:17:58<2188:33:15, 7.90s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [2088], local_loss=0.038651082664728165, train_loss=0.036336809396743774, time_cost=2.7333450317382812
+
Steps: 0%| | 2088/1000000 [5:17:58<2188:33:15, 7.90s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 2089/1000000 [5:18:04<2007:08:58, 7.24s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [2089], local_loss=0.01910080388188362, train_loss=0.07129757851362228, time_cost=1.3483972549438477
+
Steps: 0%| | 2089/1000000 [5:18:04<2007:08:58, 7.24s/it, lr=1e-5, step_loss=0.0191]
Steps: 0%| | 2090/1000000 [5:18:12<2073:06:20, 7.48s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [2090], local_loss=0.07531622052192688, train_loss=0.05253918841481209, time_cost=1.2370591163635254
+
Steps: 0%| | 2090/1000000 [5:18:12<2073:06:20, 7.48s/it, lr=1e-5, step_loss=0.0753]
Steps: 0%| | 2091/1000000 [5:18:20<2185:26:12, 7.88s/it, lr=1e-5, step_loss=0.0753][RANK-0]: Step: [2091], local_loss=0.050190962851047516, train_loss=0.07324200868606567, time_cost=5.77104115486145
+
Steps: 0%| | 2091/1000000 [5:18:20<2185:26:12, 7.88s/it, lr=1e-5, step_loss=0.0502]
Steps: 0%| | 2092/1000000 [5:18:27<2103:32:48, 7.59s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [2092], local_loss=0.053684644401073456, train_loss=0.058486536145210266, time_cost=3.33235502243042
+
Steps: 0%| | 2092/1000000 [5:18:27<2103:32:48, 7.59s/it, lr=1e-5, step_loss=0.0537]
Steps: 0%| | 2093/1000000 [5:18:38<2367:13:40, 8.54s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [2093], local_loss=0.04010849446058273, train_loss=0.1576637327671051, time_cost=1.2303814888000488
+
Steps: 0%| | 2093/1000000 [5:18:38<2367:13:40, 8.54s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 2094/1000000 [5:18:44<2188:39:27, 7.90s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [2094], local_loss=0.24792012572288513, train_loss=0.06475497037172318, time_cost=5.168217658996582
+
Steps: 0%| | 2094/1000000 [5:18:44<2188:39:27, 7.90s/it, lr=1e-5, step_loss=0.248]
Steps: 0%| | 2095/1000000 [5:18:52<2151:26:11, 7.76s/it, lr=1e-5, step_loss=0.248][RANK-0]: Step: [2095], local_loss=0.06575607508420944, train_loss=0.032401468604803085, time_cost=1.2320423126220703
+
Steps: 0%| | 2095/1000000 [5:18:52<2151:26:11, 7.76s/it, lr=1e-5, step_loss=0.0658]
Steps: 0%| | 2096/1000000 [5:19:03<2443:58:55, 8.82s/it, lr=1e-5, step_loss=0.0658][RANK-0]: Step: [2096], local_loss=0.05738166347146034, train_loss=0.07255673408508301, time_cost=1.2170398235321045
+
Steps: 0%| | 2096/1000000 [5:19:03<2443:58:55, 8.82s/it, lr=1e-5, step_loss=0.0574]
Steps: 0%| | 2097/1000000 [5:19:11<2362:19:19, 8.52s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [2097], local_loss=0.0298018679022789, train_loss=0.04126003012061119, time_cost=5.852292537689209
+
Steps: 0%| | 2097/1000000 [5:19:11<2362:19:19, 8.52s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 2098/1000000 [5:19:21<2518:20:53, 9.09s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [2098], local_loss=0.0212958212941885, train_loss=17.33989715576172, time_cost=1.3792433738708496
+
Steps: 0%| | 2098/1000000 [5:19:21<2518:20:53, 9.09s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 2099/1000000 [5:19:34<2840:25:26, 10.25s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [2099], local_loss=0.02123851142823696, train_loss=0.05660022422671318, time_cost=5.376550197601318
+
Steps: 0%| | 2099/1000000 [5:19:34<2840:25:26, 10.25s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 2100/1000000 [5:19:45<2859:15:55, 10.32s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [2100], local_loss=0.018706629052758217, train_loss=0.058721039444208145, time_cost=1.5299866199493408
+
Steps: 0%| | 2100/1000000 [5:19:45<2859:15:55, 10.32s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 2101/1000000 [5:19:50<2437:27:05, 8.79s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [2101], local_loss=0.03495993837714195, train_loss=0.06410686671733856, time_cost=1.454622745513916
+
Steps: 0%| | 2101/1000000 [5:19:50<2437:27:05, 8.79s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 2102/1000000 [5:19:59<2439:43:18, 8.80s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [2102], local_loss=0.03812012821435928, train_loss=0.0625959262251854, time_cost=2.801532030105591
+
Steps: 0%| | 2102/1000000 [5:19:59<2439:43:18, 8.80s/it, lr=1e-5, step_loss=0.0381]
Steps: 0%| | 2103/1000000 [5:20:11<2725:40:54, 9.83s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [2103], local_loss=0.06598255783319473, train_loss=0.18454477190971375, time_cost=2.6265742778778076
+
Steps: 0%| | 2103/1000000 [5:20:11<2725:40:54, 9.83s/it, lr=1e-5, step_loss=0.066]
Steps: 0%| | 2104/1000000 [5:20:17<2419:28:46, 8.73s/it, lr=1e-5, step_loss=0.066][RANK-0]: Step: [2104], local_loss=0.01462536584585905, train_loss=0.03505076467990875, time_cost=1.8446409702301025
+
Steps: 0%| | 2104/1000000 [5:20:17<2419:28:46, 8.73s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 2105/1000000 [5:20:27<2496:28:28, 9.01s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [2105], local_loss=0.11728981137275696, train_loss=0.053073324263095856, time_cost=4.035457372665405
+
Steps: 0%| | 2105/1000000 [5:20:27<2496:28:28, 9.01s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 2106/1000000 [5:20:40<2800:53:21, 10.10s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [2106], local_loss=0.027699731290340424, train_loss=0.05937337875366211, time_cost=4.628870248794556
+
Steps: 0%| | 2106/1000000 [5:20:40<2800:53:21, 10.10s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 2107/1000000 [5:20:44<2299:29:41, 8.30s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [2107], local_loss=0.27218014001846313, train_loss=0.09280169010162354, time_cost=1.5270106792449951
+
Steps: 0%| | 2107/1000000 [5:20:44<2299:29:41, 8.30s/it, lr=1e-5, step_loss=0.272]
Steps: 0%| | 2108/1000000 [5:20:54<2433:11:28, 8.78s/it, lr=1e-5, step_loss=0.272][RANK-0]: Step: [2108], local_loss=0.06203823536634445, train_loss=0.0619187131524086, time_cost=1.2951664924621582
+
Steps: 0%| | 2108/1000000 [5:20:54<2433:11:28, 8.78s/it, lr=1e-5, step_loss=0.062]
Steps: 0%| | 2109/1000000 [5:21:02<2398:46:34, 8.65s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [2109], local_loss=0.42043405771255493, train_loss=0.08232241868972778, time_cost=3.3796746730804443
+
Steps: 0%| | 2109/1000000 [5:21:02<2398:46:34, 8.65s/it, lr=1e-5, step_loss=0.42]
Steps: 0%| | 2110/1000000 [5:21:08<2180:54:21, 7.87s/it, lr=1e-5, step_loss=0.42][RANK-0]: Step: [2110], local_loss=0.014881640672683716, train_loss=0.07776843011379242, time_cost=1.3133628368377686
+
Steps: 0%| | 2110/1000000 [5:21:08<2180:54:21, 7.87s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 2111/1000000 [5:21:16<2171:48:18, 7.84s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [2111], local_loss=0.028994204476475716, train_loss=0.025750452652573586, time_cost=3.8006081581115723
+
Steps: 0%| | 2111/1000000 [5:21:16<2171:48:18, 7.84s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 2112/1000000 [5:21:23<2122:41:45, 7.66s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [2112], local_loss=0.030198892578482628, train_loss=0.05687549710273743, time_cost=3.13899564743042
+
Steps: 0%| | 2112/1000000 [5:21:23<2122:41:45, 7.66s/it, lr=1e-5, step_loss=0.0302]
Steps: 0%| | 2113/1000000 [5:21:32<2207:28:55, 7.96s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [2113], local_loss=0.024563202634453773, train_loss=0.1657593846321106, time_cost=2.4965009689331055
+
Steps: 0%| | 2113/1000000 [5:21:32<2207:28:55, 7.96s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 2114/1000000 [5:21:39<2189:41:51, 7.90s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [2114], local_loss=0.06723610311746597, train_loss=0.032342709600925446, time_cost=1.3147797584533691
+
Steps: 0%| | 2114/1000000 [5:21:39<2189:41:51, 7.90s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 2115/1000000 [5:21:45<2004:33:52, 7.23s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [2115], local_loss=0.033222608268260956, train_loss=0.04678059369325638, time_cost=1.317763090133667
+
Steps: 0%| | 2115/1000000 [5:21:45<2004:33:52, 7.23s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 2116/1000000 [5:21:59<2537:02:16, 9.15s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [2116], local_loss=0.0571679025888443, train_loss=0.05998755991458893, time_cost=4.047664642333984
+
Steps: 0%| | 2116/1000000 [5:21:59<2537:02:16, 9.15s/it, lr=1e-5, step_loss=0.0572]
Steps: 0%| | 2117/1000000 [5:22:09<2600:18:30, 9.38s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [2117], local_loss=0.1332329511642456, train_loss=0.0953335165977478, time_cost=1.3265407085418701
+
Steps: 0%| | 2117/1000000 [5:22:09<2600:18:30, 9.38s/it, lr=1e-5, step_loss=0.133]
Steps: 0%| | 2118/1000000 [5:22:20<2752:05:08, 9.93s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [2118], local_loss=0.05075814202427864, train_loss=0.0642554834485054, time_cost=2.0393788814544678
+
Steps: 0%| | 2118/1000000 [5:22:20<2752:05:08, 9.93s/it, lr=1e-5, step_loss=0.0508]
Steps: 0%| | 2119/1000000 [5:22:28<2596:00:41, 9.37s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [2119], local_loss=0.02382541447877884, train_loss=0.0510995090007782, time_cost=3.1955888271331787
+
Steps: 0%| | 2119/1000000 [5:22:28<2596:00:41, 9.37s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 2120/1000000 [5:22:33<2254:28:50, 8.13s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [2120], local_loss=0.06074592471122742, train_loss=0.0767986848950386, time_cost=3.8450329303741455
+
Steps: 0%| | 2120/1000000 [5:22:33<2254:28:50, 8.13s/it, lr=1e-5, step_loss=0.0607]
Steps: 0%| | 2121/1000000 [5:22:44<2505:18:33, 9.04s/it, lr=1e-5, step_loss=0.0607][RANK-0]: Step: [2121], local_loss=0.05599464848637581, train_loss=0.044941894710063934, time_cost=3.3589305877685547
+
Steps: 0%| | 2121/1000000 [5:22:44<2505:18:33, 9.04s/it, lr=1e-5, step_loss=0.056]
Steps: 0%| | 2122/1000000 [5:22:56<2748:41:00, 9.92s/it, lr=1e-5, step_loss=0.056][RANK-0]: Step: [2122], local_loss=0.033702678978443146, train_loss=0.053579676896333694, time_cost=4.563369035720825
+
Steps: 0%| | 2122/1000000 [5:22:56<2748:41:00, 9.92s/it, lr=1e-5, step_loss=0.0337]
Steps: 0%| | 2123/1000000 [5:23:01<2296:08:40, 8.28s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [2123], local_loss=0.020627304911613464, train_loss=0.08037663996219635, time_cost=1.6892223358154297
+
Steps: 0%| | 2123/1000000 [5:23:01<2296:08:40, 8.28s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 2124/1000000 [5:23:06<2043:18:37, 7.37s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [2124], local_loss=0.04955914616584778, train_loss=0.047433897852897644, time_cost=2.599004030227661
+
Steps: 0%| | 2124/1000000 [5:23:06<2043:18:37, 7.37s/it, lr=1e-5, step_loss=0.0496]
Steps: 0%| | 2125/1000000 [5:23:21<2655:37:20, 9.58s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [2125], local_loss=0.34335529804229736, train_loss=0.08146888762712479, time_cost=10.16123104095459
+
Steps: 0%| | 2125/1000000 [5:23:21<2655:37:20, 9.58s/it, lr=1e-5, step_loss=0.343]
Steps: 0%| | 2126/1000000 [5:23:26<2294:07:28, 8.28s/it, lr=1e-5, step_loss=0.343][RANK-0]: Step: [2126], local_loss=0.06352677941322327, train_loss=0.09878277778625488, time_cost=4.228528738021851
+
Steps: 0%| | 2126/1000000 [5:23:26<2294:07:28, 8.28s/it, lr=1e-5, step_loss=0.0635]
Steps: 0%| | 2127/1000000 [5:23:37<2556:37:40, 9.22s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [2127], local_loss=0.07507534325122833, train_loss=0.12724952399730682, time_cost=1.9056484699249268
+
Steps: 0%| | 2127/1000000 [5:23:37<2556:37:40, 9.22s/it, lr=1e-5, step_loss=0.0751]
Steps: 0%| | 2128/1000000 [5:23:45<2388:29:32, 8.62s/it, lr=1e-5, step_loss=0.0751][RANK-0]: Step: [2128], local_loss=0.0231932383030653, train_loss=0.1805446743965149, time_cost=1.2869822978973389
+
Steps: 0%| | 2128/1000000 [5:23:45<2388:29:32, 8.62s/it, lr=1e-5, step_loss=0.0232]
Steps: 0%| | 2129/1000000 [5:23:57<2703:30:35, 9.75s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [2129], local_loss=0.028942180797457695, train_loss=0.050765544176101685, time_cost=4.759292125701904
+
Steps: 0%| | 2129/1000000 [5:23:57<2703:30:35, 9.75s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 2130/1000000 [5:24:02<2338:29:23, 8.44s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [2130], local_loss=0.05649447813630104, train_loss=0.03890962153673172, time_cost=1.2042865753173828
+
Steps: 0%| | 2130/1000000 [5:24:02<2338:29:23, 8.44s/it, lr=1e-5, step_loss=0.0565]
Steps: 0%| | 2131/1000000 [5:24:08<2131:51:01, 7.69s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [2131], local_loss=0.046997345983982086, train_loss=0.05933280289173126, time_cost=3.4719011783599854
+
Steps: 0%| | 2131/1000000 [5:24:08<2131:51:01, 7.69s/it, lr=1e-5, step_loss=0.047]
Steps: 0%| | 2132/1000000 [5:24:21<2530:07:56, 9.13s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [2132], local_loss=0.025994621217250824, train_loss=0.058295074850320816, time_cost=4.63982081413269
+
Steps: 0%| | 2132/1000000 [5:24:21<2530:07:56, 9.13s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 2133/1000000 [5:24:32<2739:19:54, 9.88s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [2133], local_loss=0.018195394426584244, train_loss=0.04409608244895935, time_cost=2.403930902481079
+
Steps: 0%| | 2133/1000000 [5:24:32<2739:19:54, 9.88s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 2134/1000000 [5:24:45<2992:15:20, 10.80s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [2134], local_loss=0.022355256602168083, train_loss=0.04442834481596947, time_cost=4.783327341079712
+
Steps: 0%| | 2134/1000000 [5:24:45<2992:15:20, 10.80s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 2135/1000000 [5:25:00<3308:28:02, 11.94s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [2135], local_loss=0.02764635905623436, train_loss=0.033821314573287964, time_cost=3.78727388381958
+
Steps: 0%| | 2135/1000000 [5:25:00<3308:28:02, 11.94s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 2136/1000000 [5:25:12<3315:12:00, 11.96s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [2136], local_loss=0.03719344362616539, train_loss=0.06594352424144745, time_cost=9.91049599647522
+
Steps: 0%| | 2136/1000000 [5:25:12<3315:12:00, 11.96s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 2137/1000000 [5:25:19<2915:24:57, 10.52s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [2137], local_loss=0.020725175738334656, train_loss=0.07982327044010162, time_cost=2.743135690689087
+
Steps: 0%| | 2137/1000000 [5:25:19<2915:24:57, 10.52s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 2138/1000000 [5:25:31<3056:39:51, 11.03s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [2138], local_loss=0.02617698721587658, train_loss=37.57206726074219, time_cost=3.9461605548858643
+
Steps: 0%| | 2138/1000000 [5:25:31<3056:39:51, 11.03s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 2139/1000000 [5:25:37<2638:08:29, 9.52s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [2139], local_loss=0.021909408271312714, train_loss=0.08171199262142181, time_cost=1.6705515384674072
+
Steps: 0%| | 2139/1000000 [5:25:37<2638:08:29, 9.52s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 2140/1000000 [5:25:52<3037:09:58, 10.96s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [2140], local_loss=0.0852806493639946, train_loss=0.048544421792030334, time_cost=1.2523815631866455
+
Steps: 0%| | 2140/1000000 [5:25:52<3037:09:58, 10.96s/it, lr=1e-5, step_loss=0.0853]
Steps: 0%| | 2141/1000000 [5:25:57<2571:12:11, 9.28s/it, lr=1e-5, step_loss=0.0853][RANK-0]: Step: [2141], local_loss=0.062221888452768326, train_loss=0.07812042534351349, time_cost=4.066425085067749
+
Steps: 0%| | 2141/1000000 [5:25:57<2571:12:11, 9.28s/it, lr=1e-5, step_loss=0.0622]
Steps: 0%| | 2142/1000000 [5:26:09<2829:44:12, 10.21s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [2142], local_loss=0.0172467902302742, train_loss=0.08166475594043732, time_cost=2.3061025142669678
+
Steps: 0%| | 2142/1000000 [5:26:09<2829:44:12, 10.21s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 2143/1000000 [5:26:15<2480:49:57, 8.95s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [2143], local_loss=0.03178028762340546, train_loss=0.03946114331483841, time_cost=4.2347564697265625
+
Steps: 0%| | 2143/1000000 [5:26:15<2480:49:57, 8.95s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 2144/1000000 [5:26:29<2866:57:54, 10.34s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [2144], local_loss=0.12565603852272034, train_loss=0.06420095264911652, time_cost=6.162545442581177
+
Steps: 0%| | 2144/1000000 [5:26:29<2866:57:54, 10.34s/it, lr=1e-5, step_loss=0.126]
Steps: 0%| | 2145/1000000 [5:26:33<2376:52:27, 8.58s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [2145], local_loss=0.01832415908575058, train_loss=0.022114211693406105, time_cost=1.4113070964813232
+
Steps: 0%| | 2145/1000000 [5:26:33<2376:52:27, 8.58s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 2146/1000000 [5:26:40<2241:19:25, 8.09s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [2146], local_loss=0.019501516595482826, train_loss=0.046986136585474014, time_cost=2.491705894470215
+
Steps: 0%| | 2146/1000000 [5:26:40<2241:19:25, 8.09s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 2147/1000000 [5:26:45<1981:33:16, 7.15s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [2147], local_loss=0.022354839369654655, train_loss=0.02807723917067051, time_cost=2.08123779296875
+
Steps: 0%| | 2147/1000000 [5:26:45<1981:33:16, 7.15s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 2148/1000000 [5:26:55<2179:24:45, 7.86s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [2148], local_loss=0.016701482236385345, train_loss=0.04698142409324646, time_cost=1.2733988761901855
+
Steps: 0%| | 2148/1000000 [5:26:55<2179:24:45, 7.86s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 2149/1000000 [5:27:09<2697:45:38, 9.73s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [2149], local_loss=0.05748435854911804, train_loss=0.08652770519256592, time_cost=1.4622001647949219
+
Steps: 0%| | 2149/1000000 [5:27:09<2697:45:38, 9.73s/it, lr=1e-5, step_loss=0.0575]
Steps: 0%| | 2150/1000000 [5:27:16<2476:21:02, 8.93s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [2150], local_loss=0.023885764181613922, train_loss=0.02641145884990692, time_cost=3.146364450454712
+
Steps: 0%| | 2150/1000000 [5:27:16<2476:21:02, 8.93s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 2151/1000000 [5:27:20<2079:52:32, 7.50s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [2151], local_loss=0.04896910488605499, train_loss=0.09000059217214584, time_cost=1.4281284809112549
+
Steps: 0%| | 2151/1000000 [5:27:20<2079:52:32, 7.50s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 2152/1000000 [5:27:34<2590:59:06, 9.35s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [2152], local_loss=0.020793849602341652, train_loss=0.04671265184879303, time_cost=9.012672662734985
+
Steps: 0%| | 2152/1000000 [5:27:34<2590:59:06, 9.35s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 2153/1000000 [5:27:42<2464:48:21, 8.89s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [2153], local_loss=0.24167081713676453, train_loss=0.05988595634698868, time_cost=3.4034883975982666
+
Steps: 0%| | 2153/1000000 [5:27:42<2464:48:21, 8.89s/it, lr=1e-5, step_loss=0.242]
Steps: 0%| | 2154/1000000 [5:27:54<2774:02:24, 10.01s/it, lr=1e-5, step_loss=0.242][RANK-0]: Step: [2154], local_loss=0.028745630756020546, train_loss=0.04076806455850601, time_cost=3.3172664642333984
+
Steps: 0%| | 2154/1000000 [5:27:54<2774:02:24, 10.01s/it, lr=1e-5, step_loss=0.0287]
Steps: 0%| | 2155/1000000 [5:28:00<2433:11:31, 8.78s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [2155], local_loss=0.04230837896466255, train_loss=0.040086910128593445, time_cost=1.2764406204223633
+
Steps: 0%| | 2155/1000000 [5:28:00<2433:11:31, 8.78s/it, lr=1e-5, step_loss=0.0423]
Steps: 0%| | 2156/1000000 [5:28:07<2290:16:00, 8.26s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [2156], local_loss=0.05548611655831337, train_loss=0.09642498195171356, time_cost=3.2575323581695557
+
Steps: 0%| | 2156/1000000 [5:28:07<2290:16:00, 8.26s/it, lr=1e-5, step_loss=0.0555]
Steps: 0%| | 2157/1000000 [5:28:16<2351:45:44, 8.48s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [2157], local_loss=0.018556203693151474, train_loss=0.1475534290075302, time_cost=2.658752679824829
+
Steps: 0%| | 2157/1000000 [5:28:16<2351:45:44, 8.48s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 2158/1000000 [5:28:23<2215:44:26, 7.99s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [2158], local_loss=0.045313186943531036, train_loss=0.11417540907859802, time_cost=2.670614242553711
+
Steps: 0%| | 2158/1000000 [5:28:23<2215:44:26, 7.99s/it, lr=1e-5, step_loss=0.0453]
Steps: 0%| | 2159/1000000 [5:28:29<2013:33:45, 7.26s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [2159], local_loss=0.03175368160009384, train_loss=0.06005239859223366, time_cost=2.7582955360412598
+
Steps: 0%| | 2159/1000000 [5:28:29<2013:33:45, 7.26s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 2160/1000000 [5:28:42<2477:40:38, 8.94s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [2160], local_loss=0.050648607313632965, train_loss=29.68124771118164, time_cost=1.1950135231018066
+
Steps: 0%| | 2160/1000000 [5:28:42<2477:40:38, 8.94s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 2161/1000000 [5:28:47<2151:36:08, 7.76s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [2161], local_loss=0.028843622654676437, train_loss=0.042172618210315704, time_cost=1.8206236362457275
+
Steps: 0%| | 2161/1000000 [5:28:47<2151:36:08, 7.76s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 2162/1000000 [5:28:59<2576:30:00, 9.30s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [2162], local_loss=285.1949462890625, train_loss=35.7068977355957, time_cost=8.459615230560303
+
Steps: 0%| | 2162/1000000 [5:28:59<2576:30:00, 9.30s/it, lr=1e-5, step_loss=285]
Steps: 0%| | 2163/1000000 [5:29:06<2383:42:33, 8.60s/it, lr=1e-5, step_loss=285][RANK-0]: Step: [2163], local_loss=0.025493185967206955, train_loss=0.03964591026306152, time_cost=2.516268730163574
+
Steps: 0%| | 2163/1000000 [5:29:06<2383:42:33, 8.60s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 2164/1000000 [5:29:18<2591:32:12, 9.35s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [2164], local_loss=0.09090932458639145, train_loss=0.09247899055480957, time_cost=1.2391493320465088
+
Steps: 0%| | 2164/1000000 [5:29:18<2591:32:12, 9.35s/it, lr=1e-5, step_loss=0.0909]
Steps: 0%| | 2165/1000000 [5:29:23<2304:17:13, 8.31s/it, lr=1e-5, step_loss=0.0909][RANK-0]: Step: [2165], local_loss=0.01633240096271038, train_loss=0.02754475176334381, time_cost=1.8373732566833496
+
Steps: 0%| | 2165/1000000 [5:29:23<2304:17:13, 8.31s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 2166/1000000 [5:29:35<2561:31:50, 9.24s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [2166], local_loss=0.07069042325019836, train_loss=27.42141342163086, time_cost=2.7125167846679688
+
Steps: 0%| | 2166/1000000 [5:29:35<2561:31:50, 9.24s/it, lr=1e-5, step_loss=0.0707]
Steps: 0%| | 2167/1000000 [5:29:42<2362:02:13, 8.52s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [2167], local_loss=0.023192880675196648, train_loss=0.05405567213892937, time_cost=1.7961347103118896
+
Steps: 0%| | 2167/1000000 [5:29:42<2362:02:13, 8.52s/it, lr=1e-5, step_loss=0.0232]
Steps: 0%| | 2168/1000000 [5:29:54<2642:25:03, 9.53s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [2168], local_loss=0.06836938858032227, train_loss=0.06485630571842194, time_cost=1.2464585304260254
+
Steps: 0%| | 2168/1000000 [5:29:54<2642:25:03, 9.53s/it, lr=1e-5, step_loss=0.0684]
Steps: 0%| | 2169/1000000 [5:30:03<2655:16:05, 9.58s/it, lr=1e-5, step_loss=0.0684][RANK-0]: Step: [2169], local_loss=0.041648019105196, train_loss=0.055686816573143005, time_cost=4.279229402542114
+
Steps: 0%| | 2169/1000000 [5:30:03<2655:16:05, 9.58s/it, lr=1e-5, step_loss=0.0416]
Steps: 0%| | 2170/1000000 [5:30:11<2507:48:57, 9.05s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [2170], local_loss=0.033679381012916565, train_loss=0.04223421588540077, time_cost=3.7129626274108887
+
Steps: 0%| | 2170/1000000 [5:30:11<2507:48:57, 9.05s/it, lr=1e-5, step_loss=0.0337]
Steps: 0%| | 2171/1000000 [5:30:22<2668:09:16, 9.63s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [2171], local_loss=0.03176121413707733, train_loss=0.06016136705875397, time_cost=1.229541301727295
+
Steps: 0%| | 2171/1000000 [5:30:22<2668:09:16, 9.63s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 2172/1000000 [5:30:33<2776:01:29, 10.02s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [2172], local_loss=0.01359021570533514, train_loss=0.0463939905166626, time_cost=5.304685115814209
+
Steps: 0%| | 2172/1000000 [5:30:33<2776:01:29, 10.02s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 2173/1000000 [5:30:42<2705:03:50, 9.76s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [2173], local_loss=0.025236593559384346, train_loss=0.18310295045375824, time_cost=6.644428014755249
+
Steps: 0%| | 2173/1000000 [5:30:42<2705:03:50, 9.76s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 2174/1000000 [5:30:51<2671:43:32, 9.64s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [2174], local_loss=0.0314163938164711, train_loss=0.15932732820510864, time_cost=1.220418930053711
+
Steps: 0%| | 2174/1000000 [5:30:51<2671:43:32, 9.64s/it, lr=1e-5, step_loss=0.0314]
Steps: 0%| | 2175/1000000 [5:30:59<2529:10:25, 9.12s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [2175], local_loss=0.03222762793302536, train_loss=0.05899446830153465, time_cost=3.8476004600524902
+
Steps: 0%| | 2175/1000000 [5:30:59<2529:10:25, 9.12s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 2176/1000000 [5:31:14<2993:06:35, 10.80s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [2176], local_loss=0.04432004690170288, train_loss=0.0722966119647026, time_cost=1.2041494846343994
+
Steps: 0%| | 2176/1000000 [5:31:14<2993:06:35, 10.80s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 2177/1000000 [5:31:29<3348:16:23, 12.08s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [2177], local_loss=0.016622532159090042, train_loss=0.049698710441589355, time_cost=6.230400323867798
+
Steps: 0%| | 2177/1000000 [5:31:29<3348:16:23, 12.08s/it, lr=1e-5, step_loss=0.0166]
Steps: 0%| | 2178/1000000 [5:31:36<2940:20:16, 10.61s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [2178], local_loss=0.03206893056631088, train_loss=14.22179126739502, time_cost=3.35847544670105
+
Steps: 0%| | 2178/1000000 [5:31:36<2940:20:16, 10.61s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 2179/1000000 [5:31:48<3001:56:06, 10.83s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [2179], local_loss=0.017943410202860832, train_loss=0.08192917704582214, time_cost=3.3337690830230713
+
Steps: 0%| | 2179/1000000 [5:31:48<3001:56:06, 10.83s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 2180/1000000 [5:32:01<3238:04:24, 11.68s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [2180], local_loss=0.054172687232494354, train_loss=0.07497511804103851, time_cost=1.3495442867279053
+
Steps: 0%| | 2180/1000000 [5:32:01<3238:04:24, 11.68s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 2181/1000000 [5:32:08<2855:20:19, 10.30s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [2181], local_loss=0.0712885856628418, train_loss=0.03452648967504501, time_cost=2.8475282192230225
+
Steps: 0%| | 2181/1000000 [5:32:08<2855:20:19, 10.30s/it, lr=1e-5, step_loss=0.0713]
Steps: 0%| | 2182/1000000 [5:32:24<3252:26:31, 11.73s/it, lr=1e-5, step_loss=0.0713][RANK-0]: Step: [2182], local_loss=0.05082398280501366, train_loss=0.057578422129154205, time_cost=4.150524139404297
+
Steps: 0%| | 2182/1000000 [5:32:24<3252:26:31, 11.73s/it, lr=1e-5, step_loss=0.0508]
Steps: 0%| | 2183/1000000 [5:32:39<3526:17:02, 12.72s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [2183], local_loss=0.013603749684989452, train_loss=0.04399506002664566, time_cost=5.281044244766235
+
Steps: 0%| | 2183/1000000 [5:32:39<3526:17:02, 12.72s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 2184/1000000 [5:32:48<3244:48:21, 11.71s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [2184], local_loss=0.0170186348259449, train_loss=0.06508958339691162, time_cost=1.9419653415679932
+
Steps: 0%| | 2184/1000000 [5:32:48<3244:48:21, 11.71s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 2185/1000000 [5:32:57<2990:24:40, 10.79s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [2185], local_loss=0.34027987718582153, train_loss=0.1118231862783432, time_cost=2.3615198135375977
+
Steps: 0%| | 2185/1000000 [5:32:57<2990:24:40, 10.79s/it, lr=1e-5, step_loss=0.34]
Steps: 0%| | 2186/1000000 [5:33:07<2925:56:41, 10.56s/it, lr=1e-5, step_loss=0.34][RANK-0]: Step: [2186], local_loss=0.04182169586420059, train_loss=0.17070963978767395, time_cost=1.9752202033996582
+
Steps: 0%| | 2186/1000000 [5:33:07<2925:56:41, 10.56s/it, lr=1e-5, step_loss=0.0418]
Steps: 0%| | 2187/1000000 [5:33:17<2954:36:25, 10.66s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [2187], local_loss=0.02732272632420063, train_loss=0.031248439103364944, time_cost=1.5101451873779297
+
Steps: 0%| | 2187/1000000 [5:33:17<2954:36:25, 10.66s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 2188/1000000 [5:33:26<2783:11:37, 10.04s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [2188], local_loss=0.23137685656547546, train_loss=0.057565271854400635, time_cost=3.614882707595825
+
Steps: 0%| | 2188/1000000 [5:33:26<2783:11:37, 10.04s/it, lr=1e-5, step_loss=0.231]
Steps: 0%| | 2189/1000000 [5:33:34<2575:04:43, 9.29s/it, lr=1e-5, step_loss=0.231][RANK-0]: Step: [2189], local_loss=0.017154155299067497, train_loss=0.04327342286705971, time_cost=1.2775986194610596
+
Steps: 0%| | 2189/1000000 [5:33:34<2575:04:43, 9.29s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 2190/1000000 [5:33:46<2837:47:44, 10.24s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [2190], local_loss=0.05900829657912254, train_loss=0.03687746450304985, time_cost=3.448531150817871
+
Steps: 0%| | 2190/1000000 [5:33:46<2837:47:44, 10.24s/it, lr=1e-5, step_loss=0.059]
Steps: 0%| | 2191/1000000 [5:33:52<2442:48:15, 8.81s/it, lr=1e-5, step_loss=0.059][RANK-0]: Step: [2191], local_loss=0.03751805052161217, train_loss=0.04618636891245842, time_cost=1.3173599243164062
+
Steps: 0%| | 2191/1000000 [5:33:52<2442:48:15, 8.81s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 2192/1000000 [5:33:56<2061:24:42, 7.44s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [2192], local_loss=0.01572878472507, train_loss=0.04572755843400955, time_cost=1.2748637199401855
+
Steps: 0%| | 2192/1000000 [5:33:56<2061:24:42, 7.44s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2193/1000000 [5:34:05<2220:37:28, 8.01s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2193], local_loss=0.4803541302680969, train_loss=0.08951576054096222, time_cost=1.2228248119354248
+
Steps: 0%| | 2193/1000000 [5:34:05<2220:37:28, 8.01s/it, lr=1e-5, step_loss=0.48]
Steps: 0%| | 2194/1000000 [5:34:20<2809:42:10, 10.14s/it, lr=1e-5, step_loss=0.48][RANK-0]: Step: [2194], local_loss=0.06882733851671219, train_loss=0.051391877233982086, time_cost=6.491326332092285
+
Steps: 0%| | 2194/1000000 [5:34:20<2809:42:10, 10.14s/it, lr=1e-5, step_loss=0.0688]
Steps: 0%| | 2195/1000000 [5:34:32<2966:12:06, 10.70s/it, lr=1e-5, step_loss=0.0688][RANK-0]: Step: [2195], local_loss=0.026644330471754074, train_loss=0.05682457983493805, time_cost=9.863081693649292
+
Steps: 0%| | 2195/1000000 [5:34:32<2966:12:06, 10.70s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2196/1000000 [5:34:41<2827:48:37, 10.20s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2196], local_loss=0.1434916853904724, train_loss=0.08476810157299042, time_cost=4.278789281845093
+
Steps: 0%| | 2196/1000000 [5:34:41<2827:48:37, 10.20s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 2197/1000000 [5:34:46<2412:46:18, 8.71s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [2197], local_loss=0.05981667339801788, train_loss=0.03913678973913193, time_cost=2.193005323410034
+
Steps: 0%| | 2197/1000000 [5:34:46<2412:46:18, 8.71s/it, lr=1e-5, step_loss=0.0598]
Steps: 0%| | 2198/1000000 [5:35:01<2937:43:51, 10.60s/it, lr=1e-5, step_loss=0.0598][RANK-0]: Step: [2198], local_loss=0.03638290613889694, train_loss=0.06438159942626953, time_cost=6.005955219268799
+
Steps: 0%| | 2198/1000000 [5:35:01<2937:43:51, 10.60s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 2199/1000000 [5:35:07<2501:40:24, 9.03s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [2199], local_loss=0.024867184460163116, train_loss=0.030761292204260826, time_cost=1.1988983154296875
+
Steps: 0%| | 2199/1000000 [5:35:07<2501:40:24, 9.03s/it, lr=1e-5, step_loss=0.0249]
Steps: 0%| | 2200/1000000 [5:35:21<2925:42:55, 10.56s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [2200], local_loss=0.09501467645168304, train_loss=0.05070377141237259, time_cost=4.277098894119263
+
Steps: 0%| | 2200/1000000 [5:35:21<2925:42:55, 10.56s/it, lr=1e-5, step_loss=0.095]
Steps: 0%| | 2201/1000000 [5:35:30<2799:51:00, 10.10s/it, lr=1e-5, step_loss=0.095][RANK-0]: Step: [2201], local_loss=0.021872691810131073, train_loss=0.051077887415885925, time_cost=3.9497225284576416
+
Steps: 0%| | 2201/1000000 [5:35:30<2799:51:00, 10.10s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 2202/1000000 [5:35:37<2561:14:31, 9.24s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [2202], local_loss=0.07865391671657562, train_loss=0.05934179574251175, time_cost=1.6393377780914307
+
Steps: 0%| | 2202/1000000 [5:35:37<2561:14:31, 9.24s/it, lr=1e-5, step_loss=0.0787]
Steps: 0%| | 2203/1000000 [5:35:43<2302:06:11, 8.31s/it, lr=1e-5, step_loss=0.0787][RANK-0]: Step: [2203], local_loss=0.060583218932151794, train_loss=0.08661011606454849, time_cost=1.569502830505371
+
Steps: 0%| | 2203/1000000 [5:35:43<2302:06:11, 8.31s/it, lr=1e-5, step_loss=0.0606]
Steps: 0%| | 2204/1000000 [5:35:52<2350:12:41, 8.48s/it, lr=1e-5, step_loss=0.0606][RANK-0]: Step: [2204], local_loss=0.03540026769042015, train_loss=0.049929067492485046, time_cost=1.594395399093628
+
Steps: 0%| | 2204/1000000 [5:35:52<2350:12:41, 8.48s/it, lr=1e-5, step_loss=0.0354]
Steps: 0%| | 2205/1000000 [5:35:57<2007:46:44, 7.24s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [2205], local_loss=0.01572432741522789, train_loss=0.03657716512680054, time_cost=1.5565450191497803
+
Steps: 0%| | 2205/1000000 [5:35:57<2007:46:44, 7.24s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2206/1000000 [5:36:02<1832:57:09, 6.61s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2206], local_loss=0.01325645949691534, train_loss=0.032426901161670685, time_cost=2.6478207111358643
+
Steps: 0%| | 2206/1000000 [5:36:02<1832:57:09, 6.61s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 2207/1000000 [5:36:14<2320:48:37, 8.37s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [2207], local_loss=0.08687824755907059, train_loss=0.07476361095905304, time_cost=5.467219591140747
+
Steps: 0%| | 2207/1000000 [5:36:14<2320:48:37, 8.37s/it, lr=1e-5, step_loss=0.0869]
Steps: 0%| | 2208/1000000 [5:36:25<2533:23:54, 9.14s/it, lr=1e-5, step_loss=0.0869][RANK-0]: Step: [2208], local_loss=0.047303639352321625, train_loss=0.07089345157146454, time_cost=1.2422568798065186
+
Steps: 0%| | 2208/1000000 [5:36:25<2533:23:54, 9.14s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 2209/1000000 [5:36:30<2141:47:32, 7.73s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [2209], local_loss=0.01845923811197281, train_loss=0.037669867277145386, time_cost=1.221796989440918
+
Steps: 0%| | 2209/1000000 [5:36:30<2141:47:32, 7.73s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 2210/1000000 [5:36:34<1904:36:28, 6.87s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [2210], local_loss=0.020683322101831436, train_loss=0.0282798670232296, time_cost=2.1225149631500244
+
Steps: 0%| | 2210/1000000 [5:36:34<1904:36:28, 6.87s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 2211/1000000 [5:36:39<1687:00:50, 6.09s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [2211], local_loss=0.04823772609233856, train_loss=0.02654971368610859, time_cost=1.3557891845703125
+
Steps: 0%| | 2211/1000000 [5:36:39<1687:00:50, 6.09s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 2212/1000000 [5:36:44<1591:09:08, 5.74s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [2212], local_loss=0.14400824904441833, train_loss=0.05437634140253067, time_cost=1.9270191192626953
+
Steps: 0%| | 2212/1000000 [5:36:44<1591:09:08, 5.74s/it, lr=1e-5, step_loss=0.144]
Steps: 0%| | 2213/1000000 [5:36:56<2117:59:35, 7.64s/it, lr=1e-5, step_loss=0.144][RANK-0]: Step: [2213], local_loss=0.23931778967380524, train_loss=0.14360767602920532, time_cost=2.338253974914551
+
Steps: 0%| | 2213/1000000 [5:36:56<2117:59:35, 7.64s/it, lr=1e-5, step_loss=0.239]
Steps: 0%| | 2214/1000000 [5:37:10<2709:32:41, 9.78s/it, lr=1e-5, step_loss=0.239][RANK-0]: Step: [2214], local_loss=0.019658656790852547, train_loss=0.053595855832099915, time_cost=6.320436716079712
+
Steps: 0%| | 2214/1000000 [5:37:10<2709:32:41, 9.78s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 2215/1000000 [5:37:22<2814:20:45, 10.15s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [2215], local_loss=0.03779026120901108, train_loss=0.028951803222298622, time_cost=3.316459894180298
+
Steps: 0%| | 2215/1000000 [5:37:22<2814:20:45, 10.15s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 2216/1000000 [5:37:35<3096:26:17, 11.17s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [2216], local_loss=0.04066311568021774, train_loss=0.02914895862340927, time_cost=3.869011163711548
+
Steps: 0%| | 2216/1000000 [5:37:35<3096:26:17, 11.17s/it, lr=1e-5, step_loss=0.0407]
Steps: 0%| | 2217/1000000 [5:37:48<3227:22:19, 11.64s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [2217], local_loss=0.02296515740454197, train_loss=0.07705304026603699, time_cost=4.960545539855957
+
Steps: 0%| | 2217/1000000 [5:37:48<3227:22:19, 11.64s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 2218/1000000 [5:37:53<2675:55:29, 9.65s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [2218], local_loss=0.03215894475579262, train_loss=0.044180285185575485, time_cost=2.2512035369873047
+
Steps: 0%| | 2218/1000000 [5:37:53<2675:55:29, 9.65s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 2219/1000000 [5:38:08<3096:54:37, 11.17s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [2219], local_loss=0.02715916745364666, train_loss=0.20341119170188904, time_cost=5.432844877243042
+
Steps: 0%| | 2219/1000000 [5:38:08<3096:54:37, 11.17s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 2220/1000000 [5:38:18<3027:56:37, 10.92s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [2220], local_loss=0.05928719788789749, train_loss=0.05093205347657204, time_cost=2.5947158336639404
+
Steps: 0%| | 2220/1000000 [5:38:18<3027:56:37, 10.92s/it, lr=1e-5, step_loss=0.0593]
Steps: 0%| | 2221/1000000 [5:38:23<2548:01:47, 9.19s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [2221], local_loss=0.014762980863451958, train_loss=0.16844365000724792, time_cost=2.502260208129883
+
Steps: 0%| | 2221/1000000 [5:38:23<2548:01:47, 9.19s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 2222/1000000 [5:38:31<2406:38:26, 8.68s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [2222], local_loss=0.02168627455830574, train_loss=0.0552271232008934, time_cost=2.8509180545806885
+
Steps: 0%| | 2222/1000000 [5:38:31<2406:38:26, 8.68s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 2223/1000000 [5:38:40<2483:03:02, 8.96s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [2223], local_loss=0.046578019857406616, train_loss=0.03611311316490173, time_cost=3.2881431579589844
+
Steps: 0%| | 2223/1000000 [5:38:40<2483:03:02, 8.96s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 2224/1000000 [5:38:49<2476:46:13, 8.94s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [2224], local_loss=0.031067101284861565, train_loss=0.024660248309373856, time_cost=1.8772926330566406
+
Steps: 0%| | 2224/1000000 [5:38:49<2476:46:13, 8.94s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 2225/1000000 [5:39:06<3151:57:56, 11.37s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [2225], local_loss=0.05678481236100197, train_loss=0.030171968042850494, time_cost=9.863828182220459
+
Steps: 0%| | 2225/1000000 [5:39:06<3151:57:56, 11.37s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 2226/1000000 [5:39:16<3048:15:50, 11.00s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [2226], local_loss=0.2501276433467865, train_loss=0.05907394364476204, time_cost=6.887127161026001
+
Steps: 0%| | 2226/1000000 [5:39:16<3048:15:50, 11.00s/it, lr=1e-5, step_loss=0.25]
Steps: 0%| | 2227/1000000 [5:39:30<3301:18:02, 11.91s/it, lr=1e-5, step_loss=0.25][RANK-0]: Step: [2227], local_loss=0.026154298335313797, train_loss=0.03629699721932411, time_cost=4.714433670043945
+
Steps: 0%| | 2227/1000000 [5:39:30<3301:18:02, 11.91s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 2228/1000000 [5:39:36<2794:21:02, 10.08s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [2228], local_loss=0.9739577770233154, train_loss=0.19015775620937347, time_cost=4.449240207672119
+
Steps: 0%| | 2228/1000000 [5:39:36<2794:21:02, 10.08s/it, lr=1e-5, step_loss=0.974]
Steps: 0%| | 2229/1000000 [5:39:42<2457:01:33, 8.87s/it, lr=1e-5, step_loss=0.974][RANK-0]: Step: [2229], local_loss=0.05174940451979637, train_loss=0.049222417175769806, time_cost=1.7768261432647705
+
Steps: 0%| | 2229/1000000 [5:39:42<2457:01:33, 8.87s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 2230/1000000 [5:39:49<2312:12:35, 8.34s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [2230], local_loss=0.09925319999456406, train_loss=0.18572752177715302, time_cost=2.444312334060669
+
Steps: 0%| | 2230/1000000 [5:39:49<2312:12:35, 8.34s/it, lr=1e-5, step_loss=0.0993]
Steps: 0%| | 2231/1000000 [5:39:54<1990:29:11, 7.18s/it, lr=1e-5, step_loss=0.0993][RANK-0]: Step: [2231], local_loss=0.016502531245350838, train_loss=0.03527668118476868, time_cost=1.5718441009521484
+
Steps: 0%| | 2231/1000000 [5:39:54<1990:29:11, 7.18s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 2232/1000000 [5:40:09<2666:26:43, 9.62s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [2232], local_loss=0.04394720122218132, train_loss=0.17200785875320435, time_cost=6.335534572601318
+
Steps: 0%| | 2232/1000000 [5:40:09<2666:26:43, 9.62s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 2233/1000000 [5:40:17<2518:31:32, 9.09s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [2233], local_loss=0.03219493106007576, train_loss=0.04573129862546921, time_cost=1.220801830291748
+
Steps: 0%| | 2233/1000000 [5:40:17<2518:31:32, 9.09s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 2234/1000000 [5:40:30<2874:17:02, 10.37s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [2234], local_loss=0.21116672456264496, train_loss=0.10728415846824646, time_cost=3.4826457500457764
+
Steps: 0%| | 2234/1000000 [5:40:30<2874:17:02, 10.37s/it, lr=1e-5, step_loss=0.211]
Steps: 0%| | 2235/1000000 [5:40:42<3015:53:28, 10.88s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [2235], local_loss=0.03352309390902519, train_loss=0.03489472344517708, time_cost=7.731627702713013
+
Steps: 0%| | 2235/1000000 [5:40:42<3015:53:28, 10.88s/it, lr=1e-5, step_loss=0.0335]
Steps: 0%| | 2236/1000000 [5:40:49<2689:41:16, 9.70s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [2236], local_loss=0.021442610770463943, train_loss=0.1615363210439682, time_cost=1.2232189178466797
+
Steps: 0%| | 2236/1000000 [5:40:49<2689:41:16, 9.70s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 2237/1000000 [5:40:55<2342:47:48, 8.45s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [2237], local_loss=0.03493629768490791, train_loss=0.039893798530101776, time_cost=1.9603986740112305
+
Steps: 0%| | 2237/1000000 [5:40:55<2342:47:48, 8.45s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 2238/1000000 [5:41:07<2686:18:51, 9.69s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [2238], local_loss=0.025390122085809708, train_loss=0.08512994647026062, time_cost=4.6626927852630615
+
Steps: 0%| | 2238/1000000 [5:41:07<2686:18:51, 9.69s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 2239/1000000 [5:41:19<2841:34:36, 10.25s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [2239], local_loss=0.10455441474914551, train_loss=0.05892321467399597, time_cost=1.5161736011505127
+
Steps: 0%| | 2239/1000000 [5:41:19<2841:34:36, 10.25s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 2240/1000000 [5:41:24<2390:15:37, 8.62s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [2240], local_loss=0.03253587707877159, train_loss=0.028041809797286987, time_cost=1.7503166198730469
+
Steps: 0%| | 2240/1000000 [5:41:24<2390:15:37, 8.62s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 2241/1000000 [5:41:35<2598:09:08, 9.37s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [2241], local_loss=0.0354500450193882, train_loss=0.08511233329772949, time_cost=5.5621936321258545
+
Steps: 0%| | 2241/1000000 [5:41:35<2598:09:08, 9.37s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 2242/1000000 [5:41:47<2837:03:32, 10.24s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [2242], local_loss=0.12008636444807053, train_loss=0.07921245694160461, time_cost=2.1787641048431396
+
Steps: 0%| | 2242/1000000 [5:41:47<2837:03:32, 10.24s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 2243/1000000 [5:42:02<3201:54:03, 11.55s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [2243], local_loss=0.018108993768692017, train_loss=32.87213134765625, time_cost=6.0763022899627686
+
Steps: 0%| | 2243/1000000 [5:42:02<3201:54:03, 11.55s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 2244/1000000 [5:42:16<3393:27:53, 12.24s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [2244], local_loss=0.022369591519236565, train_loss=0.07554623484611511, time_cost=1.2101631164550781
+
Steps: 0%| | 2244/1000000 [5:42:16<3393:27:53, 12.24s/it, lr=1e-5, step_loss=0.0224]/home/image_data/hxy/Open-Sora-Plan/opensora/utils/utils.py:369: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
+ caption = BeautifulSoup(caption, features='html.parser').text
+
Steps: 0%| | 2245/1000000 [5:42:32<3716:23:33, 13.41s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [2245], local_loss=0.044632669538259506, train_loss=0.03866659849882126, time_cost=1.2217721939086914
+
Steps: 0%| | 2245/1000000 [5:42:32<3716:23:33, 13.41s/it, lr=1e-5, step_loss=0.0446]
Steps: 0%| | 2246/1000000 [5:42:44<3620:13:59, 13.06s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [2246], local_loss=0.035311006009578705, train_loss=0.05712006986141205, time_cost=5.089745283126831
+
Steps: 0%| | 2246/1000000 [5:42:44<3620:13:59, 13.06s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 2247/1000000 [5:42:49<2970:12:42, 10.72s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [2247], local_loss=0.03579599782824516, train_loss=0.030369624495506287, time_cost=1.4294753074645996
+
Steps: 0%| | 2247/1000000 [5:42:49<2970:12:42, 10.72s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 2248/1000000 [5:43:01<3029:47:52, 10.93s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [2248], local_loss=0.04689883068203926, train_loss=0.040953293442726135, time_cost=1.2262804508209229
+
Steps: 0%| | 2248/1000000 [5:43:01<3029:47:52, 10.93s/it, lr=1e-5, step_loss=0.0469]
Steps: 0%| | 2249/1000000 [5:43:12<3050:55:25, 11.01s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [2249], local_loss=49.96323776245117, train_loss=6.3014116287231445, time_cost=1.3377156257629395
+
Steps: 0%| | 2249/1000000 [5:43:12<3050:55:25, 11.01s/it, lr=1e-5, step_loss=50]
Steps: 0%| | 2250/1000000 [5:43:24<3177:24:47, 11.46s/it, lr=1e-5, step_loss=50][RANK-0]: Step: [2250], local_loss=0.022960156202316284, train_loss=0.037039317190647125, time_cost=2.0177643299102783
+
Steps: 0%| | 2250/1000000 [5:43:24<3177:24:47, 11.46s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 2251/1000000 [5:43:30<2728:15:26, 9.84s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [2251], local_loss=0.044008202850818634, train_loss=0.029549751430749893, time_cost=4.552284002304077
+
Steps: 0%| | 2251/1000000 [5:43:30<2728:15:26, 9.84s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 2252/1000000 [5:43:45<3084:44:23, 11.13s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [2252], local_loss=0.06105576455593109, train_loss=0.04690226539969444, time_cost=9.975204467773438
+
Steps: 0%| | 2252/1000000 [5:43:45<3084:44:23, 11.13s/it, lr=1e-5, step_loss=0.0611]
Steps: 0%| | 2253/1000000 [5:43:58<3266:38:24, 11.79s/it, lr=1e-5, step_loss=0.0611][RANK-0]: Step: [2253], local_loss=0.03305618464946747, train_loss=0.036739420145750046, time_cost=9.35416054725647
+
Steps: 0%| | 2253/1000000 [5:43:58<3266:38:24, 11.79s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 2254/1000000 [5:44:09<3188:42:58, 11.51s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [2254], local_loss=0.02017536200582981, train_loss=0.039727985858917236, time_cost=6.712610244750977
+
Steps: 0%| | 2254/1000000 [5:44:09<3188:42:58, 11.51s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 2255/1000000 [5:44:14<2689:26:32, 9.70s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [2255], local_loss=0.05721154436469078, train_loss=0.07535543292760849, time_cost=1.2310795783996582
+
Steps: 0%| | 2255/1000000 [5:44:14<2689:26:32, 9.70s/it, lr=1e-5, step_loss=0.0572]
Steps: 0%| | 2256/1000000 [5:44:24<2701:12:28, 9.75s/it, lr=1e-5, step_loss=0.0572][RANK-0]: Step: [2256], local_loss=0.03658370301127434, train_loss=0.05129734054207802, time_cost=4.068892002105713
+
Steps: 0%| | 2256/1000000 [5:44:24<2701:12:28, 9.75s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 2257/1000000 [5:44:37<2937:18:36, 10.60s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [2257], local_loss=0.045224353671073914, train_loss=0.038488954305648804, time_cost=4.68583607673645
+
Steps: 0%| | 2257/1000000 [5:44:37<2937:18:36, 10.60s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 2258/1000000 [5:44:42<2542:18:42, 9.17s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [2258], local_loss=0.01926126331090927, train_loss=0.052562788128852844, time_cost=1.6419856548309326
+
Steps: 0%| | 2258/1000000 [5:44:43<2542:18:42, 9.17s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2259/1000000 [5:44:55<2838:00:57, 10.24s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2259], local_loss=0.014872830361127853, train_loss=0.045389506965875626, time_cost=7.173036575317383
+
Steps: 0%| | 2259/1000000 [5:44:55<2838:00:57, 10.24s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 2260/1000000 [5:45:00<2379:27:34, 8.59s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [2260], local_loss=0.034007854759693146, train_loss=0.11247247457504272, time_cost=1.9767472743988037
+
Steps: 0%| | 2260/1000000 [5:45:00<2379:27:34, 8.59s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 2261/1000000 [5:45:14<2797:35:07, 10.09s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [2261], local_loss=0.021088112145662308, train_loss=0.022385571151971817, time_cost=1.2407400608062744
+
Steps: 0%| | 2261/1000000 [5:45:14<2797:35:07, 10.09s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 2262/1000000 [5:45:24<2812:31:58, 10.15s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [2262], local_loss=0.04715457186102867, train_loss=0.07765135169029236, time_cost=4.475080251693726
+
Steps: 0%| | 2262/1000000 [5:45:24<2812:31:58, 10.15s/it, lr=1e-5, step_loss=0.0472]
Steps: 0%| | 2263/1000000 [5:45:37<3092:56:28, 11.16s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [2263], local_loss=0.0223130714148283, train_loss=0.05439695343375206, time_cost=1.2053356170654297
+
Steps: 0%| | 2263/1000000 [5:45:37<3092:56:28, 11.16s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 2264/1000000 [5:45:43<2667:51:15, 9.63s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [2264], local_loss=0.062219105660915375, train_loss=0.17434312403202057, time_cost=1.7036876678466797
+
Steps: 0%| | 2264/1000000 [5:45:43<2667:51:15, 9.63s/it, lr=1e-5, step_loss=0.0622]
Steps: 0%| | 2265/1000000 [5:45:49<2372:14:18, 8.56s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [2265], local_loss=0.021534226834774017, train_loss=0.05480285361409187, time_cost=1.665189266204834
+
Steps: 0%| | 2265/1000000 [5:45:49<2372:14:18, 8.56s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2266/1000000 [5:45:55<2144:55:45, 7.74s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2266], local_loss=0.032242849469184875, train_loss=0.06630809605121613, time_cost=3.058595657348633
+
Steps: 0%| | 2266/1000000 [5:45:55<2144:55:45, 7.74s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 2267/1000000 [5:46:01<1952:09:31, 7.04s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [2267], local_loss=0.039535634219646454, train_loss=0.06279191374778748, time_cost=3.0960981845855713
+
Steps: 0%| | 2267/1000000 [5:46:01<1952:09:31, 7.04s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 2268/1000000 [5:46:09<2042:09:37, 7.37s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [2268], local_loss=0.07331153750419617, train_loss=0.045343104749917984, time_cost=1.218177318572998
+
Steps: 0%| | 2268/1000000 [5:46:09<2042:09:37, 7.37s/it, lr=1e-5, step_loss=0.0733]
Steps: 0%| | 2269/1000000 [5:46:18<2202:55:43, 7.95s/it, lr=1e-5, step_loss=0.0733][RANK-0]: Step: [2269], local_loss=0.025068771094083786, train_loss=0.10976599156856537, time_cost=1.2136156558990479
+
Steps: 0%| | 2269/1000000 [5:46:18<2202:55:43, 7.95s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 2270/1000000 [5:46:25<2138:27:21, 7.72s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [2270], local_loss=0.026600468903779984, train_loss=0.034103527665138245, time_cost=6.07086968421936
+
Steps: 0%| | 2270/1000000 [5:46:25<2138:27:21, 7.72s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2271/1000000 [5:46:36<2352:40:23, 8.49s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2271], local_loss=0.047436926513910294, train_loss=0.05482405424118042, time_cost=1.4858136177062988
+
Steps: 0%| | 2271/1000000 [5:46:36<2352:40:23, 8.49s/it, lr=1e-5, step_loss=0.0474]
Steps: 0%| | 2272/1000000 [5:46:41<2064:08:56, 7.45s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [2272], local_loss=0.04797716811299324, train_loss=0.060821160674095154, time_cost=2.406977415084839
+
Steps: 0%| | 2272/1000000 [5:46:41<2064:08:56, 7.45s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 2273/1000000 [5:46:49<2126:56:40, 7.67s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [2273], local_loss=0.04650919511914253, train_loss=0.034058742225170135, time_cost=1.8292627334594727
+
Steps: 0%| | 2273/1000000 [5:46:49<2126:56:40, 7.67s/it, lr=1e-5, step_loss=0.0465]
Steps: 0%| | 2274/1000000 [5:46:58<2265:02:49, 8.17s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [2274], local_loss=0.3068130612373352, train_loss=0.08392930775880814, time_cost=1.391378402709961
+
Steps: 0%| | 2274/1000000 [5:46:58<2265:02:49, 8.17s/it, lr=1e-5, step_loss=0.307]
Steps: 0%| | 2275/1000000 [5:47:02<1944:04:29, 7.01s/it, lr=1e-5, step_loss=0.307][RANK-0]: Step: [2275], local_loss=0.07263251394033432, train_loss=0.054953403770923615, time_cost=1.2181904315948486
+
Steps: 0%| | 2275/1000000 [5:47:02<1944:04:29, 7.01s/it, lr=1e-5, step_loss=0.0726]
Steps: 0%| | 2276/1000000 [5:47:08<1851:09:42, 6.68s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [2276], local_loss=0.03037244640290737, train_loss=0.14090341329574585, time_cost=4.143517255783081
+
Steps: 0%| | 2276/1000000 [5:47:08<1851:09:42, 6.68s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 2277/1000000 [5:47:23<2480:17:26, 8.95s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [2277], local_loss=0.14164628088474274, train_loss=0.09605755656957626, time_cost=4.751021146774292
+
Steps: 0%| | 2277/1000000 [5:47:23<2480:17:26, 8.95s/it, lr=1e-5, step_loss=0.142]
Steps: 0%| | 2278/1000000 [5:47:38<3032:03:02, 10.94s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [2278], local_loss=0.09682762622833252, train_loss=0.0373808890581131, time_cost=1.2312121391296387
+
Steps: 0%| | 2278/1000000 [5:47:38<3032:03:02, 10.94s/it, lr=1e-5, step_loss=0.0968]
Steps: 0%| | 2279/1000000 [5:47:42<2466:00:03, 8.90s/it, lr=1e-5, step_loss=0.0968][RANK-0]: Step: [2279], local_loss=0.03735341131687164, train_loss=0.029804695397615433, time_cost=1.2451438903808594
+
Steps: 0%| | 2279/1000000 [5:47:42<2466:00:03, 8.90s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 2280/1000000 [5:47:47<2136:30:46, 7.71s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [2280], local_loss=0.06532935053110123, train_loss=0.08506949245929718, time_cost=1.9544107913970947
+
Steps: 0%| | 2280/1000000 [5:47:47<2136:30:46, 7.71s/it, lr=1e-5, step_loss=0.0653]
Steps: 0%| | 2281/1000000 [5:47:58<2397:45:47, 8.65s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [2281], local_loss=0.022005220875144005, train_loss=0.09773033857345581, time_cost=3.4716713428497314
+
Steps: 0%| | 2281/1000000 [5:47:58<2397:45:47, 8.65s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 2282/1000000 [5:48:03<2053:36:06, 7.41s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [2282], local_loss=0.019560452550649643, train_loss=0.06951430439949036, time_cost=1.7519607543945312
+
Steps: 0%| | 2282/1000000 [5:48:03<2053:36:06, 7.41s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 2283/1000000 [5:48:14<2411:47:54, 8.70s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [2283], local_loss=0.016476619988679886, train_loss=0.08583621680736542, time_cost=1.2146568298339844
+
Steps: 0%| | 2283/1000000 [5:48:14<2411:47:54, 8.70s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 2284/1000000 [5:48:19<2093:56:51, 7.56s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [2284], local_loss=0.01265600137412548, train_loss=0.056583479046821594, time_cost=1.8369426727294922
+
Steps: 0%| | 2284/1000000 [5:48:19<2093:56:51, 7.56s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 2285/1000000 [5:48:29<2250:47:32, 8.12s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [2285], local_loss=0.12940442562103271, train_loss=0.05093936622142792, time_cost=1.3730664253234863
+
Steps: 0%| | 2285/1000000 [5:48:29<2250:47:32, 8.12s/it, lr=1e-5, step_loss=0.129]
Steps: 0%| | 2286/1000000 [5:48:38<2320:21:35, 8.37s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [2286], local_loss=0.07092493772506714, train_loss=0.0328655019402504, time_cost=2.5206260681152344
+
Steps: 0%| | 2286/1000000 [5:48:38<2320:21:35, 8.37s/it, lr=1e-5, step_loss=0.0709]
Steps: 0%| | 2287/1000000 [5:48:46<2290:52:34, 8.27s/it, lr=1e-5, step_loss=0.0709][RANK-0]: Step: [2287], local_loss=0.044041018933057785, train_loss=0.03254260867834091, time_cost=2.2023637294769287
+
Steps: 0%| | 2287/1000000 [5:48:46<2290:52:34, 8.27s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 2288/1000000 [5:49:02<2925:42:13, 10.56s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [2288], local_loss=0.04054161161184311, train_loss=0.07002302259206772, time_cost=6.686606168746948
+
Steps: 0%| | 2288/1000000 [5:49:02<2925:42:13, 10.56s/it, lr=1e-5, step_loss=0.0405]
Steps: 0%| | 2289/1000000 [5:49:16<3260:55:39, 11.77s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [2289], local_loss=0.01849730871617794, train_loss=0.022144556045532227, time_cost=6.216233491897583
+
Steps: 0%| | 2289/1000000 [5:49:16<3260:55:39, 11.77s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 2290/1000000 [5:49:24<2898:43:39, 10.46s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [2290], local_loss=0.010717005468904972, train_loss=0.05520673841238022, time_cost=1.2582058906555176
+
Steps: 0%| | 2290/1000000 [5:49:24<2898:43:39, 10.46s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 2291/1000000 [5:49:39<3328:12:40, 12.01s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [2291], local_loss=0.07825232297182083, train_loss=0.28326401114463806, time_cost=1.4397852420806885
+
Steps: 0%| | 2291/1000000 [5:49:39<3328:12:40, 12.01s/it, lr=1e-5, step_loss=0.0783]
Steps: 0%| | 2292/1000000 [5:49:45<2843:28:38, 10.26s/it, lr=1e-5, step_loss=0.0783][RANK-0]: Step: [2292], local_loss=0.02051812782883644, train_loss=0.02603883482515812, time_cost=1.223235845565796
+
Steps: 0%| | 2292/1000000 [5:49:45<2843:28:38, 10.26s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 2293/1000000 [5:49:58<3006:24:39, 10.85s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [2293], local_loss=0.05849150940775871, train_loss=0.1158176064491272, time_cost=4.242934942245483
+
Steps: 0%| | 2293/1000000 [5:49:58<3006:24:39, 10.85s/it, lr=1e-5, step_loss=0.0585]
Steps: 0%| | 2294/1000000 [5:50:08<2949:12:24, 10.64s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [2294], local_loss=0.014844612218439579, train_loss=0.09533587098121643, time_cost=3.8124022483825684
+
Steps: 0%| | 2294/1000000 [5:50:08<2949:12:24, 10.64s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 2295/1000000 [5:50:19<2999:41:42, 10.82s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [2295], local_loss=0.03072979673743248, train_loss=7.252902984619141, time_cost=1.230931043624878
+
Steps: 0%| | 2295/1000000 [5:50:19<2999:41:42, 10.82s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 2296/1000000 [5:50:30<3044:02:55, 10.98s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [2296], local_loss=0.23575666546821594, train_loss=0.06057112663984299, time_cost=3.8402340412139893
+
Steps: 0%| | 2296/1000000 [5:50:30<3044:02:55, 10.98s/it, lr=1e-5, step_loss=0.236]
Steps: 0%| | 2297/1000000 [5:50:42<3059:50:41, 11.04s/it, lr=1e-5, step_loss=0.236][RANK-0]: Step: [2297], local_loss=0.0711844190955162, train_loss=0.15906521677970886, time_cost=1.4883909225463867
+
Steps: 0%| | 2297/1000000 [5:50:42<3059:50:41, 11.04s/it, lr=1e-5, step_loss=0.0712]
Steps: 0%| | 2298/1000000 [5:50:47<2585:42:52, 9.33s/it, lr=1e-5, step_loss=0.0712][RANK-0]: Step: [2298], local_loss=0.017816854640841484, train_loss=0.026382699608802795, time_cost=1.1926307678222656
+
Steps: 0%| | 2298/1000000 [5:50:47<2585:42:52, 9.33s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2299/1000000 [5:50:55<2473:02:25, 8.92s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2299], local_loss=0.05943024531006813, train_loss=0.051442693918943405, time_cost=1.5307807922363281
+
Steps: 0%| | 2299/1000000 [5:50:55<2473:02:25, 8.92s/it, lr=1e-5, step_loss=0.0594]
Steps: 0%| | 2300/1000000 [5:51:06<2636:46:32, 9.51s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [2300], local_loss=0.040025971829891205, train_loss=0.0601818822324276, time_cost=2.7468225955963135
+
Steps: 0%| | 2300/1000000 [5:51:06<2636:46:32, 9.51s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 2301/1000000 [5:51:12<2354:21:14, 8.50s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [2301], local_loss=0.03800045698881149, train_loss=0.04460406303405762, time_cost=2.498685836791992
+
Steps: 0%| | 2301/1000000 [5:51:12<2354:21:14, 8.50s/it, lr=1e-5, step_loss=0.038]
Steps: 0%| | 2302/1000000 [5:51:20<2314:21:59, 8.35s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [2302], local_loss=0.05202947556972504, train_loss=0.13276752829551697, time_cost=3.823759078979492
+
Steps: 0%| | 2302/1000000 [5:51:20<2314:21:59, 8.35s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 2303/1000000 [5:51:34<2823:54:52, 10.19s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [2303], local_loss=0.05031374841928482, train_loss=0.09587966650724411, time_cost=1.227846384048462
+
Steps: 0%| | 2303/1000000 [5:51:34<2823:54:52, 10.19s/it, lr=1e-5, step_loss=0.0503]
Steps: 0%| | 2304/1000000 [5:51:42<2575:10:54, 9.29s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [2304], local_loss=0.014970825985074043, train_loss=0.09124777466058731, time_cost=1.2129878997802734
+
Steps: 0%| | 2304/1000000 [5:51:42<2575:10:54, 9.29s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 2305/1000000 [5:51:46<2160:06:14, 7.79s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [2305], local_loss=0.39451849460601807, train_loss=0.16721874475479126, time_cost=1.491774320602417
+
Steps: 0%| | 2305/1000000 [5:51:46<2160:06:14, 7.79s/it, lr=1e-5, step_loss=0.395]
Steps: 0%| | 2306/1000000 [5:51:53<2101:02:38, 7.58s/it, lr=1e-5, step_loss=0.395][RANK-0]: Step: [2306], local_loss=0.030403094366192818, train_loss=0.06701700389385223, time_cost=2.9573216438293457
+
Steps: 0%| | 2306/1000000 [5:51:53<2101:02:38, 7.58s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 2307/1000000 [5:52:00<2040:04:30, 7.36s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [2307], local_loss=0.11878910660743713, train_loss=0.05481497198343277, time_cost=2.451291561126709
+
Steps: 0%| | 2307/1000000 [5:52:00<2040:04:30, 7.36s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 2308/1000000 [5:52:07<1994:54:35, 7.20s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [2308], local_loss=0.018926413729786873, train_loss=0.029611803591251373, time_cost=4.958229064941406
+
Steps: 0%| | 2308/1000000 [5:52:07<1994:54:35, 7.20s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 2309/1000000 [5:52:17<2285:33:21, 8.25s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [2309], local_loss=0.02045004814863205, train_loss=0.12162758409976959, time_cost=2.236867666244507
+
Steps: 0%| | 2309/1000000 [5:52:17<2285:33:21, 8.25s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 2310/1000000 [5:52:30<2686:21:59, 9.69s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [2310], local_loss=0.02966669574379921, train_loss=0.060764048248529434, time_cost=2.623706817626953
+
Steps: 0%| | 2310/1000000 [5:52:30<2686:21:59, 9.69s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 2311/1000000 [5:52:41<2789:36:32, 10.07s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [2311], local_loss=0.03032384067773819, train_loss=0.03777492791414261, time_cost=3.2420263290405273
+
Steps: 0%| | 2311/1000000 [5:52:41<2789:36:32, 10.07s/it, lr=1e-5, step_loss=0.0303]
Steps: 0%| | 2312/1000000 [5:52:46<2358:10:44, 8.51s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [2312], local_loss=0.02200840786099434, train_loss=0.05119551718235016, time_cost=1.9201412200927734
+
Steps: 0%| | 2312/1000000 [5:52:46<2358:10:44, 8.51s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 2313/1000000 [5:52:52<2121:57:26, 7.66s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [2313], local_loss=0.11752083152532578, train_loss=0.03390197455883026, time_cost=2.852851629257202
+
Steps: 0%| | 2313/1000000 [5:52:52<2121:57:26, 7.66s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 2314/1000000 [5:52:58<1969:36:59, 7.11s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [2314], local_loss=138.0023193359375, train_loss=17.302024841308594, time_cost=1.2701318264007568
+
Steps: 0%| | 2314/1000000 [5:52:58<1969:36:59, 7.11s/it, lr=1e-5, step_loss=138]
Steps: 0%| | 2315/1000000 [5:53:08<2213:04:53, 7.99s/it, lr=1e-5, step_loss=138][RANK-0]: Step: [2315], local_loss=0.03408185765147209, train_loss=0.08182922005653381, time_cost=7.646418571472168
+
Steps: 0%| | 2315/1000000 [5:53:08<2213:04:53, 7.99s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 2316/1000000 [5:53:23<2825:14:53, 10.19s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [2316], local_loss=0.05906781926751137, train_loss=0.059695981442928314, time_cost=6.745360851287842
+
Steps: 0%| | 2316/1000000 [5:53:23<2825:14:53, 10.19s/it, lr=1e-5, step_loss=0.0591]
Steps: 0%| | 2317/1000000 [5:53:28<2413:04:33, 8.71s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [2317], local_loss=0.019817933440208435, train_loss=0.04694198817014694, time_cost=2.103471040725708
+
Steps: 0%| | 2317/1000000 [5:53:28<2413:04:33, 8.71s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 2318/1000000 [5:53:39<2598:16:40, 9.38s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [2318], local_loss=0.024551574140787125, train_loss=0.04052875190973282, time_cost=1.8435430526733398
+
Steps: 0%| | 2318/1000000 [5:53:39<2598:16:40, 9.38s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 2319/1000000 [5:53:47<2500:42:04, 9.02s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [2319], local_loss=0.03119168058037758, train_loss=0.03082774579524994, time_cost=4.355506420135498
+
Steps: 0%| | 2319/1000000 [5:53:47<2500:42:04, 9.02s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 2320/1000000 [5:54:01<2852:03:18, 10.29s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [2320], local_loss=0.020086349919438362, train_loss=0.17701195180416107, time_cost=8.800100803375244
+
Steps: 0%| | 2320/1000000 [5:54:01<2852:03:18, 10.29s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 2321/1000000 [5:54:12<2977:02:47, 10.74s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [2321], local_loss=0.03451806679368019, train_loss=0.04160495474934578, time_cost=1.2738940715789795
+
Steps: 0%| | 2321/1000000 [5:54:12<2977:02:47, 10.74s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 2322/1000000 [5:54:21<2814:39:58, 10.16s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [2322], local_loss=0.030798688530921936, train_loss=0.03875904530286789, time_cost=2.699903726577759
+
Steps: 0%| | 2322/1000000 [5:54:21<2814:39:58, 10.16s/it, lr=1e-5, step_loss=0.0308]
Steps: 0%| | 2323/1000000 [5:54:29<2588:44:58, 9.34s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [2323], local_loss=0.057589441537857056, train_loss=0.07931950688362122, time_cost=1.2196378707885742
+
Steps: 0%| | 2323/1000000 [5:54:29<2588:44:58, 9.34s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 2324/1000000 [5:54:38<2579:25:22, 9.31s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [2324], local_loss=0.05626845732331276, train_loss=0.07062672078609467, time_cost=1.405487298965454
+
Steps: 0%| | 2324/1000000 [5:54:38<2579:25:22, 9.31s/it, lr=1e-5, step_loss=0.0563]
Steps: 0%| | 2325/1000000 [5:54:42<2171:50:14, 7.84s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [2325], local_loss=0.03420629724860191, train_loss=0.039583563804626465, time_cost=1.4036133289337158
+
Steps: 0%| | 2325/1000000 [5:54:42<2171:50:14, 7.84s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 2326/1000000 [5:54:52<2283:06:26, 8.24s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [2326], local_loss=0.012330306693911552, train_loss=0.044605568051338196, time_cost=2.8680970668792725
+
Steps: 0%| | 2326/1000000 [5:54:52<2283:06:26, 8.24s/it, lr=1e-5, step_loss=0.0123]
Steps: 0%| | 2327/1000000 [5:55:02<2481:11:01, 8.95s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [2327], local_loss=0.01599874533712864, train_loss=0.03964345529675484, time_cost=2.7654175758361816
+
Steps: 0%| | 2327/1000000 [5:55:02<2481:11:01, 8.95s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 2328/1000000 [5:55:12<2551:53:30, 9.21s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [2328], local_loss=0.026615114882588387, train_loss=0.03935369849205017, time_cost=7.460982322692871
+
Steps: 0%| | 2328/1000000 [5:55:12<2551:53:30, 9.21s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2329/1000000 [5:55:16<2157:26:05, 7.78s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2329], local_loss=0.037237901240587234, train_loss=0.03062949702143669, time_cost=1.623013973236084
+
Steps: 0%| | 2329/1000000 [5:55:16<2157:26:05, 7.78s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 2330/1000000 [5:55:28<2451:24:03, 8.85s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [2330], local_loss=0.36825302243232727, train_loss=0.13959026336669922, time_cost=7.274235010147095
+
Steps: 0%| | 2330/1000000 [5:55:28<2451:24:03, 8.85s/it, lr=1e-5, step_loss=0.368]
Steps: 0%| | 2331/1000000 [5:55:34<2216:20:45, 8.00s/it, lr=1e-5, step_loss=0.368][RANK-0]: Step: [2331], local_loss=0.03253086656332016, train_loss=0.0568426176905632, time_cost=1.4959087371826172
+
Steps: 0%| | 2331/1000000 [5:55:34<2216:20:45, 8.00s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 2332/1000000 [5:55:48<2714:28:38, 9.79s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [2332], local_loss=0.021664079278707504, train_loss=0.030825166031718254, time_cost=1.22174072265625
+
Steps: 0%| | 2332/1000000 [5:55:48<2714:28:38, 9.79s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 2333/1000000 [5:55:53<2333:21:49, 8.42s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [2333], local_loss=0.08017133176326752, train_loss=0.0333036407828331, time_cost=2.4036922454833984
+
Steps: 0%| | 2333/1000000 [5:55:53<2333:21:49, 8.42s/it, lr=1e-5, step_loss=0.0802]
Steps: 0%| | 2334/1000000 [5:56:03<2481:44:08, 8.96s/it, lr=1e-5, step_loss=0.0802][RANK-0]: Step: [2334], local_loss=0.02098776027560234, train_loss=0.208844855427742, time_cost=1.5780675411224365
+
Steps: 0%| | 2334/1000000 [5:56:03<2481:44:08, 8.96s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 2335/1000000 [5:56:08<2168:50:46, 7.83s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [2335], local_loss=0.06092439219355583, train_loss=0.047355592250823975, time_cost=4.042515754699707
+
Steps: 0%| | 2335/1000000 [5:56:08<2168:50:46, 7.83s/it, lr=1e-5, step_loss=0.0609]
Steps: 0%| | 2336/1000000 [5:56:13<1926:31:52, 6.95s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [2336], local_loss=0.05361813306808472, train_loss=0.04170524701476097, time_cost=3.9594264030456543
+
Steps: 0%| | 2336/1000000 [5:56:13<1926:31:52, 6.95s/it, lr=1e-5, step_loss=0.0536]
Steps: 0%| | 2337/1000000 [5:56:19<1787:14:56, 6.45s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [2337], local_loss=0.025737250223755836, train_loss=0.02567404881119728, time_cost=2.523677349090576
+
Steps: 0%| | 2337/1000000 [5:56:19<1787:14:56, 6.45s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 2338/1000000 [5:56:26<1838:33:29, 6.63s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [2338], local_loss=0.023998141288757324, train_loss=0.05970059335231781, time_cost=1.2224159240722656
+
Steps: 0%| | 2338/1000000 [5:56:26<1838:33:29, 6.63s/it, lr=1e-5, step_loss=0.024]
Steps: 0%| | 2339/1000000 [5:56:30<1625:23:50, 5.87s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [2339], local_loss=0.01654398813843727, train_loss=0.03200187161564827, time_cost=3.0407326221466064
+
Steps: 0%| | 2339/1000000 [5:56:30<1625:23:50, 5.87s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 2340/1000000 [5:56:35<1569:28:35, 5.66s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [2340], local_loss=0.0435645692050457, train_loss=0.028447400778532028, time_cost=2.1987128257751465
+
Steps: 0%| | 2340/1000000 [5:56:35<1569:28:35, 5.66s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 2341/1000000 [5:56:40<1513:22:49, 5.46s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [2341], local_loss=0.016501737758517265, train_loss=0.07623563706874847, time_cost=2.3143386840820312
+
Steps: 0%| | 2341/1000000 [5:56:40<1513:22:49, 5.46s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 2342/1000000 [5:56:47<1660:37:51, 5.99s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [2342], local_loss=0.2976425290107727, train_loss=0.13560310006141663, time_cost=2.166069746017456
+
Steps: 0%| | 2342/1000000 [5:56:47<1660:37:51, 5.99s/it, lr=1e-5, step_loss=0.298]
Steps: 0%| | 2343/1000000 [5:56:55<1858:35:31, 6.71s/it, lr=1e-5, step_loss=0.298][RANK-0]: Step: [2343], local_loss=0.06316135078668594, train_loss=0.08647842705249786, time_cost=1.2554347515106201
+
Steps: 0%| | 2343/1000000 [5:56:55<1858:35:31, 6.71s/it, lr=1e-5, step_loss=0.0632]
Steps: 0%| | 2344/1000000 [5:57:03<1927:47:30, 6.96s/it, lr=1e-5, step_loss=0.0632][RANK-0]: Step: [2344], local_loss=0.033186618238687515, train_loss=0.03925846144556999, time_cost=1.7542734146118164
+
Steps: 0%| | 2344/1000000 [5:57:03<1927:47:30, 6.96s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 2345/1000000 [5:57:13<2150:25:27, 7.76s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [2345], local_loss=0.029732655733823776, train_loss=0.07339255511760712, time_cost=1.226386308670044
+
Steps: 0%| | 2345/1000000 [5:57:13<2150:25:27, 7.76s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 2346/1000000 [5:57:18<1950:19:06, 7.04s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [2346], local_loss=0.038303475826978683, train_loss=0.08184146881103516, time_cost=2.208014488220215
+
Steps: 0%| | 2346/1000000 [5:57:18<1950:19:06, 7.04s/it, lr=1e-5, step_loss=0.0383]
Steps: 0%| | 2347/1000000 [5:57:28<2216:21:24, 8.00s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [2347], local_loss=0.049319520592689514, train_loss=0.08674351125955582, time_cost=3.423410415649414
+
Steps: 0%| | 2347/1000000 [5:57:28<2216:21:24, 8.00s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 2348/1000000 [5:57:44<2846:13:01, 10.27s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [2348], local_loss=0.02109471708536148, train_loss=0.043734438717365265, time_cost=6.494549751281738
+
Steps: 0%| | 2348/1000000 [5:57:44<2846:13:01, 10.27s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 2349/1000000 [5:57:52<2661:49:07, 9.61s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [2349], local_loss=0.026876669377088547, train_loss=0.04252202808856964, time_cost=3.7537689208984375
+
Steps: 0%| | 2349/1000000 [5:57:52<2661:49:07, 9.61s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 2350/1000000 [5:57:59<2499:15:46, 9.02s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [2350], local_loss=1.009376883506775, train_loss=0.19046984612941742, time_cost=5.342191457748413
+
Steps: 0%| | 2350/1000000 [5:57:59<2499:15:46, 9.02s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 2351/1000000 [5:58:15<3023:31:49, 10.91s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [2351], local_loss=0.01650342531502247, train_loss=0.025987619534134865, time_cost=1.2224671840667725
+
Steps: 0%| | 2351/1000000 [5:58:15<3023:31:49, 10.91s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 2352/1000000 [5:58:22<2727:16:35, 9.84s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [2352], local_loss=0.17842626571655273, train_loss=0.09491722285747528, time_cost=1.5113794803619385
+
Steps: 0%| | 2352/1000000 [5:58:22<2727:16:35, 9.84s/it, lr=1e-5, step_loss=0.178]
Steps: 0%| | 2353/1000000 [5:58:32<2706:51:49, 9.77s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [2353], local_loss=0.089920774102211, train_loss=0.04665132239460945, time_cost=7.097589015960693
+
Steps: 0%| | 2353/1000000 [5:58:32<2706:51:49, 9.77s/it, lr=1e-5, step_loss=0.0899]
Steps: 0%| | 2354/1000000 [5:58:43<2845:10:07, 10.27s/it, lr=1e-5, step_loss=0.0899][RANK-0]: Step: [2354], local_loss=0.20222236216068268, train_loss=0.07278761267662048, time_cost=1.853285789489746
+
Steps: 0%| | 2354/1000000 [5:58:43<2845:10:07, 10.27s/it, lr=1e-5, step_loss=0.202]
Steps: 0%| | 2355/1000000 [5:58:57<3163:19:25, 11.41s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [2355], local_loss=0.04434732347726822, train_loss=0.09197346866130829, time_cost=1.240816593170166
+
Steps: 0%| | 2355/1000000 [5:58:57<3163:19:25, 11.41s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 2356/1000000 [5:59:08<3117:48:04, 11.25s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [2356], local_loss=0.023027993738651276, train_loss=17.034963607788086, time_cost=3.1505987644195557
+
Steps: 0%| | 2356/1000000 [5:59:08<3117:48:04, 11.25s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 2357/1000000 [5:59:19<3049:03:50, 11.00s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [2357], local_loss=0.025939438492059708, train_loss=0.031809840351343155, time_cost=3.889841318130493
+
Steps: 0%| | 2357/1000000 [5:59:19<3049:03:50, 11.00s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 2358/1000000 [5:59:25<2692:23:55, 9.72s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [2358], local_loss=0.028483375906944275, train_loss=0.04593110829591751, time_cost=2.260817289352417
+
Steps: 0%| | 2358/1000000 [5:59:25<2692:23:55, 9.72s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 2359/1000000 [5:59:30<2280:17:47, 8.23s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [2359], local_loss=0.047175273299217224, train_loss=0.031635284423828125, time_cost=1.2656910419464111
+
Steps: 0%| | 2359/1000000 [5:59:30<2280:17:47, 8.23s/it, lr=1e-5, step_loss=0.0472]
Steps: 0%| | 2360/1000000 [5:59:45<2844:03:56, 10.26s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [2360], local_loss=0.05685892328619957, train_loss=0.04005686193704605, time_cost=6.441790580749512
+
Steps: 0%| | 2360/1000000 [5:59:45<2844:03:56, 10.26s/it, lr=1e-5, step_loss=0.0569]
Steps: 0%| | 2361/1000000 [5:59:57<2962:22:32, 10.69s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [2361], local_loss=0.03772591054439545, train_loss=0.0674249529838562, time_cost=1.4246528148651123
+
Steps: 0%| | 2361/1000000 [5:59:57<2962:22:32, 10.69s/it, lr=1e-5, step_loss=0.0377]
Steps: 0%| | 2362/1000000 [6:00:02<2513:11:20, 9.07s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [2362], local_loss=0.038025420159101486, train_loss=0.03687459975481033, time_cost=1.5841491222381592
+
Steps: 0%| | 2362/1000000 [6:00:02<2513:11:20, 9.07s/it, lr=1e-5, step_loss=0.038]
Steps: 0%| | 2363/1000000 [6:00:08<2227:16:45, 8.04s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [2363], local_loss=0.016112716868519783, train_loss=0.0539063885807991, time_cost=2.9714810848236084
+
Steps: 0%| | 2363/1000000 [6:00:08<2227:16:45, 8.04s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 2364/1000000 [6:00:12<1905:28:01, 6.88s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [2364], local_loss=0.033988162875175476, train_loss=0.057493627071380615, time_cost=1.6558523178100586
+
Steps: 0%| | 2364/1000000 [6:00:12<1905:28:01, 6.88s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 2365/1000000 [6:00:19<1890:26:23, 6.82s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [2365], local_loss=0.06341103464365005, train_loss=0.2664238214492798, time_cost=2.684566020965576
+
Steps: 0%| | 2365/1000000 [6:00:19<1890:26:23, 6.82s/it, lr=1e-5, step_loss=0.0634]
Steps: 0%| | 2366/1000000 [6:00:27<2002:45:54, 7.23s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [2366], local_loss=0.012380213476717472, train_loss=0.02499687299132347, time_cost=4.129309415817261
+
Steps: 0%| | 2366/1000000 [6:00:27<2002:45:54, 7.23s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 2367/1000000 [6:00:38<2379:17:23, 8.59s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [2367], local_loss=0.040106192231178284, train_loss=0.041302476078271866, time_cost=1.2899253368377686
+
Steps: 0%| | 2367/1000000 [6:00:38<2379:17:23, 8.59s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 2368/1000000 [6:00:44<2092:47:50, 7.55s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [2368], local_loss=0.02918330579996109, train_loss=0.04432367533445358, time_cost=1.9293172359466553
+
Steps: 0%| | 2368/1000000 [6:00:44<2092:47:50, 7.55s/it, lr=1e-5, step_loss=0.0292]
Steps: 0%| | 2369/1000000 [6:00:53<2234:19:20, 8.06s/it, lr=1e-5, step_loss=0.0292][RANK-0]: Step: [2369], local_loss=0.014304167591035366, train_loss=0.03286401927471161, time_cost=3.7619309425354004
+
Steps: 0%| | 2369/1000000 [6:00:53<2234:19:20, 8.06s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 2370/1000000 [6:01:08<2827:51:47, 10.20s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [2370], local_loss=0.016659151762723923, train_loss=0.09225281327962875, time_cost=1.2344350814819336
+
Steps: 0%| | 2370/1000000 [6:01:08<2827:51:47, 10.20s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 2371/1000000 [6:01:21<3096:06:54, 11.17s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [2371], local_loss=0.02226993814110756, train_loss=0.04162585362792015, time_cost=3.4539570808410645
+
Steps: 0%| | 2371/1000000 [6:01:21<3096:06:54, 11.17s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 2372/1000000 [6:01:30<2899:03:43, 10.46s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [2372], local_loss=0.01382483821362257, train_loss=0.057043347507715225, time_cost=1.9762213230133057
+
Steps: 0%| | 2372/1000000 [6:01:30<2899:03:43, 10.46s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 2373/1000000 [6:01:41<2893:47:05, 10.44s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [2373], local_loss=0.22425130009651184, train_loss=0.08205696195363998, time_cost=1.3689913749694824
+
Steps: 0%| | 2373/1000000 [6:01:41<2893:47:05, 10.44s/it, lr=1e-5, step_loss=0.224]
Steps: 0%| | 2374/1000000 [6:01:56<3325:20:27, 12.00s/it, lr=1e-5, step_loss=0.224][RANK-0]: Step: [2374], local_loss=0.04262670502066612, train_loss=0.0519651398062706, time_cost=7.874398231506348
+
Steps: 0%| | 2374/1000000 [6:01:56<3325:20:27, 12.00s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 2375/1000000 [6:02:03<2905:09:05, 10.48s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [2375], local_loss=0.018283069133758545, train_loss=0.15373557806015015, time_cost=2.4230504035949707
+
Steps: 0%| | 2375/1000000 [6:02:03<2905:09:05, 10.48s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 2376/1000000 [6:02:11<2708:22:43, 9.77s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [2376], local_loss=0.039188895374536514, train_loss=0.03572656959295273, time_cost=3.978940486907959
+
Steps: 0%| | 2376/1000000 [6:02:11<2708:22:43, 9.77s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 2377/1000000 [6:02:22<2766:32:29, 9.98s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [2377], local_loss=0.026330795139074326, train_loss=0.045306190848350525, time_cost=5.814293146133423
+
Steps: 0%| | 2377/1000000 [6:02:22<2766:32:29, 9.98s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 2378/1000000 [6:02:26<2297:57:48, 8.29s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [2378], local_loss=0.20874744653701782, train_loss=0.20499351620674133, time_cost=1.5139966011047363
+
Steps: 0%| | 2378/1000000 [6:02:26<2297:57:48, 8.29s/it, lr=1e-5, step_loss=0.209]
Steps: 0%| | 2379/1000000 [6:02:33<2189:36:08, 7.90s/it, lr=1e-5, step_loss=0.209][RANK-0]: Step: [2379], local_loss=0.129219651222229, train_loss=0.052221231162548065, time_cost=3.4242942333221436
+
Steps: 0%| | 2379/1000000 [6:02:33<2189:36:08, 7.90s/it, lr=1e-5, step_loss=0.129]
Steps: 0%| | 2380/1000000 [6:02:46<2606:09:06, 9.40s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [2380], local_loss=0.02823435701429844, train_loss=0.04828410968184471, time_cost=4.079440355300903
+
Steps: 0%| | 2380/1000000 [6:02:46<2606:09:06, 9.40s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 2381/1000000 [6:02:57<2729:36:28, 9.85s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [2381], local_loss=0.022412745282053947, train_loss=0.09140527248382568, time_cost=1.5296821594238281
+
Steps: 0%| | 2381/1000000 [6:02:57<2729:36:28, 9.85s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 2382/1000000 [6:03:03<2378:10:58, 8.58s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [2382], local_loss=0.018849879503250122, train_loss=0.04809575527906418, time_cost=2.032845973968506
+
Steps: 0%| | 2382/1000000 [6:03:03<2378:10:58, 8.58s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 2383/1000000 [6:03:18<2908:29:56, 10.50s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [2383], local_loss=0.033288367092609406, train_loss=0.05691677704453468, time_cost=4.588666677474976
+
Steps: 0%| | 2383/1000000 [6:03:18<2908:29:56, 10.50s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 2384/1000000 [6:03:32<3216:43:09, 11.61s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [2384], local_loss=0.01567230187356472, train_loss=0.11206027865409851, time_cost=1.2722423076629639
+
Steps: 0%| | 2384/1000000 [6:03:32<3216:43:09, 11.61s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2385/1000000 [6:03:43<3157:56:12, 11.40s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2385], local_loss=0.020680448040366173, train_loss=0.024876920506358147, time_cost=2.3353776931762695
+
Steps: 0%| | 2385/1000000 [6:03:43<3157:56:12, 11.40s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 2386/1000000 [6:03:59<3549:36:44, 12.81s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [2386], local_loss=0.033309441059827805, train_loss=0.09669619053602219, time_cost=7.294858455657959
+
Steps: 0%| | 2386/1000000 [6:03:59<3549:36:44, 12.81s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 2387/1000000 [6:04:06<3119:45:17, 11.26s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [2387], local_loss=0.04162958636879921, train_loss=0.05510624498128891, time_cost=3.829347848892212
+
Steps: 0%| | 2387/1000000 [6:04:06<3119:45:17, 11.26s/it, lr=1e-5, step_loss=0.0416]
Steps: 0%| | 2388/1000000 [6:04:12<2622:38:22, 9.46s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [2388], local_loss=0.03312068432569504, train_loss=0.16465239226818085, time_cost=1.2396156787872314
+
Steps: 0%| | 2388/1000000 [6:04:12<2622:38:22, 9.46s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 2389/1000000 [6:04:25<2933:27:20, 10.59s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [2389], local_loss=0.014724557287991047, train_loss=0.06731025874614716, time_cost=2.51343035697937
+
Steps: 0%| | 2389/1000000 [6:04:25<2933:27:20, 10.59s/it, lr=1e-5, step_loss=0.0147]
Steps: 0%| | 2390/1000000 [6:04:33<2720:12:04, 9.82s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [2390], local_loss=0.5114056468009949, train_loss=0.08765784651041031, time_cost=2.5093860626220703
+
Steps: 0%| | 2390/1000000 [6:04:33<2720:12:04, 9.82s/it, lr=1e-5, step_loss=0.511]
Steps: 0%| | 2391/1000000 [6:04:37<2262:22:27, 8.16s/it, lr=1e-5, step_loss=0.511][RANK-0]: Step: [2391], local_loss=0.03394962474703789, train_loss=0.039423972368240356, time_cost=1.4602315425872803
+
Steps: 0%| | 2391/1000000 [6:04:37<2262:22:27, 8.16s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 2392/1000000 [6:04:41<1915:37:35, 6.91s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [2392], local_loss=0.032313913106918335, train_loss=0.05181853473186493, time_cost=1.2542879581451416
+
Steps: 0%| | 2392/1000000 [6:04:41<1915:37:35, 6.91s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 2393/1000000 [6:04:47<1784:42:58, 6.44s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [2393], local_loss=0.015633713454008102, train_loss=0.07995492219924927, time_cost=2.092813491821289
+
Steps: 0%| | 2393/1000000 [6:04:47<1784:42:58, 6.44s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 2394/1000000 [6:04:52<1662:09:33, 6.00s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [2394], local_loss=0.016433607786893845, train_loss=0.05681903287768364, time_cost=2.025994062423706
+
Steps: 0%| | 2394/1000000 [6:04:52<1662:09:33, 6.00s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 2395/1000000 [6:05:02<2064:55:13, 7.45s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [2395], local_loss=0.02149963565170765, train_loss=0.053472090512514114, time_cost=1.872082233428955
+
Steps: 0%| | 2395/1000000 [6:05:02<2064:55:13, 7.45s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2396/1000000 [6:05:09<1989:09:06, 7.18s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2396], local_loss=0.04014582186937332, train_loss=0.06225190684199333, time_cost=1.2440268993377686
+
Steps: 0%| | 2396/1000000 [6:05:09<1989:09:06, 7.18s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 2397/1000000 [6:05:22<2464:53:46, 8.89s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [2397], local_loss=0.12448462098836899, train_loss=0.05138617008924484, time_cost=3.7863831520080566
+
Steps: 0%| | 2397/1000000 [6:05:22<2464:53:46, 8.89s/it, lr=1e-5, step_loss=0.124]
Steps: 0%| | 2398/1000000 [6:05:33<2626:42:26, 9.48s/it, lr=1e-5, step_loss=0.124][RANK-0]: Step: [2398], local_loss=0.02236028015613556, train_loss=0.042671434581279755, time_cost=2.859910488128662
+
Steps: 0%| | 2398/1000000 [6:05:33<2626:42:26, 9.48s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 2399/1000000 [6:05:38<2251:23:02, 8.12s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [2399], local_loss=0.012368322350084782, train_loss=0.030893448740243912, time_cost=2.219003438949585
+
Steps: 0%| | 2399/1000000 [6:05:38<2251:23:02, 8.12s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 2400/1000000 [6:05:49<2485:37:55, 8.97s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [2400], local_loss=0.03732731193304062, train_loss=0.1033112183213234, time_cost=3.254369020462036
+
Steps: 0%| | 2400/1000000 [6:05:49<2485:37:55, 8.97s/it, lr=1e-5, step_loss=0.0373]
Steps: 0%| | 2401/1000000 [6:05:59<2630:53:32, 9.49s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [2401], local_loss=0.03498213365674019, train_loss=0.029774799942970276, time_cost=1.2138986587524414
+
Steps: 0%| | 2401/1000000 [6:05:59<2630:53:32, 9.49s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 2402/1000000 [6:06:11<2793:23:40, 10.08s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [2402], local_loss=0.02660050243139267, train_loss=0.04279666393995285, time_cost=1.2273693084716797
+
Steps: 0%| | 2402/1000000 [6:06:11<2793:23:40, 10.08s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2403/1000000 [6:06:20<2706:10:22, 9.77s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2403], local_loss=1.000295639038086, train_loss=0.20798760652542114, time_cost=1.499114751815796
+
Steps: 0%| | 2403/1000000 [6:06:20<2706:10:22, 9.77s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 2404/1000000 [6:06:25<2340:54:12, 8.45s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [2404], local_loss=0.035786259919404984, train_loss=0.15089470148086548, time_cost=1.224963903427124
+
Steps: 0%| | 2404/1000000 [6:06:25<2340:54:12, 8.45s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 2405/1000000 [6:06:39<2765:00:44, 9.98s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [2405], local_loss=0.0339912548661232, train_loss=0.038259655237197876, time_cost=4.33115291595459
+
Steps: 0%| | 2405/1000000 [6:06:39<2765:00:44, 9.98s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 2406/1000000 [6:06:45<2438:13:35, 8.80s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [2406], local_loss=0.0587545707821846, train_loss=0.08753816783428192, time_cost=1.8120160102844238
+
Steps: 0%| | 2406/1000000 [6:06:45<2438:13:35, 8.80s/it, lr=1e-5, step_loss=0.0588]
Steps: 0%| | 2407/1000000 [6:06:54<2468:31:32, 8.91s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [2407], local_loss=0.024257274344563484, train_loss=0.06953199952840805, time_cost=1.8509492874145508
+
Steps: 0%| | 2407/1000000 [6:06:54<2468:31:32, 8.91s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 2408/1000000 [6:07:03<2461:21:33, 8.88s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [2408], local_loss=0.05175787955522537, train_loss=0.047063566744327545, time_cost=6.808775186538696
+
Steps: 0%| | 2408/1000000 [6:07:03<2461:21:33, 8.88s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 2409/1000000 [6:07:15<2776:48:19, 10.02s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [2409], local_loss=0.03657121583819389, train_loss=0.05486425757408142, time_cost=4.6401708126068115
+
Steps: 0%| | 2409/1000000 [6:07:15<2776:48:19, 10.02s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 2410/1000000 [6:07:25<2720:28:44, 9.82s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [2410], local_loss=0.019081534817814827, train_loss=0.03385529667139053, time_cost=7.440594434738159
+
Steps: 0%| | 2410/1000000 [6:07:25<2720:28:44, 9.82s/it, lr=1e-5, step_loss=0.0191]
Steps: 0%| | 2411/1000000 [6:07:32<2519:01:54, 9.09s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [2411], local_loss=0.05167824402451515, train_loss=0.07286766171455383, time_cost=1.9786791801452637
+
Steps: 0%| | 2411/1000000 [6:07:32<2519:01:54, 9.09s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 2412/1000000 [6:07:42<2556:59:07, 9.23s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [2412], local_loss=0.039950259029865265, train_loss=0.039271220564842224, time_cost=2.1772117614746094
+
Steps: 0%| | 2412/1000000 [6:07:42<2556:59:07, 9.23s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 2413/1000000 [6:07:47<2214:28:46, 7.99s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [2413], local_loss=0.011509420350193977, train_loss=0.02964271791279316, time_cost=2.446039915084839
+
Steps: 0%| | 2413/1000000 [6:07:47<2214:28:46, 7.99s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 2414/1000000 [6:07:55<2262:46:47, 8.17s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [2414], local_loss=0.017673660069704056, train_loss=0.06816890835762024, time_cost=3.1102635860443115
+
Steps: 0%| | 2414/1000000 [6:07:55<2262:46:47, 8.17s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 2415/1000000 [6:08:01<2072:28:27, 7.48s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [2415], local_loss=0.05556921288371086, train_loss=0.04390960559248924, time_cost=1.4173240661621094
+
Steps: 0%| | 2415/1000000 [6:08:01<2072:28:27, 7.48s/it, lr=1e-5, step_loss=0.0556]
Steps: 0%| | 2416/1000000 [6:08:10<2221:29:45, 8.02s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [2416], local_loss=0.0157003253698349, train_loss=0.036613792181015015, time_cost=1.3719875812530518
+
Steps: 0%| | 2416/1000000 [6:08:10<2221:29:45, 8.02s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2417/1000000 [6:08:18<2219:32:38, 8.01s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2417], local_loss=0.018346872180700302, train_loss=0.04456000402569771, time_cost=3.870481491088867
+
Steps: 0%| | 2417/1000000 [6:08:18<2219:32:38, 8.01s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 2418/1000000 [6:08:23<1937:30:12, 6.99s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [2418], local_loss=0.020706672221422195, train_loss=0.15725035965442657, time_cost=1.8959696292877197
+
Steps: 0%| | 2418/1000000 [6:08:23<1937:30:12, 6.99s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 2419/1000000 [6:08:35<2318:30:45, 8.37s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [2419], local_loss=0.009164238348603249, train_loss=0.12124818563461304, time_cost=6.067826271057129
+
Steps: 0%| | 2419/1000000 [6:08:35<2318:30:45, 8.37s/it, lr=1e-5, step_loss=0.00916]
Steps: 0%| | 2420/1000000 [6:08:45<2471:52:17, 8.92s/it, lr=1e-5, step_loss=0.00916][RANK-0]: Step: [2420], local_loss=0.06344526261091232, train_loss=0.0647210031747818, time_cost=6.986248970031738
+
Steps: 0%| | 2420/1000000 [6:08:45<2471:52:17, 8.92s/it, lr=1e-5, step_loss=0.0634]
Steps: 0%| | 2421/1000000 [6:08:50<2148:10:55, 7.75s/it, lr=1e-5, step_loss=0.0634][RANK-0]: Step: [2421], local_loss=0.14760789275169373, train_loss=13.58997631072998, time_cost=2.0728626251220703
+
Steps: 0%| | 2421/1000000 [6:08:50<2148:10:55, 7.75s/it, lr=1e-5, step_loss=0.148]
Steps: 0%| | 2422/1000000 [6:08:56<1994:59:10, 7.20s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [2422], local_loss=0.019330894574522972, train_loss=0.04833295941352844, time_cost=1.2280895709991455
+
Steps: 0%| | 2422/1000000 [6:08:56<1994:59:10, 7.20s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2423/1000000 [6:09:01<1813:18:47, 6.54s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2423], local_loss=0.0335807166993618, train_loss=0.04811624437570572, time_cost=2.0204622745513916
+
Steps: 0%| | 2423/1000000 [6:09:01<1813:18:47, 6.54s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 2424/1000000 [6:09:13<2269:11:35, 8.19s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [2424], local_loss=0.02657019905745983, train_loss=0.2516564726829529, time_cost=1.9310555458068848
+
Steps: 0%| | 2424/1000000 [6:09:13<2269:11:35, 8.19s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2425/1000000 [6:09:24<2518:17:34, 9.09s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2425], local_loss=0.026683835312724113, train_loss=1.3262723684310913, time_cost=1.216191053390503
+
Steps: 0%| | 2425/1000000 [6:09:24<2518:17:34, 9.09s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 2426/1000000 [6:09:38<2952:09:11, 10.65s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [2426], local_loss=0.022934051230549812, train_loss=0.1001812294125557, time_cost=1.2469229698181152
+
Steps: 0%| | 2426/1000000 [6:09:38<2952:09:11, 10.65s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 2427/1000000 [6:09:43<2458:21:34, 8.87s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [2427], local_loss=0.06126436963677406, train_loss=0.034391842782497406, time_cost=2.341306209564209
+
Steps: 0%| | 2427/1000000 [6:09:43<2458:21:34, 8.87s/it, lr=1e-5, step_loss=0.0613]
Steps: 0%| | 2428/1000000 [6:09:49<2200:08:27, 7.94s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [2428], local_loss=0.0654837042093277, train_loss=0.04329846054315567, time_cost=1.9402191638946533
+
Steps: 0%| | 2428/1000000 [6:09:49<2200:08:27, 7.94s/it, lr=1e-5, step_loss=0.0655]
Steps: 0%| | 2429/1000000 [6:10:00<2493:21:35, 9.00s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [2429], local_loss=0.030151676386594772, train_loss=0.04268321394920349, time_cost=3.789033889770508
+
Steps: 0%| | 2429/1000000 [6:10:00<2493:21:35, 9.00s/it, lr=1e-5, step_loss=0.0302]
Steps: 0%| | 2430/1000000 [6:10:05<2169:56:49, 7.83s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [2430], local_loss=0.04767744243144989, train_loss=0.05448196828365326, time_cost=2.269134521484375
+
Steps: 0%| | 2430/1000000 [6:10:05<2169:56:49, 7.83s/it, lr=1e-5, step_loss=0.0477]
Steps: 0%| | 2431/1000000 [6:10:16<2381:45:33, 8.60s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [2431], local_loss=0.021246343851089478, train_loss=0.17240943014621735, time_cost=2.4667129516601562
+
Steps: 0%| | 2431/1000000 [6:10:16<2381:45:33, 8.60s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 2432/1000000 [6:10:23<2255:51:22, 8.14s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [2432], local_loss=0.01731293462216854, train_loss=0.0378590002655983, time_cost=3.266462802886963
+
Steps: 0%| | 2432/1000000 [6:10:23<2255:51:22, 8.14s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 2433/1000000 [6:10:27<1962:15:52, 7.08s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [2433], local_loss=0.015328031033277512, train_loss=0.0348634198307991, time_cost=1.7993345260620117
+
Steps: 0%| | 2433/1000000 [6:10:27<1962:15:52, 7.08s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 2434/1000000 [6:10:36<2113:58:19, 7.63s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [2434], local_loss=0.13696636259555817, train_loss=0.051997654139995575, time_cost=3.0635123252868652
+
Steps: 0%| | 2434/1000000 [6:10:36<2113:58:19, 7.63s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 2435/1000000 [6:10:49<2518:09:22, 9.09s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [2435], local_loss=0.15712866187095642, train_loss=0.05353355035185814, time_cost=3.4986307621002197
+
Steps: 0%| | 2435/1000000 [6:10:49<2518:09:22, 9.09s/it, lr=1e-5, step_loss=0.157]
Steps: 0%| | 2436/1000000 [6:11:02<2858:46:50, 10.32s/it, lr=1e-5, step_loss=0.157][RANK-0]: Step: [2436], local_loss=0.055903609842061996, train_loss=0.032838571816682816, time_cost=3.6064021587371826
+
Steps: 0%| | 2436/1000000 [6:11:02<2858:46:50, 10.32s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 2437/1000000 [6:11:13<2901:22:48, 10.47s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [2437], local_loss=0.013729212805628777, train_loss=0.05187553912401199, time_cost=1.6664113998413086
+
Steps: 0%| | 2437/1000000 [6:11:13<2901:22:48, 10.47s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 2438/1000000 [6:11:28<3280:41:44, 11.84s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [2438], local_loss=0.027110271155834198, train_loss=0.06144466996192932, time_cost=6.536667108535767
+
Steps: 0%| | 2438/1000000 [6:11:28<3280:41:44, 11.84s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 2439/1000000 [6:11:41<3346:55:14, 12.08s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [2439], local_loss=0.024194911122322083, train_loss=0.06105133146047592, time_cost=4.46094536781311
+
Steps: 0%| | 2439/1000000 [6:11:41<3346:55:14, 12.08s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 2440/1000000 [6:11:52<3296:44:07, 11.90s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [2440], local_loss=0.0665813684463501, train_loss=0.04909240081906319, time_cost=1.2032263278961182
+
Steps: 0%| | 2440/1000000 [6:11:52<3296:44:07, 11.90s/it, lr=1e-5, step_loss=0.0666]
Steps: 0%| | 2441/1000000 [6:12:01<3066:19:19, 11.07s/it, lr=1e-5, step_loss=0.0666][RANK-0]: Step: [2441], local_loss=0.6585741639137268, train_loss=0.23119421303272247, time_cost=6.720273971557617
+
Steps: 0%| | 2441/1000000 [6:12:01<3066:19:19, 11.07s/it, lr=1e-5, step_loss=0.659]
Steps: 0%| | 2442/1000000 [6:12:16<3339:22:27, 12.05s/it, lr=1e-5, step_loss=0.659][RANK-0]: Step: [2442], local_loss=0.027226408943533897, train_loss=0.1672278195619583, time_cost=8.432773113250732
+
Steps: 0%| | 2442/1000000 [6:12:16<3339:22:27, 12.05s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 2443/1000000 [6:12:30<3542:58:27, 12.79s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [2443], local_loss=0.023695487529039383, train_loss=0.032925233244895935, time_cost=1.2171275615692139
+
Steps: 0%| | 2443/1000000 [6:12:30<3542:58:27, 12.79s/it, lr=1e-5, step_loss=0.0237]
Steps: 0%| | 2444/1000000 [6:12:34<2846:55:29, 10.27s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [2444], local_loss=0.02337270975112915, train_loss=0.197627454996109, time_cost=1.3907208442687988
+
Steps: 0%| | 2444/1000000 [6:12:34<2846:55:29, 10.27s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 2445/1000000 [6:12:39<2359:32:33, 8.52s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [2445], local_loss=0.01783515140414238, train_loss=0.10653738677501678, time_cost=1.5323173999786377
+
Steps: 0%| | 2445/1000000 [6:12:39<2359:32:33, 8.52s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2446/1000000 [6:12:46<2261:58:29, 8.16s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2446], local_loss=0.014833728782832623, train_loss=0.04414535313844681, time_cost=2.746929168701172
+
Steps: 0%| | 2446/1000000 [6:12:46<2261:58:29, 8.16s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 2447/1000000 [6:12:53<2136:32:27, 7.71s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [2447], local_loss=0.024137623608112335, train_loss=0.049779608845710754, time_cost=2.825951337814331
+
Steps: 0%| | 2447/1000000 [6:12:53<2136:32:27, 7.71s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 2448/1000000 [6:12:57<1861:59:57, 6.72s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [2448], local_loss=0.04722342640161514, train_loss=0.05093073844909668, time_cost=1.9941511154174805
+
Steps: 0%| | 2448/1000000 [6:12:57<1861:59:57, 6.72s/it, lr=1e-5, step_loss=0.0472]
Steps: 0%| | 2449/1000000 [6:13:09<2250:07:35, 8.12s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [2449], local_loss=0.12482751160860062, train_loss=0.048243530094623566, time_cost=5.263415336608887
+
Steps: 0%| | 2449/1000000 [6:13:09<2250:07:35, 8.12s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 2450/1000000 [6:13:23<2773:33:57, 10.01s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [2450], local_loss=0.030170978978276253, train_loss=18.564552307128906, time_cost=6.528918027877808
+
Steps: 0%| | 2450/1000000 [6:13:23<2773:33:57, 10.01s/it, lr=1e-5, step_loss=0.0302]
Steps: 0%| | 2451/1000000 [6:13:29<2421:35:35, 8.74s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [2451], local_loss=0.1326240599155426, train_loss=0.04359903931617737, time_cost=3.9821054935455322
+
Steps: 0%| | 2451/1000000 [6:13:29<2421:35:35, 8.74s/it, lr=1e-5, step_loss=0.133]
Steps: 0%| | 2452/1000000 [6:13:38<2455:50:36, 8.86s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [2452], local_loss=0.05067196115851402, train_loss=0.024396328255534172, time_cost=4.026288270950317
+
Steps: 0%| | 2452/1000000 [6:13:38<2455:50:36, 8.86s/it, lr=1e-5, step_loss=0.0507]
Steps: 0%| | 2453/1000000 [6:13:50<2729:55:06, 9.85s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [2453], local_loss=0.02579534612596035, train_loss=0.09828519821166992, time_cost=1.227494478225708
+
Steps: 0%| | 2453/1000000 [6:13:50<2729:55:06, 9.85s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 2454/1000000 [6:13:57<2489:28:16, 8.98s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [2454], local_loss=0.034962452948093414, train_loss=0.04423276335000992, time_cost=2.662424087524414
+
Steps: 0%| | 2454/1000000 [6:13:57<2489:28:16, 8.98s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 2455/1000000 [6:14:03<2250:15:31, 8.12s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [2455], local_loss=0.16947011649608612, train_loss=0.12758000195026398, time_cost=1.6078202724456787
+
Steps: 0%| | 2455/1000000 [6:14:03<2250:15:31, 8.12s/it, lr=1e-5, step_loss=0.169]
Steps: 0%| | 2456/1000000 [6:14:12<2328:56:58, 8.40s/it, lr=1e-5, step_loss=0.169][RANK-0]: Step: [2456], local_loss=0.01736479438841343, train_loss=0.06663429737091064, time_cost=6.5534162521362305
+
Steps: 0%| | 2456/1000000 [6:14:12<2328:56:58, 8.40s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 2457/1000000 [6:14:17<1995:11:10, 7.20s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [2457], local_loss=0.04876624420285225, train_loss=0.03065337799489498, time_cost=1.348069429397583
+
Steps: 0%| | 2457/1000000 [6:14:17<1995:11:10, 7.20s/it, lr=1e-5, step_loss=0.0488]
Steps: 0%| | 2458/1000000 [6:14:24<2025:20:10, 7.31s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [2458], local_loss=0.11597079783678055, train_loss=0.04935736209154129, time_cost=3.950148344039917
+
Steps: 0%| | 2458/1000000 [6:14:24<2025:20:10, 7.31s/it, lr=1e-5, step_loss=0.116]
Steps: 0%| | 2459/1000000 [6:14:30<1897:32:07, 6.85s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [2459], local_loss=0.026109660044312477, train_loss=0.05736454203724861, time_cost=1.2033374309539795
+
Steps: 0%| | 2459/1000000 [6:14:30<1897:32:07, 6.85s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 2460/1000000 [6:14:41<2215:39:35, 8.00s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [2460], local_loss=0.027580764144659042, train_loss=0.03731114789843559, time_cost=1.3864803314208984
+
Steps: 0%| | 2460/1000000 [6:14:41<2215:39:35, 8.00s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 2461/1000000 [6:14:52<2522:01:41, 9.10s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [2461], local_loss=0.06283047795295715, train_loss=0.03384135663509369, time_cost=2.646280288696289
+
Steps: 0%| | 2461/1000000 [6:14:52<2522:01:41, 9.10s/it, lr=1e-5, step_loss=0.0628]
Steps: 0%| | 2462/1000000 [6:15:05<2832:17:14, 10.22s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [2462], local_loss=0.015648912638425827, train_loss=0.027182841673493385, time_cost=3.047938108444214
+
Steps: 0%| | 2462/1000000 [6:15:05<2832:17:14, 10.22s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 2463/1000000 [6:15:15<2833:30:52, 10.23s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [2463], local_loss=0.02948787435889244, train_loss=0.04272923618555069, time_cost=2.1574623584747314
+
Steps: 0%| | 2463/1000000 [6:15:15<2833:30:52, 10.23s/it, lr=1e-5, step_loss=0.0295]
Steps: 0%| | 2464/1000000 [6:15:21<2431:16:13, 8.77s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [2464], local_loss=0.1339411735534668, train_loss=0.07035557180643082, time_cost=2.3443334102630615
+
Steps: 0%| | 2464/1000000 [6:15:21<2431:16:13, 8.77s/it, lr=1e-5, step_loss=0.134]
Steps: 0%| | 2465/1000000 [6:15:34<2788:26:20, 10.06s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [2465], local_loss=0.022687412798404694, train_loss=0.05129564553499222, time_cost=2.8360400199890137
+
Steps: 0%| | 2465/1000000 [6:15:34<2788:26:20, 10.06s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 2466/1000000 [6:15:41<2536:45:28, 9.15s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [2466], local_loss=0.022574059665203094, train_loss=0.09850656241178513, time_cost=3.1484336853027344
+
Steps: 0%| | 2466/1000000 [6:15:41<2536:45:28, 9.15s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 2467/1000000 [6:15:45<2133:45:02, 7.70s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [2467], local_loss=0.11166894435882568, train_loss=0.03957154229283333, time_cost=1.2711775302886963
+
Steps: 0%| | 2467/1000000 [6:15:45<2133:45:02, 7.70s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 2468/1000000 [6:15:58<2544:23:32, 9.18s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [2468], local_loss=0.09098190814256668, train_loss=0.04791722446680069, time_cost=1.2116389274597168
+
Steps: 0%| | 2468/1000000 [6:15:58<2544:23:32, 9.18s/it, lr=1e-5, step_loss=0.091]
Steps: 0%| | 2469/1000000 [6:16:07<2566:57:26, 9.26s/it, lr=1e-5, step_loss=0.091][RANK-0]: Step: [2469], local_loss=0.11281747370958328, train_loss=0.07830285280942917, time_cost=1.5084476470947266
+
Steps: 0%| | 2469/1000000 [6:16:07<2566:57:26, 9.26s/it, lr=1e-5, step_loss=0.113]
Steps: 0%| | 2470/1000000 [6:16:23<3072:53:36, 11.09s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [2470], local_loss=0.032307665795087814, train_loss=0.04299195110797882, time_cost=1.2256617546081543
+
Steps: 0%| | 2470/1000000 [6:16:23<3072:53:36, 11.09s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 2471/1000000 [6:16:35<3187:51:50, 11.50s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [2471], local_loss=0.039742954075336456, train_loss=0.18021929264068604, time_cost=3.105933427810669
+
Steps: 0%| | 2471/1000000 [6:16:35<3187:51:50, 11.50s/it, lr=1e-5, step_loss=0.0397]
Steps: 0%| | 2472/1000000 [6:16:42<2812:20:59, 10.15s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [2472], local_loss=0.19725900888442993, train_loss=0.07928046584129333, time_cost=1.9700708389282227
+
Steps: 0%| | 2472/1000000 [6:16:42<2812:20:59, 10.15s/it, lr=1e-5, step_loss=0.197]
Steps: 0%| | 2473/1000000 [6:16:50<2589:00:52, 9.34s/it, lr=1e-5, step_loss=0.197][RANK-0]: Step: [2473], local_loss=0.03643962740898132, train_loss=0.08389943838119507, time_cost=1.880164384841919
+
Steps: 0%| | 2473/1000000 [6:16:50<2589:00:52, 9.34s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 2474/1000000 [6:17:03<2917:28:52, 10.53s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [2474], local_loss=0.15845376253128052, train_loss=0.04635552689433098, time_cost=3.491502046585083
+
Steps: 0%| | 2474/1000000 [6:17:03<2917:28:52, 10.53s/it, lr=1e-5, step_loss=0.158]
Steps: 0%| | 2475/1000000 [6:17:15<3078:29:07, 11.11s/it, lr=1e-5, step_loss=0.158][RANK-0]: Step: [2475], local_loss=0.021503925323486328, train_loss=0.03241216018795967, time_cost=3.7538459300994873
+
Steps: 0%| | 2475/1000000 [6:17:15<3078:29:07, 11.11s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2476/1000000 [6:17:21<2632:20:58, 9.50s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2476], local_loss=0.022419484332203865, train_loss=0.034932807087898254, time_cost=1.3440518379211426
+
Steps: 0%| | 2476/1000000 [6:17:21<2632:20:58, 9.50s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 2477/1000000 [6:17:30<2608:01:33, 9.41s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [2477], local_loss=0.02771667018532753, train_loss=0.10601545870304108, time_cost=3.1719393730163574
+
Steps: 0%| | 2477/1000000 [6:17:30<2608:01:33, 9.41s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 2478/1000000 [6:17:43<2846:03:31, 10.27s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [2478], local_loss=0.062492839992046356, train_loss=0.049988679587841034, time_cost=1.3729126453399658
+
Steps: 0%| | 2478/1000000 [6:17:43<2846:03:31, 10.27s/it, lr=1e-5, step_loss=0.0625]
Steps: 0%| | 2479/1000000 [6:17:54<2975:02:51, 10.74s/it, lr=1e-5, step_loss=0.0625][RANK-0]: Step: [2479], local_loss=0.022058086469769478, train_loss=0.030773412436246872, time_cost=1.3306102752685547
+
Steps: 0%| | 2479/1000000 [6:17:54<2975:02:51, 10.74s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 2480/1000000 [6:18:07<3149:49:40, 11.37s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [2480], local_loss=0.048563048243522644, train_loss=0.07626726478338242, time_cost=5.50418496131897
+
Steps: 0%| | 2480/1000000 [6:18:07<3149:49:40, 11.37s/it, lr=1e-5, step_loss=0.0486]
Steps: 0%| | 2481/1000000 [6:18:12<2616:46:49, 9.44s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [2481], local_loss=0.018359022215008736, train_loss=0.04709120839834213, time_cost=2.269896984100342
+
Steps: 0%| | 2481/1000000 [6:18:12<2616:46:49, 9.44s/it, lr=1e-5, step_loss=0.0184]
Steps: 0%| | 2482/1000000 [6:18:18<2287:13:54, 8.25s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [2482], local_loss=0.03333241865038872, train_loss=0.044497452676296234, time_cost=1.4897205829620361
+
Steps: 0%| | 2482/1000000 [6:18:18<2287:13:54, 8.25s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 2483/1000000 [6:18:25<2205:43:26, 7.96s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [2483], local_loss=0.02647869475185871, train_loss=0.03894347324967384, time_cost=3.492126226425171
+
Steps: 0%| | 2483/1000000 [6:18:25<2205:43:26, 7.96s/it, lr=1e-5, step_loss=0.0265]
Steps: 0%| | 2484/1000000 [6:18:29<1899:55:16, 6.86s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [2484], local_loss=0.022680649533867836, train_loss=0.03420256823301315, time_cost=3.25736927986145
+
Steps: 0%| | 2484/1000000 [6:18:29<1899:55:16, 6.86s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 2485/1000000 [6:18:36<1917:33:24, 6.92s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [2485], local_loss=0.07024912536144257, train_loss=0.10645543038845062, time_cost=2.798628568649292
+
Steps: 0%| | 2485/1000000 [6:18:36<1917:33:24, 6.92s/it, lr=1e-5, step_loss=0.0702]
Steps: 0%| | 2486/1000000 [6:18:51<2564:48:48, 9.26s/it, lr=1e-5, step_loss=0.0702][RANK-0]: Step: [2486], local_loss=0.013045803643763065, train_loss=0.06092480942606926, time_cost=7.060001611709595
+
Steps: 0%| | 2486/1000000 [6:18:51<2564:48:48, 9.26s/it, lr=1e-5, step_loss=0.013]
Steps: 0%| | 2487/1000000 [6:19:01<2644:56:41, 9.55s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [2487], local_loss=0.016319639980793, train_loss=0.031397897750139236, time_cost=1.196101427078247
+
Steps: 0%| | 2487/1000000 [6:19:01<2644:56:41, 9.55s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 2488/1000000 [6:19:15<3011:25:05, 10.87s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [2488], local_loss=0.11162509024143219, train_loss=1.043797492980957, time_cost=1.2230970859527588
+
Steps: 0%| | 2488/1000000 [6:19:15<3011:25:05, 10.87s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 2489/1000000 [6:19:24<2815:33:18, 10.16s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [2489], local_loss=0.013637903146445751, train_loss=0.07842075079679489, time_cost=1.2401554584503174
+
Steps: 0%| | 2489/1000000 [6:19:24<2815:33:18, 10.16s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 2490/1000000 [6:19:38<3141:46:07, 11.34s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [2490], local_loss=0.058294687420129776, train_loss=0.06501679867506027, time_cost=3.6150972843170166
+
Steps: 0%| | 2490/1000000 [6:19:38<3141:46:07, 11.34s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 2491/1000000 [6:19:44<2682:13:46, 9.68s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [2491], local_loss=0.02209334634244442, train_loss=0.029456820338964462, time_cost=1.5761089324951172
+
Steps: 0%| | 2491/1000000 [6:19:44<2682:13:46, 9.68s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 2492/1000000 [6:19:51<2482:02:22, 8.96s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [2492], local_loss=0.017816755920648575, train_loss=0.0408211350440979, time_cost=1.5335209369659424
+
Steps: 0%| | 2492/1000000 [6:19:51<2482:02:22, 8.96s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2493/1000000 [6:19:58<2342:27:06, 8.45s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2493], local_loss=0.08766555786132812, train_loss=0.06678949296474457, time_cost=1.210017204284668
+
Steps: 0%| | 2493/1000000 [6:19:58<2342:27:06, 8.45s/it, lr=1e-5, step_loss=0.0877]
Steps: 0%| | 2494/1000000 [6:20:09<2526:03:42, 9.12s/it, lr=1e-5, step_loss=0.0877][RANK-0]: Step: [2494], local_loss=0.05826980620622635, train_loss=0.043332282453775406, time_cost=1.5709939002990723
+
Steps: 0%| | 2494/1000000 [6:20:09<2526:03:42, 9.12s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 2495/1000000 [6:20:18<2572:33:33, 9.28s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [2495], local_loss=0.021840156987309456, train_loss=0.0427471324801445, time_cost=3.5339207649230957
+
Steps: 0%| | 2495/1000000 [6:20:18<2572:33:33, 9.28s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 2496/1000000 [6:20:24<2274:04:28, 8.21s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [2496], local_loss=0.028442196547985077, train_loss=0.058853428810834885, time_cost=4.999849081039429
+
Steps: 0%| | 2496/1000000 [6:20:24<2274:04:28, 8.21s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 2497/1000000 [6:20:29<1953:37:22, 7.05s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [2497], local_loss=0.2827923595905304, train_loss=0.08422434329986572, time_cost=1.200108289718628
+
Steps: 0%| | 2497/1000000 [6:20:29<1953:37:22, 7.05s/it, lr=1e-5, step_loss=0.283]
Steps: 0%| | 2498/1000000 [6:20:33<1776:27:17, 6.41s/it, lr=1e-5, step_loss=0.283][RANK-0]: Step: [2498], local_loss=0.07383554428815842, train_loss=0.09491598606109619, time_cost=1.2198779582977295
+
Steps: 0%| | 2498/1000000 [6:20:33<1776:27:17, 6.41s/it, lr=1e-5, step_loss=0.0738]
Steps: 0%| | 2499/1000000 [6:20:38<1635:30:13, 5.90s/it, lr=1e-5, step_loss=0.0738][RANK-0]: Step: [2499], local_loss=0.08613991737365723, train_loss=0.06219656392931938, time_cost=1.7177824974060059
+
Steps: 0%| | 2499/1000000 [6:20:38<1635:30:13, 5.90s/it, lr=1e-5, step_loss=0.0861]
Steps: 0%| | 2500/1000000 [6:20:53<2419:34:56, 8.73s/it, lr=1e-5, step_loss=0.0861][RANK-0]: Step: [2500], local_loss=0.09395037591457367, train_loss=0.050501804798841476, time_cost=6.408957004547119
+
Steps: 0%| | 2500/1000000 [6:20:53<2419:34:56, 8.73s/it, lr=1e-5, step_loss=0.094]
Steps: 0%| | 2501/1000000 [6:21:06<2731:01:51, 9.86s/it, lr=1e-5, step_loss=0.094][RANK-0]: Step: [2501], local_loss=0.03121742606163025, train_loss=0.05842714011669159, time_cost=2.0851328372955322
+
Steps: 0%| | 2501/1000000 [6:21:06<2731:01:51, 9.86s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 2502/1000000 [6:21:12<2399:18:43, 8.66s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [2502], local_loss=0.03294013440608978, train_loss=0.04812997579574585, time_cost=1.376600980758667
+
Steps: 0%| | 2502/1000000 [6:21:12<2399:18:43, 8.66s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 2503/1000000 [6:21:27<2926:02:16, 10.56s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [2503], local_loss=0.03627471998333931, train_loss=0.03129155933856964, time_cost=1.219362497329712
+
Steps: 0%| | 2503/1000000 [6:21:27<2926:02:16, 10.56s/it, lr=1e-5, step_loss=0.0363]
Steps: 0%| | 2504/1000000 [6:21:39<3088:31:03, 11.15s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [2504], local_loss=0.021477995440363884, train_loss=0.05331460386514664, time_cost=7.819460868835449
+
Steps: 0%| | 2504/1000000 [6:21:39<3088:31:03, 11.15s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2505/1000000 [6:21:48<2885:59:46, 10.42s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2505], local_loss=0.15915264189243317, train_loss=0.10427013039588928, time_cost=2.3442893028259277
+
Steps: 0%| | 2505/1000000 [6:21:48<2885:59:46, 10.42s/it, lr=1e-5, step_loss=0.159]
Steps: 0%| | 2506/1000000 [6:21:57<2783:22:54, 10.05s/it, lr=1e-5, step_loss=0.159][RANK-0]: Step: [2506], local_loss=0.05828677490353584, train_loss=0.050352439284324646, time_cost=3.350900173187256
+
Steps: 0%| | 2506/1000000 [6:21:57<2783:22:54, 10.05s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 2507/1000000 [6:22:02<2369:52:22, 8.55s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [2507], local_loss=0.1339026242494583, train_loss=0.051798850297927856, time_cost=2.0381603240966797
+
Steps: 0%| | 2507/1000000 [6:22:02<2369:52:22, 8.55s/it, lr=1e-5, step_loss=0.134]
Steps: 0%| | 2508/1000000 [6:22:07<2023:37:23, 7.30s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [2508], local_loss=0.047263480722904205, train_loss=0.04643431305885315, time_cost=1.4105839729309082
+
Steps: 0%| | 2508/1000000 [6:22:07<2023:37:23, 7.30s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 2509/1000000 [6:22:12<1832:08:29, 6.61s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [2509], local_loss=0.01856105960905552, train_loss=0.17541179060935974, time_cost=2.378178834915161
+
Steps: 0%| | 2509/1000000 [6:22:12<1832:08:29, 6.61s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 2510/1000000 [6:22:16<1639:00:48, 5.92s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [2510], local_loss=0.05542680248618126, train_loss=0.05790087580680847, time_cost=1.5787065029144287
+
Steps: 0%| | 2510/1000000 [6:22:16<1639:00:48, 5.92s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 2511/1000000 [6:22:23<1735:03:50, 6.26s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [2511], local_loss=0.02152911387383938, train_loss=0.03027445636689663, time_cost=2.499127149581909
+
Steps: 0%| | 2511/1000000 [6:22:23<1735:03:50, 6.26s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2512/1000000 [6:22:29<1733:06:40, 6.25s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2512], local_loss=0.2552748918533325, train_loss=0.06998717784881592, time_cost=1.2481367588043213
+
Steps: 0%| | 2512/1000000 [6:22:29<1733:06:40, 6.25s/it, lr=1e-5, step_loss=0.255]
Steps: 0%| | 2513/1000000 [6:22:36<1802:36:56, 6.51s/it, lr=1e-5, step_loss=0.255][RANK-0]: Step: [2513], local_loss=0.05762786045670509, train_loss=0.04182145372033119, time_cost=2.3358638286590576
+
Steps: 0%| | 2513/1000000 [6:22:36<1802:36:56, 6.51s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 2514/1000000 [6:22:45<1959:09:06, 7.07s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [2514], local_loss=0.04060785472393036, train_loss=0.031336672604084015, time_cost=1.2316577434539795
+
Steps: 0%| | 2514/1000000 [6:22:45<1959:09:06, 7.07s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 2515/1000000 [6:22:55<2253:19:50, 8.13s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [2515], local_loss=0.03756266459822655, train_loss=0.21131177246570587, time_cost=1.3339967727661133
+
Steps: 0%| | 2515/1000000 [6:22:55<2253:19:50, 8.13s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 2516/1000000 [6:23:01<2064:57:53, 7.45s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [2516], local_loss=0.01853843964636326, train_loss=0.04365234076976776, time_cost=1.3098127841949463
+
Steps: 0%| | 2516/1000000 [6:23:01<2064:57:53, 7.45s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 2517/1000000 [6:23:06<1858:43:42, 6.71s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [2517], local_loss=0.019757134839892387, train_loss=0.06835144013166428, time_cost=2.050044059753418
+
Steps: 0%| | 2517/1000000 [6:23:06<1858:43:42, 6.71s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 2518/1000000 [6:23:14<1913:34:42, 6.91s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [2518], local_loss=0.02813788130879402, train_loss=0.02705570124089718, time_cost=3.5328121185302734
+
Steps: 0%| | 2518/1000000 [6:23:14<1913:34:42, 6.91s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 2519/1000000 [6:23:21<1972:01:00, 7.12s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [2519], local_loss=0.02889053151011467, train_loss=0.05822349339723587, time_cost=1.223860263824463
+
Steps: 0%| | 2519/1000000 [6:23:21<1972:01:00, 7.12s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 2520/1000000 [6:23:29<2019:48:45, 7.29s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [2520], local_loss=0.018900539726018906, train_loss=0.09563387930393219, time_cost=2.680091142654419
+
Steps: 0%| | 2520/1000000 [6:23:29<2019:48:45, 7.29s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 2521/1000000 [6:23:35<1886:52:52, 6.81s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [2521], local_loss=0.021105190739035606, train_loss=0.04191415011882782, time_cost=1.2190515995025635
+
Steps: 0%| | 2521/1000000 [6:23:35<1886:52:52, 6.81s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 2522/1000000 [6:23:46<2242:45:07, 8.09s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [2522], local_loss=0.05267105624079704, train_loss=0.04611314833164215, time_cost=2.0355145931243896
+
Steps: 0%| | 2522/1000000 [6:23:46<2242:45:07, 8.09s/it, lr=1e-5, step_loss=0.0527]
Steps: 0%| | 2523/1000000 [6:23:51<1997:24:28, 7.21s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [2523], local_loss=0.11325152963399887, train_loss=0.0422927588224411, time_cost=1.2148826122283936
+
Steps: 0%| | 2523/1000000 [6:23:51<1997:24:28, 7.21s/it, lr=1e-5, step_loss=0.113]
Steps: 0%| | 2524/1000000 [6:24:02<2305:41:46, 8.32s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [2524], local_loss=0.29029715061187744, train_loss=13.074604034423828, time_cost=2.487734317779541
+
Steps: 0%| | 2524/1000000 [6:24:02<2305:41:46, 8.32s/it, lr=1e-5, step_loss=0.29]
Steps: 0%| | 2525/1000000 [6:24:08<2107:53:51, 7.61s/it, lr=1e-5, step_loss=0.29][RANK-0]: Step: [2525], local_loss=0.013644075021147728, train_loss=0.1619182825088501, time_cost=2.817021369934082
+
Steps: 0%| | 2525/1000000 [6:24:08<2107:53:51, 7.61s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 2526/1000000 [6:24:22<2704:08:41, 9.76s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [2526], local_loss=0.06371092796325684, train_loss=9.507855415344238, time_cost=1.2503793239593506
+
Steps: 0%| | 2526/1000000 [6:24:22<2704:08:41, 9.76s/it, lr=1e-5, step_loss=0.0637]
Steps: 0%| | 2527/1000000 [6:24:27<2299:40:43, 8.30s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [2527], local_loss=0.039067622274160385, train_loss=0.03922668844461441, time_cost=1.9339959621429443
+
Steps: 0%| | 2527/1000000 [6:24:27<2299:40:43, 8.30s/it, lr=1e-5, step_loss=0.0391]
Steps: 0%| | 2528/1000000 [6:24:33<2048:13:37, 7.39s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [2528], local_loss=0.07642043381929398, train_loss=0.10471382737159729, time_cost=4.0691001415252686
+
Steps: 0%| | 2528/1000000 [6:24:33<2048:13:37, 7.39s/it, lr=1e-5, step_loss=0.0764]
Steps: 0%| | 2529/1000000 [6:24:44<2388:29:17, 8.62s/it, lr=1e-5, step_loss=0.0764][RANK-0]: Step: [2529], local_loss=0.022548481822013855, train_loss=0.07011426985263824, time_cost=3.8972675800323486
+
Steps: 0%| | 2529/1000000 [6:24:44<2388:29:17, 8.62s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 2530/1000000 [6:24:50<2137:16:27, 7.71s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [2530], local_loss=0.9950522780418396, train_loss=0.2060263752937317, time_cost=1.544877290725708
+
Steps: 0%| | 2530/1000000 [6:24:50<2137:16:27, 7.71s/it, lr=1e-5, step_loss=0.995]
Steps: 0%| | 2531/1000000 [6:24:55<1925:22:48, 6.95s/it, lr=1e-5, step_loss=0.995][RANK-0]: Step: [2531], local_loss=0.03432255610823631, train_loss=0.04587383568286896, time_cost=2.395632028579712
+
Steps: 0%| | 2531/1000000 [6:24:55<1925:22:48, 6.95s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 2532/1000000 [6:25:05<2185:01:30, 7.89s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [2532], local_loss=0.03157604858279228, train_loss=25.745040893554688, time_cost=2.231034994125366
+
Steps: 0%| | 2532/1000000 [6:25:05<2185:01:30, 7.89s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 2533/1000000 [6:25:10<1961:09:59, 7.08s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [2533], local_loss=0.01515599712729454, train_loss=0.0565878264605999, time_cost=2.4796767234802246
+
Steps: 0%| | 2533/1000000 [6:25:10<1961:09:59, 7.08s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 2534/1000000 [6:25:15<1803:32:36, 6.51s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [2534], local_loss=0.044162217527627945, train_loss=0.12851423025131226, time_cost=1.2371976375579834
+
Steps: 0%| | 2534/1000000 [6:25:15<1803:32:36, 6.51s/it, lr=1e-5, step_loss=0.0442]
Steps: 0%| | 2535/1000000 [6:25:26<2181:03:30, 7.87s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [2535], local_loss=0.027548380196094513, train_loss=0.06706546247005463, time_cost=5.28912091255188
+
Steps: 0%| | 2535/1000000 [6:25:26<2181:03:30, 7.87s/it, lr=1e-5, step_loss=0.0275]
Steps: 0%| | 2536/1000000 [6:25:30<1846:48:57, 6.67s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [2536], local_loss=0.07250192761421204, train_loss=0.08905584365129471, time_cost=1.4260494709014893
+
Steps: 0%| | 2536/1000000 [6:25:30<1846:48:57, 6.67s/it, lr=1e-5, step_loss=0.0725]
Steps: 0%| | 2537/1000000 [6:25:35<1724:03:41, 6.22s/it, lr=1e-5, step_loss=0.0725][RANK-0]: Step: [2537], local_loss=0.017785998061299324, train_loss=0.08448261022567749, time_cost=2.192462682723999
+
Steps: 0%| | 2537/1000000 [6:25:35<1724:03:41, 6.22s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2538/1000000 [6:25:40<1580:06:07, 5.70s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2538], local_loss=0.0419323667883873, train_loss=34.240177154541016, time_cost=1.591860055923462
+
Steps: 0%| | 2538/1000000 [6:25:40<1580:06:07, 5.70s/it, lr=1e-5, step_loss=0.0419]
Steps: 0%| | 2539/1000000 [6:25:47<1687:56:17, 6.09s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [2539], local_loss=0.017834190279245377, train_loss=0.05146417021751404, time_cost=3.139108657836914
+
Steps: 0%| | 2539/1000000 [6:25:47<1687:56:17, 6.09s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2540/1000000 [6:25:55<1837:17:36, 6.63s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2540], local_loss=0.03767263516783714, train_loss=0.053296566009521484, time_cost=1.2206482887268066
+
Steps: 0%| | 2540/1000000 [6:25:55<1837:17:36, 6.63s/it, lr=1e-5, step_loss=0.0377]
Steps: 0%| | 2541/1000000 [6:26:02<1924:22:36, 6.95s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [2541], local_loss=0.03448839858174324, train_loss=0.10640932619571686, time_cost=1.5658586025238037
+
Steps: 0%| | 2541/1000000 [6:26:02<1924:22:36, 6.95s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 2542/1000000 [6:26:16<2487:15:04, 8.98s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [2542], local_loss=0.04821209982037544, train_loss=0.055412717163562775, time_cost=1.791959524154663
+
Steps: 0%| | 2542/1000000 [6:26:16<2487:15:04, 8.98s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 2543/1000000 [6:26:27<2630:58:22, 9.50s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [2543], local_loss=0.031107814982533455, train_loss=0.04120691865682602, time_cost=6.543880224227905
+
Steps: 0%| | 2543/1000000 [6:26:27<2630:58:22, 9.50s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 2544/1000000 [6:26:38<2733:02:23, 9.86s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [2544], local_loss=0.036403413861989975, train_loss=0.15735141932964325, time_cost=2.6505234241485596
+
Steps: 0%| | 2544/1000000 [6:26:38<2733:02:23, 9.86s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 2545/1000000 [6:26:43<2360:47:07, 8.52s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [2545], local_loss=0.016139023005962372, train_loss=0.06967568397521973, time_cost=1.3512136936187744
+
Steps: 0%| | 2545/1000000 [6:26:43<2360:47:07, 8.52s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 2546/1000000 [6:26:50<2207:01:04, 7.97s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [2546], local_loss=0.01948717050254345, train_loss=0.0643453598022461, time_cost=2.256152868270874
+
Steps: 0%| | 2546/1000000 [6:26:50<2207:01:04, 7.97s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 2547/1000000 [6:26:57<2193:13:24, 7.92s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [2547], local_loss=0.015676597133278847, train_loss=0.058469075709581375, time_cost=4.111382484436035
+
Steps: 0%| | 2547/1000000 [6:26:57<2193:13:24, 7.92s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2548/1000000 [6:27:03<1974:59:19, 7.13s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2548], local_loss=0.035666730254888535, train_loss=0.05170764401555061, time_cost=1.5209145545959473
+
Steps: 0%| | 2548/1000000 [6:27:03<1974:59:19, 7.13s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 2549/1000000 [6:27:12<2147:29:18, 7.75s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [2549], local_loss=0.046691637486219406, train_loss=4.560327529907227, time_cost=1.788705587387085
+
Steps: 0%| | 2549/1000000 [6:27:12<2147:29:18, 7.75s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 2550/1000000 [6:27:24<2517:11:37, 9.09s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [2550], local_loss=0.04935278743505478, train_loss=0.04511839896440506, time_cost=1.2139825820922852
+
Steps: 0%| | 2550/1000000 [6:27:24<2517:11:37, 9.09s/it, lr=1e-5, step_loss=0.0494]
Steps: 0%| | 2551/1000000 [6:27:31<2337:57:44, 8.44s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [2551], local_loss=0.04602554067969322, train_loss=0.1789589524269104, time_cost=2.2557692527770996
+
Steps: 0%| | 2551/1000000 [6:27:31<2337:57:44, 8.44s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 2552/1000000 [6:27:38<2233:57:36, 8.06s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [2552], local_loss=0.06232728809118271, train_loss=0.12379750609397888, time_cost=2.7655699253082275
+
Steps: 0%| | 2552/1000000 [6:27:38<2233:57:36, 8.06s/it, lr=1e-5, step_loss=0.0623]
Steps: 0%| | 2553/1000000 [6:27:44<2057:47:39, 7.43s/it, lr=1e-5, step_loss=0.0623][RANK-0]: Step: [2553], local_loss=0.051961224526166916, train_loss=0.05974440276622772, time_cost=3.9004452228546143
+
Steps: 0%| | 2553/1000000 [6:27:44<2057:47:39, 7.43s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 2554/1000000 [6:27:53<2208:22:00, 7.97s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [2554], local_loss=0.02249271795153618, train_loss=0.10686281323432922, time_cost=1.692155361175537
+
Steps: 0%| | 2554/1000000 [6:27:53<2208:22:00, 7.97s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 2555/1000000 [6:27:58<1928:41:44, 6.96s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [2555], local_loss=0.02656705677509308, train_loss=0.04507967084646225, time_cost=1.2840735912322998
+
Steps: 0%| | 2555/1000000 [6:27:58<1928:41:44, 6.96s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2556/1000000 [6:28:07<2106:03:49, 7.60s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2556], local_loss=0.05098472535610199, train_loss=0.06983301788568497, time_cost=1.2089605331420898
+
Steps: 0%| | 2556/1000000 [6:28:07<2106:03:49, 7.60s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 2557/1000000 [6:28:14<2055:27:19, 7.42s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [2557], local_loss=0.021826472133398056, train_loss=0.04424775019288063, time_cost=1.2108700275421143
+
Steps: 0%| | 2557/1000000 [6:28:14<2055:27:19, 7.42s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 2558/1000000 [6:28:24<2248:07:54, 8.11s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [2558], local_loss=0.01735975779592991, train_loss=0.05768483877182007, time_cost=4.380658388137817
+
Steps: 0%| | 2558/1000000 [6:28:24<2248:07:54, 8.11s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 2559/1000000 [6:28:31<2176:41:18, 7.86s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [2559], local_loss=0.026711879298090935, train_loss=0.058590780943632126, time_cost=3.4536921977996826
+
Steps: 0%| | 2559/1000000 [6:28:31<2176:41:18, 7.86s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 2560/1000000 [6:28:42<2409:28:56, 8.70s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [2560], local_loss=0.0745401382446289, train_loss=0.09438706934452057, time_cost=2.3849093914031982
+
Steps: 0%| | 2560/1000000 [6:28:42<2409:28:56, 8.70s/it, lr=1e-5, step_loss=0.0745]
Steps: 0%| | 2561/1000000 [6:28:48<2165:13:53, 7.81s/it, lr=1e-5, step_loss=0.0745][RANK-0]: Step: [2561], local_loss=0.04622892290353775, train_loss=0.0738338977098465, time_cost=1.2288403511047363
+
Steps: 0%| | 2561/1000000 [6:28:48<2165:13:53, 7.81s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 2562/1000000 [6:28:59<2436:00:45, 8.79s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [2562], local_loss=0.0193032156676054, train_loss=0.050358571112155914, time_cost=3.1280429363250732
+
Steps: 0%| | 2562/1000000 [6:28:59<2436:00:45, 8.79s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2563/1000000 [6:29:09<2560:33:04, 9.24s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2563], local_loss=0.023058991879224777, train_loss=0.09366333484649658, time_cost=3.828665256500244
+
Steps: 0%| | 2563/1000000 [6:29:09<2560:33:04, 9.24s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 2564/1000000 [6:29:24<3011:15:12, 10.87s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [2564], local_loss=0.04239692911505699, train_loss=0.15008893609046936, time_cost=1.2033238410949707
+
Steps: 0%| | 2564/1000000 [6:29:24<3011:15:12, 10.87s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 2565/1000000 [6:29:29<2593:38:04, 9.36s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [2565], local_loss=0.017702456563711166, train_loss=23.486797332763672, time_cost=2.1755871772766113
+
Steps: 0%| | 2565/1000000 [6:29:29<2593:38:04, 9.36s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 2566/1000000 [6:29:37<2436:21:43, 8.79s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [2566], local_loss=0.01980455592274666, train_loss=0.06449815630912781, time_cost=3.0363032817840576
+
Steps: 0%| | 2566/1000000 [6:29:37<2436:21:43, 8.79s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 2567/1000000 [6:29:51<2863:48:01, 10.34s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [2567], local_loss=0.03884132206439972, train_loss=0.05230948328971863, time_cost=4.026359558105469
+
Steps: 0%| | 2567/1000000 [6:29:51<2863:48:01, 10.34s/it, lr=1e-5, step_loss=0.0388]
Steps: 0%| | 2568/1000000 [6:29:58<2631:40:58, 9.50s/it, lr=1e-5, step_loss=0.0388][RANK-0]: Step: [2568], local_loss=0.024924924597144127, train_loss=0.030342910438776016, time_cost=1.5044076442718506
+
Steps: 0%| | 2568/1000000 [6:29:58<2631:40:58, 9.50s/it, lr=1e-5, step_loss=0.0249]
Steps: 0%| | 2569/1000000 [6:30:07<2580:12:50, 9.31s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [2569], local_loss=0.014230003580451012, train_loss=0.039681583642959595, time_cost=3.200507402420044
+
Steps: 0%| | 2569/1000000 [6:30:07<2580:12:50, 9.31s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 2570/1000000 [6:30:13<2288:26:30, 8.26s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [2570], local_loss=0.028353866189718246, train_loss=0.04662792757153511, time_cost=3.2088212966918945
+
Steps: 0%| | 2570/1000000 [6:30:13<2288:26:30, 8.26s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 2571/1000000 [6:30:20<2184:45:58, 7.89s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [2571], local_loss=0.028299005702137947, train_loss=0.03370009362697601, time_cost=2.992950439453125
+
Steps: 0%| | 2571/1000000 [6:30:20<2184:45:58, 7.89s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 2572/1000000 [6:30:34<2687:01:30, 9.70s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [2572], local_loss=0.03727159649133682, train_loss=26.001415252685547, time_cost=3.9839611053466797
+
Steps: 0%| | 2572/1000000 [6:30:34<2687:01:30, 9.70s/it, lr=1e-5, step_loss=0.0373]
Steps: 0%| | 2573/1000000 [6:30:42<2575:17:22, 9.29s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [2573], local_loss=0.02211044542491436, train_loss=0.046278852969408035, time_cost=4.3225417137146
+
Steps: 0%| | 2573/1000000 [6:30:42<2575:17:22, 9.29s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 2574/1000000 [6:30:48<2262:42:27, 8.17s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [2574], local_loss=0.05364851653575897, train_loss=0.03524038940668106, time_cost=2.8313679695129395
+
Steps: 0%| | 2574/1000000 [6:30:48<2262:42:27, 8.17s/it, lr=1e-5, step_loss=0.0536]
Steps: 0%| | 2575/1000000 [6:30:57<2334:22:04, 8.43s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [2575], local_loss=0.037744276225566864, train_loss=0.05443333834409714, time_cost=3.0135810375213623
+
Steps: 0%| | 2575/1000000 [6:30:57<2334:22:04, 8.43s/it, lr=1e-5, step_loss=0.0377]
Steps: 0%| | 2576/1000000 [6:31:13<2965:20:03, 10.70s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [2576], local_loss=0.025873780250549316, train_loss=0.10738515108823776, time_cost=1.222740650177002
+
Steps: 0%| | 2576/1000000 [6:31:13<2965:20:03, 10.70s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 2577/1000000 [6:31:20<2662:12:50, 9.61s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [2577], local_loss=0.017435235902667046, train_loss=0.08488605916500092, time_cost=1.2442171573638916
+
Steps: 0%| | 2577/1000000 [6:31:20<2662:12:50, 9.61s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 2578/1000000 [6:31:29<2628:48:46, 9.49s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [2578], local_loss=0.03689853474497795, train_loss=31.82628631591797, time_cost=1.2253203392028809
+
Steps: 0%| | 2578/1000000 [6:31:29<2628:48:46, 9.49s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 2579/1000000 [6:31:41<2789:47:37, 10.07s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [2579], local_loss=0.1796356439590454, train_loss=0.11139027774333954, time_cost=3.3175182342529297
+
Steps: 0%| | 2579/1000000 [6:31:41<2789:47:37, 10.07s/it, lr=1e-5, step_loss=0.18]
Steps: 0%| | 2580/1000000 [6:31:45<2333:08:38, 8.42s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [2580], local_loss=0.01637493632733822, train_loss=0.04779268056154251, time_cost=1.9094176292419434
+
Steps: 0%| | 2580/1000000 [6:31:45<2333:08:38, 8.42s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 2581/1000000 [6:31:58<2719:14:40, 9.81s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [2581], local_loss=0.019893404096364975, train_loss=0.025079872459173203, time_cost=5.845110177993774
+
Steps: 0%| | 2581/1000000 [6:31:58<2719:14:40, 9.81s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 2582/1000000 [6:32:03<2319:17:57, 8.37s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [2582], local_loss=0.029633115977048874, train_loss=0.0544203445315361, time_cost=2.4450299739837646
+
Steps: 0%| | 2582/1000000 [6:32:03<2319:17:57, 8.37s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 2583/1000000 [6:32:14<2546:50:28, 9.19s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [2583], local_loss=0.02176651544868946, train_loss=0.10007382929325104, time_cost=3.8703103065490723
+
Steps: 0%| | 2583/1000000 [6:32:14<2546:50:28, 9.19s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 2584/1000000 [6:32:29<3020:19:32, 10.90s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [2584], local_loss=0.0506577342748642, train_loss=0.033981770277023315, time_cost=1.2461864948272705
+
Steps: 0%| | 2584/1000000 [6:32:29<3020:19:32, 10.90s/it, lr=1e-5, step_loss=0.0507]
Steps: 0%| | 2585/1000000 [6:32:34<2511:12:17, 9.06s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [2585], local_loss=0.6319701671600342, train_loss=0.1050838828086853, time_cost=1.2185378074645996
+
Steps: 0%| | 2585/1000000 [6:32:34<2511:12:17, 9.06s/it, lr=1e-5, step_loss=0.632]
Steps: 0%| | 2586/1000000 [6:32:39<2172:51:30, 7.84s/it, lr=1e-5, step_loss=0.632][RANK-0]: Step: [2586], local_loss=0.04051007702946663, train_loss=0.03600753843784332, time_cost=2.2287709712982178
+
Steps: 0%| | 2586/1000000 [6:32:39<2172:51:30, 7.84s/it, lr=1e-5, step_loss=0.0405]
Steps: 0%| | 2587/1000000 [6:32:50<2403:02:27, 8.67s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [2587], local_loss=0.048747967928647995, train_loss=0.033455826342105865, time_cost=4.022562742233276
+
Steps: 0%| | 2587/1000000 [6:32:50<2403:02:27, 8.67s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 2588/1000000 [6:32:57<2305:21:32, 8.32s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [2588], local_loss=0.04393104463815689, train_loss=0.09289705008268356, time_cost=1.245887041091919
+
Steps: 0%| | 2588/1000000 [6:32:57<2305:21:32, 8.32s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 2589/1000000 [6:33:04<2187:23:44, 7.90s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [2589], local_loss=0.020114943385124207, train_loss=0.02493268996477127, time_cost=2.4459023475646973
+
Steps: 0%| | 2589/1000000 [6:33:04<2187:23:44, 7.90s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 2590/1000000 [6:33:11<2124:58:49, 7.67s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [2590], local_loss=0.0643242597579956, train_loss=0.06016154587268829, time_cost=2.488544464111328
+
Steps: 0%| | 2590/1000000 [6:33:11<2124:58:49, 7.67s/it, lr=1e-5, step_loss=0.0643]
Steps: 0%| | 2591/1000000 [6:33:19<2168:26:13, 7.83s/it, lr=1e-5, step_loss=0.0643][RANK-0]: Step: [2591], local_loss=0.1401631087064743, train_loss=0.07571277767419815, time_cost=2.8657262325286865
+
Steps: 0%| | 2591/1000000 [6:33:19<2168:26:13, 7.83s/it, lr=1e-5, step_loss=0.14]
Steps: 0%| | 2592/1000000 [6:33:30<2411:51:54, 8.71s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [2592], local_loss=0.017973601818084717, train_loss=0.04584764689207077, time_cost=4.594528675079346
+
Steps: 0%| | 2592/1000000 [6:33:30<2411:51:54, 8.71s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 2593/1000000 [6:33:38<2353:36:23, 8.50s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [2593], local_loss=0.01524037029594183, train_loss=0.05516224354505539, time_cost=2.6152029037475586
+
Steps: 0%| | 2593/1000000 [6:33:38<2353:36:23, 8.50s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 2594/1000000 [6:33:52<2762:13:21, 9.97s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [2594], local_loss=0.019608400762081146, train_loss=0.04702368006110191, time_cost=5.202860593795776
+
Steps: 0%| | 2594/1000000 [6:33:52<2762:13:21, 9.97s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 2595/1000000 [6:34:05<3072:02:57, 11.09s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [2595], local_loss=0.25429636240005493, train_loss=0.07754122465848923, time_cost=4.157690763473511
+
Steps: 0%| | 2595/1000000 [6:34:05<3072:02:57, 11.09s/it, lr=1e-5, step_loss=0.254]
Steps: 0%| | 2596/1000000 [6:34:16<3038:16:25, 10.97s/it, lr=1e-5, step_loss=0.254][RANK-0]: Step: [2596], local_loss=0.016917778179049492, train_loss=0.16949397325515747, time_cost=3.4079487323760986
+
Steps: 0%| | 2596/1000000 [6:34:16<3038:16:25, 10.97s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 2597/1000000 [6:34:25<2850:39:41, 10.29s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [2597], local_loss=0.017533499747514725, train_loss=0.043180156499147415, time_cost=3.6439459323883057
+
Steps: 0%| | 2597/1000000 [6:34:25<2850:39:41, 10.29s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 2598/1000000 [6:34:30<2424:04:35, 8.75s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [2598], local_loss=0.05888456106185913, train_loss=0.07068440318107605, time_cost=2.1990959644317627
+
Steps: 0%| | 2598/1000000 [6:34:30<2424:04:35, 8.75s/it, lr=1e-5, step_loss=0.0589]
Steps: 0%| | 2599/1000000 [6:34:39<2432:12:53, 8.78s/it, lr=1e-5, step_loss=0.0589][RANK-0]: Step: [2599], local_loss=0.04660169780254364, train_loss=0.03653972968459129, time_cost=2.839468002319336
+
Steps: 0%| | 2599/1000000 [6:34:39<2432:12:53, 8.78s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 2600/1000000 [6:34:50<2677:15:01, 9.66s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [2600], local_loss=0.014556813053786755, train_loss=11.754768371582031, time_cost=2.021591901779175
+
Steps: 0%| | 2600/1000000 [6:34:50<2677:15:01, 9.66s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 2601/1000000 [6:34:56<2305:00:27, 8.32s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [2601], local_loss=0.026958785951137543, train_loss=0.03944304212927818, time_cost=2.206651210784912
+
Steps: 0%| | 2601/1000000 [6:34:56<2305:00:27, 8.32s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 2602/1000000 [6:35:10<2776:06:50, 10.02s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [2602], local_loss=0.02990369312465191, train_loss=7.533968448638916, time_cost=4.405390977859497
+
Steps: 0%| | 2602/1000000 [6:35:10<2776:06:50, 10.02s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 2603/1000000 [6:35:17<2541:24:14, 9.17s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [2603], local_loss=0.04043913260102272, train_loss=0.05425441637635231, time_cost=2.6400110721588135
+
Steps: 0%| | 2603/1000000 [6:35:17<2541:24:14, 9.17s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 2604/1000000 [6:35:28<2689:35:02, 9.71s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [2604], local_loss=0.4884026348590851, train_loss=0.14418290555477142, time_cost=3.994892120361328
+
Steps: 0%| | 2604/1000000 [6:35:28<2689:35:02, 9.71s/it, lr=1e-5, step_loss=0.488]
Steps: 0%| | 2605/1000000 [6:35:41<3009:45:33, 10.86s/it, lr=1e-5, step_loss=0.488][RANK-0]: Step: [2605], local_loss=0.021894168108701706, train_loss=0.02305203676223755, time_cost=3.397101879119873
+
Steps: 0%| | 2605/1000000 [6:35:41<3009:45:33, 10.86s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 2606/1000000 [6:35:49<2736:32:58, 9.88s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [2606], local_loss=0.0633174329996109, train_loss=0.06396017968654633, time_cost=1.2171282768249512
+
Steps: 0%| | 2606/1000000 [6:35:49<2736:32:58, 9.88s/it, lr=1e-5, step_loss=0.0633]
Steps: 0%| | 2607/1000000 [6:36:03<3063:01:19, 11.06s/it, lr=1e-5, step_loss=0.0633][RANK-0]: Step: [2607], local_loss=0.028175443410873413, train_loss=0.04338059946894646, time_cost=1.2251079082489014
+
Steps: 0%| | 2607/1000000 [6:36:03<3063:01:19, 11.06s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 2608/1000000 [6:36:09<2714:33:42, 9.80s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [2608], local_loss=0.015443401411175728, train_loss=0.034087758511304855, time_cost=1.206212043762207
+
Steps: 0%| | 2608/1000000 [6:36:09<2714:33:42, 9.80s/it, lr=1e-5, step_loss=0.0154]
Steps: 0%| | 2609/1000000 [6:36:15<2370:49:20, 8.56s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [2609], local_loss=0.052433133125305176, train_loss=0.04329979792237282, time_cost=4.537285566329956
+
Steps: 0%| | 2609/1000000 [6:36:15<2370:49:20, 8.56s/it, lr=1e-5, step_loss=0.0524]
Steps: 0%| | 2610/1000000 [6:36:26<2550:16:08, 9.20s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [2610], local_loss=0.039470989257097244, train_loss=0.029521595686674118, time_cost=3.213392734527588
+
Steps: 0%| | 2610/1000000 [6:36:26<2550:16:08, 9.20s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 2611/1000000 [6:36:32<2283:05:15, 8.24s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [2611], local_loss=0.023261114954948425, train_loss=0.1728009283542633, time_cost=1.6246190071105957
+
Steps: 0%| | 2611/1000000 [6:36:32<2283:05:15, 8.24s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 2612/1000000 [6:36:43<2496:13:41, 9.01s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [2612], local_loss=0.05255685746669769, train_loss=0.16705270111560822, time_cost=1.9477150440216064
+
Steps: 0%| | 2612/1000000 [6:36:43<2496:13:41, 9.01s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 2613/1000000 [6:36:48<2182:29:21, 7.88s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [2613], local_loss=0.15836942195892334, train_loss=0.05294370278716087, time_cost=2.4502856731414795
+
Steps: 0%| | 2613/1000000 [6:36:48<2182:29:21, 7.88s/it, lr=1e-5, step_loss=0.158]
Steps: 0%| | 2614/1000000 [6:36:53<1955:34:55, 7.06s/it, lr=1e-5, step_loss=0.158][RANK-0]: Step: [2614], local_loss=0.01969144493341446, train_loss=0.10306568443775177, time_cost=2.6401712894439697
+
Steps: 0%| | 2614/1000000 [6:36:53<1955:34:55, 7.06s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 2615/1000000 [6:37:07<2496:44:20, 9.01s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [2615], local_loss=0.01685003936290741, train_loss=8.050212860107422, time_cost=4.904850244522095
+
Steps: 0%| | 2615/1000000 [6:37:07<2496:44:20, 9.01s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 2616/1000000 [6:37:11<2137:19:47, 7.71s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [2616], local_loss=0.02382575161755085, train_loss=0.07453815639019012, time_cost=1.6823930740356445
+
Steps: 0%| | 2616/1000000 [6:37:11<2137:19:47, 7.71s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 2617/1000000 [6:37:22<2415:50:49, 8.72s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [2617], local_loss=0.02937096357345581, train_loss=0.036038827151060104, time_cost=2.097698450088501
+
Steps: 0%| | 2617/1000000 [6:37:22<2415:50:49, 8.72s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 2618/1000000 [6:37:32<2520:37:47, 9.10s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [2618], local_loss=0.04028205946087837, train_loss=0.09306663274765015, time_cost=5.125447750091553
+
Steps: 0%| | 2618/1000000 [6:37:32<2520:37:47, 9.10s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 2619/1000000 [6:37:38<2259:36:30, 8.16s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [2619], local_loss=0.01802939362823963, train_loss=0.1002727523446083, time_cost=1.7435493469238281
+
Steps: 0%| | 2619/1000000 [6:37:38<2259:36:30, 8.16s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 2620/1000000 [6:37:46<2203:39:00, 7.95s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [2620], local_loss=0.01843499019742012, train_loss=0.0691274031996727, time_cost=3.079073429107666
+
Steps: 0%| | 2620/1000000 [6:37:46<2203:39:00, 7.95s/it, lr=1e-5, step_loss=0.0184]
Steps: 0%| | 2621/1000000 [6:37:55<2303:42:12, 8.32s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [2621], local_loss=0.019491583108901978, train_loss=0.046718351542949677, time_cost=1.2133421897888184
+
Steps: 0%| | 2621/1000000 [6:37:55<2303:42:12, 8.32s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 2622/1000000 [6:38:09<2794:53:53, 10.09s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [2622], local_loss=0.032448381185531616, train_loss=30.728315353393555, time_cost=5.116124629974365
+
Steps: 0%| | 2622/1000000 [6:38:09<2794:53:53, 10.09s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 2623/1000000 [6:38:23<3096:32:52, 11.18s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [2623], local_loss=0.020074592903256416, train_loss=0.04729640111327171, time_cost=7.050350666046143
+
Steps: 0%| | 2623/1000000 [6:38:23<3096:32:52, 11.18s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 2624/1000000 [6:38:36<3293:16:12, 11.89s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [2624], local_loss=0.01558533776551485, train_loss=0.06799689680337906, time_cost=3.7943952083587646
+
Steps: 0%| | 2624/1000000 [6:38:36<3293:16:12, 11.89s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 2625/1000000 [6:38:50<3446:17:00, 12.44s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [2625], local_loss=0.013802115805447102, train_loss=0.04091043770313263, time_cost=4.337572336196899
+
Steps: 0%| | 2625/1000000 [6:38:50<3446:17:00, 12.44s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 2626/1000000 [6:38:55<2778:23:56, 10.03s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [2626], local_loss=0.11082407832145691, train_loss=0.05604187026619911, time_cost=1.5928263664245605
+
Steps: 0%| | 2626/1000000 [6:38:55<2778:23:56, 10.03s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 2627/1000000 [6:39:05<2804:51:54, 10.12s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [2627], local_loss=0.06220182031393051, train_loss=0.034472592175006866, time_cost=2.3264424800872803
+
Steps: 0%| | 2627/1000000 [6:39:05<2804:51:54, 10.12s/it, lr=1e-5, step_loss=0.0622]
Steps: 0%| | 2628/1000000 [6:39:20<3204:28:18, 11.57s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [2628], local_loss=0.020371273159980774, train_loss=0.032671842724084854, time_cost=1.2987194061279297
+
Steps: 0%| | 2628/1000000 [6:39:20<3204:28:18, 11.57s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 2629/1000000 [6:39:38<3787:31:59, 13.67s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [2629], local_loss=0.39420071244239807, train_loss=0.07446257770061493, time_cost=9.95497989654541
+
Steps: 0%| | 2629/1000000 [6:39:38<3787:31:59, 13.67s/it, lr=1e-5, step_loss=0.394]
Steps: 0%| | 2630/1000000 [6:39:49<3519:58:02, 12.71s/it, lr=1e-5, step_loss=0.394][RANK-0]: Step: [2630], local_loss=0.044105272740125656, train_loss=0.047613970935344696, time_cost=1.6065773963928223
+
Steps: 0%| | 2630/1000000 [6:39:49<3519:58:02, 12.71s/it, lr=1e-5, step_loss=0.0441]
Steps: 0%| | 2631/1000000 [6:39:54<2904:25:46, 10.48s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [2631], local_loss=0.023255908861756325, train_loss=0.03202063590288162, time_cost=1.2227551937103271
+
Steps: 0%| | 2631/1000000 [6:39:54<2904:25:46, 10.48s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 2632/1000000 [6:40:00<2497:54:52, 9.02s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [2632], local_loss=0.024180477485060692, train_loss=0.04975385218858719, time_cost=3.1911489963531494
+
Steps: 0%| | 2632/1000000 [6:40:00<2497:54:52, 9.02s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 2633/1000000 [6:40:17<3182:20:14, 11.49s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [2633], local_loss=0.01370041724294424, train_loss=0.048409901559352875, time_cost=8.496005773544312
+
Steps: 0%| | 2633/1000000 [6:40:17<3182:20:14, 11.49s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 2634/1000000 [6:40:22<2664:50:10, 9.62s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [2634], local_loss=0.03873727470636368, train_loss=0.05857117474079132, time_cost=1.2090108394622803
+
Steps: 0%| | 2634/1000000 [6:40:22<2664:50:10, 9.62s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 2635/1000000 [6:40:32<2668:55:34, 9.63s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [2635], local_loss=0.04205546900629997, train_loss=0.03593524545431137, time_cost=2.6046271324157715
+
Steps: 0%| | 2635/1000000 [6:40:32<2668:55:34, 9.63s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 2636/1000000 [6:40:43<2760:32:34, 9.96s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [2636], local_loss=0.028769999742507935, train_loss=0.02683880552649498, time_cost=3.2111151218414307
+
Steps: 0%| | 2636/1000000 [6:40:43<2760:32:34, 9.96s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 2637/1000000 [6:40:49<2428:44:51, 8.77s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [2637], local_loss=0.6828462481498718, train_loss=0.15732495486736298, time_cost=2.1975066661834717
+
Steps: 0%| | 2637/1000000 [6:40:49<2428:44:51, 8.77s/it, lr=1e-5, step_loss=0.683]
Steps: 0%| | 2638/1000000 [6:40:54<2117:16:45, 7.64s/it, lr=1e-5, step_loss=0.683][RANK-0]: Step: [2638], local_loss=0.004061192274093628, train_loss=0.044394757598638535, time_cost=2.108264446258545
+
Steps: 0%| | 2638/1000000 [6:40:54<2117:16:45, 7.64s/it, lr=1e-5, step_loss=0.00406]
Steps: 0%| | 2639/1000000 [6:41:05<2426:51:06, 8.76s/it, lr=1e-5, step_loss=0.00406][RANK-0]: Step: [2639], local_loss=0.01355959102511406, train_loss=0.049882642924785614, time_cost=1.2236883640289307
+
Steps: 0%| | 2639/1000000 [6:41:05<2426:51:06, 8.76s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 2640/1000000 [6:41:19<2825:13:08, 10.20s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [2640], local_loss=0.02632494457066059, train_loss=0.057236962020397186, time_cost=3.892320156097412
+
Steps: 0%| | 2640/1000000 [6:41:19<2825:13:08, 10.20s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 2641/1000000 [6:41:29<2871:11:07, 10.36s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [2641], local_loss=0.01568196713924408, train_loss=0.07355725020170212, time_cost=4.104248285293579
+
Steps: 0%| | 2641/1000000 [6:41:29<2871:11:07, 10.36s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2642/1000000 [6:41:34<2362:05:27, 8.53s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2642], local_loss=0.023525752127170563, train_loss=0.029447883367538452, time_cost=1.5737369060516357
+
Steps: 0%| | 2642/1000000 [6:41:34<2362:05:27, 8.53s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 2643/1000000 [6:41:45<2596:04:09, 9.37s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [2643], local_loss=0.05417174845933914, train_loss=0.07773007452487946, time_cost=3.092780590057373
+
Steps: 0%| | 2643/1000000 [6:41:45<2596:04:09, 9.37s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 2644/1000000 [6:41:54<2605:11:41, 9.40s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [2644], local_loss=0.039594825357198715, train_loss=0.08386604487895966, time_cost=7.617157936096191
+
Steps: 0%| | 2644/1000000 [6:41:54<2605:11:41, 9.40s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 2645/1000000 [6:41:59<2179:59:48, 7.87s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [2645], local_loss=0.035292673856019974, train_loss=0.05740801617503166, time_cost=1.2995109558105469
+
Steps: 0%| | 2645/1000000 [6:41:59<2179:59:48, 7.87s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 2646/1000000 [6:42:08<2287:15:02, 8.26s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [2646], local_loss=0.015503905713558197, train_loss=0.03475648909807205, time_cost=3.7952120304107666
+
Steps: 0%| | 2646/1000000 [6:42:08<2287:15:02, 8.26s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 2647/1000000 [6:42:14<2078:44:34, 7.50s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [2647], local_loss=0.015918318182229996, train_loss=0.030927812680602074, time_cost=1.7232723236083984
+
Steps: 0%| | 2647/1000000 [6:42:14<2078:44:34, 7.50s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 2648/1000000 [6:42:19<1900:34:45, 6.86s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [2648], local_loss=0.04162154346704483, train_loss=0.034910671412944794, time_cost=3.0024383068084717
+
Steps: 0%| | 2648/1000000 [6:42:19<1900:34:45, 6.86s/it, lr=1e-5, step_loss=0.0416]
Steps: 0%| | 2649/1000000 [6:42:27<2021:41:52, 7.30s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [2649], local_loss=0.014453966170549393, train_loss=0.030376523733139038, time_cost=2.895498752593994
+
Steps: 0%| | 2649/1000000 [6:42:27<2021:41:52, 7.30s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 2650/1000000 [6:42:41<2516:08:07, 9.08s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [2650], local_loss=0.02859974279999733, train_loss=0.05901079624891281, time_cost=3.6050055027008057
+
Steps: 0%| | 2650/1000000 [6:42:41<2516:08:07, 9.08s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 2651/1000000 [6:42:46<2194:43:02, 7.92s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [2651], local_loss=0.03256523236632347, train_loss=0.04710696265101433, time_cost=2.248460054397583
+
Steps: 0%| | 2651/1000000 [6:42:46<2194:43:02, 7.92s/it, lr=1e-5, step_loss=0.0326]
Steps: 0%| | 2652/1000000 [6:42:53<2150:51:19, 7.76s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [2652], local_loss=0.020565969869494438, train_loss=0.0705825611948967, time_cost=1.4310274124145508
+
Steps: 0%| | 2652/1000000 [6:42:53<2150:51:19, 7.76s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 2653/1000000 [6:43:10<2916:33:01, 10.53s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [2653], local_loss=0.01375559065490961, train_loss=0.06485643237829208, time_cost=3.755216121673584
+
Steps: 0%| | 2653/1000000 [6:43:10<2916:33:01, 10.53s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 2654/1000000 [6:43:24<3188:45:17, 11.51s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [2654], local_loss=0.01666422374546528, train_loss=0.025889061391353607, time_cost=1.23722243309021
+
Steps: 0%| | 2654/1000000 [6:43:24<3188:45:17, 11.51s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 2655/1000000 [6:43:30<2718:30:11, 9.81s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [2655], local_loss=0.037864599376916885, train_loss=0.07993727177381516, time_cost=4.329517602920532
+
Steps: 0%| | 2655/1000000 [6:43:30<2718:30:11, 9.81s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 2656/1000000 [6:43:34<2281:30:43, 8.24s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [2656], local_loss=0.05656242370605469, train_loss=0.044337958097457886, time_cost=1.6015129089355469
+
Steps: 0%| | 2656/1000000 [6:43:34<2281:30:43, 8.24s/it, lr=1e-5, step_loss=0.0566]
Steps: 0%| | 2657/1000000 [6:43:39<2003:28:30, 7.23s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [2657], local_loss=0.02153060957789421, train_loss=0.08310440182685852, time_cost=2.083871364593506
+
Steps: 0%| | 2657/1000000 [6:43:39<2003:28:30, 7.23s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 2658/1000000 [6:43:48<2153:09:37, 7.77s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [2658], local_loss=0.07444213330745697, train_loss=0.038322821259498596, time_cost=4.078353404998779
+
Steps: 0%| | 2658/1000000 [6:43:48<2153:09:37, 7.77s/it, lr=1e-5, step_loss=0.0744]
Steps: 0%| | 2659/1000000 [6:43:55<2077:54:16, 7.50s/it, lr=1e-5, step_loss=0.0744][RANK-0]: Step: [2659], local_loss=0.041207827627658844, train_loss=0.03886217623949051, time_cost=1.2179591655731201
+
Steps: 0%| | 2659/1000000 [6:43:55<2077:54:16, 7.50s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 2660/1000000 [6:44:02<2036:45:53, 7.35s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [2660], local_loss=0.020995166152715683, train_loss=0.05573972687125206, time_cost=2.3698511123657227
+
Steps: 0%| | 2660/1000000 [6:44:02<2036:45:53, 7.35s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 2661/1000000 [6:44:13<2303:07:52, 8.31s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [2661], local_loss=0.021702466532588005, train_loss=0.09828177094459534, time_cost=1.4781949520111084
+
Steps: 0%| | 2661/1000000 [6:44:13<2303:07:52, 8.31s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 2662/1000000 [6:44:17<1997:23:31, 7.21s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [2662], local_loss=0.05802120640873909, train_loss=0.028480220586061478, time_cost=1.7313964366912842
+
Steps: 0%| | 2662/1000000 [6:44:17<1997:23:31, 7.21s/it, lr=1e-5, step_loss=0.058]
Steps: 0%| | 2663/1000000 [6:44:22<1771:54:05, 6.40s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [2663], local_loss=0.018759656697511673, train_loss=6.747599124908447, time_cost=1.9932849407196045
+
Steps: 0%| | 2663/1000000 [6:44:22<1771:54:05, 6.40s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 2664/1000000 [6:44:27<1647:49:25, 5.95s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [2664], local_loss=0.023335693404078484, train_loss=0.04709494113922119, time_cost=1.8737683296203613
+
Steps: 0%| | 2664/1000000 [6:44:27<1647:49:25, 5.95s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 2665/1000000 [6:44:37<2042:20:48, 7.37s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [2665], local_loss=0.03061731904745102, train_loss=0.02922404184937477, time_cost=1.2291805744171143
+
Steps: 0%| | 2665/1000000 [6:44:37<2042:20:48, 7.37s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 2666/1000000 [6:44:45<2028:35:42, 7.32s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [2666], local_loss=0.03956520929932594, train_loss=0.028273049741983414, time_cost=1.2104732990264893
+
Steps: 0%| | 2666/1000000 [6:44:45<2028:35:42, 7.32s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 2667/1000000 [6:44:54<2214:20:19, 7.99s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [2667], local_loss=0.02527831494808197, train_loss=0.026440218091011047, time_cost=3.3611087799072266
+
Steps: 0%| | 2667/1000000 [6:44:54<2214:20:19, 7.99s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 2668/1000000 [6:45:08<2666:34:19, 9.63s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [2668], local_loss=0.019931627437472343, train_loss=0.15624329447746277, time_cost=4.08669114112854
+
Steps: 0%| | 2668/1000000 [6:45:08<2666:34:19, 9.63s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 2669/1000000 [6:45:17<2647:32:19, 9.56s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [2669], local_loss=0.021252453327178955, train_loss=16.220712661743164, time_cost=3.5844943523406982
+
Steps: 0%| | 2669/1000000 [6:45:17<2647:32:19, 9.56s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 2670/1000000 [6:45:31<3013:41:33, 10.88s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [2670], local_loss=0.047972798347473145, train_loss=0.03423907980322838, time_cost=4.670259952545166
+
Steps: 0%| | 2670/1000000 [6:45:31<3013:41:33, 10.88s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 2671/1000000 [6:45:44<3208:38:52, 11.58s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [2671], local_loss=0.021409111097455025, train_loss=0.09240806102752686, time_cost=3.5920650959014893
+
Steps: 0%| | 2671/1000000 [6:45:44<3208:38:52, 11.58s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 2672/1000000 [6:45:52<2881:13:16, 10.40s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [2672], local_loss=0.04200625419616699, train_loss=0.04462311416864395, time_cost=2.9595117568969727
+
Steps: 0%| | 2672/1000000 [6:45:52<2881:13:16, 10.40s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 2673/1000000 [6:45:57<2418:27:38, 8.73s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [2673], local_loss=0.08042523264884949, train_loss=0.16924521327018738, time_cost=2.512281894683838
+
Steps: 0%| | 2673/1000000 [6:45:57<2418:27:38, 8.73s/it, lr=1e-5, step_loss=0.0804]
Steps: 0%| | 2674/1000000 [6:46:07<2581:32:26, 9.32s/it, lr=1e-5, step_loss=0.0804][RANK-0]: Step: [2674], local_loss=0.10278735309839249, train_loss=0.04082057625055313, time_cost=2.9754505157470703
+
Steps: 0%| | 2674/1000000 [6:46:07<2581:32:26, 9.32s/it, lr=1e-5, step_loss=0.103]
Steps: 0%| | 2675/1000000 [6:46:21<2931:24:49, 10.58s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [2675], local_loss=0.014490333385765553, train_loss=0.024630505591630936, time_cost=1.2611348628997803
+
Steps: 0%| | 2675/1000000 [6:46:21<2931:24:49, 10.58s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 2676/1000000 [6:46:27<2528:44:07, 9.13s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [2676], local_loss=0.016061076894402504, train_loss=0.03465786576271057, time_cost=1.227508783340454
+
Steps: 0%| | 2676/1000000 [6:46:27<2528:44:07, 9.13s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 2677/1000000 [6:46:32<2241:35:45, 8.09s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [2677], local_loss=0.09098123759031296, train_loss=0.060437820851802826, time_cost=2.7648732662200928
+
Steps: 0%| | 2677/1000000 [6:46:32<2241:35:45, 8.09s/it, lr=1e-5, step_loss=0.091]
Steps: 0%| | 2678/1000000 [6:46:42<2349:49:20, 8.48s/it, lr=1e-5, step_loss=0.091][RANK-0]: Step: [2678], local_loss=0.9877342581748962, train_loss=0.24166297912597656, time_cost=6.874531269073486
+
Steps: 0%| | 2678/1000000 [6:46:42<2349:49:20, 8.48s/it, lr=1e-5, step_loss=0.988]
Steps: 0%| | 2679/1000000 [6:46:52<2509:58:44, 9.06s/it, lr=1e-5, step_loss=0.988][RANK-0]: Step: [2679], local_loss=0.09351799637079239, train_loss=0.06328365206718445, time_cost=2.6555700302124023
+
Steps: 0%| | 2679/1000000 [6:46:52<2509:58:44, 9.06s/it, lr=1e-5, step_loss=0.0935]
Steps: 0%| | 2680/1000000 [6:47:05<2789:33:57, 10.07s/it, lr=1e-5, step_loss=0.0935][RANK-0]: Step: [2680], local_loss=0.047753579914569855, train_loss=0.03410223871469498, time_cost=1.981003999710083
+
Steps: 0%| | 2680/1000000 [6:47:05<2789:33:57, 10.07s/it, lr=1e-5, step_loss=0.0478]
Steps: 0%| | 2681/1000000 [6:47:18<3073:19:51, 11.09s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [2681], local_loss=0.09770926833152771, train_loss=0.057575371116399765, time_cost=5.257898330688477
+
Steps: 0%| | 2681/1000000 [6:47:18<3073:19:51, 11.09s/it, lr=1e-5, step_loss=0.0977]
Steps: 0%| | 2682/1000000 [6:47:25<2709:36:40, 9.78s/it, lr=1e-5, step_loss=0.0977][RANK-0]: Step: [2682], local_loss=0.01670847274363041, train_loss=0.025641396641731262, time_cost=2.522435426712036
+
Steps: 0%| | 2682/1000000 [6:47:25<2709:36:40, 9.78s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 2683/1000000 [6:47:37<2882:30:50, 10.40s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [2683], local_loss=0.045242734253406525, train_loss=0.04298257455229759, time_cost=3.2610878944396973
+
Steps: 0%| | 2683/1000000 [6:47:37<2882:30:50, 10.40s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 2684/1000000 [6:47:46<2800:13:33, 10.11s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [2684], local_loss=0.015555012039840221, train_loss=0.03584705665707588, time_cost=4.487453937530518
+
Steps: 0%| | 2684/1000000 [6:47:46<2800:13:33, 10.11s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 2685/1000000 [6:47:56<2819:03:39, 10.18s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [2685], local_loss=0.06418439745903015, train_loss=0.05416412651538849, time_cost=2.296001434326172
+
Steps: 0%| | 2685/1000000 [6:47:56<2819:03:39, 10.18s/it, lr=1e-5, step_loss=0.0642]
Steps: 0%| | 2686/1000000 [6:48:07<2897:56:53, 10.46s/it, lr=1e-5, step_loss=0.0642][RANK-0]: Step: [2686], local_loss=0.0028841127641499043, train_loss=0.051933854818344116, time_cost=1.1964757442474365
+
Steps: 0%| | 2686/1000000 [6:48:07<2897:56:53, 10.46s/it, lr=1e-5, step_loss=0.00288]
Steps: 0%| | 2687/1000000 [6:48:21<3115:11:34, 11.24s/it, lr=1e-5, step_loss=0.00288][RANK-0]: Step: [2687], local_loss=0.01932210475206375, train_loss=0.020814914256334305, time_cost=4.809334754943848
+
Steps: 0%| | 2687/1000000 [6:48:21<3115:11:34, 11.24s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2688/1000000 [6:48:30<2934:23:39, 10.59s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2688], local_loss=0.018476614728569984, train_loss=0.04381976276636124, time_cost=1.567854642868042
+
Steps: 0%| | 2688/1000000 [6:48:30<2934:23:39, 10.59s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 2689/1000000 [6:48:35<2479:47:05, 8.95s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [2689], local_loss=0.03945700824260712, train_loss=0.035928111523389816, time_cost=1.221088171005249
+
Steps: 0%| | 2689/1000000 [6:48:35<2479:47:05, 8.95s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 2690/1000000 [6:48:40<2168:51:30, 7.83s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [2690], local_loss=0.026907915249466896, train_loss=0.08590935170650482, time_cost=2.6501870155334473
+
Steps: 0%| | 2690/1000000 [6:48:40<2168:51:30, 7.83s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 2691/1000000 [6:48:51<2424:32:49, 8.75s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [2691], local_loss=0.053452420979738235, train_loss=0.05124390125274658, time_cost=1.3861911296844482
+
Steps: 0%| | 2691/1000000 [6:48:51<2424:32:49, 8.75s/it, lr=1e-5, step_loss=0.0535]
Steps: 0%| | 2692/1000000 [6:49:09<3205:53:18, 11.57s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [2692], local_loss=0.056846365332603455, train_loss=0.037192732095718384, time_cost=15.951951026916504
+
Steps: 0%| | 2692/1000000 [6:49:09<3205:53:18, 11.57s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 2693/1000000 [6:49:20<3138:22:42, 11.33s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [2693], local_loss=0.05800974369049072, train_loss=0.045569028705358505, time_cost=8.310706615447998
+
Steps: 0%| | 2693/1000000 [6:49:20<3138:22:42, 11.33s/it, lr=1e-5, step_loss=0.058]
Steps: 0%| | 2694/1000000 [6:49:27<2820:00:41, 10.18s/it, lr=1e-5, step_loss=0.058][RANK-0]: Step: [2694], local_loss=0.024837922304868698, train_loss=0.04060173034667969, time_cost=6.21704888343811
+
Steps: 0%| | 2694/1000000 [6:49:27<2820:00:41, 10.18s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 2695/1000000 [6:49:32<2334:44:26, 8.43s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [2695], local_loss=0.1731095016002655, train_loss=0.07490740716457367, time_cost=1.5635950565338135
+
Steps: 0%| | 2695/1000000 [6:49:32<2334:44:26, 8.43s/it, lr=1e-5, step_loss=0.173]
Steps: 0%| | 2696/1000000 [6:49:37<2052:17:07, 7.41s/it, lr=1e-5, step_loss=0.173][RANK-0]: Step: [2696], local_loss=0.012723897583782673, train_loss=0.048004135489463806, time_cost=2.272559404373169
+
Steps: 0%| | 2696/1000000 [6:49:37<2052:17:07, 7.41s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 2697/1000000 [6:49:42<1923:01:43, 6.94s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [2697], local_loss=0.02020484395325184, train_loss=0.18507862091064453, time_cost=1.4720327854156494
+
Steps: 0%| | 2697/1000000 [6:49:42<1923:01:43, 6.94s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 2698/1000000 [6:49:52<2118:57:17, 7.65s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [2698], local_loss=0.12277846783399582, train_loss=0.06087291240692139, time_cost=2.836229085922241
+
Steps: 0%| | 2698/1000000 [6:49:52<2118:57:17, 7.65s/it, lr=1e-5, step_loss=0.123]
Steps: 0%| | 2699/1000000 [6:50:03<2431:59:22, 8.78s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [2699], local_loss=0.03497574105858803, train_loss=0.1617347002029419, time_cost=1.2250111103057861
+
Steps: 0%| | 2699/1000000 [6:50:03<2431:59:22, 8.78s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 2700/1000000 [6:50:08<2108:41:46, 7.61s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [2700], local_loss=0.03480790928006172, train_loss=0.02996041253209114, time_cost=1.667273759841919
+
Steps: 0%| | 2700/1000000 [6:50:08<2108:41:46, 7.61s/it, lr=1e-5, step_loss=0.0348]
Steps: 0%| | 2701/1000000 [6:50:17<2224:23:22, 8.03s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [2701], local_loss=0.024566493928432465, train_loss=0.054222166538238525, time_cost=2.7737603187561035
+
Steps: 0%| | 2701/1000000 [6:50:17<2224:23:22, 8.03s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 2702/1000000 [6:50:22<1949:10:43, 7.04s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [2702], local_loss=0.026139572262763977, train_loss=0.030893856659531593, time_cost=1.7443857192993164
+
Steps: 0%| | 2702/1000000 [6:50:22<1949:10:43, 7.04s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 2703/1000000 [6:50:37<2603:22:28, 9.40s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [2703], local_loss=0.025989636778831482, train_loss=0.053243353962898254, time_cost=11.124225378036499
+
Steps: 0%| | 2703/1000000 [6:50:37<2603:22:28, 9.40s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 2704/1000000 [6:50:43<2361:54:59, 8.53s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [2704], local_loss=0.11808106303215027, train_loss=0.04780982434749603, time_cost=2.093916893005371
+
Steps: 0%| | 2704/1000000 [6:50:43<2361:54:59, 8.53s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 2705/1000000 [6:50:57<2797:06:06, 10.10s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [2705], local_loss=0.0434088408946991, train_loss=0.06257454305887222, time_cost=3.9781341552734375
+
Steps: 0%| | 2705/1000000 [6:50:57<2797:06:06, 10.10s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 2706/1000000 [6:51:01<2324:24:54, 8.39s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [2706], local_loss=0.24359624087810516, train_loss=0.08768223971128464, time_cost=1.4482839107513428
+
Steps: 0%| | 2706/1000000 [6:51:01<2324:24:54, 8.39s/it, lr=1e-5, step_loss=0.244]
Steps: 0%| | 2707/1000000 [6:51:06<2037:33:06, 7.36s/it, lr=1e-5, step_loss=0.244][RANK-0]: Step: [2707], local_loss=0.02827885001897812, train_loss=0.07502156496047974, time_cost=2.185316562652588
+
Steps: 0%| | 2707/1000000 [6:51:06<2037:33:06, 7.36s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 2708/1000000 [6:51:11<1844:35:25, 6.66s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [2708], local_loss=0.04506073519587517, train_loss=0.044218335300683975, time_cost=1.229093313217163
+
Steps: 0%| | 2708/1000000 [6:51:11<1844:35:25, 6.66s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 2709/1000000 [6:51:17<1725:42:59, 6.23s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [2709], local_loss=0.023620838299393654, train_loss=6.996360778808594, time_cost=1.5875787734985352
+
Steps: 0%| | 2709/1000000 [6:51:17<1725:42:59, 6.23s/it, lr=1e-5, step_loss=0.0236]
Steps: 0%| | 2710/1000000 [6:51:25<1931:08:59, 6.97s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [2710], local_loss=0.008482029661536217, train_loss=0.040541574358940125, time_cost=3.134934425354004
+
Steps: 0%| | 2710/1000000 [6:51:25<1931:08:59, 6.97s/it, lr=1e-5, step_loss=0.00848]
Steps: 0%| | 2711/1000000 [6:51:32<1927:54:05, 6.96s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [2711], local_loss=0.037591248750686646, train_loss=0.04508134722709656, time_cost=3.352633476257324
+
Steps: 0%| | 2711/1000000 [6:51:32<1927:54:05, 6.96s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 2712/1000000 [6:51:38<1820:17:57, 6.57s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [2712], local_loss=0.021000448614358902, train_loss=12.361162185668945, time_cost=2.984647035598755
+
Steps: 0%| | 2712/1000000 [6:51:38<1820:17:57, 6.57s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 2713/1000000 [6:51:51<2401:03:24, 8.67s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [2713], local_loss=0.02342815510928631, train_loss=0.1104908436536789, time_cost=2.536876678466797
+
Steps: 0%| | 2713/1000000 [6:51:51<2401:03:24, 8.67s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 2714/1000000 [6:51:57<2112:51:58, 7.63s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [2714], local_loss=0.018607668578624725, train_loss=0.017539313063025475, time_cost=2.248162269592285
+
Steps: 0%| | 2714/1000000 [6:51:57<2112:51:58, 7.63s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 2715/1000000 [6:52:08<2445:35:40, 8.83s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [2715], local_loss=0.3403320908546448, train_loss=0.13653795421123505, time_cost=3.989281415939331
+
Steps: 0%| | 2715/1000000 [6:52:08<2445:35:40, 8.83s/it, lr=1e-5, step_loss=0.34]
Steps: 0%| | 2716/1000000 [6:52:15<2311:45:06, 8.34s/it, lr=1e-5, step_loss=0.34][RANK-0]: Step: [2716], local_loss=0.01400664635002613, train_loss=0.028134223073720932, time_cost=2.3098182678222656
+
Steps: 0%| | 2716/1000000 [6:52:15<2311:45:06, 8.34s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 2717/1000000 [6:52:24<2289:45:11, 8.27s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [2717], local_loss=0.04170381650328636, train_loss=0.1643885374069214, time_cost=1.280716896057129
+
Steps: 0%| | 2717/1000000 [6:52:24<2289:45:11, 8.27s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 2718/1000000 [6:52:37<2693:53:19, 9.72s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [2718], local_loss=0.46618837118148804, train_loss=0.09703611582517624, time_cost=3.1563868522644043
+
Steps: 0%| | 2718/1000000 [6:52:37<2693:53:19, 9.72s/it, lr=1e-5, step_loss=0.466]
Steps: 0%| | 2719/1000000 [6:52:43<2385:34:10, 8.61s/it, lr=1e-5, step_loss=0.466][RANK-0]: Step: [2719], local_loss=0.024966221302747726, train_loss=0.06150461733341217, time_cost=1.4006767272949219
+
Steps: 0%| | 2719/1000000 [6:52:43<2385:34:10, 8.61s/it, lr=1e-5, step_loss=0.025]
Steps: 0%| | 2720/1000000 [6:52:52<2414:27:17, 8.72s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [2720], local_loss=0.031927525997161865, train_loss=0.09835365414619446, time_cost=1.4616854190826416
+
Steps: 0%| | 2720/1000000 [6:52:52<2414:27:17, 8.72s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 2721/1000000 [6:53:04<2756:15:41, 9.95s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [2721], local_loss=0.014414886012673378, train_loss=0.036830008029937744, time_cost=5.5287346839904785
+
Steps: 0%| | 2721/1000000 [6:53:04<2756:15:41, 9.95s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 2722/1000000 [6:53:20<3199:25:03, 11.55s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [2722], local_loss=0.024954477325081825, train_loss=0.047264233231544495, time_cost=13.501160860061646
+
Steps: 0%| | 2722/1000000 [6:53:20<3199:25:03, 11.55s/it, lr=1e-5, step_loss=0.025]
Steps: 0%| | 2723/1000000 [6:53:27<2862:01:51, 10.33s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [2723], local_loss=0.03310919553041458, train_loss=0.08893486857414246, time_cost=2.531437635421753
+
Steps: 0%| | 2723/1000000 [6:53:27<2862:01:51, 10.33s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 2724/1000000 [6:53:37<2781:18:45, 10.04s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [2724], local_loss=0.3240697681903839, train_loss=0.07861113548278809, time_cost=1.2658429145812988
+
Steps: 0%| | 2724/1000000 [6:53:37<2781:18:45, 10.04s/it, lr=1e-5, step_loss=0.324]
Steps: 0%| | 2725/1000000 [6:53:42<2418:19:36, 8.73s/it, lr=1e-5, step_loss=0.324][RANK-0]: Step: [2725], local_loss=0.01579071208834648, train_loss=0.03847789764404297, time_cost=3.072333812713623
+
Steps: 0%| | 2725/1000000 [6:53:42<2418:19:36, 8.73s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 2726/1000000 [6:53:59<3042:38:45, 10.98s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [2726], local_loss=0.02899470552802086, train_loss=0.0425843708217144, time_cost=6.248687744140625
+
Steps: 0%| | 2726/1000000 [6:53:59<3042:38:45, 10.98s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 2727/1000000 [6:54:09<2983:26:30, 10.77s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [2727], local_loss=0.03470712900161743, train_loss=0.14806252717971802, time_cost=1.5612733364105225
+
Steps: 0%| | 2727/1000000 [6:54:09<2983:26:30, 10.77s/it, lr=1e-5, step_loss=0.0347]
Steps: 0%| | 2728/1000000 [6:54:16<2656:53:47, 9.59s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [2728], local_loss=0.02820647694170475, train_loss=0.03211088478565216, time_cost=2.8874869346618652
+
Steps: 0%| | 2728/1000000 [6:54:16<2656:53:47, 9.59s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 2729/1000000 [6:54:20<2211:15:06, 7.98s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [2729], local_loss=0.07462003827095032, train_loss=0.15402577817440033, time_cost=1.2748992443084717
+
Steps: 0%| | 2729/1000000 [6:54:20<2211:15:06, 7.98s/it, lr=1e-5, step_loss=0.0746]
Steps: 0%| | 2730/1000000 [6:54:26<2028:34:53, 7.32s/it, lr=1e-5, step_loss=0.0746][RANK-0]: Step: [2730], local_loss=0.3040122091770172, train_loss=0.07766635715961456, time_cost=1.3893070220947266
+
Steps: 0%| | 2730/1000000 [6:54:26<2028:34:53, 7.32s/it, lr=1e-5, step_loss=0.304]
Steps: 0%| | 2731/1000000 [6:54:34<2085:28:16, 7.53s/it, lr=1e-5, step_loss=0.304][RANK-0]: Step: [2731], local_loss=0.029002144932746887, train_loss=0.05851010978221893, time_cost=3.3543155193328857
+
Steps: 0%| | 2731/1000000 [6:54:34<2085:28:16, 7.53s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 2732/1000000 [6:54:46<2516:53:05, 9.09s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [2732], local_loss=0.05168432742357254, train_loss=0.05500383675098419, time_cost=4.097764492034912
+
Steps: 0%| | 2732/1000000 [6:54:46<2516:53:05, 9.09s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 2733/1000000 [6:54:56<2523:48:44, 9.11s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [2733], local_loss=1.0129485130310059, train_loss=0.17533370852470398, time_cost=2.707148551940918
+
Steps: 0%| | 2733/1000000 [6:54:56<2523:48:44, 9.11s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 2734/1000000 [6:55:03<2353:46:52, 8.50s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [2734], local_loss=0.014258196577429771, train_loss=6.087039947509766, time_cost=5.61757755279541
+
Steps: 0%| | 2734/1000000 [6:55:03<2353:46:52, 8.50s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 2735/1000000 [6:55:09<2216:42:47, 8.00s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [2735], local_loss=0.04447336122393608, train_loss=0.08063913881778717, time_cost=2.346729278564453
+
Steps: 0%| | 2735/1000000 [6:55:09<2216:42:47, 8.00s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 2736/1000000 [6:55:15<1991:06:24, 7.19s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [2736], local_loss=0.01816294528543949, train_loss=0.046618007123470306, time_cost=1.2322657108306885
+
Steps: 0%| | 2736/1000000 [6:55:15<1991:06:24, 7.19s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 2737/1000000 [6:55:26<2351:45:58, 8.49s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [2737], local_loss=0.03132191300392151, train_loss=0.07157812267541885, time_cost=3.8769423961639404
+
Steps: 0%| | 2737/1000000 [6:55:26<2351:45:58, 8.49s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 2738/1000000 [6:55:36<2434:08:18, 8.79s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [2738], local_loss=0.026639966294169426, train_loss=0.12090911716222763, time_cost=1.2155182361602783
+
Steps: 0%| | 2738/1000000 [6:55:36<2434:08:18, 8.79s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2739/1000000 [6:55:45<2476:42:49, 8.94s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2739], local_loss=0.18833622336387634, train_loss=0.06507248431444168, time_cost=4.28738260269165
+
Steps: 0%| | 2739/1000000 [6:55:45<2476:42:49, 8.94s/it, lr=1e-5, step_loss=0.188]
Steps: 0%| | 2740/1000000 [6:55:52<2316:48:19, 8.36s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [2740], local_loss=0.016902729868888855, train_loss=0.07709630578756332, time_cost=2.4307613372802734
+
Steps: 0%| | 2740/1000000 [6:55:52<2316:48:19, 8.36s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 2741/1000000 [6:56:03<2518:22:38, 9.09s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [2741], local_loss=0.1860903948545456, train_loss=0.25254231691360474, time_cost=2.6458356380462646
+
Steps: 0%| | 2741/1000000 [6:56:03<2518:22:38, 9.09s/it, lr=1e-5, step_loss=0.186]
Steps: 0%| | 2742/1000000 [6:56:07<2130:33:47, 7.69s/it, lr=1e-5, step_loss=0.186][RANK-0]: Step: [2742], local_loss=0.013161682523787022, train_loss=0.02522999793291092, time_cost=1.7381277084350586
+
Steps: 0%| | 2742/1000000 [6:56:07<2130:33:47, 7.69s/it, lr=1e-5, step_loss=0.0132]
Steps: 0%| | 2743/1000000 [6:56:23<2823:03:41, 10.19s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [2743], local_loss=0.08573585003614426, train_loss=0.046091269701719284, time_cost=4.761760473251343
+
Steps: 0%| | 2743/1000000 [6:56:23<2823:03:41, 10.19s/it, lr=1e-5, step_loss=0.0857]
Steps: 0%| | 2744/1000000 [6:56:36<3042:12:58, 10.98s/it, lr=1e-5, step_loss=0.0857][RANK-0]: Step: [2744], local_loss=0.08786468207836151, train_loss=0.053348395973443985, time_cost=2.2625396251678467
+
Steps: 0%| | 2744/1000000 [6:56:36<3042:12:58, 10.98s/it, lr=1e-5, step_loss=0.0879]
Steps: 0%| | 2745/1000000 [6:56:44<2768:10:08, 9.99s/it, lr=1e-5, step_loss=0.0879][RANK-0]: Step: [2745], local_loss=0.09465933591127396, train_loss=0.0689820647239685, time_cost=2.5492706298828125
+
Steps: 0%| | 2745/1000000 [6:56:44<2768:10:08, 9.99s/it, lr=1e-5, step_loss=0.0947]
Steps: 0%| | 2746/1000000 [6:56:51<2570:08:04, 9.28s/it, lr=1e-5, step_loss=0.0947][RANK-0]: Step: [2746], local_loss=0.9901491403579712, train_loss=0.2509157061576843, time_cost=5.703307628631592
+
Steps: 0%| | 2746/1000000 [6:56:51<2570:08:04, 9.28s/it, lr=1e-5, step_loss=0.99]
Steps: 0%| | 2747/1000000 [6:56:57<2258:35:46, 8.15s/it, lr=1e-5, step_loss=0.99][RANK-0]: Step: [2747], local_loss=0.04176932945847511, train_loss=0.028185561299324036, time_cost=1.4788551330566406
+
Steps: 0%| | 2747/1000000 [6:56:57<2258:35:46, 8.15s/it, lr=1e-5, step_loss=0.0418]
Steps: 0%| | 2748/1000000 [6:57:02<1990:53:13, 7.19s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [2748], local_loss=0.0280810184776783, train_loss=0.03834615647792816, time_cost=1.9676845073699951
+
Steps: 0%| | 2748/1000000 [6:57:02<1990:53:13, 7.19s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 2749/1000000 [6:57:14<2410:44:53, 8.70s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [2749], local_loss=0.014586872421205044, train_loss=0.11030749976634979, time_cost=2.132255792617798
+
Steps: 0%| | 2749/1000000 [6:57:14<2410:44:53, 8.70s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 2750/1000000 [6:57:32<3160:12:56, 11.41s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [2750], local_loss=0.053253866732120514, train_loss=0.05396489053964615, time_cost=10.231732606887817
+
Steps: 0%| | 2750/1000000 [6:57:32<3160:12:56, 11.41s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 2751/1000000 [6:57:36<2554:24:46, 9.22s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [2751], local_loss=0.022936824709177017, train_loss=0.022639075294137, time_cost=1.3774211406707764
+
Steps: 0%| | 2751/1000000 [6:57:36<2554:24:46, 9.22s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 2752/1000000 [6:57:50<2986:17:51, 10.78s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [2752], local_loss=0.04231303185224533, train_loss=0.11149357259273529, time_cost=5.474299669265747
+
Steps: 0%| | 2752/1000000 [6:57:50<2986:17:51, 10.78s/it, lr=1e-5, step_loss=0.0423]
Steps: 0%| | 2753/1000000 [6:57:55<2508:18:21, 9.05s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [2753], local_loss=0.02030164934694767, train_loss=0.045769885182380676, time_cost=2.0874969959259033
+
Steps: 0%| | 2753/1000000 [6:57:55<2508:18:21, 9.05s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 2754/1000000 [6:58:01<2190:37:00, 7.91s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [2754], local_loss=0.04454922676086426, train_loss=0.09816479682922363, time_cost=2.435619354248047
+
Steps: 0%| | 2754/1000000 [6:58:01<2190:37:00, 7.91s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 2755/1000000 [6:58:07<2097:16:06, 7.57s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [2755], local_loss=0.017117757350206375, train_loss=0.03990037366747856, time_cost=2.6902923583984375
+
Steps: 0%| | 2755/1000000 [6:58:07<2097:16:06, 7.57s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 2756/1000000 [6:58:18<2363:04:04, 8.53s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [2756], local_loss=0.02391752228140831, train_loss=0.03361045941710472, time_cost=1.2169454097747803
+
Steps: 0%| | 2756/1000000 [6:58:18<2363:04:04, 8.53s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 2757/1000000 [6:58:29<2586:52:13, 9.34s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [2757], local_loss=1.0084389448165894, train_loss=0.15941981971263885, time_cost=1.6594843864440918
+
Steps: 0%| | 2757/1000000 [6:58:29<2586:52:13, 9.34s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 2758/1000000 [6:58:34<2218:57:44, 8.01s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [2758], local_loss=0.016858911141753197, train_loss=0.024049263447523117, time_cost=1.2279267311096191
+
Steps: 0%| | 2758/1000000 [6:58:34<2218:57:44, 8.01s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 2759/1000000 [6:58:45<2446:07:15, 8.83s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [2759], local_loss=0.013516600243747234, train_loss=0.028714414685964584, time_cost=2.1754486560821533
+
Steps: 0%| | 2759/1000000 [6:58:45<2446:07:15, 8.83s/it, lr=1e-5, step_loss=0.0135]
Steps: 0%| | 2760/1000000 [6:58:52<2288:46:05, 8.26s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [2760], local_loss=0.01733296737074852, train_loss=0.03356156498193741, time_cost=3.1925158500671387
+
Steps: 0%| | 2760/1000000 [6:58:52<2288:46:05, 8.26s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 2761/1000000 [6:58:59<2164:54:18, 7.82s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [2761], local_loss=0.06729858368635178, train_loss=0.03573732078075409, time_cost=2.2454886436462402
+
Steps: 0%| | 2761/1000000 [6:58:59<2164:54:18, 7.82s/it, lr=1e-5, step_loss=0.0673]
Steps: 0%| | 2762/1000000 [6:59:12<2634:13:56, 9.51s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [2762], local_loss=0.05676613748073578, train_loss=0.03835269808769226, time_cost=1.2386577129364014
+
Steps: 0%| | 2762/1000000 [6:59:12<2634:13:56, 9.51s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 2763/1000000 [6:59:24<2804:42:13, 10.12s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [2763], local_loss=0.012585975229740143, train_loss=0.06196366250514984, time_cost=4.301372051239014
+
Steps: 0%| | 2763/1000000 [6:59:24<2804:42:13, 10.12s/it, lr=1e-5, step_loss=0.0126]
Steps: 0%| | 2764/1000000 [6:59:31<2538:31:29, 9.16s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [2764], local_loss=0.021880149841308594, train_loss=0.059365082532167435, time_cost=2.4724502563476562
+
Steps: 0%| | 2764/1000000 [6:59:31<2538:31:29, 9.16s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 2765/1000000 [6:59:39<2422:06:51, 8.74s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [2765], local_loss=0.01619269698858261, train_loss=0.02891615405678749, time_cost=1.3121635913848877
+
Steps: 0%| | 2765/1000000 [6:59:39<2422:06:51, 8.74s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 2766/1000000 [6:59:44<2165:19:13, 7.82s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [2766], local_loss=0.033797841519117355, train_loss=0.052355170249938965, time_cost=3.1532068252563477
+
Steps: 0%| | 2766/1000000 [6:59:44<2165:19:13, 7.82s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 2767/1000000 [6:59:49<1953:27:36, 7.05s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [2767], local_loss=0.01776740327477455, train_loss=0.03396522253751755, time_cost=1.4056925773620605
+
Steps: 0%| | 2767/1000000 [6:59:49<1953:27:36, 7.05s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2768/1000000 [6:59:55<1856:36:07, 6.70s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2768], local_loss=0.01732228882610798, train_loss=0.058766305446624756, time_cost=2.1387321949005127
+
Steps: 0%| | 2768/1000000 [6:59:55<1856:36:07, 6.70s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 2769/1000000 [7:00:04<2057:24:40, 7.43s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [2769], local_loss=0.04829530790448189, train_loss=0.1624007672071457, time_cost=3.565763473510742
+
Steps: 0%| | 2769/1000000 [7:00:04<2057:24:40, 7.43s/it, lr=1e-5, step_loss=0.0483]
Steps: 0%| | 2770/1000000 [7:00:16<2376:34:46, 8.58s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [2770], local_loss=0.01966806687414646, train_loss=0.03097151778638363, time_cost=1.2254502773284912
+
Steps: 0%| | 2770/1000000 [7:00:16<2376:34:46, 8.58s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 2771/1000000 [7:00:24<2344:41:39, 8.46s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [2771], local_loss=0.020305953919887543, train_loss=0.06967116892337799, time_cost=3.372459650039673
+
Steps: 0%| | 2771/1000000 [7:00:24<2344:41:39, 8.46s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 2772/1000000 [7:00:28<2020:54:13, 7.30s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [2772], local_loss=0.03406462445855141, train_loss=0.044463954865932465, time_cost=1.860903024673462
+
Steps: 0%| | 2772/1000000 [7:00:28<2020:54:13, 7.30s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 2773/1000000 [7:00:33<1824:39:06, 6.59s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [2773], local_loss=0.024444838985800743, train_loss=0.05554749816656113, time_cost=3.9429304599761963
+
Steps: 0%| | 2773/1000000 [7:00:33<1824:39:06, 6.59s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 2774/1000000 [7:00:40<1856:45:03, 6.70s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [2774], local_loss=0.3220532238483429, train_loss=0.08055797219276428, time_cost=1.3864929676055908
+
Steps: 0%| | 2774/1000000 [7:00:40<1856:45:03, 6.70s/it, lr=1e-5, step_loss=0.322]
Steps: 0%| | 2775/1000000 [7:00:47<1881:09:05, 6.79s/it, lr=1e-5, step_loss=0.322][RANK-0]: Step: [2775], local_loss=0.045631539076566696, train_loss=0.06700000166893005, time_cost=2.9770421981811523
+
Steps: 0%| | 2775/1000000 [7:00:47<1881:09:05, 6.79s/it, lr=1e-5, step_loss=0.0456]
Steps: 0%| | 2776/1000000 [7:00:59<2261:54:25, 8.17s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [2776], local_loss=0.042622070759534836, train_loss=0.04906513914465904, time_cost=2.253044843673706
+
Steps: 0%| | 2776/1000000 [7:00:59<2261:54:25, 8.17s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 2777/1000000 [7:01:05<2129:53:42, 7.69s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [2777], local_loss=0.044524867087602615, train_loss=0.05654120817780495, time_cost=2.4152321815490723
+
Steps: 0%| | 2777/1000000 [7:01:05<2129:53:42, 7.69s/it, lr=1e-5, step_loss=0.0445]
Steps: 0%| | 2778/1000000 [7:01:10<1895:00:08, 6.84s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [2778], local_loss=0.18329468369483948, train_loss=0.09256786108016968, time_cost=2.448493242263794
+
Steps: 0%| | 2778/1000000 [7:01:10<1895:00:08, 6.84s/it, lr=1e-5, step_loss=0.183]
Steps: 0%| | 2779/1000000 [7:01:19<2026:19:02, 7.32s/it, lr=1e-5, step_loss=0.183][RANK-0]: Step: [2779], local_loss=0.13605551421642303, train_loss=0.07640516757965088, time_cost=1.2192776203155518
+
Steps: 0%| | 2779/1000000 [7:01:19<2026:19:02, 7.32s/it, lr=1e-5, step_loss=0.136]
Steps: 0%| | 2780/1000000 [7:01:26<2067:02:07, 7.46s/it, lr=1e-5, step_loss=0.136][RANK-0]: Step: [2780], local_loss=0.016385843977332115, train_loss=0.03937854990363121, time_cost=3.144160747528076
+
Steps: 0%| | 2780/1000000 [7:01:26<2067:02:07, 7.46s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 2781/1000000 [7:01:36<2244:54:50, 8.10s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [2781], local_loss=0.014413896016776562, train_loss=0.03782298043370247, time_cost=1.2245731353759766
+
Steps: 0%| | 2781/1000000 [7:01:36<2244:54:50, 8.10s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 2782/1000000 [7:01:46<2371:41:54, 8.56s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [2782], local_loss=0.3598136603832245, train_loss=0.10888765752315521, time_cost=3.079674005508423
+
Steps: 0%| | 2782/1000000 [7:01:46<2371:41:54, 8.56s/it, lr=1e-5, step_loss=0.36]
Steps: 0%| | 2783/1000000 [7:01:55<2444:11:54, 8.82s/it, lr=1e-5, step_loss=0.36][RANK-0]: Step: [2783], local_loss=0.01955130510032177, train_loss=0.07842443883419037, time_cost=3.1895008087158203
+
Steps: 0%| | 2783/1000000 [7:01:55<2444:11:54, 8.82s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 2784/1000000 [7:02:08<2779:28:49, 10.03s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [2784], local_loss=0.02707717940211296, train_loss=0.10086572170257568, time_cost=5.467409372329712
+
Steps: 0%| | 2784/1000000 [7:02:08<2779:28:49, 10.03s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 2785/1000000 [7:02:21<3035:53:21, 10.96s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [2785], local_loss=0.067499078810215, train_loss=0.04086403548717499, time_cost=3.5514111518859863
+
Steps: 0%| | 2785/1000000 [7:02:21<3035:53:21, 10.96s/it, lr=1e-5, step_loss=0.0675]
Steps: 0%| | 2786/1000000 [7:02:30<2905:24:29, 10.49s/it, lr=1e-5, step_loss=0.0675][RANK-0]: Step: [2786], local_loss=0.020868387073278427, train_loss=0.04392962157726288, time_cost=3.676725387573242
+
Steps: 0%| | 2786/1000000 [7:02:30<2905:24:29, 10.49s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 2787/1000000 [7:02:37<2618:44:27, 9.45s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [2787], local_loss=0.01566704921424389, train_loss=0.03773915022611618, time_cost=2.5348057746887207
+
Steps: 0%| | 2787/1000000 [7:02:37<2618:44:27, 9.45s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 2788/1000000 [7:02:43<2258:00:27, 8.15s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [2788], local_loss=0.034734271466732025, train_loss=0.04040617123246193, time_cost=1.2346885204315186
+
Steps: 0%| | 2788/1000000 [7:02:43<2258:00:27, 8.15s/it, lr=1e-5, step_loss=0.0347]
Steps: 0%| | 2789/1000000 [7:02:51<2280:44:51, 8.23s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [2789], local_loss=0.01590154320001602, train_loss=0.039231497794389725, time_cost=2.5628182888031006
+
Steps: 0%| | 2789/1000000 [7:02:51<2280:44:51, 8.23s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 2790/1000000 [7:02:58<2190:21:14, 7.91s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [2790], local_loss=0.024395935237407684, train_loss=0.11795632541179657, time_cost=1.243635654449463
+
Steps: 0%| | 2790/1000000 [7:02:58<2190:21:14, 7.91s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 2791/1000000 [7:03:02<1882:07:33, 6.79s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [2791], local_loss=0.028408454731106758, train_loss=0.027959704399108887, time_cost=1.5991394519805908
+
Steps: 0%| | 2791/1000000 [7:03:02<1882:07:33, 6.79s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 2792/1000000 [7:03:08<1757:18:46, 6.34s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [2792], local_loss=0.01403399184346199, train_loss=0.08343607187271118, time_cost=2.53985857963562
+
Steps: 0%| | 2792/1000000 [7:03:08<1757:18:46, 6.34s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 2793/1000000 [7:03:13<1674:04:41, 6.04s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [2793], local_loss=0.018257923424243927, train_loss=0.07640665024518967, time_cost=4.525263786315918
+
Steps: 0%| | 2793/1000000 [7:03:13<1674:04:41, 6.04s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 2794/1000000 [7:03:26<2221:18:27, 8.02s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [2794], local_loss=0.06465398520231247, train_loss=0.07717275619506836, time_cost=9.493308782577515
+
Steps: 0%| | 2794/1000000 [7:03:26<2221:18:27, 8.02s/it, lr=1e-5, step_loss=0.0647]
Steps: 0%| | 2795/1000000 [7:03:31<1989:50:57, 7.18s/it, lr=1e-5, step_loss=0.0647][RANK-0]: Step: [2795], local_loss=0.21303625404834747, train_loss=0.06202809885144234, time_cost=2.697467088699341
+
Steps: 0%| | 2795/1000000 [7:03:31<1989:50:57, 7.18s/it, lr=1e-5, step_loss=0.213]
Steps: 0%| | 2796/1000000 [7:03:38<2018:10:53, 7.29s/it, lr=1e-5, step_loss=0.213][RANK-0]: Step: [2796], local_loss=0.018115978688001633, train_loss=0.04189681261777878, time_cost=1.9276013374328613
+
Steps: 0%| | 2796/1000000 [7:03:38<2018:10:53, 7.29s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 2797/1000000 [7:03:43<1778:38:54, 6.42s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [2797], local_loss=0.019265910610556602, train_loss=0.027280552312731743, time_cost=1.805448055267334
+
Steps: 0%| | 2797/1000000 [7:03:43<1778:38:54, 6.42s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2798/1000000 [7:03:54<2175:23:11, 7.85s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2798], local_loss=0.045234110206365585, train_loss=0.07781651616096497, time_cost=4.2001051902771
+
Steps: 0%| | 2798/1000000 [7:03:54<2175:23:11, 7.85s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 2799/1000000 [7:04:07<2586:42:11, 9.34s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [2799], local_loss=0.0321473628282547, train_loss=0.025814011693000793, time_cost=1.217499017715454
+
Steps: 0%| | 2799/1000000 [7:04:07<2586:42:11, 9.34s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 2800/1000000 [7:04:18<2760:04:07, 9.96s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [2800], local_loss=0.023133575916290283, train_loss=0.03139524534344673, time_cost=3.6619486808776855
+
Steps: 0%| | 2800/1000000 [7:04:18<2760:04:07, 9.96s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 2801/1000000 [7:04:29<2867:53:10, 10.35s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [2801], local_loss=0.020498743280768394, train_loss=0.0403316356241703, time_cost=3.9418182373046875
+
Steps: 0%| | 2801/1000000 [7:04:29<2867:53:10, 10.35s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 2802/1000000 [7:04:42<3022:53:08, 10.91s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [2802], local_loss=0.023911580443382263, train_loss=0.045100923627614975, time_cost=1.2311766147613525
+
Steps: 0%| | 2802/1000000 [7:04:42<3022:53:08, 10.91s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 2803/1000000 [7:04:56<3307:19:15, 11.94s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [2803], local_loss=0.03947635740041733, train_loss=0.02423449233174324, time_cost=5.168247938156128
+
Steps: 0%| | 2803/1000000 [7:04:56<3307:19:15, 11.94s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 2804/1000000 [7:05:12<3608:09:36, 13.03s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [2804], local_loss=0.10156922787427902, train_loss=0.06872439384460449, time_cost=1.2152190208435059
+
Steps: 0%| | 2804/1000000 [7:05:12<3608:09:36, 13.03s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 2805/1000000 [7:05:17<2957:03:12, 10.68s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [2805], local_loss=0.02495632693171501, train_loss=0.03110242448747158, time_cost=2.1146163940429688
+
Steps: 0%| | 2805/1000000 [7:05:17<2957:03:12, 10.68s/it, lr=1e-5, step_loss=0.025]
Steps: 0%| | 2806/1000000 [7:05:26<2845:48:59, 10.27s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [2806], local_loss=0.025047114118933678, train_loss=0.02729038894176483, time_cost=3.0354199409484863
+
Steps: 0%| | 2806/1000000 [7:05:26<2845:48:59, 10.27s/it, lr=1e-5, step_loss=0.025]
Steps: 0%| | 2807/1000000 [7:05:33<2560:29:47, 9.24s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [2807], local_loss=0.09062604606151581, train_loss=0.048566363751888275, time_cost=2.3807473182678223
+
Steps: 0%| | 2807/1000000 [7:05:33<2560:29:47, 9.24s/it, lr=1e-5, step_loss=0.0906]
Steps: 0%| | 2808/1000000 [7:05:43<2589:14:49, 9.35s/it, lr=1e-5, step_loss=0.0906][RANK-0]: Step: [2808], local_loss=0.028458405286073685, train_loss=0.04287034273147583, time_cost=7.10739278793335
+
Steps: 0%| | 2808/1000000 [7:05:43<2589:14:49, 9.35s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 2809/1000000 [7:05:47<2171:01:29, 7.84s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [2809], local_loss=0.022860772907733917, train_loss=0.03655880317091942, time_cost=1.795727252960205
+
Steps: 0%| | 2809/1000000 [7:05:47<2171:01:29, 7.84s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 2810/1000000 [7:05:58<2446:12:39, 8.83s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [2810], local_loss=0.047807443886995316, train_loss=0.11757636070251465, time_cost=7.055248975753784
+
Steps: 0%| | 2810/1000000 [7:05:58<2446:12:39, 8.83s/it, lr=1e-5, step_loss=0.0478]
Steps: 0%| | 2811/1000000 [7:06:07<2437:52:42, 8.80s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [2811], local_loss=0.05458063259720802, train_loss=0.07842401415109634, time_cost=1.623507022857666
+
Steps: 0%| | 2811/1000000 [7:06:07<2437:52:42, 8.80s/it, lr=1e-5, step_loss=0.0546]
Steps: 0%| | 2812/1000000 [7:06:14<2288:19:42, 8.26s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [2812], local_loss=0.019439363852143288, train_loss=0.020913902670145035, time_cost=2.409379243850708
+
Steps: 0%| | 2812/1000000 [7:06:14<2288:19:42, 8.26s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 2813/1000000 [7:06:23<2368:14:16, 8.55s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [2813], local_loss=0.0565846785902977, train_loss=0.03764771670103073, time_cost=1.2206499576568604
+
Steps: 0%| | 2813/1000000 [7:06:23<2368:14:16, 8.55s/it, lr=1e-5, step_loss=0.0566]
Steps: 0%| | 2814/1000000 [7:06:39<2977:57:16, 10.75s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [2814], local_loss=0.0617477148771286, train_loss=0.06294433772563934, time_cost=8.621854066848755
+
Steps: 0%| | 2814/1000000 [7:06:39<2977:57:16, 10.75s/it, lr=1e-5, step_loss=0.0617]
Steps: 0%| | 2815/1000000 [7:06:49<2918:39:55, 10.54s/it, lr=1e-5, step_loss=0.0617][RANK-0]: Step: [2815], local_loss=0.02879752591252327, train_loss=0.08517533540725708, time_cost=1.2106938362121582
+
Steps: 0%| | 2815/1000000 [7:06:49<2918:39:55, 10.54s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 2816/1000000 [7:06:57<2713:29:35, 9.80s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [2816], local_loss=0.02388155646622181, train_loss=0.15885314345359802, time_cost=1.2082653045654297
+
Steps: 0%| | 2816/1000000 [7:06:57<2713:29:35, 9.80s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 2817/1000000 [7:07:01<2245:04:12, 8.11s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [2817], local_loss=0.02688927948474884, train_loss=0.1633315235376358, time_cost=1.2894713878631592
+
Steps: 0%| | 2817/1000000 [7:07:01<2245:04:12, 8.11s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 2818/1000000 [7:07:08<2140:45:59, 7.73s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [2818], local_loss=0.017132416367530823, train_loss=0.03254833072423935, time_cost=2.7360970973968506
+
Steps: 0%| | 2818/1000000 [7:07:08<2140:45:59, 7.73s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 2819/1000000 [7:07:14<2032:12:29, 7.34s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [2819], local_loss=0.020083626732230186, train_loss=0.06915967166423798, time_cost=2.371851921081543
+
Steps: 0%| | 2819/1000000 [7:07:14<2032:12:29, 7.34s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 2820/1000000 [7:07:21<1935:58:32, 6.99s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [2820], local_loss=0.05499144271016121, train_loss=0.04340768977999687, time_cost=1.6130189895629883
+
Steps: 0%| | 2820/1000000 [7:07:21<1935:58:32, 6.99s/it, lr=1e-5, step_loss=0.055]
Steps: 0%| | 2821/1000000 [7:07:34<2438:06:11, 8.80s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [2821], local_loss=0.014769559726119041, train_loss=0.03493410721421242, time_cost=9.795348167419434
+
Steps: 0%| | 2821/1000000 [7:07:34<2438:06:11, 8.80s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 2822/1000000 [7:07:43<2450:14:44, 8.85s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [2822], local_loss=0.050886817276477814, train_loss=0.030949829146265984, time_cost=3.8624651432037354
+
Steps: 0%| | 2822/1000000 [7:07:43<2450:14:44, 8.85s/it, lr=1e-5, step_loss=0.0509]
Steps: 0%| | 2823/1000000 [7:07:48<2204:50:54, 7.96s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [2823], local_loss=0.014275329187512398, train_loss=0.05781003087759018, time_cost=1.8292837142944336
+
Steps: 0%| | 2823/1000000 [7:07:48<2204:50:54, 7.96s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 2824/1000000 [7:07:56<2132:47:17, 7.70s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [2824], local_loss=0.039432454854249954, train_loss=0.04069456458091736, time_cost=2.574094295501709
+
Steps: 0%| | 2824/1000000 [7:07:56<2132:47:17, 7.70s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 2825/1000000 [7:08:09<2614:33:26, 9.44s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [2825], local_loss=0.022304141893982887, train_loss=0.06118209660053253, time_cost=3.5872702598571777
+
Steps: 0%| | 2825/1000000 [7:08:09<2614:33:26, 9.44s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 2826/1000000 [7:08:22<2891:16:40, 10.44s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [2826], local_loss=0.022021502256393433, train_loss=0.06690922379493713, time_cost=4.254019021987915
+
Steps: 0%| | 2826/1000000 [7:08:22<2891:16:40, 10.44s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 2827/1000000 [7:08:29<2652:26:04, 9.58s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [2827], local_loss=0.022709965705871582, train_loss=0.10594690591096878, time_cost=1.673084020614624
+
Steps: 0%| | 2827/1000000 [7:08:29<2652:26:04, 9.58s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 2828/1000000 [7:08:38<2614:32:16, 9.44s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [2828], local_loss=0.011537754908204079, train_loss=0.15698111057281494, time_cost=2.778130054473877
+
Steps: 0%| | 2828/1000000 [7:08:38<2614:32:16, 9.44s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 2829/1000000 [7:08:43<2232:03:28, 8.06s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [2829], local_loss=0.17600849270820618, train_loss=0.06087084114551544, time_cost=1.484710931777954
+
Steps: 0%| | 2829/1000000 [7:08:43<2232:03:28, 8.06s/it, lr=1e-5, step_loss=0.176]
Steps: 0%| | 2830/1000000 [7:08:50<2151:33:54, 7.77s/it, lr=1e-5, step_loss=0.176][RANK-0]: Step: [2830], local_loss=0.023797200992703438, train_loss=0.07984510809183121, time_cost=3.1301872730255127
+
Steps: 0%| | 2830/1000000 [7:08:50<2151:33:54, 7.77s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 2831/1000000 [7:09:03<2532:30:15, 9.14s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [2831], local_loss=0.03842364624142647, train_loss=0.046987924724817276, time_cost=1.7256050109863281
+
Steps: 0%| | 2831/1000000 [7:09:03<2532:30:15, 9.14s/it, lr=1e-5, step_loss=0.0384]
Steps: 0%| | 2832/1000000 [7:09:15<2754:46:45, 9.95s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [2832], local_loss=0.045128364115953445, train_loss=0.03315450996160507, time_cost=1.2992329597473145
+
Steps: 0%| | 2832/1000000 [7:09:15<2754:46:45, 9.95s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 2833/1000000 [7:09:24<2677:30:31, 9.67s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [2833], local_loss=0.03309568762779236, train_loss=0.033016808331012726, time_cost=3.857325792312622
+
Steps: 0%| | 2833/1000000 [7:09:24<2677:30:31, 9.67s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 2834/1000000 [7:09:29<2303:13:26, 8.32s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [2834], local_loss=0.0186411514878273, train_loss=0.0467398539185524, time_cost=2.2111005783081055
+
Steps: 0%| | 2834/1000000 [7:09:29<2303:13:26, 8.32s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 2835/1000000 [7:09:44<2847:57:49, 10.28s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [2835], local_loss=0.05329715460538864, train_loss=0.07462753355503082, time_cost=11.889102935791016
+
Steps: 0%| | 2835/1000000 [7:09:44<2847:57:49, 10.28s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 2836/1000000 [7:09:52<2730:33:12, 9.86s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [2836], local_loss=0.03677291423082352, train_loss=0.06387712806463242, time_cost=3.8921124935150146
+
Steps: 0%| | 2836/1000000 [7:09:52<2730:33:12, 9.86s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 2837/1000000 [7:10:05<2965:00:58, 10.70s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [2837], local_loss=0.04232098162174225, train_loss=0.058903105556964874, time_cost=2.9194436073303223
+
Steps: 0%| | 2837/1000000 [7:10:05<2965:00:58, 10.70s/it, lr=1e-5, step_loss=0.0423]
Steps: 0%| | 2838/1000000 [7:10:24<3674:21:27, 13.27s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [2838], local_loss=0.049201808869838715, train_loss=15.88003158569336, time_cost=11.371980667114258
+
Steps: 0%| | 2838/1000000 [7:10:24<3674:21:27, 13.27s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 2839/1000000 [7:10:37<3646:06:16, 13.16s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [2839], local_loss=0.0196688175201416, train_loss=0.037962377071380615, time_cost=4.892412424087524
+
Steps: 0%| | 2839/1000000 [7:10:37<3646:06:16, 13.16s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 2840/1000000 [7:10:43<3039:02:32, 10.97s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [2840], local_loss=0.02802799642086029, train_loss=0.03362182527780533, time_cost=4.100163698196411
+
Steps: 0%| | 2840/1000000 [7:10:43<3039:02:32, 10.97s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 2841/1000000 [7:10:55<3124:51:48, 11.28s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [2841], local_loss=0.04364544525742531, train_loss=0.05288213863968849, time_cost=4.156816720962524
+
Steps: 0%| | 2841/1000000 [7:10:55<3124:51:48, 11.28s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 2842/1000000 [7:10:59<2516:43:34, 9.09s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [2842], local_loss=0.055246852338314056, train_loss=22.28076934814453, time_cost=1.2342290878295898
+
Steps: 0%| | 2842/1000000 [7:10:59<2516:43:34, 9.09s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 2843/1000000 [7:11:06<2327:09:52, 8.40s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [2843], local_loss=0.12153199315071106, train_loss=0.047052815556526184, time_cost=2.732015609741211
+
Steps: 0%| | 2843/1000000 [7:11:06<2327:09:52, 8.40s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 2844/1000000 [7:11:19<2689:21:44, 9.71s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [2844], local_loss=0.0250738263130188, train_loss=0.06972409039735794, time_cost=3.243687152862549
+
Steps: 0%| | 2844/1000000 [7:11:19<2689:21:44, 9.71s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 2845/1000000 [7:11:25<2366:28:13, 8.54s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [2845], local_loss=0.019594451412558556, train_loss=0.02817581221461296, time_cost=1.744269609451294
+
Steps: 0%| | 2845/1000000 [7:11:25<2366:28:13, 8.54s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 2846/1000000 [7:11:30<2078:00:47, 7.50s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [2846], local_loss=0.044863708317279816, train_loss=0.04749177396297455, time_cost=2.2786636352539062
+
Steps: 0%| | 2846/1000000 [7:11:30<2078:00:47, 7.50s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 2847/1000000 [7:11:35<1875:41:59, 6.77s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [2847], local_loss=0.13745155930519104, train_loss=0.050470009446144104, time_cost=1.228963851928711
+
Steps: 0%| | 2847/1000000 [7:11:35<1875:41:59, 6.77s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 2848/1000000 [7:11:40<1722:55:42, 6.22s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [2848], local_loss=0.047992266714572906, train_loss=0.036752667278051376, time_cost=1.8848929405212402
+
Steps: 0%| | 2848/1000000 [7:11:40<1722:55:42, 6.22s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 2849/1000000 [7:11:45<1633:40:41, 5.90s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [2849], local_loss=0.015870647504925728, train_loss=0.05114715173840523, time_cost=2.5801239013671875
+
Steps: 0%| | 2849/1000000 [7:11:45<1633:40:41, 5.90s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 2850/1000000 [7:11:49<1524:31:21, 5.50s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [2850], local_loss=0.027839098125696182, train_loss=0.046741142868995667, time_cost=1.6547002792358398
+
Steps: 0%| | 2850/1000000 [7:11:49<1524:31:21, 5.50s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 2851/1000000 [7:12:01<2070:46:51, 7.48s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [2851], local_loss=0.10071298480033875, train_loss=0.07363933324813843, time_cost=7.621075630187988
+
Steps: 0%| | 2851/1000000 [7:12:01<2070:46:51, 7.48s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 2852/1000000 [7:12:13<2410:39:34, 8.70s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [2852], local_loss=0.02715958096086979, train_loss=0.06394834071397781, time_cost=5.155639886856079
+
Steps: 0%| | 2852/1000000 [7:12:13<2410:39:34, 8.70s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 2853/1000000 [7:12:26<2800:11:13, 10.11s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [2853], local_loss=0.02208763360977173, train_loss=0.07176250219345093, time_cost=4.418299436569214
+
Steps: 0%| | 2853/1000000 [7:12:26<2800:11:13, 10.11s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 2854/1000000 [7:12:43<3358:27:06, 12.13s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [2854], local_loss=0.010198943316936493, train_loss=0.04976914823055267, time_cost=9.482044458389282
+
Steps: 0%| | 2854/1000000 [7:12:43<3358:27:06, 12.13s/it, lr=1e-5, step_loss=0.0102]
Steps: 0%| | 2855/1000000 [7:12:51<2964:59:16, 10.70s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [2855], local_loss=0.04112403467297554, train_loss=0.0466151237487793, time_cost=1.2381160259246826
+
Steps: 0%| | 2855/1000000 [7:12:51<2964:59:16, 10.70s/it, lr=1e-5, step_loss=0.0411]
Steps: 0%| | 2856/1000000 [7:13:00<2815:42:10, 10.17s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [2856], local_loss=0.05536482110619545, train_loss=0.04803958907723427, time_cost=3.8135297298431396
+
Steps: 0%| | 2856/1000000 [7:13:00<2815:42:10, 10.17s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 2857/1000000 [7:13:04<2334:57:50, 8.43s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [2857], local_loss=0.017499519512057304, train_loss=0.040440600365400314, time_cost=1.293126106262207
+
Steps: 0%| | 2857/1000000 [7:13:04<2334:57:50, 8.43s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 2858/1000000 [7:13:15<2540:48:14, 9.17s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [2858], local_loss=0.1907159686088562, train_loss=0.05000779777765274, time_cost=2.1268503665924072
+
Steps: 0%| | 2858/1000000 [7:13:15<2540:48:14, 9.17s/it, lr=1e-5, step_loss=0.191]
Steps: 0%| | 2859/1000000 [7:13:20<2190:31:14, 7.91s/it, lr=1e-5, step_loss=0.191][RANK-0]: Step: [2859], local_loss=0.0541207455098629, train_loss=0.027038484811782837, time_cost=1.2608051300048828
+
Steps: 0%| | 2859/1000000 [7:13:20<2190:31:14, 7.91s/it, lr=1e-5, step_loss=0.0541]
Steps: 0%| | 2860/1000000 [7:13:28<2215:58:04, 8.00s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [2860], local_loss=0.01884787157177925, train_loss=0.0325598269701004, time_cost=6.404985427856445
+
Steps: 0%| | 2860/1000000 [7:13:28<2215:58:04, 8.00s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 2861/1000000 [7:13:36<2245:46:34, 8.11s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [2861], local_loss=0.03812278062105179, train_loss=0.03573545068502426, time_cost=1.405494213104248
+
Steps: 0%| | 2861/1000000 [7:13:36<2245:46:34, 8.11s/it, lr=1e-5, step_loss=0.0381]
Steps: 0%| | 2862/1000000 [7:13:42<2041:39:54, 7.37s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [2862], local_loss=0.1063375174999237, train_loss=0.047429442405700684, time_cost=2.864191770553589
+
Steps: 0%| | 2862/1000000 [7:13:42<2041:39:54, 7.37s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 2863/1000000 [7:13:51<2162:58:18, 7.81s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [2863], local_loss=0.03947573900222778, train_loss=0.06139606982469559, time_cost=3.050051689147949
+
Steps: 0%| | 2863/1000000 [7:13:51<2162:58:18, 7.81s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 2864/1000000 [7:13:56<1928:20:14, 6.96s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [2864], local_loss=0.44337010383605957, train_loss=0.1215924620628357, time_cost=1.9608333110809326
+
Steps: 0%| | 2864/1000000 [7:13:56<1928:20:14, 6.96s/it, lr=1e-5, step_loss=0.443]
Steps: 0%| | 2865/1000000 [7:14:05<2094:21:56, 7.56s/it, lr=1e-5, step_loss=0.443][RANK-0]: Step: [2865], local_loss=0.15312494337558746, train_loss=0.10258843004703522, time_cost=3.191680908203125
+
Steps: 0%| | 2865/1000000 [7:14:05<2094:21:56, 7.56s/it, lr=1e-5, step_loss=0.153]
Steps: 0%| | 2866/1000000 [7:14:10<1929:32:00, 6.97s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [2866], local_loss=0.011909221298992634, train_loss=36.09782028198242, time_cost=1.2099952697753906
+
Steps: 0%| | 2866/1000000 [7:14:10<1929:32:00, 6.97s/it, lr=1e-5, step_loss=0.0119]
Steps: 0%| | 2867/1000000 [7:14:16<1789:21:30, 6.46s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [2867], local_loss=0.13151414692401886, train_loss=0.05649926885962486, time_cost=2.285740375518799
+
Steps: 0%| | 2867/1000000 [7:14:16<1789:21:30, 6.46s/it, lr=1e-5, step_loss=0.132]
Steps: 0%| | 2868/1000000 [7:14:29<2350:40:40, 8.49s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [2868], local_loss=0.029831353574991226, train_loss=2.5162582397460938, time_cost=3.115709066390991
+
Steps: 0%| | 2868/1000000 [7:14:29<2350:40:40, 8.49s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 2869/1000000 [7:14:38<2430:52:45, 8.78s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [2869], local_loss=0.02029413729906082, train_loss=0.041916415095329285, time_cost=2.6898353099823
+
Steps: 0%| | 2869/1000000 [7:14:38<2430:52:45, 8.78s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 2870/1000000 [7:14:44<2149:45:13, 7.76s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [2870], local_loss=0.05863718315958977, train_loss=0.05937645584344864, time_cost=2.925259590148926
+
Steps: 0%| | 2870/1000000 [7:14:44<2149:45:13, 7.76s/it, lr=1e-5, step_loss=0.0586]
Steps: 0%| | 2871/1000000 [7:14:51<2122:43:06, 7.66s/it, lr=1e-5, step_loss=0.0586][RANK-0]: Step: [2871], local_loss=0.020577101036906242, train_loss=0.15528467297554016, time_cost=1.2095835208892822
+
Steps: 0%| | 2871/1000000 [7:14:51<2122:43:06, 7.66s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 2872/1000000 [7:14:59<2137:04:15, 7.72s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [2872], local_loss=0.031867705285549164, train_loss=0.02621607854962349, time_cost=3.878053665161133
+
Steps: 0%| | 2872/1000000 [7:14:59<2137:04:15, 7.72s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 2873/1000000 [7:15:11<2502:43:06, 9.04s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [2873], local_loss=0.017670564353466034, train_loss=0.03631056845188141, time_cost=5.033742427825928
+
Steps: 0%| | 2873/1000000 [7:15:11<2502:43:06, 9.04s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 2874/1000000 [7:15:26<2987:15:51, 10.79s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [2874], local_loss=0.022627152502536774, train_loss=9.369024276733398, time_cost=1.2183725833892822
+
Steps: 0%| | 2874/1000000 [7:15:26<2987:15:51, 10.79s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 2875/1000000 [7:15:38<3125:06:35, 11.28s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [2875], local_loss=0.038919903337955475, train_loss=0.0274943970143795, time_cost=1.2321100234985352
+
Steps: 0%| | 2875/1000000 [7:15:38<3125:06:35, 11.28s/it, lr=1e-5, step_loss=0.0389]
Steps: 0%| | 2876/1000000 [7:15:48<2982:40:54, 10.77s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [2876], local_loss=0.14864656329154968, train_loss=0.04846450686454773, time_cost=3.6822192668914795
+
Steps: 0%| | 2876/1000000 [7:15:48<2982:40:54, 10.77s/it, lr=1e-5, step_loss=0.149]
Steps: 0%| | 2877/1000000 [7:15:52<2450:36:02, 8.85s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [2877], local_loss=0.14669817686080933, train_loss=0.04937790334224701, time_cost=1.6401376724243164
+
Steps: 0%| | 2877/1000000 [7:15:52<2450:36:02, 8.85s/it, lr=1e-5, step_loss=0.147]
Steps: 0%| | 2878/1000000 [7:15:59<2286:50:26, 8.26s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [2878], local_loss=0.024463839828968048, train_loss=0.05146603286266327, time_cost=2.395275115966797
+
Steps: 0%| | 2878/1000000 [7:15:59<2286:50:26, 8.26s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 2879/1000000 [7:16:04<2007:45:16, 7.25s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [2879], local_loss=0.16108383238315582, train_loss=0.05989881977438927, time_cost=2.058840751647949
+
Steps: 0%| | 2879/1000000 [7:16:04<2007:45:16, 7.25s/it, lr=1e-5, step_loss=0.161]
Steps: 0%| | 2880/1000000 [7:16:14<2267:19:42, 8.19s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [2880], local_loss=0.03932914510369301, train_loss=0.050191350281238556, time_cost=3.291355848312378
+
Steps: 0%| | 2880/1000000 [7:16:14<2267:19:42, 8.19s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 2881/1000000 [7:16:26<2560:15:02, 9.24s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [2881], local_loss=0.018252484500408173, train_loss=0.07099471241235733, time_cost=2.4876251220703125
+
Steps: 0%| | 2881/1000000 [7:16:26<2560:15:02, 9.24s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 2882/1000000 [7:16:34<2413:20:59, 8.71s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [2882], local_loss=0.1601238250732422, train_loss=0.09642143547534943, time_cost=2.91414737701416
+
Steps: 0%| | 2882/1000000 [7:16:34<2413:20:59, 8.71s/it, lr=1e-5, step_loss=0.16]
Steps: 0%| | 2883/1000000 [7:16:40<2179:54:29, 7.87s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [2883], local_loss=0.04665279760956764, train_loss=0.04637280851602554, time_cost=3.9185431003570557
+
Steps: 0%| | 2883/1000000 [7:16:40<2179:54:29, 7.87s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 2884/1000000 [7:16:53<2660:12:53, 9.60s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [2884], local_loss=0.10321047902107239, train_loss=0.06432948261499405, time_cost=1.3096749782562256
+
Steps: 0%| | 2884/1000000 [7:16:53<2660:12:53, 9.60s/it, lr=1e-5, step_loss=0.103]
Steps: 0%| | 2885/1000000 [7:17:06<2916:41:43, 10.53s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [2885], local_loss=0.01837136223912239, train_loss=0.1024884432554245, time_cost=1.2840678691864014
+
Steps: 0%| | 2885/1000000 [7:17:06<2916:41:43, 10.53s/it, lr=1e-5, step_loss=0.0184]
Steps: 0%| | 2886/1000000 [7:17:21<3273:29:18, 11.82s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [2886], local_loss=0.02029002085328102, train_loss=0.05263320729136467, time_cost=6.098123550415039
+
Steps: 0%| | 2886/1000000 [7:17:21<3273:29:18, 11.82s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 2887/1000000 [7:17:28<2875:21:39, 10.38s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [2887], local_loss=0.06137699633836746, train_loss=0.03322553634643555, time_cost=2.2662346363067627
+
Steps: 0%| | 2887/1000000 [7:17:28<2875:21:39, 10.38s/it, lr=1e-5, step_loss=0.0614]
Steps: 0%| | 2888/1000000 [7:17:46<3509:01:45, 12.67s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [2888], local_loss=0.046148333698511124, train_loss=0.1553393453359604, time_cost=1.2929189205169678
+
Steps: 0%| | 2888/1000000 [7:17:46<3509:01:45, 12.67s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 2889/1000000 [7:17:57<3395:33:14, 12.26s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [2889], local_loss=0.07045572251081467, train_loss=0.05087373033165932, time_cost=3.4352831840515137
+
Steps: 0%| | 2889/1000000 [7:17:57<3395:33:14, 12.26s/it, lr=1e-5, step_loss=0.0705]
Steps: 0%| | 2890/1000000 [7:18:10<3431:46:00, 12.39s/it, lr=1e-5, step_loss=0.0705][RANK-0]: Step: [2890], local_loss=0.020440073683857918, train_loss=0.040039896965026855, time_cost=3.8241631984710693
+
Steps: 0%| | 2890/1000000 [7:18:10<3431:46:00, 12.39s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 2891/1000000 [7:18:20<3241:04:54, 11.70s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [2891], local_loss=0.01676841825246811, train_loss=0.18377289175987244, time_cost=1.2336153984069824
+
Steps: 0%| | 2891/1000000 [7:18:20<3241:04:54, 11.70s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 2892/1000000 [7:18:34<3434:00:07, 12.40s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [2892], local_loss=0.008206394501030445, train_loss=0.06873247027397156, time_cost=5.644583463668823
+
Steps: 0%| | 2892/1000000 [7:18:34<3434:00:07, 12.40s/it, lr=1e-5, step_loss=0.00821]
Steps: 0%| | 2893/1000000 [7:18:48<3552:14:05, 12.83s/it, lr=1e-5, step_loss=0.00821][RANK-0]: Step: [2893], local_loss=0.06668782979249954, train_loss=0.12775826454162598, time_cost=1.268373727798462
+
Steps: 0%| | 2893/1000000 [7:18:48<3552:14:05, 12.83s/it, lr=1e-5, step_loss=0.0667]
Steps: 0%| | 2894/1000000 [7:19:03<3723:52:46, 13.44s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [2894], local_loss=0.04338928312063217, train_loss=0.02600095048546791, time_cost=4.776020526885986
+
Steps: 0%| | 2894/1000000 [7:19:03<3723:52:46, 13.44s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 2895/1000000 [7:19:10<3212:21:06, 11.60s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [2895], local_loss=0.01504932064563036, train_loss=0.026457859203219414, time_cost=5.275773048400879
+
Steps: 0%| | 2895/1000000 [7:19:10<3212:21:06, 11.60s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 2896/1000000 [7:19:19<2973:22:12, 10.74s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [2896], local_loss=0.05474451556801796, train_loss=0.07827252894639969, time_cost=1.5880873203277588
+
Steps: 0%| | 2896/1000000 [7:19:19<2973:22:12, 10.74s/it, lr=1e-5, step_loss=0.0547]
Steps: 0%| | 2897/1000000 [7:19:32<3155:27:56, 11.39s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [2897], local_loss=0.016124548390507698, train_loss=0.0551648810505867, time_cost=1.265066146850586
+
Steps: 0%| | 2897/1000000 [7:19:32<3155:27:56, 11.39s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 2898/1000000 [7:19:43<3165:46:31, 11.43s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [2898], local_loss=0.041235607117414474, train_loss=0.036699842661619186, time_cost=2.1179614067077637
+
Steps: 0%| | 2898/1000000 [7:19:43<3165:46:31, 11.43s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 2899/1000000 [7:19:52<2959:25:09, 10.68s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [2899], local_loss=0.04173355922102928, train_loss=0.08904936164617538, time_cost=3.5456604957580566
+
Steps: 0%| | 2899/1000000 [7:19:52<2959:25:09, 10.68s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 2900/1000000 [7:20:10<3568:26:57, 12.88s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [2900], local_loss=0.016156909987330437, train_loss=0.04896301031112671, time_cost=3.05893874168396
+
Steps: 0%| | 2900/1000000 [7:20:10<3568:26:57, 12.88s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 2901/1000000 [7:20:20<3298:07:59, 11.91s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [2901], local_loss=0.02442701905965805, train_loss=0.10361963510513306, time_cost=1.1924951076507568
+
Steps: 0%| | 2901/1000000 [7:20:20<3298:07:59, 11.91s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 2902/1000000 [7:20:31<3239:30:20, 11.70s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [2902], local_loss=0.020917408168315887, train_loss=0.16136959195137024, time_cost=1.3885979652404785
+
Steps: 0%| | 2902/1000000 [7:20:31<3239:30:20, 11.70s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 2903/1000000 [7:20:45<3436:34:02, 12.41s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [2903], local_loss=0.04468044266104698, train_loss=0.18285022675991058, time_cost=4.258156776428223
+
Steps: 0%| | 2903/1000000 [7:20:45<3436:34:02, 12.41s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 2904/1000000 [7:20:59<3554:56:24, 12.84s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [2904], local_loss=0.02248547226190567, train_loss=0.05890218913555145, time_cost=2.926790475845337
+
Steps: 0%| | 2904/1000000 [7:20:59<3554:56:24, 12.84s/it, lr=1e-5, step_loss=0.0225]
Steps: 0%| | 2905/1000000 [7:21:05<2975:57:19, 10.74s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [2905], local_loss=0.05863222852349281, train_loss=0.07549576461315155, time_cost=1.5838284492492676
+
Steps: 0%| | 2905/1000000 [7:21:05<2975:57:19, 10.74s/it, lr=1e-5, step_loss=0.0586]
Steps: 0%| | 2906/1000000 [7:21:16<3018:31:42, 10.90s/it, lr=1e-5, step_loss=0.0586][RANK-0]: Step: [2906], local_loss=0.06856793165206909, train_loss=0.13600407540798187, time_cost=1.2775211334228516
+
Steps: 0%| | 2906/1000000 [7:21:16<3018:31:42, 10.90s/it, lr=1e-5, step_loss=0.0686]
Steps: 0%| | 2907/1000000 [7:21:31<3377:26:24, 12.19s/it, lr=1e-5, step_loss=0.0686][RANK-0]: Step: [2907], local_loss=0.30759528279304504, train_loss=0.09898732602596283, time_cost=7.670982599258423
+
Steps: 0%| | 2907/1000000 [7:21:31<3377:26:24, 12.19s/it, lr=1e-5, step_loss=0.308]
Steps: 0%| | 2908/1000000 [7:21:43<3369:59:14, 12.17s/it, lr=1e-5, step_loss=0.308][RANK-0]: Step: [2908], local_loss=0.015363487415015697, train_loss=0.09643140435218811, time_cost=1.2100090980529785
+
Steps: 0%| | 2908/1000000 [7:21:43<3369:59:14, 12.17s/it, lr=1e-5, step_loss=0.0154]
Steps: 0%| | 2909/1000000 [7:21:55<3324:33:59, 12.00s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [2909], local_loss=0.02054022066295147, train_loss=0.03500718995928764, time_cost=4.470508575439453
+
Steps: 0%| | 2909/1000000 [7:21:55<3324:33:59, 12.00s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 2910/1000000 [7:22:09<3503:21:01, 12.65s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [2910], local_loss=0.031992580741643906, train_loss=0.028043033555150032, time_cost=4.820876359939575
+
Steps: 0%| | 2910/1000000 [7:22:09<3503:21:01, 12.65s/it, lr=1e-5, step_loss=0.032]
Steps: 0%| | 2911/1000000 [7:22:22<3553:54:02, 12.83s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [2911], local_loss=0.02475748397409916, train_loss=0.06817855685949326, time_cost=3.608293294906616
+
Steps: 0%| | 2911/1000000 [7:22:22<3553:54:02, 12.83s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 2912/1000000 [7:22:36<3633:05:01, 13.12s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [2912], local_loss=0.022783834487199783, train_loss=0.029587559401988983, time_cost=3.5993499755859375
+
Steps: 0%| | 2912/1000000 [7:22:36<3633:05:01, 13.12s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 2913/1000000 [7:22:41<2923:07:44, 10.55s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [2913], local_loss=0.018082380294799805, train_loss=9.343562126159668, time_cost=2.243222713470459
+
Steps: 0%| | 2913/1000000 [7:22:41<2923:07:44, 10.55s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 2914/1000000 [7:22:50<2856:17:17, 10.31s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [2914], local_loss=0.09045758843421936, train_loss=0.16471342742443085, time_cost=4.127847671508789
+
Steps: 0%| | 2914/1000000 [7:22:50<2856:17:17, 10.31s/it, lr=1e-5, step_loss=0.0905]
Steps: 0%| | 2915/1000000 [7:22:55<2426:35:35, 8.76s/it, lr=1e-5, step_loss=0.0905][RANK-0]: Step: [2915], local_loss=0.017849009484052658, train_loss=0.03370748087763786, time_cost=1.2125661373138428
+
Steps: 0%| | 2915/1000000 [7:22:55<2426:35:35, 8.76s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 2916/1000000 [7:23:01<2119:12:40, 7.65s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [2916], local_loss=0.04733329266309738, train_loss=0.051713988184928894, time_cost=1.2166717052459717
+
Steps: 0%| | 2916/1000000 [7:23:01<2119:12:40, 7.65s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 2917/1000000 [7:23:06<1911:18:06, 6.90s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [2917], local_loss=0.06458049267530441, train_loss=0.0885237380862236, time_cost=2.4643945693969727
+
Steps: 0%| | 2917/1000000 [7:23:06<1911:18:06, 6.90s/it, lr=1e-5, step_loss=0.0646]
Steps: 0%| | 2918/1000000 [7:23:15<2115:56:04, 7.64s/it, lr=1e-5, step_loss=0.0646][RANK-0]: Step: [2918], local_loss=0.04619154334068298, train_loss=0.13160043954849243, time_cost=3.072896718978882
+
Steps: 0%| | 2918/1000000 [7:23:15<2115:56:04, 7.64s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 2919/1000000 [7:23:24<2220:06:13, 8.02s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [2919], local_loss=0.07797711342573166, train_loss=0.10641475021839142, time_cost=2.1967151165008545
+
Steps: 0%| | 2919/1000000 [7:23:24<2220:06:13, 8.02s/it, lr=1e-5, step_loss=0.078]
Steps: 0%| | 2920/1000000 [7:23:33<2290:15:47, 8.27s/it, lr=1e-5, step_loss=0.078][RANK-0]: Step: [2920], local_loss=0.05357363075017929, train_loss=0.07490266114473343, time_cost=3.2763965129852295
+
Steps: 0%| | 2920/1000000 [7:23:33<2290:15:47, 8.27s/it, lr=1e-5, step_loss=0.0536]
Steps: 0%| | 2921/1000000 [7:23:38<2071:39:36, 7.48s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [2921], local_loss=0.02897682785987854, train_loss=3.7957730293273926, time_cost=1.3193325996398926
+
Steps: 0%| | 2921/1000000 [7:23:38<2071:39:36, 7.48s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 2922/1000000 [7:23:46<2039:52:01, 7.37s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [2922], local_loss=0.03945207968354225, train_loss=0.06778772920370102, time_cost=2.6237640380859375
+
Steps: 0%| | 2922/1000000 [7:23:46<2039:52:01, 7.37s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 2923/1000000 [7:23:50<1819:43:11, 6.57s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [2923], local_loss=0.018211524933576584, train_loss=0.04276707023382187, time_cost=1.6452972888946533
+
Steps: 0%| | 2923/1000000 [7:23:50<1819:43:11, 6.57s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 2924/1000000 [7:24:00<2057:08:25, 7.43s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [2924], local_loss=0.026564033702015877, train_loss=0.0445852056145668, time_cost=1.31256103515625
+
Steps: 0%| | 2924/1000000 [7:24:00<2057:08:25, 7.43s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 2925/1000000 [7:24:13<2558:38:49, 9.24s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [2925], local_loss=0.049336664378643036, train_loss=0.04832310974597931, time_cost=4.820659399032593
+
Steps: 0%| | 2925/1000000 [7:24:13<2558:38:49, 9.24s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 2926/1000000 [7:24:24<2682:55:33, 9.69s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [2926], local_loss=0.014092226512730122, train_loss=0.16052421927452087, time_cost=3.60040545463562
+
Steps: 0%| | 2926/1000000 [7:24:24<2682:55:33, 9.69s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 2927/1000000 [7:24:38<3040:15:02, 10.98s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [2927], local_loss=0.05019262060523033, train_loss=0.15708430111408234, time_cost=4.102905035018921
+
Steps: 0%| | 2927/1000000 [7:24:38<3040:15:02, 10.98s/it, lr=1e-5, step_loss=0.0502]
Steps: 0%| | 2928/1000000 [7:24:49<3046:02:14, 11.00s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [2928], local_loss=0.13726906478405, train_loss=0.05232499912381172, time_cost=2.3746023178100586
+
Steps: 0%| | 2928/1000000 [7:24:49<3046:02:14, 11.00s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 2929/1000000 [7:25:02<3179:42:48, 11.48s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [2929], local_loss=0.08176450431346893, train_loss=0.05928371101617813, time_cost=5.413536310195923
+
Steps: 0%| | 2929/1000000 [7:25:02<3179:42:48, 11.48s/it, lr=1e-5, step_loss=0.0818]
Steps: 0%| | 2930/1000000 [7:25:14<3278:59:38, 11.84s/it, lr=1e-5, step_loss=0.0818][RANK-0]: Step: [2930], local_loss=0.03627849742770195, train_loss=0.07202845811843872, time_cost=3.2033228874206543
+
Steps: 0%| | 2930/1000000 [7:25:14<3278:59:38, 11.84s/it, lr=1e-5, step_loss=0.0363]
Steps: 0%| | 2931/1000000 [7:25:23<3035:57:58, 10.96s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [2931], local_loss=0.03682847321033478, train_loss=0.02539471909403801, time_cost=1.2365014553070068
+
Steps: 0%| | 2931/1000000 [7:25:23<3035:57:58, 10.96s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 2932/1000000 [7:25:29<2618:16:47, 9.45s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [2932], local_loss=0.023298529908061028, train_loss=0.03601735457777977, time_cost=1.7931287288665771
+
Steps: 0%| | 2932/1000000 [7:25:29<2618:16:47, 9.45s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 2933/1000000 [7:25:35<2307:25:47, 8.33s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [2933], local_loss=0.03269842267036438, train_loss=0.055591240525245667, time_cost=1.238452434539795
+
Steps: 0%| | 2933/1000000 [7:25:35<2307:25:47, 8.33s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 2934/1000000 [7:25:44<2380:00:48, 8.59s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [2934], local_loss=0.13033939898014069, train_loss=0.06822174042463303, time_cost=1.2466721534729004
+
Steps: 0%| | 2934/1000000 [7:25:44<2380:00:48, 8.59s/it, lr=1e-5, step_loss=0.13]
Steps: 0%| | 2935/1000000 [7:25:57<2741:08:03, 9.90s/it, lr=1e-5, step_loss=0.13][RANK-0]: Step: [2935], local_loss=0.06204413250088692, train_loss=0.04593590646982193, time_cost=3.430039167404175
+
Steps: 0%| | 2935/1000000 [7:25:57<2741:08:03, 9.90s/it, lr=1e-5, step_loss=0.062]
Steps: 0%| | 2936/1000000 [7:26:04<2487:41:21, 8.98s/it, lr=1e-5, step_loss=0.062][RANK-0]: Step: [2936], local_loss=0.013443628326058388, train_loss=0.027800383046269417, time_cost=2.7933433055877686
+
Steps: 0%| | 2936/1000000 [7:26:04<2487:41:21, 8.98s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 2937/1000000 [7:26:13<2500:16:56, 9.03s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [2937], local_loss=0.02426191233098507, train_loss=0.06813742965459824, time_cost=3.0627171993255615
+
Steps: 0%| | 2937/1000000 [7:26:13<2500:16:56, 9.03s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 2938/1000000 [7:26:24<2672:56:30, 9.65s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [2938], local_loss=0.023009121417999268, train_loss=0.05640536546707153, time_cost=1.939225673675537
+
Steps: 0%| | 2938/1000000 [7:26:24<2672:56:30, 9.65s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 2939/1000000 [7:26:33<2643:44:50, 9.55s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [2939], local_loss=0.07507448643445969, train_loss=0.032871030271053314, time_cost=2.7894675731658936
+
Steps: 0%| | 2939/1000000 [7:26:33<2643:44:50, 9.55s/it, lr=1e-5, step_loss=0.0751]
Steps: 0%| | 2940/1000000 [7:26:41<2499:03:10, 9.02s/it, lr=1e-5, step_loss=0.0751][RANK-0]: Step: [2940], local_loss=0.027959594503045082, train_loss=0.045983850955963135, time_cost=2.856940269470215
+
Steps: 0%| | 2940/1000000 [7:26:41<2499:03:10, 9.02s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 2941/1000000 [7:26:48<2288:06:41, 8.26s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [2941], local_loss=0.015755444765090942, train_loss=0.041794050484895706, time_cost=2.3123607635498047
+
Steps: 0%| | 2941/1000000 [7:26:48<2288:06:41, 8.26s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 2942/1000000 [7:26:55<2251:06:45, 8.13s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [2942], local_loss=0.019584672525525093, train_loss=18.99985694885254, time_cost=1.2097067832946777
+
Steps: 0%| | 2942/1000000 [7:26:55<2251:06:45, 8.13s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 2943/1000000 [7:27:01<2010:44:20, 7.26s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [2943], local_loss=0.02603868395090103, train_loss=0.03242453187704086, time_cost=1.23233962059021
+
Steps: 0%| | 2943/1000000 [7:27:01<2010:44:20, 7.26s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 2944/1000000 [7:27:07<1978:20:45, 7.14s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [2944], local_loss=0.06389322876930237, train_loss=0.08542953431606293, time_cost=2.4366190433502197
+
Steps: 0%| | 2944/1000000 [7:27:07<1978:20:45, 7.14s/it, lr=1e-5, step_loss=0.0639]
Steps: 0%| | 2945/1000000 [7:27:17<2162:46:35, 7.81s/it, lr=1e-5, step_loss=0.0639][RANK-0]: Step: [2945], local_loss=0.014532960020005703, train_loss=0.047248050570487976, time_cost=4.282250165939331
+
Steps: 0%| | 2945/1000000 [7:27:17<2162:46:35, 7.81s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 2946/1000000 [7:27:27<2377:39:02, 8.58s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [2946], local_loss=0.019263306632637978, train_loss=0.039025820791721344, time_cost=6.100034236907959
+
Steps: 0%| | 2946/1000000 [7:27:27<2377:39:02, 8.58s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 2947/1000000 [7:27:32<2025:56:01, 7.31s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [2947], local_loss=0.038603704422712326, train_loss=0.04017895832657814, time_cost=1.5445361137390137
+
Steps: 0%| | 2947/1000000 [7:27:32<2025:56:01, 7.31s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 2948/1000000 [7:27:43<2335:06:08, 8.43s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [2948], local_loss=0.04650970548391342, train_loss=0.10073433816432953, time_cost=1.2311906814575195
+
Steps: 0%| | 2948/1000000 [7:27:43<2335:06:08, 8.43s/it, lr=1e-5, step_loss=0.0465]
Steps: 0%| | 2949/1000000 [7:27:48<2064:31:31, 7.45s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [2949], local_loss=0.04200395941734314, train_loss=0.027572181075811386, time_cost=2.2808613777160645
+
Steps: 0%| | 2949/1000000 [7:27:48<2064:31:31, 7.45s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 2950/1000000 [7:27:55<2053:51:29, 7.42s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [2950], local_loss=0.036173246800899506, train_loss=0.0800565704703331, time_cost=3.5586845874786377
+
Steps: 0%| | 2950/1000000 [7:27:55<2053:51:29, 7.42s/it, lr=1e-5, step_loss=0.0362]
Steps: 0%| | 2951/1000000 [7:28:10<2701:27:47, 9.75s/it, lr=1e-5, step_loss=0.0362][RANK-0]: Step: [2951], local_loss=0.047086600214242935, train_loss=0.05315833166241646, time_cost=1.2746312618255615
+
Steps: 0%| | 2951/1000000 [7:28:10<2701:27:47, 9.75s/it, lr=1e-5, step_loss=0.0471]
Steps: 0%| | 2952/1000000 [7:28:19<2631:23:56, 9.50s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [2952], local_loss=0.0806158110499382, train_loss=0.047866806387901306, time_cost=6.924638509750366
+
Steps: 0%| | 2952/1000000 [7:28:19<2631:23:56, 9.50s/it, lr=1e-5, step_loss=0.0806]
Steps: 0%| | 2953/1000000 [7:28:29<2668:55:07, 9.64s/it, lr=1e-5, step_loss=0.0806][RANK-0]: Step: [2953], local_loss=0.08459720760583878, train_loss=0.19305568933486938, time_cost=5.455454587936401
+
Steps: 0%| | 2953/1000000 [7:28:29<2668:55:07, 9.64s/it, lr=1e-5, step_loss=0.0846]
Steps: 0%| | 2954/1000000 [7:28:36<2432:46:37, 8.78s/it, lr=1e-5, step_loss=0.0846][RANK-0]: Step: [2954], local_loss=0.032993514090776443, train_loss=0.044569022953510284, time_cost=3.0978848934173584
+
Steps: 0%| | 2954/1000000 [7:28:36<2432:46:37, 8.78s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 2955/1000000 [7:28:52<3007:08:04, 10.86s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [2955], local_loss=0.153956800699234, train_loss=0.05877460166811943, time_cost=8.098284482955933
+
Steps: 0%| | 2955/1000000 [7:28:52<3007:08:04, 10.86s/it, lr=1e-5, step_loss=0.154]
Steps: 0%| | 2956/1000000 [7:28:57<2509:18:06, 9.06s/it, lr=1e-5, step_loss=0.154][RANK-0]: Step: [2956], local_loss=0.029386088252067566, train_loss=0.04389844462275505, time_cost=1.8767735958099365
+
Steps: 0%| | 2956/1000000 [7:28:57<2509:18:06, 9.06s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 2957/1000000 [7:29:08<2713:01:38, 9.80s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [2957], local_loss=0.024521609768271446, train_loss=0.062241826206445694, time_cost=3.9182982444763184
+
Steps: 0%| | 2957/1000000 [7:29:08<2713:01:38, 9.80s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 2958/1000000 [7:29:13<2306:28:05, 8.33s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [2958], local_loss=0.04982583597302437, train_loss=0.11846524477005005, time_cost=1.7971758842468262
+
Steps: 0%| | 2958/1000000 [7:29:13<2306:28:05, 8.33s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 2959/1000000 [7:29:18<2051:21:53, 7.41s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [2959], local_loss=337.352294921875, train_loss=42.19588088989258, time_cost=2.5202388763427734
+
Steps: 0%| | 2959/1000000 [7:29:18<2051:21:53, 7.41s/it, lr=1e-5, step_loss=337]
Steps: 0%| | 2960/1000000 [7:29:32<2563:06:08, 9.25s/it, lr=1e-5, step_loss=337][RANK-0]: Step: [2960], local_loss=0.02334091253578663, train_loss=0.03634941205382347, time_cost=5.095054388046265
+
Steps: 0%| | 2960/1000000 [7:29:32<2563:06:08, 9.25s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 2961/1000000 [7:29:44<2798:21:59, 10.10s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [2961], local_loss=0.05859924480319023, train_loss=0.03265521675348282, time_cost=1.9371731281280518
+
Steps: 0%| | 2961/1000000 [7:29:44<2798:21:59, 10.10s/it, lr=1e-5, step_loss=0.0586]
Steps: 0%| | 2962/1000000 [7:29:53<2707:41:12, 9.78s/it, lr=1e-5, step_loss=0.0586][RANK-0]: Step: [2962], local_loss=0.026716802269220352, train_loss=18.57862663269043, time_cost=4.466440200805664
+
Steps: 0%| | 2962/1000000 [7:29:53<2707:41:12, 9.78s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 2963/1000000 [7:29:58<2315:11:08, 8.36s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [2963], local_loss=0.019483009353280067, train_loss=0.06237247586250305, time_cost=1.2371790409088135
+
Steps: 0%| | 2963/1000000 [7:29:58<2315:11:08, 8.36s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 2964/1000000 [7:30:07<2369:01:20, 8.55s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [2964], local_loss=0.027426961809396744, train_loss=0.050089865922927856, time_cost=1.3067729473114014
+
Steps: 0%| | 2964/1000000 [7:30:07<2369:01:20, 8.55s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 2965/1000000 [7:30:12<2112:48:50, 7.63s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [2965], local_loss=0.009536858648061752, train_loss=0.03189034014940262, time_cost=2.5137031078338623
+
Steps: 0%| | 2965/1000000 [7:30:12<2112:48:50, 7.63s/it, lr=1e-5, step_loss=0.00954]
Steps: 0%| | 2966/1000000 [7:30:21<2208:54:56, 7.98s/it, lr=1e-5, step_loss=0.00954][RANK-0]: Step: [2966], local_loss=0.45426249504089355, train_loss=0.0829414576292038, time_cost=6.840414524078369
+
Steps: 0%| | 2966/1000000 [7:30:21<2208:54:56, 7.98s/it, lr=1e-5, step_loss=0.454]
Steps: 0%| | 2967/1000000 [7:30:30<2307:29:28, 8.33s/it, lr=1e-5, step_loss=0.454][RANK-0]: Step: [2967], local_loss=0.024766474962234497, train_loss=0.034666143357753754, time_cost=3.049539804458618
+
Steps: 0%| | 2967/1000000 [7:30:30<2307:29:28, 8.33s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 2968/1000000 [7:30:43<2675:48:12, 9.66s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [2968], local_loss=0.04158184304833412, train_loss=0.11799278855323792, time_cost=1.2468361854553223
+
Steps: 0%| | 2968/1000000 [7:30:43<2675:48:12, 9.66s/it, lr=1e-5, step_loss=0.0416]
Steps: 0%| | 2969/1000000 [7:30:48<2294:29:33, 8.28s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [2969], local_loss=0.01696622744202614, train_loss=0.06250714510679245, time_cost=1.9697864055633545
+
Steps: 0%| | 2969/1000000 [7:30:48<2294:29:33, 8.28s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 2970/1000000 [7:30:55<2198:14:51, 7.94s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [2970], local_loss=0.05493771284818649, train_loss=0.08345259726047516, time_cost=1.2313427925109863
+
Steps: 0%| | 2970/1000000 [7:30:55<2198:14:51, 7.94s/it, lr=1e-5, step_loss=0.0549]
Steps: 0%| | 2971/1000000 [7:31:07<2501:35:31, 9.03s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [2971], local_loss=0.017438843846321106, train_loss=0.05949046090245247, time_cost=1.236171007156372
+
Steps: 0%| | 2971/1000000 [7:31:07<2501:35:31, 9.03s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 2972/1000000 [7:31:12<2170:40:59, 7.84s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [2972], local_loss=0.03794068098068237, train_loss=0.04914044588804245, time_cost=2.088230848312378
+
Steps: 0%| | 2972/1000000 [7:31:12<2170:40:59, 7.84s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 2973/1000000 [7:31:22<2370:21:21, 8.56s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [2973], local_loss=0.09151526540517807, train_loss=0.05555356293916702, time_cost=1.5304734706878662
+
Steps: 0%| | 2973/1000000 [7:31:22<2370:21:21, 8.56s/it, lr=1e-5, step_loss=0.0915]
Steps: 0%| | 2974/1000000 [7:31:28<2165:22:31, 7.82s/it, lr=1e-5, step_loss=0.0915][RANK-0]: Step: [2974], local_loss=0.06721600145101547, train_loss=0.10201410949230194, time_cost=1.292384147644043
+
Steps: 0%| | 2974/1000000 [7:31:28<2165:22:31, 7.82s/it, lr=1e-5, step_loss=0.0672]
Steps: 0%| | 2975/1000000 [7:31:33<1936:04:58, 6.99s/it, lr=1e-5, step_loss=0.0672][RANK-0]: Step: [2975], local_loss=0.08969622105360031, train_loss=0.07232268154621124, time_cost=1.2218806743621826
+
Steps: 0%| | 2975/1000000 [7:31:33<1936:04:58, 6.99s/it, lr=1e-5, step_loss=0.0897]
Steps: 0%| | 2976/1000000 [7:31:40<1935:49:20, 6.99s/it, lr=1e-5, step_loss=0.0897][RANK-0]: Step: [2976], local_loss=0.0846748799085617, train_loss=0.10527519881725311, time_cost=2.504105806350708
+
Steps: 0%| | 2976/1000000 [7:31:40<1935:49:20, 6.99s/it, lr=1e-5, step_loss=0.0847]
Steps: 0%| | 2977/1000000 [7:31:54<2509:55:41, 9.06s/it, lr=1e-5, step_loss=0.0847][RANK-0]: Step: [2977], local_loss=0.075301893055439, train_loss=0.05739093944430351, time_cost=5.096057176589966
+
Steps: 0%| | 2977/1000000 [7:31:54<2509:55:41, 9.06s/it, lr=1e-5, step_loss=0.0753]
Steps: 0%| | 2978/1000000 [7:32:05<2641:41:07, 9.54s/it, lr=1e-5, step_loss=0.0753][RANK-0]: Step: [2978], local_loss=0.009838107973337173, train_loss=0.03163217380642891, time_cost=1.2127554416656494
+
Steps: 0%| | 2978/1000000 [7:32:05<2641:41:07, 9.54s/it, lr=1e-5, step_loss=0.00984]
Steps: 0%| | 2979/1000000 [7:32:11<2359:03:39, 8.52s/it, lr=1e-5, step_loss=0.00984][RANK-0]: Step: [2979], local_loss=0.027349330484867096, train_loss=0.05219332128763199, time_cost=1.2190124988555908
+
Steps: 0%| | 2979/1000000 [7:32:11<2359:03:39, 8.52s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 2980/1000000 [7:32:25<2834:31:07, 10.23s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [2980], local_loss=0.2076779454946518, train_loss=0.05674459785223007, time_cost=1.2346148490905762
+
Steps: 0%| | 2980/1000000 [7:32:25<2834:31:07, 10.23s/it, lr=1e-5, step_loss=0.208]
Steps: 0%| | 2981/1000000 [7:32:37<2960:48:07, 10.69s/it, lr=1e-5, step_loss=0.208][RANK-0]: Step: [2981], local_loss=0.024706585332751274, train_loss=0.06280475854873657, time_cost=3.425623893737793
+
Steps: 0%| | 2981/1000000 [7:32:37<2960:48:07, 10.69s/it, lr=1e-5, step_loss=0.0247]
Steps: 0%| | 2982/1000000 [7:32:41<2436:32:17, 8.80s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [2982], local_loss=0.1512824445962906, train_loss=0.07047456502914429, time_cost=3.3500869274139404
+
Steps: 0%| | 2982/1000000 [7:32:41<2436:32:17, 8.80s/it, lr=1e-5, step_loss=0.151]
Steps: 0%| | 2983/1000000 [7:32:53<2665:35:13, 9.62s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [2983], local_loss=0.04050940275192261, train_loss=0.032581910490989685, time_cost=4.248643398284912
+
Steps: 0%| | 2983/1000000 [7:32:53<2665:35:13, 9.62s/it, lr=1e-5, step_loss=0.0405]
Steps: 0%| | 2984/1000000 [7:33:08<3077:54:19, 11.11s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [2984], local_loss=0.031532756984233856, train_loss=0.14712849259376526, time_cost=5.710795879364014
+
Steps: 0%| | 2984/1000000 [7:33:08<3077:54:19, 11.11s/it, lr=1e-5, step_loss=0.0315]
Steps: 0%| | 2985/1000000 [7:33:23<3428:27:02, 12.38s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [2985], local_loss=0.019573839381337166, train_loss=0.030288182199001312, time_cost=4.551344156265259
+
Steps: 0%| | 2985/1000000 [7:33:23<3428:27:02, 12.38s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 2986/1000000 [7:33:28<2832:30:36, 10.23s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [2986], local_loss=0.1701197326183319, train_loss=0.05332696810364723, time_cost=1.2078313827514648
+
Steps: 0%| | 2986/1000000 [7:33:28<2832:30:36, 10.23s/it, lr=1e-5, step_loss=0.17]
Steps: 0%| | 2987/1000000 [7:33:38<2837:12:27, 10.24s/it, lr=1e-5, step_loss=0.17][RANK-0]: Step: [2987], local_loss=0.02085932530462742, train_loss=0.0670846700668335, time_cost=7.832729816436768
+
Steps: 0%| | 2987/1000000 [7:33:38<2837:12:27, 10.24s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 2988/1000000 [7:33:51<2995:37:12, 10.82s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [2988], local_loss=0.04250089079141617, train_loss=0.04210032522678375, time_cost=4.540601015090942
+
Steps: 0%| | 2988/1000000 [7:33:51<2995:37:12, 10.82s/it, lr=1e-5, step_loss=0.0425]
Steps: 0%| | 2989/1000000 [7:33:56<2523:59:38, 9.11s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [2989], local_loss=0.01865996979176998, train_loss=0.029277032241225243, time_cost=2.114546298980713
+
Steps: 0%| | 2989/1000000 [7:33:56<2523:59:38, 9.11s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 2990/1000000 [7:34:10<2941:55:07, 10.62s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [2990], local_loss=0.06829040497541428, train_loss=0.044538937509059906, time_cost=1.3248064517974854
+
Steps: 0%| | 2990/1000000 [7:34:10<2941:55:07, 10.62s/it, lr=1e-5, step_loss=0.0683]
Steps: 0%| | 2991/1000000 [7:34:17<2686:15:59, 9.70s/it, lr=1e-5, step_loss=0.0683][RANK-0]: Step: [2991], local_loss=0.018833773210644722, train_loss=0.024124082177877426, time_cost=2.6648080348968506
+
Steps: 0%| | 2991/1000000 [7:34:17<2686:15:59, 9.70s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 2992/1000000 [7:34:29<2881:59:19, 10.41s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [2992], local_loss=0.020782405510544777, train_loss=0.03460697457194328, time_cost=3.5625107288360596
+
Steps: 0%| | 2992/1000000 [7:34:29<2881:59:19, 10.41s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 2993/1000000 [7:34:45<3295:42:56, 11.90s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [2993], local_loss=0.018575023859739304, train_loss=0.035060957074165344, time_cost=7.27923846244812
+
Steps: 0%| | 2993/1000000 [7:34:45<3295:42:56, 11.90s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 2994/1000000 [7:34:52<2910:12:03, 10.51s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [2994], local_loss=0.018600421026349068, train_loss=0.06500924378633499, time_cost=1.3978972434997559
+
Steps: 0%| | 2994/1000000 [7:34:52<2910:12:03, 10.51s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 2995/1000000 [7:35:03<2971:12:00, 10.73s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [2995], local_loss=0.015019470825791359, train_loss=0.027841391041874886, time_cost=1.3708760738372803
+
Steps: 0%| | 2995/1000000 [7:35:03<2971:12:00, 10.73s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 2996/1000000 [7:35:08<2498:56:28, 9.02s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [2996], local_loss=0.18003986775875092, train_loss=0.06255938112735748, time_cost=1.4669053554534912
+
Steps: 0%| | 2996/1000000 [7:35:08<2498:56:28, 9.02s/it, lr=1e-5, step_loss=0.18]
Steps: 0%| | 2997/1000000 [7:35:16<2361:48:24, 8.53s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [2997], local_loss=0.037456270307302475, train_loss=0.08461495488882065, time_cost=1.2818312644958496
+
Steps: 0%| | 2997/1000000 [7:35:16<2361:48:24, 8.53s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 2998/1000000 [7:35:23<2235:10:55, 8.07s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [2998], local_loss=0.02282494492828846, train_loss=0.02300068363547325, time_cost=3.2390663623809814
+
Steps: 0%| | 2998/1000000 [7:35:23<2235:10:55, 8.07s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 2999/1000000 [7:35:28<2002:06:04, 7.23s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [2999], local_loss=0.03818702697753906, train_loss=0.07455213367938995, time_cost=1.5219833850860596
+
Steps: 0%| | 2999/1000000 [7:35:28<2002:06:04, 7.23s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 3000/1000000 [7:35:34<1908:05:34, 6.89s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [3000], local_loss=0.014120970852673054, train_loss=0.03147069364786148, time_cost=2.395592212677002
+09/19/2024 06:45:27 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000
+09/19/2024 06:45:27 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 06:45:27,195] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-19 06:45:27,225] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 06:45:27,226] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 06:45:55,366] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 06:45:55,377] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 06:46:26,281] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:26,281] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:26,282] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:28,407] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:28,407] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:28,408] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:29,232] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:29,233] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:29,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:30,226] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:30,226] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:30,226] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:31,052] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:31,052] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:31,052] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:31,149] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:31,149] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:31,149] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:31,424] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:31,424] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:31,425] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 06:46:32,143] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 06:46:32,148] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 06:46:32,149] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 06:46:32 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/model/diffusion_pytorch_model.safetensors
+09/19/2024 06:47:53 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/scheduler.bin
+09/19/2024 06:47:53 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/sampler.bin
+09/19/2024 06:47:53 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000/random_states_0.pkl
+09/19/2024 06:47:53 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-3000
+
Steps: 0%| | 3000/1000000 [7:38:00<1908:05:34, 6.89s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 3001/1000000 [7:38:05<13863:46:13, 50.06s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [3001], local_loss=0.23968404531478882, train_loss=0.06413528323173523, time_cost=1.2300841808319092
+
Steps: 0%| | 3001/1000000 [7:38:05<13863:46:13, 50.06s/it, lr=1e-5, step_loss=0.24]
Steps: 0%| | 3002/1000000 [7:38:13<10383:27:48, 37.49s/it, lr=1e-5, step_loss=0.24][RANK-0]: Step: [3002], local_loss=0.014499658718705177, train_loss=0.05707530677318573, time_cost=5.964017868041992
+
Steps: 0%| | 3002/1000000 [7:38:13<10383:27:48, 37.49s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 3003/1000000 [7:38:17<7626:20:36, 27.54s/it, lr=1e-5, step_loss=0.0145] [RANK-0]: Step: [3003], local_loss=0.06666810065507889, train_loss=0.08704312890768051, time_cost=1.307377576828003
+
Steps: 0%| | 3003/1000000 [7:38:17<7626:20:36, 27.54s/it, lr=1e-5, step_loss=0.0667]
Steps: 0%| | 3004/1000000 [7:38:22<5696:22:11, 20.57s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [3004], local_loss=0.053727589547634125, train_loss=0.03364736959338188, time_cost=1.8030712604522705
+
Steps: 0%| | 3004/1000000 [7:38:22<5696:22:11, 20.57s/it, lr=1e-5, step_loss=0.0537]
Steps: 0%| | 3005/1000000 [7:38:29<4603:44:42, 16.62s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [3005], local_loss=0.26519468426704407, train_loss=0.06875071674585342, time_cost=1.5786561965942383
+
Steps: 0%| | 3005/1000000 [7:38:29<4603:44:42, 16.62s/it, lr=1e-5, step_loss=0.265]
Steps: 0%| | 3006/1000000 [7:38:39<4074:38:29, 14.71s/it, lr=1e-5, step_loss=0.265][RANK-0]: Step: [3006], local_loss=0.0215145293623209, train_loss=0.020842457190155983, time_cost=4.283459186553955
+
Steps: 0%| | 3006/1000000 [7:38:39<4074:38:29, 14.71s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 3007/1000000 [7:38:50<3758:53:36, 13.57s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [3007], local_loss=0.026762885972857475, train_loss=0.026809632778167725, time_cost=3.1660044193267822
+
Steps: 0%| | 3007/1000000 [7:38:50<3758:53:36, 13.57s/it, lr=1e-5, step_loss=0.0268]
Steps: 0%| | 3008/1000000 [7:38:57<3221:49:09, 11.63s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [3008], local_loss=0.02563454955816269, train_loss=0.054328158497810364, time_cost=1.5058915615081787
+
Steps: 0%| | 3008/1000000 [7:38:57<3221:49:09, 11.63s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 3009/1000000 [7:39:02<2647:37:45, 9.56s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [3009], local_loss=0.018115989863872528, train_loss=0.14787150919437408, time_cost=1.2145583629608154
+
Steps: 0%| | 3009/1000000 [7:39:02<2647:37:45, 9.56s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 3010/1000000 [7:39:13<2744:16:05, 9.91s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [3010], local_loss=0.019783105701208115, train_loss=0.0883931964635849, time_cost=1.2027363777160645
+
Steps: 0%| | 3010/1000000 [7:39:13<2744:16:05, 9.91s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 3011/1000000 [7:39:26<3046:57:03, 11.00s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [3011], local_loss=0.030141396448016167, train_loss=0.049903832376003265, time_cost=1.2563040256500244
+
Steps: 0%| | 3011/1000000 [7:39:26<3046:57:03, 11.00s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 3012/1000000 [7:39:34<2805:59:00, 10.13s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [3012], local_loss=0.015024205669760704, train_loss=0.1508609801530838, time_cost=3.1671998500823975
+
Steps: 0%| | 3012/1000000 [7:39:34<2805:59:00, 10.13s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 3013/1000000 [7:39:50<3255:13:18, 11.75s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [3013], local_loss=0.030581282451748848, train_loss=0.03341829404234886, time_cost=7.897190093994141
+
Steps: 0%| | 3013/1000000 [7:39:50<3255:13:18, 11.75s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 3014/1000000 [7:39:55<2696:00:30, 9.73s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [3014], local_loss=0.022372601553797722, train_loss=0.045226287096738815, time_cost=1.2318916320800781
+
Steps: 0%| | 3014/1000000 [7:39:55<2696:00:30, 9.73s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 3015/1000000 [7:40:04<2622:55:15, 9.47s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [3015], local_loss=0.014350712299346924, train_loss=0.03675833344459534, time_cost=2.9757230281829834
+
Steps: 0%| | 3015/1000000 [7:40:04<2622:55:15, 9.47s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 3016/1000000 [7:40:12<2471:39:31, 8.92s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [3016], local_loss=0.017931319773197174, train_loss=0.041494570672512054, time_cost=2.286067247390747
+
Steps: 0%| | 3016/1000000 [7:40:12<2471:39:31, 8.92s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 3017/1000000 [7:40:18<2248:00:33, 8.12s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [3017], local_loss=0.06989498436450958, train_loss=0.02746809832751751, time_cost=2.1939427852630615
+
Steps: 0%| | 3017/1000000 [7:40:18<2248:00:33, 8.12s/it, lr=1e-5, step_loss=0.0699]
Steps: 0%| | 3018/1000000 [7:40:25<2146:55:00, 7.75s/it, lr=1e-5, step_loss=0.0699][RANK-0]: Step: [3018], local_loss=0.03038296476006508, train_loss=0.16126768290996552, time_cost=2.3021838665008545
+
Steps: 0%| | 3018/1000000 [7:40:25<2146:55:00, 7.75s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 3019/1000000 [7:40:30<1923:33:31, 6.95s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [3019], local_loss=0.04898935183882713, train_loss=0.03163345903158188, time_cost=1.3074052333831787
+
Steps: 0%| | 3019/1000000 [7:40:30<1923:33:31, 6.95s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 3020/1000000 [7:40:34<1710:31:38, 6.18s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [3020], local_loss=0.017399927601218224, train_loss=0.03315328434109688, time_cost=1.738891839981079
+
Steps: 0%| | 3020/1000000 [7:40:34<1710:31:38, 6.18s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3021/1000000 [7:40:39<1588:50:52, 5.74s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3021], local_loss=0.011527915485203266, train_loss=0.08378549665212631, time_cost=2.339799404144287
+
Steps: 0%| | 3021/1000000 [7:40:39<1588:50:52, 5.74s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 3022/1000000 [7:40:47<1798:58:55, 6.50s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [3022], local_loss=0.015323398634791374, train_loss=0.04382583126425743, time_cost=2.100951910018921
+
Steps: 0%| | 3022/1000000 [7:40:47<1798:58:55, 6.50s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 3023/1000000 [7:40:59<2214:50:14, 8.00s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [3023], local_loss=0.04764936491847038, train_loss=0.05917735397815704, time_cost=3.145559310913086
+
Steps: 0%| | 3023/1000000 [7:40:59<2214:50:14, 8.00s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 3024/1000000 [7:41:03<1948:11:07, 7.03s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [3024], local_loss=0.07800766825675964, train_loss=0.05677901208400726, time_cost=2.2159175872802734
+
Steps: 0%| | 3024/1000000 [7:41:03<1948:11:07, 7.03s/it, lr=1e-5, step_loss=0.078]
Steps: 0%| | 3025/1000000 [7:41:13<2161:03:41, 7.80s/it, lr=1e-5, step_loss=0.078][RANK-0]: Step: [3025], local_loss=0.01774289458990097, train_loss=0.031537868082523346, time_cost=1.1876301765441895
+
Steps: 0%| | 3025/1000000 [7:41:13<2161:03:41, 7.80s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 3026/1000000 [7:41:25<2512:17:19, 9.07s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [3026], local_loss=0.023217318579554558, train_loss=0.06839510798454285, time_cost=5.0594611167907715
+
Steps: 0%| | 3026/1000000 [7:41:25<2512:17:19, 9.07s/it, lr=1e-5, step_loss=0.0232]
Steps: 0%| | 3027/1000000 [7:41:36<2687:30:00, 9.70s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [3027], local_loss=0.2082156091928482, train_loss=0.05933550372719765, time_cost=4.181241750717163
+
Steps: 0%| | 3027/1000000 [7:41:36<2687:30:00, 9.70s/it, lr=1e-5, step_loss=0.208]
Steps: 0%| | 3028/1000000 [7:41:51<3133:04:44, 11.31s/it, lr=1e-5, step_loss=0.208][RANK-0]: Step: [3028], local_loss=0.02279449999332428, train_loss=0.027000300586223602, time_cost=5.73904824256897
+
Steps: 0%| | 3028/1000000 [7:41:51<3133:04:44, 11.31s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 3029/1000000 [7:41:56<2612:35:59, 9.43s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [3029], local_loss=0.011127570644021034, train_loss=0.024939918890595436, time_cost=2.1568899154663086
+
Steps: 0%| | 3029/1000000 [7:41:56<2612:35:59, 9.43s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 3030/1000000 [7:42:04<2487:45:12, 8.98s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [3030], local_loss=0.011644808575510979, train_loss=0.04192975163459778, time_cost=3.373141050338745
+
Steps: 0%| | 3030/1000000 [7:42:04<2487:45:12, 8.98s/it, lr=1e-5, step_loss=0.0116]
Steps: 0%| | 3031/1000000 [7:42:18<2890:11:54, 10.44s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [3031], local_loss=0.025511477142572403, train_loss=0.032375819981098175, time_cost=3.3899519443511963
+
Steps: 0%| | 3031/1000000 [7:42:18<2890:11:54, 10.44s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 3032/1000000 [7:42:36<3520:04:59, 12.71s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [3032], local_loss=0.02760316990315914, train_loss=0.04014062136411667, time_cost=9.641381740570068
+
Steps: 0%| | 3032/1000000 [7:42:36<3520:04:59, 12.71s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 3033/1000000 [7:42:46<3299:06:47, 11.91s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [3033], local_loss=0.03490660712122917, train_loss=10.211114883422852, time_cost=4.713247060775757
+
Steps: 0%| | 3033/1000000 [7:42:46<3299:06:47, 11.91s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 3034/1000000 [7:43:01<3530:06:37, 12.75s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [3034], local_loss=0.06386592239141464, train_loss=0.03516729176044464, time_cost=8.402340173721313
+
Steps: 0%| | 3034/1000000 [7:43:01<3530:06:37, 12.75s/it, lr=1e-5, step_loss=0.0639]
Steps: 0%| | 3035/1000000 [7:43:14<3580:35:12, 12.93s/it, lr=1e-5, step_loss=0.0639][RANK-0]: Step: [3035], local_loss=0.04525984823703766, train_loss=0.04895469918847084, time_cost=3.9791676998138428
+
Steps: 0%| | 3035/1000000 [7:43:14<3580:35:12, 12.93s/it, lr=1e-5, step_loss=0.0453]
Steps: 0%| | 3036/1000000 [7:43:29<3696:48:19, 13.35s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [3036], local_loss=0.053391169756650925, train_loss=0.030773460865020752, time_cost=4.125130653381348
+
Steps: 0%| | 3036/1000000 [7:43:29<3696:48:19, 13.35s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 3037/1000000 [7:43:33<2953:02:29, 10.66s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [3037], local_loss=0.025831123813986778, train_loss=0.027688007801771164, time_cost=1.830287218093872
+
Steps: 0%| | 3037/1000000 [7:43:33<2953:02:29, 10.66s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 3038/1000000 [7:43:44<2965:44:30, 10.71s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [3038], local_loss=0.10220782458782196, train_loss=0.04140249267220497, time_cost=2.00797700881958
+
Steps: 0%| | 3038/1000000 [7:43:44<2965:44:30, 10.71s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 3039/1000000 [7:43:53<2820:29:40, 10.18s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [3039], local_loss=0.03665527328848839, train_loss=0.03389090299606323, time_cost=3.3690543174743652
+
Steps: 0%| | 3039/1000000 [7:43:53<2820:29:40, 10.18s/it, lr=1e-5, step_loss=0.0367]
Steps: 0%| | 3040/1000000 [7:43:58<2455:11:34, 8.87s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [3040], local_loss=0.01378067396581173, train_loss=0.03229059278964996, time_cost=1.9845168590545654
+
Steps: 0%| | 3040/1000000 [7:43:58<2455:11:34, 8.87s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 3041/1000000 [7:44:12<2855:22:02, 10.31s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [3041], local_loss=0.026557397097349167, train_loss=0.09050779044628143, time_cost=9.982834100723267
+
Steps: 0%| | 3041/1000000 [7:44:12<2855:22:02, 10.31s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 3042/1000000 [7:44:27<3203:47:28, 11.57s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [3042], local_loss=0.018458131700754166, train_loss=0.07891656458377838, time_cost=4.936638593673706
+
Steps: 0%| | 3042/1000000 [7:44:27<3203:47:28, 11.57s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 3043/1000000 [7:44:36<3004:47:54, 10.85s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [3043], local_loss=0.03381999582052231, train_loss=0.029751937836408615, time_cost=3.404966115951538
+
Steps: 0%| | 3043/1000000 [7:44:36<3004:47:54, 10.85s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 3044/1000000 [7:44:48<3093:38:35, 11.17s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [3044], local_loss=0.015652671456336975, train_loss=0.05000491440296173, time_cost=5.249236583709717
+
Steps: 0%| | 3044/1000000 [7:44:48<3093:38:35, 11.17s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 3045/1000000 [7:44:54<2679:38:26, 9.68s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [3045], local_loss=0.018782731145620346, train_loss=0.031112153083086014, time_cost=1.6995136737823486
+
Steps: 0%| | 3045/1000000 [7:44:54<2679:38:26, 9.68s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 3046/1000000 [7:44:59<2327:52:35, 8.41s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [3046], local_loss=0.12596885859966278, train_loss=0.16789750754833221, time_cost=2.875411033630371
+
Steps: 0%| | 3046/1000000 [7:44:59<2327:52:35, 8.41s/it, lr=1e-5, step_loss=0.126]
Steps: 0%| | 3047/1000000 [7:45:05<2064:53:49, 7.46s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [3047], local_loss=0.010902310721576214, train_loss=0.16007933020591736, time_cost=2.3453314304351807
+
Steps: 0%| | 3047/1000000 [7:45:05<2064:53:49, 7.46s/it, lr=1e-5, step_loss=0.0109]
Steps: 0%| | 3048/1000000 [7:45:17<2509:56:52, 9.06s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [3048], local_loss=0.12505526840686798, train_loss=0.10188373923301697, time_cost=9.845461130142212
+
Steps: 0%| | 3048/1000000 [7:45:17<2509:56:52, 9.06s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 3049/1000000 [7:45:29<2748:33:14, 9.93s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [3049], local_loss=0.01925818808376789, train_loss=0.028493400663137436, time_cost=1.2246088981628418
+
Steps: 0%| | 3049/1000000 [7:45:29<2748:33:14, 9.93s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 3050/1000000 [7:45:41<2859:08:35, 10.32s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [3050], local_loss=0.10827065259218216, train_loss=0.12073186784982681, time_cost=1.2345736026763916
+
Steps: 0%| | 3050/1000000 [7:45:41<2859:08:35, 10.32s/it, lr=1e-5, step_loss=0.108]
Steps: 0%| | 3051/1000000 [7:45:52<2931:02:43, 10.58s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [3051], local_loss=0.08929216861724854, train_loss=0.06073952466249466, time_cost=2.2582061290740967
+
Steps: 0%| | 3051/1000000 [7:45:52<2931:02:43, 10.58s/it, lr=1e-5, step_loss=0.0893]
Steps: 0%| | 3052/1000000 [7:46:05<3126:05:05, 11.29s/it, lr=1e-5, step_loss=0.0893][RANK-0]: Step: [3052], local_loss=0.058987926691770554, train_loss=0.046087078750133514, time_cost=3.4834859371185303
+
Steps: 0%| | 3052/1000000 [7:46:05<3126:05:05, 11.29s/it, lr=1e-5, step_loss=0.059]
Steps: 0%| | 3053/1000000 [7:46:14<2929:58:41, 10.58s/it, lr=1e-5, step_loss=0.059][RANK-0]: Step: [3053], local_loss=0.017618441954255104, train_loss=0.04995418339967728, time_cost=3.337984800338745
+
Steps: 0%| | 3053/1000000 [7:46:14<2929:58:41, 10.58s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 3054/1000000 [7:46:25<2996:20:50, 10.82s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [3054], local_loss=0.04700423777103424, train_loss=0.04982983320951462, time_cost=3.764734983444214
+
Steps: 0%| | 3054/1000000 [7:46:25<2996:20:50, 10.82s/it, lr=1e-5, step_loss=0.047]
Steps: 0%| | 3055/1000000 [7:46:33<2724:59:50, 9.84s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [3055], local_loss=0.06403614580631256, train_loss=0.036725983023643494, time_cost=1.597937822341919
+
Steps: 0%| | 3055/1000000 [7:46:33<2724:59:50, 9.84s/it, lr=1e-5, step_loss=0.064]
Steps: 0%| | 3056/1000000 [7:46:43<2811:27:11, 10.15s/it, lr=1e-5, step_loss=0.064][RANK-0]: Step: [3056], local_loss=0.04273269698023796, train_loss=0.04131679981946945, time_cost=1.310685634613037
+
Steps: 0%| | 3056/1000000 [7:46:43<2811:27:11, 10.15s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 3057/1000000 [7:46:48<2383:44:19, 8.61s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [3057], local_loss=0.02934686467051506, train_loss=0.02651711367070675, time_cost=1.2347631454467773
+
Steps: 0%| | 3057/1000000 [7:46:48<2383:44:19, 8.61s/it, lr=1e-5, step_loss=0.0293]
Steps: 0%| | 3058/1000000 [7:47:00<2594:30:24, 9.37s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [3058], local_loss=0.07968433201313019, train_loss=0.035041291266679764, time_cost=3.2645349502563477
+
Steps: 0%| | 3058/1000000 [7:47:00<2594:30:24, 9.37s/it, lr=1e-5, step_loss=0.0797]
Steps: 0%| | 3059/1000000 [7:47:07<2423:11:50, 8.75s/it, lr=1e-5, step_loss=0.0797][RANK-0]: Step: [3059], local_loss=0.0401344858109951, train_loss=0.0375114306807518, time_cost=1.2430694103240967
+
Steps: 0%| | 3059/1000000 [7:47:07<2423:11:50, 8.75s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 3060/1000000 [7:47:18<2626:08:00, 9.48s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [3060], local_loss=0.026284629479050636, train_loss=0.028167258948087692, time_cost=7.126548767089844
+
Steps: 0%| | 3060/1000000 [7:47:18<2626:08:00, 9.48s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 3061/1000000 [7:47:23<2279:06:54, 8.23s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [3061], local_loss=0.029819970950484276, train_loss=0.02676846832036972, time_cost=2.793701410293579
+
Steps: 0%| | 3061/1000000 [7:47:23<2279:06:54, 8.23s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 3062/1000000 [7:47:29<2093:32:30, 7.56s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [3062], local_loss=0.0173453688621521, train_loss=0.057509828358888626, time_cost=1.9543836116790771
+
Steps: 0%| | 3062/1000000 [7:47:29<2093:32:30, 7.56s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 3063/1000000 [7:47:41<2404:37:35, 8.68s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [3063], local_loss=0.3253127336502075, train_loss=0.06448489427566528, time_cost=4.080507516860962
+
Steps: 0%| | 3063/1000000 [7:47:41<2404:37:35, 8.68s/it, lr=1e-5, step_loss=0.325]
Steps: 0%| | 3064/1000000 [7:47:52<2619:00:29, 9.46s/it, lr=1e-5, step_loss=0.325][RANK-0]: Step: [3064], local_loss=0.026943925768136978, train_loss=0.0986524298787117, time_cost=1.243891954421997
+
Steps: 0%| | 3064/1000000 [7:47:52<2619:00:29, 9.46s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 3065/1000000 [7:47:57<2241:14:57, 8.09s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [3065], local_loss=0.04622533544898033, train_loss=0.10121431946754456, time_cost=1.2417302131652832
+
Steps: 0%| | 3065/1000000 [7:47:57<2241:14:57, 8.09s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 3066/1000000 [7:48:04<2139:47:40, 7.73s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [3066], local_loss=0.03331591188907623, train_loss=0.05472603067755699, time_cost=2.287630796432495
+
Steps: 0%| | 3066/1000000 [7:48:04<2139:47:40, 7.73s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 3067/1000000 [7:48:11<2084:37:10, 7.53s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [3067], local_loss=0.23352231085300446, train_loss=22.914188385009766, time_cost=2.1505773067474365
+
Steps: 0%| | 3067/1000000 [7:48:11<2084:37:10, 7.53s/it, lr=1e-5, step_loss=0.234]
Steps: 0%| | 3068/1000000 [7:48:21<2273:58:05, 8.21s/it, lr=1e-5, step_loss=0.234][RANK-0]: Step: [3068], local_loss=0.04615413397550583, train_loss=0.15645524859428406, time_cost=4.264253616333008
+
Steps: 0%| | 3068/1000000 [7:48:21<2273:58:05, 8.21s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 3069/1000000 [7:48:31<2490:22:29, 8.99s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [3069], local_loss=0.030000949278473854, train_loss=0.043492499738931656, time_cost=1.2464423179626465
+
Steps: 0%| | 3069/1000000 [7:48:31<2490:22:29, 8.99s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 3070/1000000 [7:48:36<2131:40:51, 7.70s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [3070], local_loss=0.02437516488134861, train_loss=0.03217542916536331, time_cost=1.4044225215911865
+
Steps: 0%| | 3070/1000000 [7:48:36<2131:40:51, 7.70s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 3071/1000000 [7:48:52<2774:13:49, 10.02s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [3071], local_loss=0.051305364817380905, train_loss=0.07752823829650879, time_cost=1.2504961490631104
+
Steps: 0%| | 3071/1000000 [7:48:52<2774:13:49, 10.02s/it, lr=1e-5, step_loss=0.0513]
Steps: 0%| | 3072/1000000 [7:48:58<2435:28:45, 8.79s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [3072], local_loss=0.023917904123663902, train_loss=0.05153721943497658, time_cost=1.7325844764709473
+
Steps: 0%| | 3072/1000000 [7:48:58<2435:28:45, 8.79s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 3073/1000000 [7:49:11<2793:09:45, 10.09s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [3073], local_loss=0.04349283501505852, train_loss=0.05873364955186844, time_cost=4.015617370605469
+
Steps: 0%| | 3073/1000000 [7:49:11<2793:09:45, 10.09s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 3074/1000000 [7:49:22<2878:07:22, 10.39s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [3074], local_loss=0.061397865414619446, train_loss=0.0336092971265316, time_cost=4.3670220375061035
+
Steps: 0%| | 3074/1000000 [7:49:22<2878:07:22, 10.39s/it, lr=1e-5, step_loss=0.0614]
Steps: 0%| | 3075/1000000 [7:49:27<2431:02:45, 8.78s/it, lr=1e-5, step_loss=0.0614][RANK-0]: Step: [3075], local_loss=0.022036125883460045, train_loss=0.03236253932118416, time_cost=1.9280776977539062
+
Steps: 0%| | 3075/1000000 [7:49:27<2431:02:45, 8.78s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 3076/1000000 [7:49:39<2678:30:48, 9.67s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [3076], local_loss=0.6745975613594055, train_loss=0.15078096091747284, time_cost=3.6513454914093018
+
Steps: 0%| | 3076/1000000 [7:49:39<2678:30:48, 9.67s/it, lr=1e-5, step_loss=0.675]
Steps: 0%| | 3077/1000000 [7:49:49<2780:02:36, 10.04s/it, lr=1e-5, step_loss=0.675][RANK-0]: Step: [3077], local_loss=0.01853635534644127, train_loss=0.039208654314279556, time_cost=3.616652727127075
+
Steps: 0%| | 3077/1000000 [7:49:49<2780:02:36, 10.04s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 3078/1000000 [7:50:03<3042:47:14, 10.99s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [3078], local_loss=0.06519665569067001, train_loss=0.06832022964954376, time_cost=5.218007802963257
+
Steps: 0%| | 3078/1000000 [7:50:03<3042:47:14, 10.99s/it, lr=1e-5, step_loss=0.0652]
Steps: 0%| | 3079/1000000 [7:50:14<3059:54:07, 11.05s/it, lr=1e-5, step_loss=0.0652][RANK-0]: Step: [3079], local_loss=0.1567768156528473, train_loss=0.061926230788230896, time_cost=1.2324841022491455
+
Steps: 0%| | 3079/1000000 [7:50:14<3059:54:07, 11.05s/it, lr=1e-5, step_loss=0.157]
Steps: 0%| | 3080/1000000 [7:50:21<2762:28:15, 9.98s/it, lr=1e-5, step_loss=0.157][RANK-0]: Step: [3080], local_loss=0.08382046967744827, train_loss=0.0315251462161541, time_cost=1.6292743682861328
+
Steps: 0%| | 3080/1000000 [7:50:21<2762:28:15, 9.98s/it, lr=1e-5, step_loss=0.0838]
Steps: 0%| | 3081/1000000 [7:50:32<2828:58:14, 10.22s/it, lr=1e-5, step_loss=0.0838][RANK-0]: Step: [3081], local_loss=0.02603556588292122, train_loss=0.03599892929196358, time_cost=1.702272891998291
+
Steps: 0%| | 3081/1000000 [7:50:32<2828:58:14, 10.22s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 3082/1000000 [7:50:45<3082:47:54, 11.13s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [3082], local_loss=0.03018959052860737, train_loss=0.04121524095535278, time_cost=7.185759782791138
+
Steps: 0%| | 3082/1000000 [7:50:45<3082:47:54, 11.13s/it, lr=1e-5, step_loss=0.0302]
Steps: 0%| | 3083/1000000 [7:50:59<3298:26:57, 11.91s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [3083], local_loss=0.008876418694853783, train_loss=0.05220896005630493, time_cost=6.001250267028809
+
Steps: 0%| | 3083/1000000 [7:50:59<3298:26:57, 11.91s/it, lr=1e-5, step_loss=0.00888]
Steps: 0%| | 3084/1000000 [7:51:07<2974:19:35, 10.74s/it, lr=1e-5, step_loss=0.00888][RANK-0]: Step: [3084], local_loss=0.030985156074166298, train_loss=0.02669048309326172, time_cost=3.360236644744873
+
Steps: 0%| | 3084/1000000 [7:51:07<2974:19:35, 10.74s/it, lr=1e-5, step_loss=0.031]
Steps: 0%| | 3085/1000000 [7:51:20<3182:03:26, 11.49s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [3085], local_loss=0.017574012279510498, train_loss=0.22594711184501648, time_cost=5.559993028640747
+
Steps: 0%| | 3085/1000000 [7:51:20<3182:03:26, 11.49s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 3086/1000000 [7:51:24<2576:07:43, 9.30s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [3086], local_loss=0.0545801967382431, train_loss=0.043998993933200836, time_cost=1.4757153987884521
+
Steps: 0%| | 3086/1000000 [7:51:24<2576:07:43, 9.30s/it, lr=1e-5, step_loss=0.0546]
Steps: 0%| | 3087/1000000 [7:51:31<2348:32:50, 8.48s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [3087], local_loss=0.034287966787815094, train_loss=0.029280545189976692, time_cost=3.007164239883423
+
Steps: 0%| | 3087/1000000 [7:51:31<2348:32:50, 8.48s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 3088/1000000 [7:51:37<2132:55:29, 7.70s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [3088], local_loss=0.01846674084663391, train_loss=0.03177165240049362, time_cost=4.910137414932251
+
Steps: 0%| | 3088/1000000 [7:51:37<2132:55:29, 7.70s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 3089/1000000 [7:51:48<2371:29:44, 8.56s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [3089], local_loss=0.13351604342460632, train_loss=0.07966233789920807, time_cost=5.079134225845337
+
Steps: 0%| | 3089/1000000 [7:51:48<2371:29:44, 8.56s/it, lr=1e-5, step_loss=0.134]
Steps: 0%| | 3090/1000000 [7:51:55<2251:28:06, 8.13s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [3090], local_loss=0.09494179487228394, train_loss=0.040711913257837296, time_cost=5.999765157699585
+
Steps: 0%| | 3090/1000000 [7:51:55<2251:28:06, 8.13s/it, lr=1e-5, step_loss=0.0949]
Steps: 0%| | 3091/1000000 [7:52:02<2154:36:51, 7.78s/it, lr=1e-5, step_loss=0.0949][RANK-0]: Step: [3091], local_loss=0.023099606856703758, train_loss=0.03166433051228523, time_cost=3.1078803539276123
+
Steps: 0%| | 3091/1000000 [7:52:02<2154:36:51, 7.78s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 3092/1000000 [7:52:11<2262:21:37, 8.17s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [3092], local_loss=0.05874074250459671, train_loss=0.06869688630104065, time_cost=3.1214256286621094
+
Steps: 0%| | 3092/1000000 [7:52:11<2262:21:37, 8.17s/it, lr=1e-5, step_loss=0.0587]
Steps: 0%| | 3093/1000000 [7:52:16<2010:10:11, 7.26s/it, lr=1e-5, step_loss=0.0587][RANK-0]: Step: [3093], local_loss=0.02326294220983982, train_loss=0.025165820494294167, time_cost=2.058185338973999
+
Steps: 0%| | 3093/1000000 [7:52:16<2010:10:11, 7.26s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 3094/1000000 [7:52:25<2178:08:17, 7.87s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [3094], local_loss=0.018281064927577972, train_loss=0.02897038497030735, time_cost=2.2164833545684814
+
Steps: 0%| | 3094/1000000 [7:52:25<2178:08:17, 7.87s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 3095/1000000 [7:52:39<2684:59:46, 9.70s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [3095], local_loss=0.020284311845898628, train_loss=0.050942160189151764, time_cost=1.2224843502044678
+
Steps: 0%| | 3095/1000000 [7:52:39<2684:59:46, 9.70s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 3096/1000000 [7:52:48<2624:42:57, 9.48s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [3096], local_loss=0.02175750397145748, train_loss=0.08627309650182724, time_cost=5.950990438461304
+
Steps: 0%| | 3096/1000000 [7:52:48<2624:42:57, 9.48s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 3097/1000000 [7:52:55<2414:31:54, 8.72s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [3097], local_loss=0.09130708873271942, train_loss=0.051948558539152145, time_cost=2.4995803833007812
+
Steps: 0%| | 3097/1000000 [7:52:55<2414:31:54, 8.72s/it, lr=1e-5, step_loss=0.0913]
Steps: 0%| | 3098/1000000 [7:53:04<2440:04:57, 8.81s/it, lr=1e-5, step_loss=0.0913][RANK-0]: Step: [3098], local_loss=0.01986810937523842, train_loss=0.04403385519981384, time_cost=1.8081958293914795
+
Steps: 0%| | 3098/1000000 [7:53:04<2440:04:57, 8.81s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 3099/1000000 [7:53:13<2464:23:46, 8.90s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [3099], local_loss=0.04528424143791199, train_loss=0.04040123522281647, time_cost=4.04398250579834
+
Steps: 0%| | 3099/1000000 [7:53:13<2464:23:46, 8.90s/it, lr=1e-5, step_loss=0.0453]
Steps: 0%| | 3100/1000000 [7:53:19<2206:10:49, 7.97s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [3100], local_loss=0.014103412628173828, train_loss=0.053015220910310745, time_cost=1.405092477798462
+
Steps: 0%| | 3100/1000000 [7:53:19<2206:10:49, 7.97s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 3101/1000000 [7:53:25<2056:49:14, 7.43s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [3101], local_loss=0.27742961049079895, train_loss=0.08451184630393982, time_cost=5.0820324420928955
+
Steps: 0%| | 3101/1000000 [7:53:25<2056:49:14, 7.43s/it, lr=1e-5, step_loss=0.277]
Steps: 0%| | 3102/1000000 [7:53:37<2441:16:45, 8.82s/it, lr=1e-5, step_loss=0.277][RANK-0]: Step: [3102], local_loss=0.043900396674871445, train_loss=0.05224555358290672, time_cost=1.9202196598052979
+
Steps: 0%| | 3102/1000000 [7:53:37<2441:16:45, 8.82s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 3103/1000000 [7:53:52<2986:15:34, 10.78s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [3103], local_loss=0.036628641188144684, train_loss=0.039261236786842346, time_cost=2.86924409866333
+
Steps: 0%| | 3103/1000000 [7:53:52<2986:15:34, 10.78s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 3104/1000000 [7:54:03<2999:50:54, 10.83s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [3104], local_loss=0.019556155428290367, train_loss=0.02333064191043377, time_cost=4.5385658740997314
+
Steps: 0%| | 3104/1000000 [7:54:03<2999:50:54, 10.83s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 3105/1000000 [7:54:16<3135:45:22, 11.32s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [3105], local_loss=0.019568799063563347, train_loss=0.056264959275722504, time_cost=5.8151915073394775
+
Steps: 0%| | 3105/1000000 [7:54:16<3135:45:22, 11.32s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 3106/1000000 [7:54:26<3011:30:22, 10.88s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [3106], local_loss=0.02388383448123932, train_loss=0.039812929928302765, time_cost=7.263758659362793
+
Steps: 0%| | 3106/1000000 [7:54:26<3011:30:22, 10.88s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 3107/1000000 [7:54:31<2518:17:34, 9.09s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [3107], local_loss=0.022713640704751015, train_loss=0.0384514182806015, time_cost=2.2270936965942383
+
Steps: 0%| | 3107/1000000 [7:54:31<2518:17:34, 9.09s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 3108/1000000 [7:54:42<2721:48:02, 9.83s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [3108], local_loss=0.012389454059302807, train_loss=0.0419982373714447, time_cost=2.251798629760742
+
Steps: 0%| | 3108/1000000 [7:54:42<2721:48:02, 9.83s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 3109/1000000 [7:54:50<2516:55:23, 9.09s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [3109], local_loss=0.017765304073691368, train_loss=0.024208655580878258, time_cost=1.455902099609375
+
Steps: 0%| | 3109/1000000 [7:54:50<2516:55:23, 9.09s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 3110/1000000 [7:54:59<2513:32:52, 9.08s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [3110], local_loss=0.039830051362514496, train_loss=0.03376740962266922, time_cost=3.097078323364258
+
Steps: 0%| | 3110/1000000 [7:54:59<2513:32:52, 9.08s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 3111/1000000 [7:55:08<2562:37:46, 9.25s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [3111], local_loss=0.01999262347817421, train_loss=0.06913615763187408, time_cost=4.753103733062744
+
Steps: 0%| | 3111/1000000 [7:55:08<2562:37:46, 9.25s/it, lr=1e-5, step_loss=0.02]
Steps: 0%| | 3112/1000000 [7:55:15<2376:14:26, 8.58s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [3112], local_loss=0.10026731342077255, train_loss=0.054144419729709625, time_cost=3.1987149715423584
+
Steps: 0%| | 3112/1000000 [7:55:15<2376:14:26, 8.58s/it, lr=1e-5, step_loss=0.1]
Steps: 0%| | 3113/1000000 [7:55:29<2763:35:09, 9.98s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [3113], local_loss=0.012238179333508015, train_loss=0.03653555363416672, time_cost=3.0855658054351807
+
Steps: 0%| | 3113/1000000 [7:55:29<2763:35:09, 9.98s/it, lr=1e-5, step_loss=0.0122]
Steps: 0%| | 3114/1000000 [7:55:35<2504:08:22, 9.04s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [3114], local_loss=0.03632145747542381, train_loss=0.02861485257744789, time_cost=1.3938870429992676
+
Steps: 0%| | 3114/1000000 [7:55:35<2504:08:22, 9.04s/it, lr=1e-5, step_loss=0.0363]
Steps: 0%| | 3115/1000000 [7:55:41<2239:14:03, 8.09s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [3115], local_loss=0.03006853722035885, train_loss=0.06524835526943207, time_cost=1.733450174331665
+
Steps: 0%| | 3115/1000000 [7:55:41<2239:14:03, 8.09s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 3116/1000000 [7:55:47<2033:25:34, 7.34s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [3116], local_loss=0.039223991334438324, train_loss=0.029778946191072464, time_cost=1.5609078407287598
+
Steps: 0%| | 3116/1000000 [7:55:47<2033:25:34, 7.34s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 3117/1000000 [7:56:02<2716:34:30, 9.81s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [3117], local_loss=0.048313334584236145, train_loss=0.03361085429787636, time_cost=7.092556476593018
+
Steps: 0%| | 3117/1000000 [7:56:02<2716:34:30, 9.81s/it, lr=1e-5, step_loss=0.0483]
Steps: 0%| | 3118/1000000 [7:56:14<2858:28:18, 10.32s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [3118], local_loss=0.019331270828843117, train_loss=0.029984215274453163, time_cost=1.977881669998169
+
Steps: 0%| | 3118/1000000 [7:56:14<2858:28:18, 10.32s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 3119/1000000 [7:56:19<2444:55:19, 8.83s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [3119], local_loss=0.03313606232404709, train_loss=9.250107765197754, time_cost=2.861722946166992
+
Steps: 0%| | 3119/1000000 [7:56:19<2444:55:19, 8.83s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 3120/1000000 [7:56:28<2442:44:49, 8.82s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [3120], local_loss=0.03953725844621658, train_loss=0.03282864764332771, time_cost=2.821305513381958
+
Steps: 0%| | 3120/1000000 [7:56:28<2442:44:49, 8.82s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 3121/1000000 [7:56:45<3071:36:08, 11.09s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [3121], local_loss=0.03889119252562523, train_loss=0.048663750290870667, time_cost=6.8403480052948
+
Steps: 0%| | 3121/1000000 [7:56:45<3071:36:08, 11.09s/it, lr=1e-5, step_loss=0.0389]
Steps: 0%| | 3122/1000000 [7:56:59<3365:12:56, 12.15s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [3122], local_loss=0.01548898033797741, train_loss=0.151818186044693, time_cost=1.9192266464233398
+
Steps: 0%| | 3122/1000000 [7:56:59<3365:12:56, 12.15s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 3123/1000000 [7:57:10<3238:45:56, 11.70s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [3123], local_loss=0.11189766973257065, train_loss=0.0593823567032814, time_cost=2.519770622253418
+
Steps: 0%| | 3123/1000000 [7:57:10<3238:45:56, 11.70s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 3124/1000000 [7:57:21<3212:39:27, 11.60s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [3124], local_loss=0.03268139064311981, train_loss=0.04527977481484413, time_cost=1.9583427906036377
+
Steps: 0%| | 3124/1000000 [7:57:21<3212:39:27, 11.60s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 3125/1000000 [7:57:29<2862:40:28, 10.34s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [3125], local_loss=0.02001251094043255, train_loss=0.03583497554063797, time_cost=4.382894515991211
+
Steps: 0%| | 3125/1000000 [7:57:29<2862:40:28, 10.34s/it, lr=1e-5, step_loss=0.02]
Steps: 0%| | 3126/1000000 [7:57:40<2961:39:02, 10.70s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [3126], local_loss=0.018953628838062286, train_loss=0.04196838662028313, time_cost=2.997976064682007
+
Steps: 0%| | 3126/1000000 [7:57:40<2961:39:02, 10.70s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 3127/1000000 [7:57:51<2999:33:18, 10.83s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [3127], local_loss=0.00788895320147276, train_loss=0.05101856216788292, time_cost=2.8025567531585693
+
Steps: 0%| | 3127/1000000 [7:57:51<2999:33:18, 10.83s/it, lr=1e-5, step_loss=0.00789]
Steps: 0%| | 3128/1000000 [7:57:56<2522:23:37, 9.11s/it, lr=1e-5, step_loss=0.00789][RANK-0]: Step: [3128], local_loss=0.03671657294034958, train_loss=0.03255409002304077, time_cost=1.9853456020355225
+
Steps: 0%| | 3128/1000000 [7:57:56<2522:23:37, 9.11s/it, lr=1e-5, step_loss=0.0367]
Steps: 0%| | 3129/1000000 [7:58:08<2743:09:32, 9.91s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [3129], local_loss=0.18500874936580658, train_loss=0.05275797098875046, time_cost=5.532186031341553
+
Steps: 0%| | 3129/1000000 [7:58:08<2743:09:32, 9.91s/it, lr=1e-5, step_loss=0.185]
Steps: 0%| | 3130/1000000 [7:58:17<2651:38:18, 9.58s/it, lr=1e-5, step_loss=0.185][RANK-0]: Step: [3130], local_loss=0.03194347023963928, train_loss=0.042422983795404434, time_cost=1.877422571182251
+
Steps: 0%| | 3130/1000000 [7:58:17<2651:38:18, 9.58s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 3131/1000000 [7:58:33<3231:43:58, 11.67s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [3131], local_loss=0.0337897464632988, train_loss=0.0787680447101593, time_cost=8.135976076126099
+
Steps: 0%| | 3131/1000000 [7:58:33<3231:43:58, 11.67s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 3132/1000000 [7:58:38<2632:31:07, 9.51s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [3132], local_loss=0.050590354949235916, train_loss=0.03322066366672516, time_cost=3.6640589237213135
+
Steps: 0%| | 3132/1000000 [7:58:38<2632:31:07, 9.51s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 3133/1000000 [7:58:43<2257:04:29, 8.15s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [3133], local_loss=0.05668318644165993, train_loss=0.05301106721162796, time_cost=1.9761366844177246
+
Steps: 0%| | 3133/1000000 [7:58:43<2257:04:29, 8.15s/it, lr=1e-5, step_loss=0.0567]
Steps: 0%| | 3134/1000000 [7:58:52<2350:28:47, 8.49s/it, lr=1e-5, step_loss=0.0567][RANK-0]: Step: [3134], local_loss=0.03432725742459297, train_loss=0.03327397629618645, time_cost=1.2514512538909912
+
Steps: 0%| | 3134/1000000 [7:58:52<2350:28:47, 8.49s/it, lr=1e-5, step_loss=0.0343]
Steps: 0%| | 3135/1000000 [7:59:01<2404:49:46, 8.68s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [3135], local_loss=0.042730752378702164, train_loss=0.06456196308135986, time_cost=1.2490251064300537
+
Steps: 0%| | 3135/1000000 [7:59:01<2404:49:46, 8.68s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 3136/1000000 [7:59:11<2465:57:56, 8.91s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [3136], local_loss=0.027271270751953125, train_loss=0.08066023886203766, time_cost=1.226456642150879
+
Steps: 0%| | 3136/1000000 [7:59:11<2465:57:56, 8.91s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 3137/1000000 [7:59:16<2172:50:32, 7.85s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [3137], local_loss=0.020168043673038483, train_loss=0.08068178594112396, time_cost=2.610039472579956
+
Steps: 0%| | 3137/1000000 [7:59:16<2172:50:32, 7.85s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 3138/1000000 [7:59:23<2087:20:14, 7.54s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [3138], local_loss=0.028928466141223907, train_loss=0.17497117817401886, time_cost=2.5703461170196533
+
Steps: 0%| | 3138/1000000 [7:59:23<2087:20:14, 7.54s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 3139/1000000 [7:59:30<2038:28:50, 7.36s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [3139], local_loss=0.09735621511936188, train_loss=1.17372727394104, time_cost=2.9231579303741455
+
Steps: 0%| | 3139/1000000 [7:59:30<2038:28:50, 7.36s/it, lr=1e-5, step_loss=0.0974]
Steps: 0%| | 3140/1000000 [7:59:39<2191:12:13, 7.91s/it, lr=1e-5, step_loss=0.0974][RANK-0]: Step: [3140], local_loss=0.039887528866529465, train_loss=0.037176743149757385, time_cost=1.257500410079956
+
Steps: 0%| | 3140/1000000 [7:59:39<2191:12:13, 7.91s/it, lr=1e-5, step_loss=0.0399]
Steps: 0%| | 3141/1000000 [7:59:44<1954:11:18, 7.06s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [3141], local_loss=0.06162148714065552, train_loss=0.04733829200267792, time_cost=2.045196533203125
+
Steps: 0%| | 3141/1000000 [7:59:44<1954:11:18, 7.06s/it, lr=1e-5, step_loss=0.0616]
Steps: 0%| | 3142/1000000 [7:59:55<2247:12:36, 8.12s/it, lr=1e-5, step_loss=0.0616][RANK-0]: Step: [3142], local_loss=0.08955144137144089, train_loss=0.0629652664065361, time_cost=2.3810200691223145
+
Steps: 0%| | 3142/1000000 [7:59:55<2247:12:36, 8.12s/it, lr=1e-5, step_loss=0.0896]
Steps: 0%| | 3143/1000000 [7:59:59<1966:01:25, 7.10s/it, lr=1e-5, step_loss=0.0896][RANK-0]: Step: [3143], local_loss=0.021226059645414352, train_loss=0.06321143358945847, time_cost=1.7872607707977295
+
Steps: 0%| | 3143/1000000 [7:59:59<1966:01:25, 7.10s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 3144/1000000 [8:00:05<1854:50:32, 6.70s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [3144], local_loss=0.044897790998220444, train_loss=0.037842489778995514, time_cost=3.34183669090271
+
Steps: 0%| | 3144/1000000 [8:00:05<1854:50:32, 6.70s/it, lr=1e-5, step_loss=0.0449]
Steps: 0%| | 3145/1000000 [8:00:14<2002:25:52, 7.23s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [3145], local_loss=0.02539985068142414, train_loss=0.06286805868148804, time_cost=1.7292113304138184
+
Steps: 0%| | 3145/1000000 [8:00:14<2002:25:52, 7.23s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 3146/1000000 [8:00:24<2285:11:13, 8.25s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [3146], local_loss=0.02186710573732853, train_loss=0.03425139933824539, time_cost=1.228698968887329
+
Steps: 0%| | 3146/1000000 [8:00:24<2285:11:13, 8.25s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 3147/1000000 [8:00:31<2169:20:52, 7.83s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [3147], local_loss=0.012879523448646069, train_loss=0.024560339748859406, time_cost=2.380330801010132
+
Steps: 0%| | 3147/1000000 [8:00:31<2169:20:52, 7.83s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 3148/1000000 [8:00:45<2637:18:03, 9.52s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [3148], local_loss=0.026235921308398247, train_loss=0.09648773074150085, time_cost=4.376644849777222
+
Steps: 0%| | 3148/1000000 [8:00:45<2637:18:03, 9.52s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 3149/1000000 [8:00:54<2606:05:31, 9.41s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [3149], local_loss=0.01774982176721096, train_loss=0.021771155297756195, time_cost=4.851229906082153
+
Steps: 0%| | 3149/1000000 [8:00:54<2606:05:31, 9.41s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 3150/1000000 [8:00:59<2298:01:13, 8.30s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [3150], local_loss=0.06629037857055664, train_loss=0.036129653453826904, time_cost=2.7490055561065674
+
Steps: 0%| | 3150/1000000 [8:00:59<2298:01:13, 8.30s/it, lr=1e-5, step_loss=0.0663]
Steps: 0%| | 3151/1000000 [8:01:07<2237:38:08, 8.08s/it, lr=1e-5, step_loss=0.0663][RANK-0]: Step: [3151], local_loss=0.017450008541345596, train_loss=0.029694590717554092, time_cost=2.007394313812256
+
Steps: 0%| | 3151/1000000 [8:01:07<2237:38:08, 8.08s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 3152/1000000 [8:01:14<2150:12:47, 7.77s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [3152], local_loss=0.06402429938316345, train_loss=0.05160107463598251, time_cost=3.504305362701416
+
Steps: 0%| | 3152/1000000 [8:01:14<2150:12:47, 7.77s/it, lr=1e-5, step_loss=0.064]
Steps: 0%| | 3153/1000000 [8:01:19<1916:56:29, 6.92s/it, lr=1e-5, step_loss=0.064][RANK-0]: Step: [3153], local_loss=0.012527533806860447, train_loss=0.040000248700380325, time_cost=2.584702968597412
+
Steps: 0%| | 3153/1000000 [8:01:19<1916:56:29, 6.92s/it, lr=1e-5, step_loss=0.0125]
Steps: 0%| | 3154/1000000 [8:01:28<2104:27:20, 7.60s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [3154], local_loss=0.048714496195316315, train_loss=0.05614563822746277, time_cost=2.220372200012207
+
Steps: 0%| | 3154/1000000 [8:01:28<2104:27:20, 7.60s/it, lr=1e-5, step_loss=0.0487]
Steps: 0%| | 3155/1000000 [8:01:36<2106:02:27, 7.61s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [3155], local_loss=0.03112534061074257, train_loss=0.12348145246505737, time_cost=2.6530110836029053
+
Steps: 0%| | 3155/1000000 [8:01:36<2106:02:27, 7.61s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 3156/1000000 [8:01:42<1988:59:32, 7.18s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [3156], local_loss=0.0885159894824028, train_loss=0.07643520087003708, time_cost=2.098400115966797
+
Steps: 0%| | 3156/1000000 [8:01:42<1988:59:32, 7.18s/it, lr=1e-5, step_loss=0.0885]
Steps: 0%| | 3157/1000000 [8:01:56<2553:59:54, 9.22s/it, lr=1e-5, step_loss=0.0885][RANK-0]: Step: [3157], local_loss=0.06311262398958206, train_loss=0.06839900463819504, time_cost=4.764523267745972
+
Steps: 0%| | 3157/1000000 [8:01:56<2553:59:54, 9.22s/it, lr=1e-5, step_loss=0.0631]
Steps: 0%| | 3158/1000000 [8:02:02<2251:49:19, 8.13s/it, lr=1e-5, step_loss=0.0631][RANK-0]: Step: [3158], local_loss=0.017004894092679024, train_loss=0.03897868096828461, time_cost=4.530867099761963
+
Steps: 0%| | 3158/1000000 [8:02:02<2251:49:19, 8.13s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 3159/1000000 [8:02:06<1904:05:29, 6.88s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [3159], local_loss=0.05694659799337387, train_loss=0.04390127956867218, time_cost=1.2157790660858154
+
Steps: 0%| | 3159/1000000 [8:02:06<1904:05:29, 6.88s/it, lr=1e-5, step_loss=0.0569]
Steps: 0%| | 3160/1000000 [8:02:14<1995:37:00, 7.21s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [3160], local_loss=0.03379307687282562, train_loss=0.028464166447520256, time_cost=2.011939525604248
+
Steps: 0%| | 3160/1000000 [8:02:14<1995:37:00, 7.21s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 3161/1000000 [8:02:23<2151:02:59, 7.77s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [3161], local_loss=0.09260906279087067, train_loss=0.03488515317440033, time_cost=2.903834819793701
+
Steps: 0%| | 3161/1000000 [8:02:23<2151:02:59, 7.77s/it, lr=1e-5, step_loss=0.0926]
Steps: 0%| | 3162/1000000 [8:02:29<2069:17:44, 7.47s/it, lr=1e-5, step_loss=0.0926][RANK-0]: Step: [3162], local_loss=0.9950695633888245, train_loss=0.1435178965330124, time_cost=2.948794364929199
+
Steps: 0%| | 3162/1000000 [8:02:29<2069:17:44, 7.47s/it, lr=1e-5, step_loss=0.995]
Steps: 0%| | 3163/1000000 [8:02:40<2354:38:55, 8.50s/it, lr=1e-5, step_loss=0.995][RANK-0]: Step: [3163], local_loss=0.026302821934223175, train_loss=0.09365709125995636, time_cost=1.941094160079956
+
Steps: 0%| | 3163/1000000 [8:02:40<2354:38:55, 8.50s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 3164/1000000 [8:02:55<2831:59:46, 10.23s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [3164], local_loss=0.017474357038736343, train_loss=0.029799990355968475, time_cost=5.022464990615845
+
Steps: 0%| | 3164/1000000 [8:02:55<2831:59:46, 10.23s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 3165/1000000 [8:03:02<2607:08:04, 9.42s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [3165], local_loss=0.023314014077186584, train_loss=0.03244340419769287, time_cost=5.318179607391357
+
Steps: 0%| | 3165/1000000 [8:03:02<2607:08:04, 9.42s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 3166/1000000 [8:03:07<2198:25:17, 7.94s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [3166], local_loss=0.04187273234128952, train_loss=0.07090476155281067, time_cost=1.3132801055908203
+
Steps: 0%| | 3166/1000000 [8:03:07<2198:25:17, 7.94s/it, lr=1e-5, step_loss=0.0419]
Steps: 0%| | 3167/1000000 [8:03:18<2509:40:35, 9.06s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [3167], local_loss=0.03325631469488144, train_loss=0.05672278627753258, time_cost=3.0844783782958984
+
Steps: 0%| | 3167/1000000 [8:03:18<2509:40:35, 9.06s/it, lr=1e-5, step_loss=0.0333]
Steps: 0%| | 3168/1000000 [8:03:27<2495:50:06, 9.01s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [3168], local_loss=0.08780598640441895, train_loss=0.06116471067070961, time_cost=3.052382230758667
+
Steps: 0%| | 3168/1000000 [8:03:27<2495:50:06, 9.01s/it, lr=1e-5, step_loss=0.0878]
Steps: 0%| | 3169/1000000 [8:03:32<2185:14:14, 7.89s/it, lr=1e-5, step_loss=0.0878][RANK-0]: Step: [3169], local_loss=0.027023358270525932, train_loss=0.07894887775182724, time_cost=2.5121910572052
+
Steps: 0%| | 3169/1000000 [8:03:32<2185:14:14, 7.89s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 3170/1000000 [8:03:44<2465:51:34, 8.91s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [3170], local_loss=0.01829967275261879, train_loss=0.05670449882745743, time_cost=1.9036381244659424
+
Steps: 0%| | 3170/1000000 [8:03:44<2465:51:34, 8.91s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 3171/1000000 [8:03:50<2282:56:18, 8.24s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [3171], local_loss=0.03271523863077164, train_loss=0.031942568719387054, time_cost=2.6171796321868896
+
Steps: 0%| | 3171/1000000 [8:03:50<2282:56:18, 8.24s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 3172/1000000 [8:03:55<2007:35:24, 7.25s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [3172], local_loss=0.024803992360830307, train_loss=0.03420022502541542, time_cost=2.5338287353515625
+
Steps: 0%| | 3172/1000000 [8:03:55<2007:35:24, 7.25s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 3173/1000000 [8:04:02<1954:40:10, 7.06s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [3173], local_loss=0.04545918107032776, train_loss=0.05233694985508919, time_cost=1.2223505973815918
+
Steps: 0%| | 3173/1000000 [8:04:02<1954:40:10, 7.06s/it, lr=1e-5, step_loss=0.0455]
Steps: 0%| | 3174/1000000 [8:04:13<2288:57:08, 8.27s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [3174], local_loss=0.03200783580541611, train_loss=0.09641693532466888, time_cost=8.764496088027954
+
Steps: 0%| | 3174/1000000 [8:04:13<2288:57:08, 8.27s/it, lr=1e-5, step_loss=0.032]
Steps: 0%| | 3175/1000000 [8:04:23<2426:30:42, 8.76s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [3175], local_loss=0.023925792425870895, train_loss=0.1647079885005951, time_cost=4.506069183349609
+
Steps: 0%| | 3175/1000000 [8:04:23<2426:30:42, 8.76s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 3176/1000000 [8:04:33<2565:34:21, 9.27s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [3176], local_loss=0.01870901510119438, train_loss=0.1383359134197235, time_cost=2.487222909927368
+
Steps: 0%| | 3176/1000000 [8:04:33<2565:34:21, 9.27s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 3177/1000000 [8:04:41<2409:39:21, 8.70s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [3177], local_loss=0.26419395208358765, train_loss=0.07028122246265411, time_cost=1.2838819026947021
+
Steps: 0%| | 3177/1000000 [8:04:41<2409:39:21, 8.70s/it, lr=1e-5, step_loss=0.264]
Steps: 0%| | 3178/1000000 [8:04:49<2409:42:47, 8.70s/it, lr=1e-5, step_loss=0.264][RANK-0]: Step: [3178], local_loss=0.05044182762503624, train_loss=0.03588488698005676, time_cost=1.213085651397705
+
Steps: 0%| | 3178/1000000 [8:04:49<2409:42:47, 8.70s/it, lr=1e-5, step_loss=0.0504]
Steps: 0%| | 3179/1000000 [8:05:03<2798:48:46, 10.11s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [3179], local_loss=0.02949882112443447, train_loss=0.1281452476978302, time_cost=1.2267775535583496
+
Steps: 0%| | 3179/1000000 [8:05:03<2798:48:46, 10.11s/it, lr=1e-5, step_loss=0.0295]
Steps: 0%| | 3180/1000000 [8:05:18<3249:45:14, 11.74s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [3180], local_loss=0.024834897369146347, train_loss=0.03704807907342911, time_cost=2.433016538619995
+
Steps: 0%| | 3180/1000000 [8:05:18<3249:45:14, 11.74s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 3181/1000000 [8:05:26<2880:15:14, 10.40s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [3181], local_loss=0.07375729084014893, train_loss=0.07863865792751312, time_cost=1.685354471206665
+
Steps: 0%| | 3181/1000000 [8:05:26<2880:15:14, 10.40s/it, lr=1e-5, step_loss=0.0738]
Steps: 0%| | 3182/1000000 [8:05:33<2589:34:33, 9.35s/it, lr=1e-5, step_loss=0.0738][RANK-0]: Step: [3182], local_loss=0.022931262850761414, train_loss=0.1617451161146164, time_cost=3.152315139770508
+
Steps: 0%| | 3182/1000000 [8:05:33<2589:34:33, 9.35s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 3183/1000000 [8:05:37<2168:13:00, 7.83s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [3183], local_loss=0.030102476477622986, train_loss=0.0856223851442337, time_cost=1.314357042312622
+
Steps: 0%| | 3183/1000000 [8:05:37<2168:13:00, 7.83s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 3184/1000000 [8:05:42<1909:29:25, 6.90s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [3184], local_loss=0.03450493887066841, train_loss=0.04168723523616791, time_cost=1.8112399578094482
+
Steps: 0%| | 3184/1000000 [8:05:42<1909:29:25, 6.90s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 3185/1000000 [8:05:55<2444:45:17, 8.83s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [3185], local_loss=0.012743460945785046, train_loss=0.04041071981191635, time_cost=4.161081075668335
+
Steps: 0%| | 3185/1000000 [8:05:55<2444:45:17, 8.83s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 3186/1000000 [8:05:59<2056:57:36, 7.43s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [3186], local_loss=0.033037327229976654, train_loss=7.1969075202941895, time_cost=1.420947551727295
+
Steps: 0%| | 3186/1000000 [8:05:59<2056:57:36, 7.43s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 3187/1000000 [8:06:07<2070:45:12, 7.48s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [3187], local_loss=0.05263684689998627, train_loss=0.059679314494132996, time_cost=3.618457317352295
+
Steps: 0%| | 3187/1000000 [8:06:07<2070:45:12, 7.48s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 3188/1000000 [8:06:20<2596:23:05, 9.38s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [3188], local_loss=0.028096502646803856, train_loss=0.09631418436765671, time_cost=1.2335033416748047
+
Steps: 0%| | 3188/1000000 [8:06:20<2596:23:05, 9.38s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 3189/1000000 [8:06:26<2248:56:09, 8.12s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [3189], local_loss=0.021536223590373993, train_loss=0.024731259793043137, time_cost=2.13484263420105
+
Steps: 0%| | 3189/1000000 [8:06:26<2248:56:09, 8.12s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 3190/1000000 [8:06:30<1949:18:25, 7.04s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [3190], local_loss=0.057271067053079605, train_loss=0.055916547775268555, time_cost=1.6026315689086914
+
Steps: 0%| | 3190/1000000 [8:06:30<1949:18:25, 7.04s/it, lr=1e-5, step_loss=0.0573]
Steps: 0%| | 3191/1000000 [8:06:39<2124:23:59, 7.67s/it, lr=1e-5, step_loss=0.0573][RANK-0]: Step: [3191], local_loss=0.0745362862944603, train_loss=0.1613253951072693, time_cost=2.1599340438842773
+
Steps: 0%| | 3191/1000000 [8:06:39<2124:23:59, 7.67s/it, lr=1e-5, step_loss=0.0745]
Steps: 0%| | 3192/1000000 [8:06:48<2241:25:01, 8.09s/it, lr=1e-5, step_loss=0.0745][RANK-0]: Step: [3192], local_loss=0.02508716657757759, train_loss=43.35103988647461, time_cost=2.12724232673645
+
Steps: 0%| | 3192/1000000 [8:06:48<2241:25:01, 8.09s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 3193/1000000 [8:07:03<2744:16:04, 9.91s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [3193], local_loss=0.014282634481787682, train_loss=0.05791400372982025, time_cost=1.2197952270507812
+
Steps: 0%| | 3193/1000000 [8:07:03<2744:16:04, 9.91s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 3194/1000000 [8:07:08<2385:19:07, 8.61s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [3194], local_loss=0.03580916300415993, train_loss=0.024853402748703957, time_cost=2.7113797664642334
+
Steps: 0%| | 3194/1000000 [8:07:08<2385:19:07, 8.61s/it, lr=1e-5, step_loss=0.0358]
Steps: 0%| | 3195/1000000 [8:07:20<2636:12:17, 9.52s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [3195], local_loss=0.025687865912914276, train_loss=0.08214560896158218, time_cost=2.4032793045043945
+
Steps: 0%| | 3195/1000000 [8:07:20<2636:12:17, 9.52s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 3196/1000000 [8:07:30<2727:38:28, 9.85s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [3196], local_loss=0.019723456352949142, train_loss=0.08415024727582932, time_cost=3.1528425216674805
+
Steps: 0%| | 3196/1000000 [8:07:30<2727:38:28, 9.85s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 3197/1000000 [8:07:35<2329:58:22, 8.41s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [3197], local_loss=0.012208404019474983, train_loss=0.03456024453043938, time_cost=2.044790029525757
+
Steps: 0%| | 3197/1000000 [8:07:35<2329:58:22, 8.41s/it, lr=1e-5, step_loss=0.0122]
Steps: 0%| | 3198/1000000 [8:07:45<2407:48:00, 8.70s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [3198], local_loss=0.09443047642707825, train_loss=0.05615590140223503, time_cost=2.111044406890869
+
Steps: 0%| | 3198/1000000 [8:07:45<2407:48:00, 8.70s/it, lr=1e-5, step_loss=0.0944]
Steps: 0%| | 3199/1000000 [8:07:56<2622:14:37, 9.47s/it, lr=1e-5, step_loss=0.0944][RANK-0]: Step: [3199], local_loss=0.015241997316479683, train_loss=0.14064279198646545, time_cost=6.898184776306152
+
Steps: 0%| | 3199/1000000 [8:07:56<2622:14:37, 9.47s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 3200/1000000 [8:08:03<2406:43:07, 8.69s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [3200], local_loss=0.03614324331283569, train_loss=0.029397200793027878, time_cost=1.2829854488372803
+
Steps: 0%| | 3200/1000000 [8:08:03<2406:43:07, 8.69s/it, lr=1e-5, step_loss=0.0361]
Steps: 0%| | 3201/1000000 [8:08:11<2342:44:06, 8.46s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [3201], local_loss=0.07546444237232208, train_loss=0.04139331728219986, time_cost=1.288532018661499
+
Steps: 0%| | 3201/1000000 [8:08:11<2342:44:06, 8.46s/it, lr=1e-5, step_loss=0.0755]
Steps: 0%| | 3202/1000000 [8:08:18<2253:56:15, 8.14s/it, lr=1e-5, step_loss=0.0755][RANK-0]: Step: [3202], local_loss=0.04031966254115105, train_loss=0.03812012821435928, time_cost=1.559248447418213
+
Steps: 0%| | 3202/1000000 [8:08:18<2253:56:15, 8.14s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 3203/1000000 [8:08:31<2631:41:56, 9.50s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [3203], local_loss=0.045142076909542084, train_loss=0.15576660633087158, time_cost=4.457688570022583
+
Steps: 0%| | 3203/1000000 [8:08:31<2631:41:56, 9.50s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 3204/1000000 [8:08:42<2775:22:00, 10.02s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [3204], local_loss=0.021691536530852318, train_loss=0.07477690279483795, time_cost=6.516289949417114
+
Steps: 0%| | 3204/1000000 [8:08:42<2775:22:00, 10.02s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 3205/1000000 [8:08:53<2851:37:10, 10.30s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [3205], local_loss=0.05451364815235138, train_loss=0.05459502339363098, time_cost=2.019752025604248
+
Steps: 0%| | 3205/1000000 [8:08:53<2851:37:10, 10.30s/it, lr=1e-5, step_loss=0.0545]
Steps: 0%| | 3206/1000000 [8:09:00<2579:14:46, 9.32s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [3206], local_loss=0.02231632173061371, train_loss=0.04493220895528793, time_cost=2.4700138568878174
+
Steps: 0%| | 3206/1000000 [8:09:00<2579:14:46, 9.32s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 3207/1000000 [8:09:05<2195:14:56, 7.93s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [3207], local_loss=0.017503606155514717, train_loss=0.0795382559299469, time_cost=2.2570419311523438
+
Steps: 0%| | 3207/1000000 [8:09:05<2195:14:56, 7.93s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 3208/1000000 [8:09:12<2134:25:19, 7.71s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [3208], local_loss=0.10992298275232315, train_loss=0.09741156548261642, time_cost=1.2989428043365479
+
Steps: 0%| | 3208/1000000 [8:09:12<2134:25:19, 7.71s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 3209/1000000 [8:09:24<2481:41:26, 8.96s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [3209], local_loss=0.4657561779022217, train_loss=0.08222761750221252, time_cost=4.69980001449585
+
Steps: 0%| | 3209/1000000 [8:09:24<2481:41:26, 8.96s/it, lr=1e-5, step_loss=0.466]
Steps: 0%| | 3210/1000000 [8:09:39<3015:44:18, 10.89s/it, lr=1e-5, step_loss=0.466][RANK-0]: Step: [3210], local_loss=0.048038728535175323, train_loss=0.02635035291314125, time_cost=6.916625022888184
+
Steps: 0%| | 3210/1000000 [8:09:39<3015:44:18, 10.89s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 3211/1000000 [8:09:53<3224:03:56, 11.64s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [3211], local_loss=0.015542617067694664, train_loss=0.07416828721761703, time_cost=6.088035583496094
+
Steps: 0%| | 3211/1000000 [8:09:53<3224:03:56, 11.64s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 3212/1000000 [8:10:08<3496:08:07, 12.63s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [3212], local_loss=0.020520348101854324, train_loss=0.028802797198295593, time_cost=1.2355198860168457
+
Steps: 0%| | 3212/1000000 [8:10:08<3496:08:07, 12.63s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 3213/1000000 [8:10:12<2811:53:02, 10.16s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [3213], local_loss=0.05512970685958862, train_loss=0.03612823784351349, time_cost=1.217527151107788
+
Steps: 0%| | 3213/1000000 [8:10:12<2811:53:02, 10.16s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 3214/1000000 [8:10:26<3088:42:53, 11.16s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [3214], local_loss=170.63812255859375, train_loss=21.375640869140625, time_cost=5.9872565269470215
+
Steps: 0%| | 3214/1000000 [8:10:26<3088:42:53, 11.16s/it, lr=1e-5, step_loss=171]
Steps: 0%| | 3215/1000000 [8:10:35<2975:01:48, 10.74s/it, lr=1e-5, step_loss=171][RANK-0]: Step: [3215], local_loss=0.017990726977586746, train_loss=0.05806553363800049, time_cost=4.7462873458862305
+
Steps: 0%| | 3215/1000000 [8:10:35<2975:01:48, 10.74s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 3216/1000000 [8:10:46<2994:06:21, 10.81s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [3216], local_loss=0.04393136501312256, train_loss=0.04851432889699936, time_cost=2.5506386756896973
+
Steps: 0%| | 3216/1000000 [8:10:46<2994:06:21, 10.81s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 3217/1000000 [8:10:57<3012:33:53, 10.88s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [3217], local_loss=0.05241165682673454, train_loss=0.03395263850688934, time_cost=1.538654088973999
+
Steps: 0%| | 3217/1000000 [8:10:57<3012:33:53, 10.88s/it, lr=1e-5, step_loss=0.0524]
Steps: 0%| | 3218/1000000 [8:11:02<2483:27:26, 8.97s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [3218], local_loss=0.025377795100212097, train_loss=0.03117408975958824, time_cost=1.6204113960266113
+
Steps: 0%| | 3218/1000000 [8:11:02<2483:27:26, 8.97s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 3219/1000000 [8:11:14<2741:56:56, 9.90s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [3219], local_loss=0.03918492794036865, train_loss=0.051667895168066025, time_cost=4.399053335189819
+
Steps: 0%| | 3219/1000000 [8:11:14<2741:56:56, 9.90s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 3220/1000000 [8:11:25<2849:47:23, 10.29s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [3220], local_loss=0.020845944061875343, train_loss=0.02767818048596382, time_cost=1.228135347366333
+
Steps: 0%| | 3220/1000000 [8:11:25<2849:47:23, 10.29s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 3221/1000000 [8:11:40<3205:56:15, 11.58s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [3221], local_loss=0.025843866169452667, train_loss=0.03984881192445755, time_cost=1.234532356262207
+
Steps: 0%| | 3221/1000000 [8:11:40<3205:56:15, 11.58s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 3222/1000000 [8:11:47<2821:59:01, 10.19s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [3222], local_loss=0.016311269253492355, train_loss=0.02403368055820465, time_cost=1.2249329090118408
+
Steps: 0%| | 3222/1000000 [8:11:47<2821:59:01, 10.19s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 3223/1000000 [8:11:56<2755:35:47, 9.95s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [3223], local_loss=0.01438448391854763, train_loss=0.16194911301136017, time_cost=3.2677481174468994
+
Steps: 0%| | 3223/1000000 [8:11:56<2755:35:47, 9.95s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 3224/1000000 [8:12:08<2894:56:14, 10.46s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [3224], local_loss=0.01897023804485798, train_loss=0.07709639519453049, time_cost=1.6138262748718262
+
Steps: 0%| | 3224/1000000 [8:12:08<2894:56:14, 10.46s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 3225/1000000 [8:12:16<2733:00:25, 9.87s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [3225], local_loss=0.08988898992538452, train_loss=0.036604106426239014, time_cost=6.001893043518066
+
Steps: 0%| | 3225/1000000 [8:12:16<2733:00:25, 9.87s/it, lr=1e-5, step_loss=0.0899]
Steps: 0%| | 3226/1000000 [8:12:22<2397:05:58, 8.66s/it, lr=1e-5, step_loss=0.0899][RANK-0]: Step: [3226], local_loss=0.13723552227020264, train_loss=0.07086776196956635, time_cost=1.363518476486206
+
Steps: 0%| | 3226/1000000 [8:12:22<2397:05:58, 8.66s/it, lr=1e-5, step_loss=0.137]
Steps: 0%| | 3227/1000000 [8:12:27<2097:39:59, 7.58s/it, lr=1e-5, step_loss=0.137][RANK-0]: Step: [3227], local_loss=0.04601101949810982, train_loss=0.054228682070970535, time_cost=2.3535284996032715
+
Steps: 0%| | 3227/1000000 [8:12:27<2097:39:59, 7.58s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 3228/1000000 [8:12:36<2198:37:36, 7.94s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [3228], local_loss=0.015513468533754349, train_loss=0.05329026281833649, time_cost=6.241364479064941
+
Steps: 0%| | 3228/1000000 [8:12:36<2198:37:36, 7.94s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 3229/1000000 [8:12:44<2228:05:40, 8.05s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [3229], local_loss=0.043674103915691376, train_loss=0.03030271828174591, time_cost=4.269312620162964
+
Steps: 0%| | 3229/1000000 [8:12:44<2228:05:40, 8.05s/it, lr=1e-5, step_loss=0.0437]
Steps: 0%| | 3230/1000000 [8:12:48<1912:02:27, 6.91s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [3230], local_loss=0.03503552824258804, train_loss=0.15921594202518463, time_cost=1.617185354232788
+
Steps: 0%| | 3230/1000000 [8:12:48<1912:02:27, 6.91s/it, lr=1e-5, step_loss=0.035]
Steps: 0%| | 3231/1000000 [8:12:57<2042:33:24, 7.38s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [3231], local_loss=0.024496780708432198, train_loss=0.034749895334243774, time_cost=7.17767858505249
+
Steps: 0%| | 3231/1000000 [8:12:57<2042:33:24, 7.38s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 3232/1000000 [8:13:02<1866:28:56, 6.74s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [3232], local_loss=0.022165732458233833, train_loss=0.08258108794689178, time_cost=2.7530288696289062
+
Steps: 0%| | 3232/1000000 [8:13:02<1866:28:56, 6.74s/it, lr=1e-5, step_loss=0.0222]
Steps: 0%| | 3233/1000000 [8:13:12<2095:47:43, 7.57s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [3233], local_loss=0.07434453070163727, train_loss=0.04631881043314934, time_cost=3.0880401134490967
+
Steps: 0%| | 3233/1000000 [8:13:12<2095:47:43, 7.57s/it, lr=1e-5, step_loss=0.0743]
Steps: 0%| | 3234/1000000 [8:13:18<2015:59:02, 7.28s/it, lr=1e-5, step_loss=0.0743][RANK-0]: Step: [3234], local_loss=0.033645834773778915, train_loss=0.028084754943847656, time_cost=1.2334916591644287
+
Steps: 0%| | 3234/1000000 [8:13:18<2015:59:02, 7.28s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 3235/1000000 [8:13:28<2237:28:45, 8.08s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [3235], local_loss=0.06211649999022484, train_loss=0.045363858342170715, time_cost=4.567570447921753
+
Steps: 0%| | 3235/1000000 [8:13:28<2237:28:45, 8.08s/it, lr=1e-5, step_loss=0.0621]
Steps: 0%| | 3236/1000000 [8:13:39<2439:51:29, 8.81s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [3236], local_loss=0.06813624501228333, train_loss=0.05908294394612312, time_cost=4.684152364730835
+
Steps: 0%| | 3236/1000000 [8:13:39<2439:51:29, 8.81s/it, lr=1e-5, step_loss=0.0681]
Steps: 0%| | 3237/1000000 [8:13:43<2045:34:18, 7.39s/it, lr=1e-5, step_loss=0.0681][RANK-0]: Step: [3237], local_loss=0.009154217317700386, train_loss=0.027927156537771225, time_cost=1.241697072982788
+
Steps: 0%| | 3237/1000000 [8:13:43<2045:34:18, 7.39s/it, lr=1e-5, step_loss=0.00915]
Steps: 0%| | 3238/1000000 [8:13:50<2040:03:46, 7.37s/it, lr=1e-5, step_loss=0.00915][RANK-0]: Step: [3238], local_loss=0.028743306174874306, train_loss=0.10623262822628021, time_cost=3.1513845920562744
+
Steps: 0%| | 3238/1000000 [8:13:50<2040:03:46, 7.37s/it, lr=1e-5, step_loss=0.0287]
Steps: 0%| | 3239/1000000 [8:13:57<2003:22:37, 7.24s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [3239], local_loss=0.21071743965148926, train_loss=0.17472559213638306, time_cost=1.2396180629730225
+
Steps: 0%| | 3239/1000000 [8:13:57<2003:22:37, 7.24s/it, lr=1e-5, step_loss=0.211]
Steps: 0%| | 3240/1000000 [8:14:02<1806:50:46, 6.53s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [3240], local_loss=0.026791561394929886, train_loss=0.10238195210695267, time_cost=1.7547967433929443
+
Steps: 0%| | 3240/1000000 [8:14:02<1806:50:46, 6.53s/it, lr=1e-5, step_loss=0.0268]
Steps: 0%| | 3241/1000000 [8:14:10<1934:02:23, 6.99s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [3241], local_loss=0.18210558593273163, train_loss=0.04901499301195145, time_cost=3.924264669418335
+
Steps: 0%| | 3241/1000000 [8:14:10<1934:02:23, 6.99s/it, lr=1e-5, step_loss=0.182]
Steps: 0%| | 3242/1000000 [8:14:24<2488:49:02, 8.99s/it, lr=1e-5, step_loss=0.182][RANK-0]: Step: [3242], local_loss=0.019813718274235725, train_loss=0.1498023122549057, time_cost=5.649673938751221
+
Steps: 0%| | 3242/1000000 [8:14:24<2488:49:02, 8.99s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 3243/1000000 [8:14:34<2611:31:26, 9.43s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [3243], local_loss=0.014382394962012768, train_loss=0.02342578023672104, time_cost=1.5045087337493896
+
Steps: 0%| | 3243/1000000 [8:14:34<2611:31:26, 9.43s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 3244/1000000 [8:14:45<2764:04:04, 9.98s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [3244], local_loss=0.01739315502345562, train_loss=0.04063521698117256, time_cost=4.364184379577637
+
Steps: 0%| | 3244/1000000 [8:14:45<2764:04:04, 9.98s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3245/1000000 [8:14:51<2380:15:48, 8.60s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3245], local_loss=0.03531569615006447, train_loss=0.06171180307865143, time_cost=1.7259862422943115
+
Steps: 0%| | 3245/1000000 [8:14:51<2380:15:48, 8.60s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 3246/1000000 [8:14:58<2268:48:51, 8.19s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [3246], local_loss=0.08191085606813431, train_loss=0.10231759399175644, time_cost=6.09683632850647
+
Steps: 0%| | 3246/1000000 [8:14:58<2268:48:51, 8.19s/it, lr=1e-5, step_loss=0.0819]
Steps: 0%| | 3247/1000000 [8:15:12<2724:23:36, 9.84s/it, lr=1e-5, step_loss=0.0819][RANK-0]: Step: [3247], local_loss=0.05842966586351395, train_loss=0.07394866645336151, time_cost=1.2288618087768555
+
Steps: 0%| | 3247/1000000 [8:15:12<2724:23:36, 9.84s/it, lr=1e-5, step_loss=0.0584]
Steps: 0%| | 3248/1000000 [8:15:24<2894:08:01, 10.45s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [3248], local_loss=0.015573062002658844, train_loss=0.02049960196018219, time_cost=1.220670461654663
+
Steps: 0%| | 3248/1000000 [8:15:24<2894:08:01, 10.45s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 3249/1000000 [8:15:37<3115:22:14, 11.25s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [3249], local_loss=0.025366537272930145, train_loss=0.030199969187378883, time_cost=4.683387517929077
+
Steps: 0%| | 3249/1000000 [8:15:37<3115:22:14, 11.25s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 3250/1000000 [8:15:44<2755:29:25, 9.95s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [3250], local_loss=0.013316017575562, train_loss=0.04932558164000511, time_cost=1.2219433784484863
+
Steps: 0%| | 3250/1000000 [8:15:44<2755:29:25, 9.95s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 3251/1000000 [8:15:48<2311:35:19, 8.35s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [3251], local_loss=0.07077676802873611, train_loss=0.043888598680496216, time_cost=2.2200310230255127
+
Steps: 0%| | 3251/1000000 [8:15:48<2311:35:19, 8.35s/it, lr=1e-5, step_loss=0.0708]
Steps: 0%| | 3252/1000000 [8:15:53<2020:22:49, 7.30s/it, lr=1e-5, step_loss=0.0708][RANK-0]: Step: [3252], local_loss=0.042407356202602386, train_loss=0.11455563455820084, time_cost=1.2058982849121094
+
Steps: 0%| | 3252/1000000 [8:15:53<2020:22:49, 7.30s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 3253/1000000 [8:16:09<2699:54:14, 9.75s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [3253], local_loss=1.0335524082183838, train_loss=0.22435420751571655, time_cost=6.94118070602417
+
Steps: 0%| | 3253/1000000 [8:16:09<2699:54:14, 9.75s/it, lr=1e-5, step_loss=1.03]
Steps: 0%| | 3254/1000000 [8:16:16<2523:43:57, 9.12s/it, lr=1e-5, step_loss=1.03][RANK-0]: Step: [3254], local_loss=0.031175607815384865, train_loss=0.047056861221790314, time_cost=1.6762619018554688
+
Steps: 0%| | 3254/1000000 [8:16:16<2523:43:57, 9.12s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 3255/1000000 [8:16:27<2698:02:16, 9.74s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [3255], local_loss=0.016267985105514526, train_loss=0.048134945333004, time_cost=2.32422137260437
+
Steps: 0%| | 3255/1000000 [8:16:27<2698:02:16, 9.74s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 3256/1000000 [8:16:32<2281:43:23, 8.24s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [3256], local_loss=0.03486718237400055, train_loss=0.029703710228204727, time_cost=1.2246856689453125
+
Steps: 0%| | 3256/1000000 [8:16:32<2281:43:23, 8.24s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 3257/1000000 [8:16:38<2069:10:00, 7.47s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [3257], local_loss=0.049151111394166946, train_loss=0.1289927214384079, time_cost=2.950530767440796
+
Steps: 0%| | 3257/1000000 [8:16:38<2069:10:00, 7.47s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 3258/1000000 [8:16:49<2355:04:17, 8.51s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [3258], local_loss=0.08794955164194107, train_loss=0.08875562250614166, time_cost=2.814009189605713
+
Steps: 0%| | 3258/1000000 [8:16:49<2355:04:17, 8.51s/it, lr=1e-5, step_loss=0.0879]
Steps: 0%| | 3259/1000000 [8:17:05<2991:07:40, 10.80s/it, lr=1e-5, step_loss=0.0879][RANK-0]: Step: [3259], local_loss=0.034062840044498444, train_loss=0.03868767246603966, time_cost=13.403440237045288
+
Steps: 0%| | 3259/1000000 [8:17:05<2991:07:40, 10.80s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 3260/1000000 [8:17:15<2917:44:32, 10.54s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [3260], local_loss=0.09819494187831879, train_loss=0.07463638484477997, time_cost=1.196751356124878
+
Steps: 0%| | 3260/1000000 [8:17:15<2917:44:32, 10.54s/it, lr=1e-5, step_loss=0.0982]
Steps: 0%| | 3261/1000000 [8:17:28<3150:56:57, 11.38s/it, lr=1e-5, step_loss=0.0982][RANK-0]: Step: [3261], local_loss=0.023790620267391205, train_loss=0.06427716463804245, time_cost=3.1660802364349365
+
Steps: 0%| | 3261/1000000 [8:17:28<3150:56:57, 11.38s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 3262/1000000 [8:17:43<3420:08:01, 12.35s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [3262], local_loss=0.01600792072713375, train_loss=0.03318101912736893, time_cost=1.2098097801208496
+
Steps: 0%| | 3262/1000000 [8:17:43<3420:08:01, 12.35s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 3263/1000000 [8:17:52<3145:44:13, 11.36s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [3263], local_loss=0.31515222787857056, train_loss=0.06816036999225616, time_cost=5.804816484451294
+
Steps: 0%| | 3263/1000000 [8:17:52<3145:44:13, 11.36s/it, lr=1e-5, step_loss=0.315]
Steps: 0%| | 3264/1000000 [8:17:57<2646:27:32, 9.56s/it, lr=1e-5, step_loss=0.315][RANK-0]: Step: [3264], local_loss=0.02059406042098999, train_loss=0.020812034606933594, time_cost=2.6241250038146973
+
Steps: 0%| | 3264/1000000 [8:17:57<2646:27:32, 9.56s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 3265/1000000 [8:18:04<2424:05:11, 8.76s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [3265], local_loss=0.012900921516120434, train_loss=0.04012913256883621, time_cost=2.835749864578247
+
Steps: 0%| | 3265/1000000 [8:18:04<2424:05:11, 8.76s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 3266/1000000 [8:18:09<2137:34:21, 7.72s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [3266], local_loss=0.011494074948132038, train_loss=0.016208652406930923, time_cost=1.5173068046569824
+
Steps: 0%| | 3266/1000000 [8:18:09<2137:34:21, 7.72s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 3267/1000000 [8:18:23<2616:49:55, 9.45s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [3267], local_loss=0.015856266021728516, train_loss=0.023177653551101685, time_cost=5.297594785690308
+
Steps: 0%| | 3267/1000000 [8:18:23<2616:49:55, 9.45s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 3268/1000000 [8:18:29<2317:36:01, 8.37s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [3268], local_loss=0.0130292484536767, train_loss=0.026089955121278763, time_cost=1.238948106765747
+
Steps: 0%| | 3268/1000000 [8:18:29<2317:36:01, 8.37s/it, lr=1e-5, step_loss=0.013]
Steps: 0%| | 3269/1000000 [8:18:43<2838:15:29, 10.25s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [3269], local_loss=0.015619589015841484, train_loss=0.04740285500884056, time_cost=6.12089467048645
+
Steps: 0%| | 3269/1000000 [8:18:43<2838:15:29, 10.25s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 3270/1000000 [8:18:51<2589:54:51, 9.35s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [3270], local_loss=0.043479762971401215, train_loss=0.03277492895722389, time_cost=1.2346153259277344
+
Steps: 0%| | 3270/1000000 [8:18:51<2589:54:51, 9.35s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 3271/1000000 [8:18:56<2231:44:15, 8.06s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [3271], local_loss=0.043874479830265045, train_loss=0.04496951773762703, time_cost=2.2597157955169678
+
Steps: 0%| | 3271/1000000 [8:18:56<2231:44:15, 8.06s/it, lr=1e-5, step_loss=0.0439]
Steps: 0%| | 3272/1000000 [8:19:01<1983:12:22, 7.16s/it, lr=1e-5, step_loss=0.0439][RANK-0]: Step: [3272], local_loss=0.016218245029449463, train_loss=0.05120912939310074, time_cost=2.127591371536255
+
Steps: 0%| | 3272/1000000 [8:19:01<1983:12:22, 7.16s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 3273/1000000 [8:19:16<2628:23:47, 9.49s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [3273], local_loss=0.03933359682559967, train_loss=0.06786612421274185, time_cost=6.743847370147705
+
Steps: 0%| | 3273/1000000 [8:19:16<2628:23:47, 9.49s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 3274/1000000 [8:19:21<2265:44:58, 8.18s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [3274], local_loss=0.042135369032621384, train_loss=0.0419670045375824, time_cost=1.477020263671875
+
Steps: 0%| | 3274/1000000 [8:19:21<2265:44:58, 8.18s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 3275/1000000 [8:19:25<1966:28:06, 7.10s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [3275], local_loss=0.032532598823308945, train_loss=0.09450051188468933, time_cost=3.8787853717803955
+
Steps: 0%| | 3275/1000000 [8:19:25<1966:28:06, 7.10s/it, lr=1e-5, step_loss=0.0325]
Steps: 0%| | 3276/1000000 [8:19:34<2108:50:00, 7.62s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [3276], local_loss=0.059445880353450775, train_loss=0.06973261386156082, time_cost=2.834582805633545
+
Steps: 0%| | 3276/1000000 [8:19:34<2108:50:00, 7.62s/it, lr=1e-5, step_loss=0.0594]
Steps: 0%| | 3277/1000000 [8:19:41<2055:27:11, 7.42s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [3277], local_loss=0.05420558899641037, train_loss=0.03722541779279709, time_cost=3.2619123458862305
+
Steps: 0%| | 3277/1000000 [8:19:41<2055:27:11, 7.42s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 3278/1000000 [8:19:47<1918:09:31, 6.93s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [3278], local_loss=0.12681502103805542, train_loss=0.05264703556895256, time_cost=2.3432395458221436
+
Steps: 0%| | 3278/1000000 [8:19:47<1918:09:31, 6.93s/it, lr=1e-5, step_loss=0.127]
Steps: 0%| | 3279/1000000 [8:19:53<1846:00:26, 6.67s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [3279], local_loss=0.2516835629940033, train_loss=0.08548040688037872, time_cost=1.9561586380004883
+
Steps: 0%| | 3279/1000000 [8:19:53<1846:00:26, 6.67s/it, lr=1e-5, step_loss=0.252]
Steps: 0%| | 3280/1000000 [8:20:00<1905:27:23, 6.88s/it, lr=1e-5, step_loss=0.252][RANK-0]: Step: [3280], local_loss=0.02824694849550724, train_loss=0.04286356270313263, time_cost=2.828166961669922
+
Steps: 0%| | 3280/1000000 [8:20:00<1905:27:23, 6.88s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 3281/1000000 [8:20:07<1922:18:23, 6.94s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [3281], local_loss=0.017489051446318626, train_loss=0.04269181191921234, time_cost=5.249188661575317
+
Steps: 0%| | 3281/1000000 [8:20:07<1922:18:23, 6.94s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 3282/1000000 [8:20:19<2274:52:10, 8.22s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [3282], local_loss=0.03779980167746544, train_loss=0.03915204480290413, time_cost=2.1856260299682617
+
Steps: 0%| | 3282/1000000 [8:20:19<2274:52:10, 8.22s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 3283/1000000 [8:20:24<2036:38:00, 7.36s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [3283], local_loss=0.018749669194221497, train_loss=0.03182453289628029, time_cost=1.2640631198883057
+
Steps: 0%| | 3283/1000000 [8:20:24<2036:38:00, 7.36s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 3284/1000000 [8:20:32<2058:28:05, 7.43s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [3284], local_loss=0.0137028144672513, train_loss=0.04676026478409767, time_cost=3.967557191848755
+
Steps: 0%| | 3284/1000000 [8:20:32<2058:28:05, 7.43s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 3285/1000000 [8:20:36<1843:02:56, 6.66s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [3285], local_loss=0.02499309927225113, train_loss=0.042593494057655334, time_cost=2.4546258449554443
+
Steps: 0%| | 3285/1000000 [8:20:36<1843:02:56, 6.66s/it, lr=1e-5, step_loss=0.025]
Steps: 0%| | 3286/1000000 [8:20:44<1898:50:04, 6.86s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [3286], local_loss=0.04577285423874855, train_loss=0.03910071775317192, time_cost=1.4161875247955322
+
Steps: 0%| | 3286/1000000 [8:20:44<1898:50:04, 6.86s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 3287/1000000 [8:20:59<2557:21:51, 9.24s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [3287], local_loss=0.04905819147825241, train_loss=0.04361933842301369, time_cost=6.282364368438721
+
Steps: 0%| | 3287/1000000 [8:20:59<2557:21:51, 9.24s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 3288/1000000 [8:21:15<3152:57:43, 11.39s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [3288], local_loss=0.05141059309244156, train_loss=0.026915032416582108, time_cost=7.2794201374053955
+
Steps: 0%| | 3288/1000000 [8:21:15<3152:57:43, 11.39s/it, lr=1e-5, step_loss=0.0514]
Steps: 0%| | 3289/1000000 [8:21:22<2778:39:49, 10.04s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [3289], local_loss=0.033052958548069, train_loss=0.15465866029262543, time_cost=2.690519094467163
+
Steps: 0%| | 3289/1000000 [8:21:22<2778:39:49, 10.04s/it, lr=1e-5, step_loss=0.0331]
Steps: 0%| | 3290/1000000 [8:21:30<2589:24:17, 9.35s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [3290], local_loss=0.08075709640979767, train_loss=0.08077813684940338, time_cost=1.2508554458618164
+
Steps: 0%| | 3290/1000000 [8:21:30<2589:24:17, 9.35s/it, lr=1e-5, step_loss=0.0808]
Steps: 0%| | 3291/1000000 [8:21:40<2675:58:04, 9.67s/it, lr=1e-5, step_loss=0.0808][RANK-0]: Step: [3291], local_loss=0.01742010936141014, train_loss=0.07033627480268478, time_cost=5.422677516937256
+
Steps: 0%| | 3291/1000000 [8:21:40<2675:58:04, 9.67s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3292/1000000 [8:21:53<2969:12:44, 10.72s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3292], local_loss=0.2196696698665619, train_loss=0.08106327801942825, time_cost=4.2433180809021
+
Steps: 0%| | 3292/1000000 [8:21:53<2969:12:44, 10.72s/it, lr=1e-5, step_loss=0.22]
Steps: 0%| | 3293/1000000 [8:21:59<2526:41:30, 9.13s/it, lr=1e-5, step_loss=0.22][RANK-0]: Step: [3293], local_loss=0.30128780007362366, train_loss=0.09794969856739044, time_cost=1.234645128250122
+
Steps: 0%| | 3293/1000000 [8:21:59<2526:41:30, 9.13s/it, lr=1e-5, step_loss=0.301]
Steps: 0%| | 3294/1000000 [8:22:04<2256:16:55, 8.15s/it, lr=1e-5, step_loss=0.301][RANK-0]: Step: [3294], local_loss=0.019529541954398155, train_loss=24.176532745361328, time_cost=1.4363932609558105
+
Steps: 0%| | 3294/1000000 [8:22:04<2256:16:55, 8.15s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 3295/1000000 [8:22:10<2066:55:59, 7.47s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [3295], local_loss=0.04949395731091499, train_loss=0.04537670314311981, time_cost=4.270689249038696
+
Steps: 0%| | 3295/1000000 [8:22:10<2066:55:59, 7.47s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 3296/1000000 [8:22:19<2140:51:10, 7.73s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [3296], local_loss=0.06884055584669113, train_loss=0.04197663068771362, time_cost=1.7863852977752686
+
Steps: 0%| | 3296/1000000 [8:22:19<2140:51:10, 7.73s/it, lr=1e-5, step_loss=0.0688]
Steps: 0%| | 3297/1000000 [8:22:30<2423:38:43, 8.75s/it, lr=1e-5, step_loss=0.0688][RANK-0]: Step: [3297], local_loss=0.045762911438941956, train_loss=0.042668841779232025, time_cost=8.36397099494934
+
Steps: 0%| | 3297/1000000 [8:22:30<2423:38:43, 8.75s/it, lr=1e-5, step_loss=0.0458]
Steps: 0%| | 3298/1000000 [8:22:40<2528:25:13, 9.13s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [3298], local_loss=0.0371132418513298, train_loss=0.03813071548938751, time_cost=1.2171707153320312
+
Steps: 0%| | 3298/1000000 [8:22:40<2528:25:13, 9.13s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 3299/1000000 [8:22:50<2589:30:30, 9.35s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [3299], local_loss=0.0451921783387661, train_loss=0.04595125839114189, time_cost=3.7761781215667725
+
Steps: 0%| | 3299/1000000 [8:22:50<2589:30:30, 9.35s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 3300/1000000 [8:23:00<2670:59:27, 9.65s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [3300], local_loss=0.05123669654130936, train_loss=0.08388632535934448, time_cost=1.2255439758300781
+
Steps: 0%| | 3300/1000000 [8:23:00<2670:59:27, 9.65s/it, lr=1e-5, step_loss=0.0512]
Steps: 0%| | 3301/1000000 [8:23:11<2798:14:45, 10.11s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [3301], local_loss=0.010162406601011753, train_loss=0.0545315258204937, time_cost=1.2170381546020508
+
Steps: 0%| | 3301/1000000 [8:23:11<2798:14:45, 10.11s/it, lr=1e-5, step_loss=0.0102]
Steps: 0%| | 3302/1000000 [8:23:27<3264:53:59, 11.79s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [3302], local_loss=0.03128058463335037, train_loss=0.03893844783306122, time_cost=5.928025960922241
+
Steps: 0%| | 3302/1000000 [8:23:27<3264:53:59, 11.79s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 3303/1000000 [8:23:33<2776:48:57, 10.03s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [3303], local_loss=0.012108701281249523, train_loss=0.06368209421634674, time_cost=4.23870062828064
+
Steps: 0%| | 3303/1000000 [8:23:33<2776:48:57, 10.03s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 3304/1000000 [8:23:51<3421:36:12, 12.36s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [3304], local_loss=0.032332394272089005, train_loss=0.04955708235502243, time_cost=14.3004150390625
+
Steps: 0%| | 3304/1000000 [8:23:51<3421:36:12, 12.36s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 3305/1000000 [8:23:56<2836:28:23, 10.25s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [3305], local_loss=0.034126315265893936, train_loss=0.038732580840587616, time_cost=1.2359790802001953
+
Steps: 0%| | 3305/1000000 [8:23:56<2836:28:23, 10.25s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 3306/1000000 [8:24:01<2428:36:23, 8.77s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [3306], local_loss=0.0595097616314888, train_loss=0.06114517152309418, time_cost=2.174734115600586
+
Steps: 0%| | 3306/1000000 [8:24:01<2428:36:23, 8.77s/it, lr=1e-5, step_loss=0.0595]
Steps: 0%| | 3307/1000000 [8:24:13<2694:50:20, 9.73s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [3307], local_loss=0.015294159762561321, train_loss=0.09338022023439407, time_cost=4.798076868057251
+
Steps: 0%| | 3307/1000000 [8:24:13<2694:50:20, 9.73s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 3308/1000000 [8:24:24<2775:28:49, 10.02s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [3308], local_loss=0.017623327672481537, train_loss=0.020590942353010178, time_cost=1.5216364860534668
+
Steps: 0%| | 3308/1000000 [8:24:24<2775:28:49, 10.02s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 3309/1000000 [8:24:37<2995:31:37, 10.82s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [3309], local_loss=0.025683315470814705, train_loss=0.027038760483264923, time_cost=5.348963499069214
+
Steps: 0%| | 3309/1000000 [8:24:37<2995:31:37, 10.82s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 3310/1000000 [8:24:50<3242:11:31, 11.71s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [3310], local_loss=0.3929520845413208, train_loss=0.15574710071086884, time_cost=3.9135231971740723
+
Steps: 0%| | 3310/1000000 [8:24:50<3242:11:31, 11.71s/it, lr=1e-5, step_loss=0.393]
Steps: 0%| | 3311/1000000 [8:24:56<2700:22:53, 9.75s/it, lr=1e-5, step_loss=0.393][RANK-0]: Step: [3311], local_loss=0.059537407010793686, train_loss=0.02884018048644066, time_cost=3.253955364227295
+
Steps: 0%| | 3311/1000000 [8:24:56<2700:22:53, 9.75s/it, lr=1e-5, step_loss=0.0595]
Steps: 0%| | 3312/1000000 [8:25:07<2867:38:41, 10.36s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [3312], local_loss=0.024215644225478172, train_loss=0.044107601046562195, time_cost=9.2679443359375
+
Steps: 0%| | 3312/1000000 [8:25:07<2867:38:41, 10.36s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 3313/1000000 [8:25:17<2815:48:26, 10.17s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [3313], local_loss=0.05573764070868492, train_loss=0.044321514666080475, time_cost=7.783771276473999
+
Steps: 0%| | 3313/1000000 [8:25:17<2815:48:26, 10.17s/it, lr=1e-5, step_loss=0.0557]
Steps: 0%| | 3314/1000000 [8:25:23<2445:42:50, 8.83s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [3314], local_loss=0.1199340969324112, train_loss=0.1751781404018402, time_cost=3.2911150455474854
+
Steps: 0%| | 3314/1000000 [8:25:23<2445:42:50, 8.83s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 3315/1000000 [8:25:28<2183:25:17, 7.89s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [3315], local_loss=0.016013741493225098, train_loss=0.03962506726384163, time_cost=2.98976731300354
+
Steps: 0%| | 3315/1000000 [8:25:28<2183:25:17, 7.89s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 3316/1000000 [8:25:45<2869:09:35, 10.36s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [3316], local_loss=0.05583431199193001, train_loss=0.05524813383817673, time_cost=7.211406946182251
+
Steps: 0%| | 3316/1000000 [8:25:45<2869:09:35, 10.36s/it, lr=1e-5, step_loss=0.0558]
Steps: 0%| | 3317/1000000 [8:25:50<2434:32:41, 8.79s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [3317], local_loss=0.024096636101603508, train_loss=0.029802266508340836, time_cost=1.5422618389129639
+
Steps: 0%| | 3317/1000000 [8:25:50<2434:32:41, 8.79s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 3318/1000000 [8:25:55<2138:08:38, 7.72s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [3318], local_loss=0.016139784827828407, train_loss=12.303934097290039, time_cost=2.2562410831451416
+
Steps: 0%| | 3318/1000000 [8:25:55<2138:08:38, 7.72s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 3319/1000000 [8:26:09<2658:43:20, 9.60s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [3319], local_loss=0.03786972165107727, train_loss=0.47497090697288513, time_cost=5.486952781677246
+
Steps: 0%| | 3319/1000000 [8:26:09<2658:43:20, 9.60s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 3320/1000000 [8:26:23<2996:14:02, 10.82s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [3320], local_loss=0.01887901872396469, train_loss=0.0427607037127018, time_cost=1.2333462238311768
+
Steps: 0%| | 3320/1000000 [8:26:23<2996:14:02, 10.82s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 3321/1000000 [8:26:34<3012:21:56, 10.88s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [3321], local_loss=0.015166735276579857, train_loss=0.022648721933364868, time_cost=2.5204224586486816
+
Steps: 0%| | 3321/1000000 [8:26:34<3012:21:56, 10.88s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 3322/1000000 [8:26:46<3124:33:58, 11.29s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [3322], local_loss=0.016349148005247116, train_loss=0.03121432289481163, time_cost=4.592052221298218
+
Steps: 0%| | 3322/1000000 [8:26:46<3124:33:58, 11.29s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 3323/1000000 [8:26:52<2675:26:26, 9.66s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [3323], local_loss=0.016818774864077568, train_loss=15.266578674316406, time_cost=1.2221558094024658
+
Steps: 0%| | 3323/1000000 [8:26:52<2675:26:26, 9.66s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 3324/1000000 [8:27:06<3040:28:12, 10.98s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [3324], local_loss=139.38966369628906, train_loss=17.449201583862305, time_cost=4.575874090194702
+
Steps: 0%| | 3324/1000000 [8:27:06<3040:28:12, 10.98s/it, lr=1e-5, step_loss=139]
Steps: 0%| | 3325/1000000 [8:27:11<2548:35:16, 9.21s/it, lr=1e-5, step_loss=139][RANK-0]: Step: [3325], local_loss=0.11541596055030823, train_loss=0.05307187885046005, time_cost=2.246889591217041
+
Steps: 0%| | 3325/1000000 [8:27:11<2548:35:16, 9.21s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 3326/1000000 [8:27:22<2689:57:51, 9.72s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [3326], local_loss=0.013552060350775719, train_loss=0.1495121419429779, time_cost=3.5951011180877686
+
Steps: 0%| | 3326/1000000 [8:27:22<2689:57:51, 9.72s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 3327/1000000 [8:27:29<2464:59:56, 8.90s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [3327], local_loss=0.04908354952931404, train_loss=0.154204323887825, time_cost=2.6119213104248047
+
Steps: 0%| | 3327/1000000 [8:27:29<2464:59:56, 8.90s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 3328/1000000 [8:27:44<2953:29:47, 10.67s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [3328], local_loss=0.2026163637638092, train_loss=0.06335968524217606, time_cost=11.868080139160156
+
Steps: 0%| | 3328/1000000 [8:27:44<2953:29:47, 10.67s/it, lr=1e-5, step_loss=0.203]
Steps: 0%| | 3329/1000000 [8:27:50<2571:21:04, 9.29s/it, lr=1e-5, step_loss=0.203][RANK-0]: Step: [3329], local_loss=0.05259421467781067, train_loss=0.036296188831329346, time_cost=2.3100907802581787
+
Steps: 0%| | 3329/1000000 [8:27:50<2571:21:04, 9.29s/it, lr=1e-5, step_loss=0.0526]
Steps: 0%| | 3330/1000000 [8:28:02<2839:21:51, 10.26s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [3330], local_loss=0.013307873159646988, train_loss=0.07709743082523346, time_cost=4.680156946182251
+
Steps: 0%| | 3330/1000000 [8:28:02<2839:21:51, 10.26s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 3331/1000000 [8:28:16<3130:13:04, 11.31s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [3331], local_loss=0.01819310151040554, train_loss=0.027058478444814682, time_cost=5.980935573577881
+
Steps: 0%| | 3331/1000000 [8:28:16<3130:13:04, 11.31s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 3332/1000000 [8:28:20<2547:07:32, 9.20s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [3332], local_loss=0.03684272617101669, train_loss=0.05802667886018753, time_cost=2.0901830196380615
+
Steps: 0%| | 3332/1000000 [8:28:20<2547:07:32, 9.20s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 3333/1000000 [8:28:28<2403:37:40, 8.68s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [3333], local_loss=0.06243172287940979, train_loss=0.0572376549243927, time_cost=1.218937873840332
+
Steps: 0%| | 3333/1000000 [8:28:28<2403:37:40, 8.68s/it, lr=1e-5, step_loss=0.0624]
Steps: 0%| | 3334/1000000 [8:28:35<2256:32:13, 8.15s/it, lr=1e-5, step_loss=0.0624][RANK-0]: Step: [3334], local_loss=0.032007671892642975, train_loss=0.06395363062620163, time_cost=2.9380598068237305
+
Steps: 0%| | 3334/1000000 [8:28:35<2256:32:13, 8.15s/it, lr=1e-5, step_loss=0.032]
Steps: 0%| | 3335/1000000 [8:28:39<1983:26:11, 7.16s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [3335], local_loss=0.012395801022648811, train_loss=0.026380470022559166, time_cost=1.9662809371948242
+
Steps: 0%| | 3335/1000000 [8:28:39<1983:26:11, 7.16s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 3336/1000000 [8:28:45<1829:17:56, 6.61s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [3336], local_loss=0.06654807925224304, train_loss=0.06258866935968399, time_cost=3.901888608932495
+
Steps: 0%| | 3336/1000000 [8:28:45<1829:17:56, 6.61s/it, lr=1e-5, step_loss=0.0665]
Steps: 0%| | 3337/1000000 [8:28:52<1855:13:31, 6.70s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [3337], local_loss=0.026699813082814217, train_loss=0.06026356667280197, time_cost=1.2987520694732666
+
Steps: 0%| | 3337/1000000 [8:28:52<1855:13:31, 6.70s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 3338/1000000 [8:29:02<2116:21:38, 7.64s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [3338], local_loss=0.04117468744516373, train_loss=0.16549895703792572, time_cost=6.90658164024353
+
Steps: 0%| | 3338/1000000 [8:29:02<2116:21:38, 7.64s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 3339/1000000 [8:29:13<2399:20:52, 8.67s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [3339], local_loss=0.05335381254553795, train_loss=0.07456812262535095, time_cost=1.9495162963867188
+
Steps: 0%| | 3339/1000000 [8:29:13<2399:20:52, 8.67s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 3340/1000000 [8:29:28<2989:47:55, 10.80s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [3340], local_loss=0.013153879903256893, train_loss=0.08104047179222107, time_cost=4.905426979064941
+
Steps: 0%| | 3340/1000000 [8:29:28<2989:47:55, 10.80s/it, lr=1e-5, step_loss=0.0132]
Steps: 0%| | 3341/1000000 [8:29:42<3222:20:07, 11.64s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [3341], local_loss=0.17537623643875122, train_loss=0.09796109795570374, time_cost=4.252265214920044
+
Steps: 0%| | 3341/1000000 [8:29:42<3222:20:07, 11.64s/it, lr=1e-5, step_loss=0.175]
Steps: 0%| | 3342/1000000 [8:29:49<2827:32:37, 10.21s/it, lr=1e-5, step_loss=0.175][RANK-0]: Step: [3342], local_loss=0.017808062955737114, train_loss=0.06429591774940491, time_cost=5.034790754318237
+
Steps: 0%| | 3342/1000000 [8:29:49<2827:32:37, 10.21s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 3343/1000000 [8:29:59<2789:13:13, 10.07s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [3343], local_loss=0.06237893924117088, train_loss=0.03172094374895096, time_cost=1.2259125709533691
+
Steps: 0%| | 3343/1000000 [8:29:59<2789:13:13, 10.07s/it, lr=1e-5, step_loss=0.0624]
Steps: 0%| | 3344/1000000 [8:30:12<3084:25:17, 11.14s/it, lr=1e-5, step_loss=0.0624][RANK-0]: Step: [3344], local_loss=0.01167629286646843, train_loss=0.062275420874357224, time_cost=11.279526233673096
+
Steps: 0%| | 3344/1000000 [8:30:12<3084:25:17, 11.14s/it, lr=1e-5, step_loss=0.0117]
Steps: 0%| | 3345/1000000 [8:30:22<2948:14:01, 10.65s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [3345], local_loss=0.017703348770737648, train_loss=0.020848635584115982, time_cost=3.5578079223632812
+
Steps: 0%| | 3345/1000000 [8:30:22<2948:14:01, 10.65s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 3346/1000000 [8:30:36<3210:50:23, 11.60s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [3346], local_loss=0.09986984729766846, train_loss=0.28692948818206787, time_cost=5.82232666015625
+
Steps: 0%| | 3346/1000000 [8:30:36<3210:50:23, 11.60s/it, lr=1e-5, step_loss=0.0999]
Steps: 0%| | 3347/1000000 [8:30:40<2615:38:59, 9.45s/it, lr=1e-5, step_loss=0.0999][RANK-0]: Step: [3347], local_loss=0.015250416472554207, train_loss=0.03187251463532448, time_cost=1.710437536239624
+
Steps: 0%| | 3347/1000000 [8:30:40<2615:38:59, 9.45s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 3348/1000000 [8:30:51<2750:49:06, 9.94s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [3348], local_loss=0.10911056399345398, train_loss=0.10379558801651001, time_cost=9.42520546913147
+
Steps: 0%| | 3348/1000000 [8:30:51<2750:49:06, 9.94s/it, lr=1e-5, step_loss=0.109]
Steps: 0%| | 3349/1000000 [8:30:57<2390:21:40, 8.63s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [3349], local_loss=0.09487305581569672, train_loss=0.13341189920902252, time_cost=1.206904411315918
+
Steps: 0%| | 3349/1000000 [8:30:57<2390:21:40, 8.63s/it, lr=1e-5, step_loss=0.0949]
Steps: 0%| | 3350/1000000 [8:31:01<2030:31:49, 7.33s/it, lr=1e-5, step_loss=0.0949][RANK-0]: Step: [3350], local_loss=0.025063106790184975, train_loss=0.0833359807729721, time_cost=1.2812132835388184
+
Steps: 0%| | 3350/1000000 [8:31:01<2030:31:49, 7.33s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 3351/1000000 [8:31:12<2301:01:43, 8.31s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [3351], local_loss=0.016935961320996284, train_loss=0.15924711525440216, time_cost=5.238463640213013
+
Steps: 0%| | 3351/1000000 [8:31:12<2301:01:43, 8.31s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 3352/1000000 [8:31:18<2165:57:20, 7.82s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [3352], local_loss=0.019474191591143608, train_loss=0.03382027894258499, time_cost=2.3339881896972656
+
Steps: 0%| | 3352/1000000 [8:31:18<2165:57:20, 7.82s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 3353/1000000 [8:31:24<2026:57:55, 7.32s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [3353], local_loss=0.015242405235767365, train_loss=0.046433303505182266, time_cost=2.546253204345703
+
Steps: 0%| | 3353/1000000 [8:31:24<2026:57:55, 7.32s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 3354/1000000 [8:31:29<1825:56:08, 6.60s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [3354], local_loss=0.06786108016967773, train_loss=0.05443952977657318, time_cost=1.3810458183288574
+
Steps: 0%| | 3354/1000000 [8:31:29<1825:56:08, 6.60s/it, lr=1e-5, step_loss=0.0679]
Steps: 0%| | 3355/1000000 [8:31:44<2488:34:56, 8.99s/it, lr=1e-5, step_loss=0.0679][RANK-0]: Step: [3355], local_loss=0.053769126534461975, train_loss=0.03532664477825165, time_cost=1.2179789543151855
+
Steps: 0%| | 3355/1000000 [8:31:44<2488:34:56, 8.99s/it, lr=1e-5, step_loss=0.0538]
Steps: 0%| | 3356/1000000 [8:31:57<2850:56:29, 10.30s/it, lr=1e-5, step_loss=0.0538][RANK-0]: Step: [3356], local_loss=0.016709405928850174, train_loss=0.07795452326536179, time_cost=3.5066449642181396
+
Steps: 0%| | 3356/1000000 [8:31:57<2850:56:29, 10.30s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 3357/1000000 [8:32:02<2419:06:59, 8.74s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [3357], local_loss=0.10727217048406601, train_loss=0.04094203561544418, time_cost=2.0234880447387695
+
Steps: 0%| | 3357/1000000 [8:32:02<2419:06:59, 8.74s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 3358/1000000 [8:32:10<2303:42:37, 8.32s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [3358], local_loss=0.05364362522959709, train_loss=0.03934301435947418, time_cost=3.6254680156707764
+
Steps: 0%| | 3358/1000000 [8:32:10<2303:42:37, 8.32s/it, lr=1e-5, step_loss=0.0536]
Steps: 0%| | 3359/1000000 [8:32:14<1976:03:29, 7.14s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [3359], local_loss=0.06779676675796509, train_loss=0.0609564483165741, time_cost=1.5608251094818115
+
Steps: 0%| | 3359/1000000 [8:32:14<1976:03:29, 7.14s/it, lr=1e-5, step_loss=0.0678]
Steps: 0%| | 3360/1000000 [8:32:23<2165:50:29, 7.82s/it, lr=1e-5, step_loss=0.0678][RANK-0]: Step: [3360], local_loss=0.027519872412085533, train_loss=0.05474837124347687, time_cost=1.3638689517974854
+
Steps: 0%| | 3360/1000000 [8:32:23<2165:50:29, 7.82s/it, lr=1e-5, step_loss=0.0275]
Steps: 0%| | 3361/1000000 [8:32:36<2581:50:36, 9.33s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [3361], local_loss=0.011684310622513294, train_loss=0.06024907901883125, time_cost=3.4222583770751953
+
Steps: 0%| | 3361/1000000 [8:32:36<2581:50:36, 9.33s/it, lr=1e-5, step_loss=0.0117]
Steps: 0%| | 3362/1000000 [8:32:43<2380:51:25, 8.60s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [3362], local_loss=0.06154216080904007, train_loss=0.16183750331401825, time_cost=2.5296387672424316
+
Steps: 0%| | 3362/1000000 [8:32:43<2380:51:25, 8.60s/it, lr=1e-5, step_loss=0.0615]
Steps: 0%| | 3363/1000000 [8:32:50<2252:53:52, 8.14s/it, lr=1e-5, step_loss=0.0615][RANK-0]: Step: [3363], local_loss=0.027869494631886482, train_loss=0.07760129868984222, time_cost=1.5311672687530518
+
Steps: 0%| | 3363/1000000 [8:32:50<2252:53:52, 8.14s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 3364/1000000 [8:32:57<2164:20:59, 7.82s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [3364], local_loss=0.9940809011459351, train_loss=0.19053441286087036, time_cost=2.3750524520874023
+
Steps: 0%| | 3364/1000000 [8:32:57<2164:20:59, 7.82s/it, lr=1e-5, step_loss=0.994]
Steps: 0%| | 3365/1000000 [8:33:02<1875:58:22, 6.78s/it, lr=1e-5, step_loss=0.994][RANK-0]: Step: [3365], local_loss=0.01807643100619316, train_loss=0.0201499555259943, time_cost=1.473038911819458
+
Steps: 0%| | 3365/1000000 [8:33:02<1875:58:22, 6.78s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 3366/1000000 [8:33:07<1742:26:20, 6.29s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [3366], local_loss=0.015793295577168465, train_loss=0.07070877403020859, time_cost=2.594597339630127
+
Steps: 0%| | 3366/1000000 [8:33:07<1742:26:20, 6.29s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 3367/1000000 [8:33:13<1713:13:33, 6.19s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [3367], local_loss=0.01390753872692585, train_loss=0.022026201710104942, time_cost=1.7695622444152832
+
Steps: 0%| | 3367/1000000 [8:33:13<1713:13:33, 6.19s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 3368/1000000 [8:33:19<1724:50:59, 6.23s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [3368], local_loss=0.0343753956258297, train_loss=0.0695062130689621, time_cost=3.336643934249878
+
Steps: 0%| | 3368/1000000 [8:33:19<1724:50:59, 6.23s/it, lr=1e-5, step_loss=0.0344]
Steps: 0%| | 3369/1000000 [8:33:26<1790:27:29, 6.47s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [3369], local_loss=0.019162410870194435, train_loss=0.051860466599464417, time_cost=1.2892670631408691
+
Steps: 0%| | 3369/1000000 [8:33:26<1790:27:29, 6.47s/it, lr=1e-5, step_loss=0.0192]
Steps: 0%| | 3370/1000000 [8:33:31<1693:48:45, 6.12s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [3370], local_loss=0.03929290920495987, train_loss=0.03998710960149765, time_cost=1.5904150009155273
+
Steps: 0%| | 3370/1000000 [8:33:31<1693:48:45, 6.12s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 3371/1000000 [8:33:47<2500:32:15, 9.03s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [3371], local_loss=0.01904541626572609, train_loss=0.0843081995844841, time_cost=6.894887447357178
+
Steps: 0%| | 3371/1000000 [8:33:47<2500:32:15, 9.03s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 3372/1000000 [8:33:58<2606:04:46, 9.41s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [3372], local_loss=0.026027698069810867, train_loss=0.035388536751270294, time_cost=2.71866774559021
+
Steps: 0%| | 3372/1000000 [8:33:58<2606:04:46, 9.41s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 3373/1000000 [8:34:02<2177:15:29, 7.86s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [3373], local_loss=0.013615632429718971, train_loss=0.053427886217832565, time_cost=1.6106476783752441
+
Steps: 0%| | 3373/1000000 [8:34:02<2177:15:29, 7.86s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 3374/1000000 [8:34:08<2051:05:39, 7.41s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [3374], local_loss=0.05173899605870247, train_loss=0.0364375114440918, time_cost=2.847649097442627
+
Steps: 0%| | 3374/1000000 [8:34:08<2051:05:39, 7.41s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 3375/1000000 [8:34:12<1792:04:06, 6.47s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [3375], local_loss=0.018863534554839134, train_loss=0.04530065506696701, time_cost=1.474902868270874
+
Steps: 0%| | 3375/1000000 [8:34:12<1792:04:06, 6.47s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 3376/1000000 [8:34:18<1720:06:05, 6.21s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [3376], local_loss=0.014635995961725712, train_loss=0.04764946550130844, time_cost=1.2149841785430908
+
Steps: 0%| | 3376/1000000 [8:34:18<1720:06:05, 6.21s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 3377/1000000 [8:34:31<2308:30:23, 8.34s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [3377], local_loss=67.39790344238281, train_loss=8.578948974609375, time_cost=5.6053876876831055
+
Steps: 0%| | 3377/1000000 [8:34:31<2308:30:23, 8.34s/it, lr=1e-5, step_loss=67.4]
Steps: 0%| | 3378/1000000 [8:34:36<2035:20:48, 7.35s/it, lr=1e-5, step_loss=67.4][RANK-0]: Step: [3378], local_loss=0.012245915830135345, train_loss=0.02512577921152115, time_cost=2.3449742794036865
+
Steps: 0%| | 3378/1000000 [8:34:36<2035:20:48, 7.35s/it, lr=1e-5, step_loss=0.0122]
Steps: 0%| | 3379/1000000 [8:34:42<1916:26:25, 6.92s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [3379], local_loss=0.030924618244171143, train_loss=0.052680566906929016, time_cost=1.5226433277130127
+
Steps: 0%| | 3379/1000000 [8:34:42<1916:26:25, 6.92s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 3380/1000000 [8:34:50<1969:37:42, 7.11s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [3380], local_loss=0.09226825833320618, train_loss=0.038628339767456055, time_cost=3.6780312061309814
+
Steps: 0%| | 3380/1000000 [8:34:50<1969:37:42, 7.11s/it, lr=1e-5, step_loss=0.0923]
Steps: 0%| | 3381/1000000 [8:35:04<2578:26:33, 9.31s/it, lr=1e-5, step_loss=0.0923][RANK-0]: Step: [3381], local_loss=0.08451250195503235, train_loss=0.037195559591054916, time_cost=6.331456899642944
+
Steps: 0%| | 3381/1000000 [8:35:04<2578:26:33, 9.31s/it, lr=1e-5, step_loss=0.0845]
Steps: 0%| | 3382/1000000 [8:35:10<2311:51:38, 8.35s/it, lr=1e-5, step_loss=0.0845][RANK-0]: Step: [3382], local_loss=0.04274008050560951, train_loss=0.059016164392232895, time_cost=1.7750592231750488
+
Steps: 0%| | 3382/1000000 [8:35:10<2311:51:38, 8.35s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 3383/1000000 [8:35:16<2118:47:10, 7.65s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [3383], local_loss=0.035186149179935455, train_loss=0.0384206622838974, time_cost=1.5007107257843018
+
Steps: 0%| | 3383/1000000 [8:35:16<2118:47:10, 7.65s/it, lr=1e-5, step_loss=0.0352]
Steps: 0%| | 3384/1000000 [8:35:24<2072:23:49, 7.49s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [3384], local_loss=0.07374927401542664, train_loss=0.0407559871673584, time_cost=1.2363715171813965
+
Steps: 0%| | 3384/1000000 [8:35:24<2072:23:49, 7.49s/it, lr=1e-5, step_loss=0.0737]
Steps: 0%| | 3385/1000000 [8:35:29<1864:37:02, 6.74s/it, lr=1e-5, step_loss=0.0737][RANK-0]: Step: [3385], local_loss=0.02300139144062996, train_loss=0.05010340362787247, time_cost=3.9480977058410645
+
Steps: 0%| | 3385/1000000 [8:35:29<1864:37:02, 6.74s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 3386/1000000 [8:35:38<2049:30:47, 7.40s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [3386], local_loss=0.03845422714948654, train_loss=0.030769070610404015, time_cost=2.732511281967163
+
Steps: 0%| | 3386/1000000 [8:35:38<2049:30:47, 7.40s/it, lr=1e-5, step_loss=0.0385]
Steps: 0%| | 3387/1000000 [8:35:47<2203:57:28, 7.96s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [3387], local_loss=0.014240737073123455, train_loss=0.19197320938110352, time_cost=1.2219901084899902
+
Steps: 0%| | 3387/1000000 [8:35:47<2203:57:28, 7.96s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 3388/1000000 [8:35:51<1915:26:19, 6.92s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [3388], local_loss=0.9839746356010437, train_loss=0.28497442603111267, time_cost=1.4336626529693604
+
Steps: 0%| | 3388/1000000 [8:35:51<1915:26:19, 6.92s/it, lr=1e-5, step_loss=0.984]
Steps: 0%| | 3389/1000000 [8:35:56<1749:04:41, 6.32s/it, lr=1e-5, step_loss=0.984][RANK-0]: Step: [3389], local_loss=0.025884998962283134, train_loss=0.0776614099740982, time_cost=3.681691884994507
+
Steps: 0%| | 3389/1000000 [8:35:56<1749:04:41, 6.32s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 3390/1000000 [8:36:08<2207:31:12, 7.97s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [3390], local_loss=0.029096147045493126, train_loss=0.04243740439414978, time_cost=4.0220489501953125
+
Steps: 0%| | 3390/1000000 [8:36:08<2207:31:12, 7.97s/it, lr=1e-5, step_loss=0.0291]
Steps: 0%| | 3391/1000000 [8:36:19<2449:21:44, 8.85s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [3391], local_loss=0.050203993916511536, train_loss=0.027059942483901978, time_cost=4.308856010437012
+
Steps: 0%| | 3391/1000000 [8:36:19<2449:21:44, 8.85s/it, lr=1e-5, step_loss=0.0502]
Steps: 0%| | 3392/1000000 [8:36:33<2862:40:56, 10.34s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [3392], local_loss=0.46770501136779785, train_loss=0.14288181066513062, time_cost=6.438949823379517
+
Steps: 0%| | 3392/1000000 [8:36:33<2862:40:56, 10.34s/it, lr=1e-5, step_loss=0.468]
Steps: 0%| | 3393/1000000 [8:36:38<2405:34:56, 8.69s/it, lr=1e-5, step_loss=0.468][RANK-0]: Step: [3393], local_loss=0.018358509987592697, train_loss=0.046475328505039215, time_cost=1.8600225448608398
+
Steps: 0%| | 3393/1000000 [8:36:38<2405:34:56, 8.69s/it, lr=1e-5, step_loss=0.0184]
Steps: 0%| | 3394/1000000 [8:36:43<2106:18:00, 7.61s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [3394], local_loss=0.11472180485725403, train_loss=0.039100587368011475, time_cost=2.74945068359375
+
Steps: 0%| | 3394/1000000 [8:36:43<2106:18:00, 7.61s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 3395/1000000 [8:36:50<2073:51:50, 7.49s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [3395], local_loss=0.08245210349559784, train_loss=0.04827922582626343, time_cost=2.6905527114868164
+
Steps: 0%| | 3395/1000000 [8:36:50<2073:51:50, 7.49s/it, lr=1e-5, step_loss=0.0825]
Steps: 0%| | 3396/1000000 [8:37:04<2654:25:58, 9.59s/it, lr=1e-5, step_loss=0.0825][RANK-0]: Step: [3396], local_loss=0.03639720380306244, train_loss=0.05047110840678215, time_cost=5.269892454147339
+
Steps: 0%| | 3396/1000000 [8:37:04<2654:25:58, 9.59s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 3397/1000000 [8:37:09<2240:17:26, 8.09s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [3397], local_loss=0.03152336925268173, train_loss=0.05929271876811981, time_cost=1.26985502243042
+
Steps: 0%| | 3397/1000000 [8:37:09<2240:17:26, 8.09s/it, lr=1e-5, step_loss=0.0315]
Steps: 0%| | 3398/1000000 [8:37:15<2042:36:15, 7.38s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [3398], local_loss=0.028199590742588043, train_loss=0.08936063200235367, time_cost=1.5781059265136719
+
Steps: 0%| | 3398/1000000 [8:37:15<2042:36:15, 7.38s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 3399/1000000 [8:37:19<1817:58:03, 6.57s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [3399], local_loss=0.020820528268814087, train_loss=0.03504562005400658, time_cost=2.3024790287017822
+
Steps: 0%| | 3399/1000000 [8:37:19<1817:58:03, 6.57s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 3400/1000000 [8:37:31<2279:54:51, 8.24s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [3400], local_loss=0.02547704428434372, train_loss=0.07838019728660583, time_cost=1.2096397876739502
+
Steps: 0%| | 3400/1000000 [8:37:31<2279:54:51, 8.24s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 3401/1000000 [8:37:43<2544:18:50, 9.19s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [3401], local_loss=0.06121879070997238, train_loss=0.04034113883972168, time_cost=1.8869588375091553
+
Steps: 0%| | 3401/1000000 [8:37:43<2544:18:50, 9.19s/it, lr=1e-5, step_loss=0.0612]
Steps: 0%| | 3402/1000000 [8:37:52<2517:39:02, 9.09s/it, lr=1e-5, step_loss=0.0612][RANK-0]: Step: [3402], local_loss=0.020334480330348015, train_loss=0.030498333275318146, time_cost=6.640599727630615
+
Steps: 0%| | 3402/1000000 [8:37:52<2517:39:02, 9.09s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 3403/1000000 [8:38:05<2896:46:10, 10.46s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [3403], local_loss=0.06408810615539551, train_loss=0.042315490543842316, time_cost=1.2231569290161133
+
Steps: 0%| | 3403/1000000 [8:38:05<2896:46:10, 10.46s/it, lr=1e-5, step_loss=0.0641]
Steps: 0%| | 3404/1000000 [8:38:09<2367:10:41, 8.55s/it, lr=1e-5, step_loss=0.0641][RANK-0]: Step: [3404], local_loss=0.04106086120009422, train_loss=0.02469760924577713, time_cost=1.3702361583709717
+
Steps: 0%| | 3404/1000000 [8:38:09<2367:10:41, 8.55s/it, lr=1e-5, step_loss=0.0411]
Steps: 0%| | 3405/1000000 [8:38:21<2578:17:33, 9.31s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [3405], local_loss=0.05892724543809891, train_loss=0.03476101532578468, time_cost=1.2252697944641113
+
Steps: 0%| | 3405/1000000 [8:38:21<2578:17:33, 9.31s/it, lr=1e-5, step_loss=0.0589]
Steps: 0%| | 3406/1000000 [8:38:25<2181:43:58, 7.88s/it, lr=1e-5, step_loss=0.0589][RANK-0]: Step: [3406], local_loss=0.07950518280267715, train_loss=0.18701356649398804, time_cost=1.9502735137939453
+
Steps: 0%| | 3406/1000000 [8:38:25<2181:43:58, 7.88s/it, lr=1e-5, step_loss=0.0795]
Steps: 0%| | 3407/1000000 [8:38:33<2214:59:24, 8.00s/it, lr=1e-5, step_loss=0.0795][RANK-0]: Step: [3407], local_loss=0.023174680769443512, train_loss=0.13798478245735168, time_cost=4.683525085449219
+
Steps: 0%| | 3407/1000000 [8:38:33<2214:59:24, 8.00s/it, lr=1e-5, step_loss=0.0232]
Steps: 0%| | 3408/1000000 [8:38:39<2035:33:46, 7.35s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [3408], local_loss=0.1660090982913971, train_loss=0.18505708873271942, time_cost=1.7552542686462402
+
Steps: 0%| | 3408/1000000 [8:38:39<2035:33:46, 7.35s/it, lr=1e-5, step_loss=0.166]
Steps: 0%| | 3409/1000000 [8:38:45<1903:20:59, 6.88s/it, lr=1e-5, step_loss=0.166][RANK-0]: Step: [3409], local_loss=0.04062442481517792, train_loss=0.06445026397705078, time_cost=3.473259449005127
+
Steps: 0%| | 3409/1000000 [8:38:45<1903:20:59, 6.88s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 3410/1000000 [8:38:59<2484:09:35, 8.97s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [3410], local_loss=0.027673669159412384, train_loss=0.15188702940940857, time_cost=1.20582914352417
+
Steps: 0%| | 3410/1000000 [8:38:59<2484:09:35, 8.97s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 3411/1000000 [8:39:04<2151:15:14, 7.77s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [3411], local_loss=0.0243183933198452, train_loss=0.025174513459205627, time_cost=1.3384995460510254
+
Steps: 0%| | 3411/1000000 [8:39:04<2151:15:14, 7.77s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 3412/1000000 [8:39:09<1949:57:07, 7.04s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [3412], local_loss=0.06159336492419243, train_loss=0.0687936469912529, time_cost=2.4064362049102783
+
Steps: 0%| | 3412/1000000 [8:39:09<1949:57:07, 7.04s/it, lr=1e-5, step_loss=0.0616]
Steps: 0%| | 3413/1000000 [8:39:23<2518:21:48, 9.10s/it, lr=1e-5, step_loss=0.0616][RANK-0]: Step: [3413], local_loss=0.3353382647037506, train_loss=0.11877971887588501, time_cost=4.917224884033203
+
Steps: 0%| | 3413/1000000 [8:39:23<2518:21:48, 9.10s/it, lr=1e-5, step_loss=0.335]
Steps: 0%| | 3414/1000000 [8:39:34<2705:02:06, 9.77s/it, lr=1e-5, step_loss=0.335][RANK-0]: Step: [3414], local_loss=0.05112133175134659, train_loss=0.041803859174251556, time_cost=1.927844762802124
+
Steps: 0%| | 3414/1000000 [8:39:34<2705:02:06, 9.77s/it, lr=1e-5, step_loss=0.0511]
Steps: 0%| | 3415/1000000 [8:39:44<2716:35:54, 9.81s/it, lr=1e-5, step_loss=0.0511][RANK-0]: Step: [3415], local_loss=0.01162561122328043, train_loss=0.05149094760417938, time_cost=1.6549227237701416
+
Steps: 0%| | 3415/1000000 [8:39:44<2716:35:54, 9.81s/it, lr=1e-5, step_loss=0.0116]
Steps: 0%| | 3416/1000000 [8:39:53<2637:28:53, 9.53s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [3416], local_loss=0.015451465733349323, train_loss=0.02222488448023796, time_cost=1.3256275653839111
+
Steps: 0%| | 3416/1000000 [8:39:53<2637:28:53, 9.53s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 3417/1000000 [8:39:59<2309:23:42, 8.34s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [3417], local_loss=0.01780698448419571, train_loss=0.07240079343318939, time_cost=1.4797437191009521
+
Steps: 0%| | 3417/1000000 [8:39:59<2309:23:42, 8.34s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 3418/1000000 [8:40:05<2141:25:53, 7.74s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [3418], local_loss=0.06272036582231522, train_loss=0.07332921773195267, time_cost=4.888473987579346
+
Steps: 0%| | 3418/1000000 [8:40:05<2141:25:53, 7.74s/it, lr=1e-5, step_loss=0.0627]
Steps: 0%| | 3419/1000000 [8:40:11<1991:15:52, 7.19s/it, lr=1e-5, step_loss=0.0627][RANK-0]: Step: [3419], local_loss=0.039827801287174225, train_loss=0.05854010209441185, time_cost=2.2357022762298584
+
Steps: 0%| | 3419/1000000 [8:40:11<1991:15:52, 7.19s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 3420/1000000 [8:40:22<2320:58:26, 8.38s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [3420], local_loss=0.021164027974009514, train_loss=0.03299606218934059, time_cost=5.584777593612671
+
Steps: 0%| | 3420/1000000 [8:40:22<2320:58:26, 8.38s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 3421/1000000 [8:40:33<2504:08:09, 9.05s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [3421], local_loss=0.10901796817779541, train_loss=0.062308356165885925, time_cost=5.534416675567627
+
Steps: 0%| | 3421/1000000 [8:40:33<2504:08:09, 9.05s/it, lr=1e-5, step_loss=0.109]
Steps: 0%| | 3422/1000000 [8:40:42<2515:25:42, 9.09s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [3422], local_loss=0.10388584434986115, train_loss=0.12684543430805206, time_cost=3.8150734901428223
+
Steps: 0%| | 3422/1000000 [8:40:42<2515:25:42, 9.09s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 3423/1000000 [8:40:51<2540:12:23, 9.18s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [3423], local_loss=0.014205679297447205, train_loss=0.08892939984798431, time_cost=1.3409669399261475
+
Steps: 0%| | 3423/1000000 [8:40:51<2540:12:23, 9.18s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 3424/1000000 [8:41:02<2704:35:21, 9.77s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [3424], local_loss=0.07288903743028641, train_loss=0.06503474712371826, time_cost=3.5666871070861816
+
Steps: 0%| | 3424/1000000 [8:41:02<2704:35:21, 9.77s/it, lr=1e-5, step_loss=0.0729]
Steps: 0%| | 3425/1000000 [8:41:14<2820:29:36, 10.19s/it, lr=1e-5, step_loss=0.0729][RANK-0]: Step: [3425], local_loss=0.19424313306808472, train_loss=0.07144007086753845, time_cost=1.341646671295166
+
Steps: 0%| | 3425/1000000 [8:41:14<2820:29:36, 10.19s/it, lr=1e-5, step_loss=0.194]
Steps: 0%| | 3426/1000000 [8:41:19<2403:58:42, 8.68s/it, lr=1e-5, step_loss=0.194][RANK-0]: Step: [3426], local_loss=0.00999630056321621, train_loss=0.0595521405339241, time_cost=2.409489393234253
+
Steps: 0%| | 3426/1000000 [8:41:19<2403:58:42, 8.68s/it, lr=1e-5, step_loss=0.01]
Steps: 0%| | 3427/1000000 [8:41:30<2592:04:53, 9.36s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [3427], local_loss=0.08214406669139862, train_loss=0.07955446094274521, time_cost=1.8117077350616455
+
Steps: 0%| | 3427/1000000 [8:41:30<2592:04:53, 9.36s/it, lr=1e-5, step_loss=0.0821]
Steps: 0%| | 3428/1000000 [8:41:35<2235:43:45, 8.08s/it, lr=1e-5, step_loss=0.0821][RANK-0]: Step: [3428], local_loss=0.05350640043616295, train_loss=0.10758700966835022, time_cost=2.0075597763061523
+
Steps: 0%| | 3428/1000000 [8:41:35<2235:43:45, 8.08s/it, lr=1e-5, step_loss=0.0535]
Steps: 0%| | 3429/1000000 [8:41:46<2464:07:33, 8.90s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [3429], local_loss=0.22543689608573914, train_loss=0.10533547401428223, time_cost=2.4269235134124756
+
Steps: 0%| | 3429/1000000 [8:41:46<2464:07:33, 8.90s/it, lr=1e-5, step_loss=0.225]
Steps: 0%| | 3430/1000000 [8:41:53<2309:28:22, 8.34s/it, lr=1e-5, step_loss=0.225][RANK-0]: Step: [3430], local_loss=0.053374022245407104, train_loss=0.088409423828125, time_cost=5.109272718429565
+
Steps: 0%| | 3430/1000000 [8:41:53<2309:28:22, 8.34s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 3431/1000000 [8:41:57<1971:02:45, 7.12s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [3431], local_loss=0.02770686335861683, train_loss=0.0402505025267601, time_cost=1.2185711860656738
+
Steps: 0%| | 3431/1000000 [8:41:57<1971:02:45, 7.12s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 3432/1000000 [8:42:07<2213:36:35, 8.00s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [3432], local_loss=0.049027957022190094, train_loss=0.05442951247096062, time_cost=1.8084943294525146
+
Steps: 0%| | 3432/1000000 [8:42:07<2213:36:35, 8.00s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 3433/1000000 [8:42:11<1919:59:06, 6.94s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [3433], local_loss=0.024358371272683144, train_loss=0.023817364126443863, time_cost=3.330615520477295
+
Steps: 0%| | 3433/1000000 [8:42:11<1919:59:06, 6.94s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 3434/1000000 [8:42:28<2710:32:44, 9.79s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [3434], local_loss=0.03140672296285629, train_loss=0.02653970569372177, time_cost=8.3403160572052
+
Steps: 0%| | 3434/1000000 [8:42:28<2710:32:44, 9.79s/it, lr=1e-5, step_loss=0.0314]
Steps: 0%| | 3435/1000000 [8:42:38<2750:39:46, 9.94s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [3435], local_loss=0.03829089552164078, train_loss=0.043202128261327744, time_cost=1.2069337368011475
+
Steps: 0%| | 3435/1000000 [8:42:38<2750:39:46, 9.94s/it, lr=1e-5, step_loss=0.0383]
Steps: 0%| | 3436/1000000 [8:42:43<2307:36:22, 8.34s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [3436], local_loss=0.03644388169050217, train_loss=0.03290265053510666, time_cost=1.9747819900512695
+
Steps: 0%| | 3436/1000000 [8:42:43<2307:36:22, 8.34s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 3437/1000000 [8:42:53<2485:15:23, 8.98s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [3437], local_loss=0.10451607406139374, train_loss=0.06221088767051697, time_cost=1.670198678970337
+
Steps: 0%| | 3437/1000000 [8:42:53<2485:15:23, 8.98s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 3438/1000000 [8:42:59<2174:46:03, 7.86s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [3438], local_loss=0.05666060000658035, train_loss=0.07895441353321075, time_cost=1.5409550666809082
+
Steps: 0%| | 3438/1000000 [8:42:59<2174:46:03, 7.86s/it, lr=1e-5, step_loss=0.0567]
Steps: 0%| | 3439/1000000 [8:43:03<1857:15:12, 6.71s/it, lr=1e-5, step_loss=0.0567][RANK-0]: Step: [3439], local_loss=0.047564174979925156, train_loss=0.057667918503284454, time_cost=3.0637588500976562
+
Steps: 0%| | 3439/1000000 [8:43:03<1857:15:12, 6.71s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 3440/1000000 [8:43:08<1728:20:31, 6.24s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [3440], local_loss=0.016328420490026474, train_loss=0.05178527161478996, time_cost=1.2381610870361328
+
Steps: 0%| | 3440/1000000 [8:43:08<1728:20:31, 6.24s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 3441/1000000 [8:43:15<1831:57:11, 6.62s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [3441], local_loss=0.04737379029393196, train_loss=0.03426579385995865, time_cost=3.4067370891571045
+
Steps: 0%| | 3441/1000000 [8:43:15<1831:57:11, 6.62s/it, lr=1e-5, step_loss=0.0474]
Steps: 0%| | 3442/1000000 [8:43:20<1673:18:37, 6.04s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [3442], local_loss=0.0208548866212368, train_loss=8.525713920593262, time_cost=1.9287426471710205
+
Steps: 0%| | 3442/1000000 [8:43:20<1673:18:37, 6.04s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 3443/1000000 [8:43:25<1566:57:03, 5.66s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [3443], local_loss=0.03831246867775917, train_loss=0.04300557076931, time_cost=1.9355947971343994
+
Steps: 0%| | 3443/1000000 [8:43:25<1566:57:03, 5.66s/it, lr=1e-5, step_loss=0.0383]
Steps: 0%| | 3444/1000000 [8:43:32<1670:26:58, 6.03s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [3444], local_loss=0.031166909262537956, train_loss=0.06501492857933044, time_cost=1.2456467151641846
+
Steps: 0%| | 3444/1000000 [8:43:32<1670:26:58, 6.03s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 3445/1000000 [8:43:38<1723:12:06, 6.22s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [3445], local_loss=0.020122159272432327, train_loss=0.042213208973407745, time_cost=1.9337494373321533
+
Steps: 0%| | 3445/1000000 [8:43:38<1723:12:06, 6.22s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 3446/1000000 [8:43:47<1969:09:26, 7.11s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [3446], local_loss=32.84366989135742, train_loss=4.2653303146362305, time_cost=3.666374444961548
+
Steps: 0%| | 3446/1000000 [8:43:47<1969:09:26, 7.11s/it, lr=1e-5, step_loss=32.8]
Steps: 0%| | 3447/1000000 [8:43:57<2161:29:39, 7.81s/it, lr=1e-5, step_loss=32.8][RANK-0]: Step: [3447], local_loss=0.021292025223374367, train_loss=0.035765428096055984, time_cost=7.130367755889893
+
Steps: 0%| | 3447/1000000 [8:43:57<2161:29:39, 7.81s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 3448/1000000 [8:44:06<2298:46:50, 8.30s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [3448], local_loss=0.09819084405899048, train_loss=0.05813048779964447, time_cost=1.2534990310668945
+
Steps: 0%| | 3448/1000000 [8:44:06<2298:46:50, 8.30s/it, lr=1e-5, step_loss=0.0982]
Steps: 0%| | 3449/1000000 [8:44:11<1979:00:59, 7.15s/it, lr=1e-5, step_loss=0.0982][RANK-0]: Step: [3449], local_loss=0.019531406462192535, train_loss=0.09648434072732925, time_cost=1.7299532890319824
+
Steps: 0%| | 3449/1000000 [8:44:11<1979:00:59, 7.15s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 3450/1000000 [8:44:17<1875:33:21, 6.78s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [3450], local_loss=0.05757904052734375, train_loss=0.07862862199544907, time_cost=3.4467718601226807
+
Steps: 0%| | 3450/1000000 [8:44:17<1875:33:21, 6.78s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 3451/1000000 [8:44:22<1752:56:21, 6.33s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [3451], local_loss=0.05175270140171051, train_loss=0.04730217531323433, time_cost=1.4791572093963623
+
Steps: 0%| | 3451/1000000 [8:44:22<1752:56:21, 6.33s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 3452/1000000 [8:44:35<2300:23:41, 8.31s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [3452], local_loss=0.04845167323946953, train_loss=0.1585320085287094, time_cost=4.7133355140686035
+
Steps: 0%| | 3452/1000000 [8:44:35<2300:23:41, 8.31s/it, lr=1e-5, step_loss=0.0485]
Steps: 0%| | 3453/1000000 [8:44:43<2243:48:30, 8.11s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [3453], local_loss=0.024347221478819847, train_loss=0.041851237416267395, time_cost=1.8430240154266357
+
Steps: 0%| | 3453/1000000 [8:44:43<2243:48:30, 8.11s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 3454/1000000 [8:44:53<2475:01:33, 8.94s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [3454], local_loss=0.01655418798327446, train_loss=0.17235776782035828, time_cost=4.121024131774902
+
Steps: 0%| | 3454/1000000 [8:44:53<2475:01:33, 8.94s/it, lr=1e-5, step_loss=0.0166]
Steps: 0%| | 3455/1000000 [8:45:02<2479:57:22, 8.96s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [3455], local_loss=0.04516025632619858, train_loss=0.054201070219278336, time_cost=3.2231850624084473
+
Steps: 0%| | 3455/1000000 [8:45:02<2479:57:22, 8.96s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 3456/1000000 [8:45:07<2084:37:44, 7.53s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [3456], local_loss=0.029828008264303207, train_loss=0.06421539187431335, time_cost=1.464176893234253
+
Steps: 0%| | 3456/1000000 [8:45:07<2084:37:44, 7.53s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 3457/1000000 [8:45:11<1856:59:27, 6.71s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [3457], local_loss=0.04843825101852417, train_loss=0.042976751923561096, time_cost=1.956632375717163
+
Steps: 0%| | 3457/1000000 [8:45:11<1856:59:27, 6.71s/it, lr=1e-5, step_loss=0.0484]
Steps: 0%| | 3458/1000000 [8:45:18<1866:17:28, 6.74s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [3458], local_loss=0.03357252851128578, train_loss=0.060569215565919876, time_cost=2.69958758354187
+
Steps: 0%| | 3458/1000000 [8:45:18<1866:17:28, 6.74s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 3459/1000000 [8:45:32<2415:09:40, 8.72s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [3459], local_loss=0.012122506275773048, train_loss=0.02965167909860611, time_cost=5.882936477661133
+
Steps: 0%| | 3459/1000000 [8:45:32<2415:09:40, 8.72s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 3460/1000000 [8:45:37<2133:02:36, 7.71s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [3460], local_loss=0.01848457008600235, train_loss=0.12897169589996338, time_cost=1.5030620098114014
+
Steps: 0%| | 3460/1000000 [8:45:37<2133:02:36, 7.71s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 3461/1000000 [8:45:41<1850:20:42, 6.68s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [3461], local_loss=0.014186017215251923, train_loss=0.04940872639417648, time_cost=1.2233526706695557
+
Steps: 0%| | 3461/1000000 [8:45:41<1850:20:42, 6.68s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 3462/1000000 [8:45:50<2054:18:27, 7.42s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [3462], local_loss=0.022682450711727142, train_loss=0.18031617999076843, time_cost=3.96116042137146
+
Steps: 0%| | 3462/1000000 [8:45:50<2054:18:27, 7.42s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 3463/1000000 [8:45:59<2194:32:43, 7.93s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [3463], local_loss=0.04514038562774658, train_loss=0.05974818766117096, time_cost=1.4927077293395996
+
Steps: 0%| | 3463/1000000 [8:45:59<2194:32:43, 7.93s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 3464/1000000 [8:46:13<2673:09:28, 9.66s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [3464], local_loss=0.08322951942682266, train_loss=0.06563340872526169, time_cost=7.454133749008179
+
Steps: 0%| | 3464/1000000 [8:46:13<2673:09:28, 9.66s/it, lr=1e-5, step_loss=0.0832]
Steps: 0%| | 3465/1000000 [8:46:23<2660:15:33, 9.61s/it, lr=1e-5, step_loss=0.0832][RANK-0]: Step: [3465], local_loss=0.06776773929595947, train_loss=0.041534677147865295, time_cost=1.6308894157409668
+
Steps: 0%| | 3465/1000000 [8:46:23<2660:15:33, 9.61s/it, lr=1e-5, step_loss=0.0678]
Steps: 0%| | 3466/1000000 [8:46:29<2361:25:21, 8.53s/it, lr=1e-5, step_loss=0.0678][RANK-0]: Step: [3466], local_loss=0.01133608166128397, train_loss=0.030963584780693054, time_cost=1.20947265625
+
Steps: 0%| | 3466/1000000 [8:46:29<2361:25:21, 8.53s/it, lr=1e-5, step_loss=0.0113]
Steps: 0%| | 3467/1000000 [8:46:36<2255:39:49, 8.15s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [3467], local_loss=0.04985203966498375, train_loss=0.05694662034511566, time_cost=1.4694418907165527
+
Steps: 0%| | 3467/1000000 [8:46:36<2255:39:49, 8.15s/it, lr=1e-5, step_loss=0.0499]
Steps: 0%| | 3468/1000000 [8:46:45<2312:43:31, 8.35s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [3468], local_loss=0.021605296060442924, train_loss=0.07276614010334015, time_cost=1.559206247329712
+
Steps: 0%| | 3468/1000000 [8:46:45<2312:43:31, 8.35s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 3469/1000000 [8:46:50<2018:32:47, 7.29s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [3469], local_loss=0.029935412108898163, train_loss=0.022455299273133278, time_cost=1.8489160537719727
+
Steps: 0%| | 3469/1000000 [8:46:50<2018:32:47, 7.29s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 3470/1000000 [8:46:57<2039:50:32, 7.37s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [3470], local_loss=0.026797588914632797, train_loss=0.06150238960981369, time_cost=4.035168409347534
+
Steps: 0%| | 3470/1000000 [8:46:57<2039:50:32, 7.37s/it, lr=1e-5, step_loss=0.0268]
Steps: 0%| | 3471/1000000 [8:47:06<2140:30:33, 7.73s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [3471], local_loss=0.02476552128791809, train_loss=0.08588646352291107, time_cost=2.6534523963928223
+
Steps: 0%| | 3471/1000000 [8:47:06<2140:30:33, 7.73s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 3472/1000000 [8:47:18<2518:42:23, 9.10s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [3472], local_loss=0.07770270109176636, train_loss=0.056281279772520065, time_cost=5.512389659881592
+
Steps: 0%| | 3472/1000000 [8:47:18<2518:42:23, 9.10s/it, lr=1e-5, step_loss=0.0777]
Steps: 0%| | 3473/1000000 [8:47:33<3018:16:38, 10.90s/it, lr=1e-5, step_loss=0.0777][RANK-0]: Step: [3473], local_loss=0.018675200641155243, train_loss=0.04102929309010506, time_cost=6.030207872390747
+
Steps: 0%| | 3473/1000000 [8:47:33<3018:16:38, 10.90s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 3474/1000000 [8:47:40<2694:41:30, 9.73s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [3474], local_loss=0.02377822808921337, train_loss=0.06069839745759964, time_cost=2.9781837463378906
+
Steps: 0%| | 3474/1000000 [8:47:40<2694:41:30, 9.73s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 3475/1000000 [8:47:48<2539:24:43, 9.17s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [3475], local_loss=0.010582626797258854, train_loss=0.04742119461297989, time_cost=2.4070756435394287
+
Steps: 0%| | 3475/1000000 [8:47:48<2539:24:43, 9.17s/it, lr=1e-5, step_loss=0.0106]
Steps: 0%| | 3476/1000000 [8:47:57<2503:15:43, 9.04s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [3476], local_loss=0.025205181911587715, train_loss=0.02543911710381508, time_cost=2.405099868774414
+
Steps: 0%| | 3476/1000000 [8:47:57<2503:15:43, 9.04s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 3477/1000000 [8:48:03<2262:02:47, 8.17s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [3477], local_loss=0.01744488626718521, train_loss=0.08368891477584839, time_cost=1.75978684425354
+
Steps: 0%| | 3477/1000000 [8:48:03<2262:02:47, 8.17s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3478/1000000 [8:48:10<2183:49:05, 7.89s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3478], local_loss=0.10635069757699966, train_loss=0.06125103309750557, time_cost=2.664804220199585
+
Steps: 0%| | 3478/1000000 [8:48:10<2183:49:05, 7.89s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 3479/1000000 [8:48:21<2393:20:46, 8.65s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [3479], local_loss=0.012044009752571583, train_loss=0.05025431513786316, time_cost=1.6240742206573486
+
Steps: 0%| | 3479/1000000 [8:48:21<2393:20:46, 8.65s/it, lr=1e-5, step_loss=0.012]
Steps: 0%| | 3480/1000000 [8:48:34<2781:42:33, 10.05s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [3480], local_loss=0.11970151960849762, train_loss=0.049780383706092834, time_cost=4.863812208175659
+
Steps: 0%| | 3480/1000000 [8:48:34<2781:42:33, 10.05s/it, lr=1e-5, step_loss=0.12]
Steps: 0%| | 3481/1000000 [8:48:40<2421:53:54, 8.75s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [3481], local_loss=0.03413568064570427, train_loss=0.039283089339733124, time_cost=1.7606041431427002
+
Steps: 0%| | 3481/1000000 [8:48:40<2421:53:54, 8.75s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 3482/1000000 [8:48:44<2049:01:23, 7.40s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [3482], local_loss=0.03551584482192993, train_loss=0.028497925028204918, time_cost=3.2157580852508545
+
Steps: 0%| | 3482/1000000 [8:48:44<2049:01:23, 7.40s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 3483/1000000 [8:48:48<1800:30:43, 6.50s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [3483], local_loss=0.03542240709066391, train_loss=0.07801760733127594, time_cost=1.2771718502044678
+
Steps: 0%| | 3483/1000000 [8:48:48<1800:30:43, 6.50s/it, lr=1e-5, step_loss=0.0354]
Steps: 0%| | 3484/1000000 [8:49:03<2520:58:18, 9.11s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [3484], local_loss=0.31384924054145813, train_loss=0.07632195204496384, time_cost=6.8229429721832275
+
Steps: 0%| | 3484/1000000 [8:49:03<2520:58:18, 9.11s/it, lr=1e-5, step_loss=0.314]
Steps: 0%| | 3485/1000000 [8:49:14<2668:50:45, 9.64s/it, lr=1e-5, step_loss=0.314][RANK-0]: Step: [3485], local_loss=0.02909272350370884, train_loss=0.04492553323507309, time_cost=2.1748921871185303
+
Steps: 0%| | 3485/1000000 [8:49:14<2668:50:45, 9.64s/it, lr=1e-5, step_loss=0.0291]
Steps: 0%| | 3486/1000000 [8:49:24<2667:21:41, 9.64s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [3486], local_loss=0.015960492193698883, train_loss=0.03174632787704468, time_cost=1.7404775619506836
+
Steps: 0%| | 3486/1000000 [8:49:24<2667:21:41, 9.64s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 3487/1000000 [8:49:30<2357:39:41, 8.52s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [3487], local_loss=0.028262421488761902, train_loss=0.08059005439281464, time_cost=1.2106826305389404
+
Steps: 0%| | 3487/1000000 [8:49:30<2357:39:41, 8.52s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 3488/1000000 [8:49:34<2039:03:12, 7.37s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [3488], local_loss=0.05683015286922455, train_loss=0.05089350417256355, time_cost=1.7412433624267578
+
Steps: 0%| | 3488/1000000 [8:49:35<2039:03:12, 7.37s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 3489/1000000 [8:49:40<1890:21:04, 6.83s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [3489], local_loss=0.05335086211562157, train_loss=0.18385110795497894, time_cost=3.2755889892578125
+
Steps: 0%| | 3489/1000000 [8:49:40<1890:21:04, 6.83s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 3490/1000000 [8:49:53<2376:43:24, 8.59s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [3490], local_loss=0.08785530924797058, train_loss=0.04061455279588699, time_cost=5.197603940963745
+
Steps: 0%| | 3490/1000000 [8:49:53<2376:43:24, 8.59s/it, lr=1e-5, step_loss=0.0879]
Steps: 0%| | 3491/1000000 [8:50:02<2444:19:21, 8.83s/it, lr=1e-5, step_loss=0.0879][RANK-0]: Step: [3491], local_loss=0.028137363493442535, train_loss=0.025418002158403397, time_cost=1.547912836074829
+
Steps: 0%| | 3491/1000000 [8:50:02<2444:19:21, 8.83s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 3492/1000000 [8:50:11<2475:51:26, 8.94s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [3492], local_loss=0.02203076146543026, train_loss=0.1294333040714264, time_cost=3.1822619438171387
+
Steps: 0%| | 3492/1000000 [8:50:11<2475:51:26, 8.94s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 3493/1000000 [8:50:16<2098:33:06, 7.58s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [3493], local_loss=0.05324485898017883, train_loss=0.05618325248360634, time_cost=1.441584587097168
+
Steps: 0%| | 3493/1000000 [8:50:16<2098:33:06, 7.58s/it, lr=1e-5, step_loss=0.0532]
Steps: 0%| | 3494/1000000 [8:50:21<1932:31:00, 6.98s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [3494], local_loss=0.02261868491768837, train_loss=0.04632851108908653, time_cost=2.0149495601654053
+
Steps: 0%| | 3494/1000000 [8:50:21<1932:31:00, 6.98s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 3495/1000000 [8:50:27<1806:42:35, 6.53s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [3495], local_loss=0.02277509681880474, train_loss=0.0715290904045105, time_cost=2.407911539077759
+
Steps: 0%| | 3495/1000000 [8:50:27<1806:42:35, 6.53s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 3496/1000000 [8:50:38<2225:05:08, 8.04s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [3496], local_loss=0.3173542618751526, train_loss=0.12279729545116425, time_cost=5.265875339508057
+
Steps: 0%| | 3496/1000000 [8:50:38<2225:05:08, 8.04s/it, lr=1e-5, step_loss=0.317]
Steps: 0%| | 3497/1000000 [8:50:52<2658:36:36, 9.60s/it, lr=1e-5, step_loss=0.317][RANK-0]: Step: [3497], local_loss=0.01666433736681938, train_loss=0.09309183061122894, time_cost=4.705126047134399
+
Steps: 0%| | 3497/1000000 [8:50:52<2658:36:36, 9.60s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 3498/1000000 [8:50:57<2284:20:52, 8.25s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [3498], local_loss=0.02842288464307785, train_loss=0.021842114627361298, time_cost=2.552603006362915
+
Steps: 0%| | 3498/1000000 [8:50:57<2284:20:52, 8.25s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 3499/1000000 [8:51:02<2018:43:19, 7.29s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [3499], local_loss=0.08242326229810715, train_loss=0.16998204588890076, time_cost=2.0991978645324707
+
Steps: 0%| | 3499/1000000 [8:51:02<2018:43:19, 7.29s/it, lr=1e-5, step_loss=0.0824]
Steps: 0%| | 3500/1000000 [8:51:07<1848:59:53, 6.68s/it, lr=1e-5, step_loss=0.0824][RANK-0]: Step: [3500], local_loss=0.014789454638957977, train_loss=0.26140522956848145, time_cost=1.2727138996124268
+
Steps: 0%| | 3500/1000000 [8:51:07<1848:59:53, 6.68s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 3501/1000000 [8:51:12<1700:31:13, 6.14s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [3501], local_loss=0.12134665250778198, train_loss=0.04402308538556099, time_cost=1.9313080310821533
+
Steps: 0%| | 3501/1000000 [8:51:12<1700:31:13, 6.14s/it, lr=1e-5, step_loss=0.121]
Steps: 0%| | 3502/1000000 [8:51:22<2019:29:01, 7.30s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [3502], local_loss=0.0436968058347702, train_loss=0.039786696434020996, time_cost=2.022758722305298
+
Steps: 0%| | 3502/1000000 [8:51:22<2019:29:01, 7.30s/it, lr=1e-5, step_loss=0.0437]
Steps: 0%| | 3503/1000000 [8:51:40<2876:57:41, 10.39s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [3503], local_loss=0.03655977547168732, train_loss=0.03976959362626076, time_cost=9.120368719100952
+
Steps: 0%| | 3503/1000000 [8:51:40<2876:57:41, 10.39s/it, lr=1e-5, step_loss=0.0366]
Steps: 0%| | 3504/1000000 [8:51:52<3026:08:58, 10.93s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [3504], local_loss=0.023485561832785606, train_loss=0.03927011042833328, time_cost=5.3154542446136475
+
Steps: 0%| | 3504/1000000 [8:51:52<3026:08:58, 10.93s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 3505/1000000 [8:51:58<2614:31:14, 9.45s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [3505], local_loss=0.056938402354717255, train_loss=0.09550406038761139, time_cost=1.5551166534423828
+
Steps: 0%| | 3505/1000000 [8:51:58<2614:31:14, 9.45s/it, lr=1e-5, step_loss=0.0569]
Steps: 0%| | 3506/1000000 [8:52:06<2531:45:03, 9.15s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [3506], local_loss=0.0592515766620636, train_loss=0.07803124189376831, time_cost=4.251593112945557
+
Steps: 0%| | 3506/1000000 [8:52:06<2531:45:03, 9.15s/it, lr=1e-5, step_loss=0.0593]
Steps: 0%| | 3507/1000000 [8:52:15<2524:13:40, 9.12s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [3507], local_loss=0.02204268053174019, train_loss=0.05049612373113632, time_cost=1.4308526515960693
+
Steps: 0%| | 3507/1000000 [8:52:15<2524:13:40, 9.12s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 3508/1000000 [8:52:27<2711:22:01, 9.80s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [3508], local_loss=0.03795243054628372, train_loss=0.039695583283901215, time_cost=1.2287859916687012
+
Steps: 0%| | 3508/1000000 [8:52:27<2711:22:01, 9.80s/it, lr=1e-5, step_loss=0.038]
Steps: 0%| | 3509/1000000 [8:52:34<2508:29:48, 9.06s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [3509], local_loss=0.1062769964337349, train_loss=0.08421832323074341, time_cost=3.1574575901031494
+
Steps: 0%| | 3509/1000000 [8:52:34<2508:29:48, 9.06s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 3510/1000000 [8:52:40<2270:16:27, 8.20s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [3510], local_loss=0.04263801872730255, train_loss=0.05030182749032974, time_cost=1.7945067882537842
+
Steps: 0%| | 3510/1000000 [8:52:40<2270:16:27, 8.20s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 3511/1000000 [8:52:54<2740:53:00, 9.90s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [3511], local_loss=0.02530556172132492, train_loss=0.04124702140688896, time_cost=4.4695024490356445
+
Steps: 0%| | 3511/1000000 [8:52:54<2740:53:00, 9.90s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 3512/1000000 [8:53:09<3198:19:24, 11.55s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [3512], local_loss=0.025124413892626762, train_loss=0.024428430944681168, time_cost=1.2216296195983887
+
Steps: 0%| | 3512/1000000 [8:53:09<3198:19:24, 11.55s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 3513/1000000 [8:53:14<2604:57:32, 9.41s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [3513], local_loss=0.0345853716135025, train_loss=0.04093695059418678, time_cost=2.3374087810516357
+
Steps: 0%| | 3513/1000000 [8:53:14<2604:57:32, 9.41s/it, lr=1e-5, step_loss=0.0346]
Steps: 0%| | 3514/1000000 [8:53:20<2377:32:37, 8.59s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [3514], local_loss=0.021933505311608315, train_loss=0.10183163732290268, time_cost=2.3417491912841797
+
Steps: 0%| | 3514/1000000 [8:53:20<2377:32:37, 8.59s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 3515/1000000 [8:53:31<2574:00:04, 9.30s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [3515], local_loss=0.22747905552387238, train_loss=0.0640401691198349, time_cost=1.2275314331054688
+
Steps: 0%| | 3515/1000000 [8:53:31<2574:00:04, 9.30s/it, lr=1e-5, step_loss=0.227]
Steps: 0%| | 3516/1000000 [8:53:38<2385:02:42, 8.62s/it, lr=1e-5, step_loss=0.227][RANK-0]: Step: [3516], local_loss=0.03552500531077385, train_loss=0.05623335391283035, time_cost=1.2179555892944336
+
Steps: 0%| | 3516/1000000 [8:53:38<2385:02:42, 8.62s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 3517/1000000 [8:53:48<2441:27:42, 8.82s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [3517], local_loss=0.11469902098178864, train_loss=0.04021967574954033, time_cost=3.7043235301971436
+
Steps: 0%| | 3517/1000000 [8:53:48<2441:27:42, 8.82s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 3518/1000000 [8:53:58<2565:39:14, 9.27s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [3518], local_loss=0.10711756348609924, train_loss=0.04540364071726799, time_cost=1.647867202758789
+
Steps: 0%| | 3518/1000000 [8:53:58<2565:39:14, 9.27s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 3519/1000000 [8:54:10<2759:29:10, 9.97s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [3519], local_loss=0.01413641031831503, train_loss=0.016621988266706467, time_cost=1.2069239616394043
+
Steps: 0%| | 3519/1000000 [8:54:10<2759:29:10, 9.97s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 3520/1000000 [8:54:21<2857:09:40, 10.32s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [3520], local_loss=0.0379413440823555, train_loss=0.030574962496757507, time_cost=4.218746900558472
+
Steps: 0%| | 3520/1000000 [8:54:21<2857:09:40, 10.32s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 3521/1000000 [8:54:35<3150:28:45, 11.38s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [3521], local_loss=0.08022551983594894, train_loss=0.035607706755399704, time_cost=5.090507507324219
+
Steps: 0%| | 3521/1000000 [8:54:35<3150:28:45, 11.38s/it, lr=1e-5, step_loss=0.0802]
Steps: 0%| | 3522/1000000 [8:54:49<3373:01:27, 12.19s/it, lr=1e-5, step_loss=0.0802][RANK-0]: Step: [3522], local_loss=0.04473422095179558, train_loss=0.06316597759723663, time_cost=4.974416971206665
+
Steps: 0%| | 3522/1000000 [8:54:49<3373:01:27, 12.19s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 3523/1000000 [8:54:54<2768:27:09, 10.00s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [3523], local_loss=0.014674251899123192, train_loss=0.14740680158138275, time_cost=1.2220263481140137
+
Steps: 0%| | 3523/1000000 [8:54:54<2768:27:09, 10.00s/it, lr=1e-5, step_loss=0.0147]
Steps: 0%| | 3524/1000000 [8:55:02<2633:37:43, 9.51s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [3524], local_loss=0.02232917584478855, train_loss=0.04576539248228073, time_cost=2.8817732334136963
+
Steps: 0%| | 3524/1000000 [8:55:02<2633:37:43, 9.51s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 3525/1000000 [8:55:11<2600:11:49, 9.39s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [3525], local_loss=0.08620769530534744, train_loss=14.562715530395508, time_cost=3.4979989528656006
+
Steps: 0%| | 3525/1000000 [8:55:11<2600:11:49, 9.39s/it, lr=1e-5, step_loss=0.0862]
Steps: 0%| | 3526/1000000 [8:55:17<2324:46:11, 8.40s/it, lr=1e-5, step_loss=0.0862][RANK-0]: Step: [3526], local_loss=0.012987971305847168, train_loss=0.03098393976688385, time_cost=1.6198530197143555
+
Steps: 0%| | 3526/1000000 [8:55:17<2324:46:11, 8.40s/it, lr=1e-5, step_loss=0.013]
Steps: 0%| | 3527/1000000 [8:55:28<2556:00:55, 9.23s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [3527], local_loss=0.02929767221212387, train_loss=0.06447924673557281, time_cost=2.6435563564300537
+
Steps: 0%| | 3527/1000000 [8:55:28<2556:00:55, 9.23s/it, lr=1e-5, step_loss=0.0293]
Steps: 0%| | 3528/1000000 [8:55:38<2614:21:40, 9.45s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [3528], local_loss=0.016388937830924988, train_loss=0.04970888793468475, time_cost=4.501751184463501
+
Steps: 0%| | 3528/1000000 [8:55:38<2614:21:40, 9.45s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 3529/1000000 [8:55:48<2631:16:49, 9.51s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [3529], local_loss=0.11881048232316971, train_loss=0.07533587515354156, time_cost=3.862278699874878
+
Steps: 0%| | 3529/1000000 [8:55:48<2631:16:49, 9.51s/it, lr=1e-5, step_loss=0.119]
Steps: 0%| | 3530/1000000 [8:55:53<2271:46:47, 8.21s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [3530], local_loss=0.029381301254034042, train_loss=0.031247053295373917, time_cost=1.5423357486724854
+
Steps: 0%| | 3530/1000000 [8:55:53<2271:46:47, 8.21s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 3531/1000000 [8:56:08<2784:44:07, 10.06s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [3531], local_loss=0.024757569655776024, train_loss=0.052331022918224335, time_cost=1.2455363273620605
+
Steps: 0%| | 3531/1000000 [8:56:08<2784:44:07, 10.06s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 3532/1000000 [8:56:19<2883:33:09, 10.42s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [3532], local_loss=0.018087834119796753, train_loss=0.04108111932873726, time_cost=9.449519872665405
+
Steps: 0%| | 3532/1000000 [8:56:19<2883:33:09, 10.42s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 3533/1000000 [8:56:24<2487:04:19, 8.99s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [3533], local_loss=0.131130188703537, train_loss=17.286521911621094, time_cost=2.9804840087890625
+
Steps: 0%| | 3533/1000000 [8:56:24<2487:04:19, 8.99s/it, lr=1e-5, step_loss=0.131]
Steps: 0%| | 3534/1000000 [8:56:29<2104:54:27, 7.60s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [3534], local_loss=0.01587352156639099, train_loss=0.04209887236356735, time_cost=1.197232961654663
+
Steps: 0%| | 3534/1000000 [8:56:29<2104:54:27, 7.60s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 3535/1000000 [8:56:40<2388:45:03, 8.63s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [3535], local_loss=0.0371055081486702, train_loss=0.19253933429718018, time_cost=3.142096519470215
+
Steps: 0%| | 3535/1000000 [8:56:40<2388:45:03, 8.63s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 3536/1000000 [8:56:45<2110:10:20, 7.62s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [3536], local_loss=0.026240088045597076, train_loss=0.07516953349113464, time_cost=2.6276443004608154
+
Steps: 0%| | 3536/1000000 [8:56:45<2110:10:20, 7.62s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 3537/1000000 [8:56:54<2227:00:05, 8.05s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [3537], local_loss=0.08134736865758896, train_loss=0.08843708038330078, time_cost=1.270477294921875
+
Steps: 0%| | 3537/1000000 [8:56:54<2227:00:05, 8.05s/it, lr=1e-5, step_loss=0.0813]
Steps: 0%| | 3538/1000000 [8:57:04<2392:51:43, 8.64s/it, lr=1e-5, step_loss=0.0813][RANK-0]: Step: [3538], local_loss=0.02148994617164135, train_loss=0.01967860572040081, time_cost=1.326073169708252
+
Steps: 0%| | 3538/1000000 [8:57:04<2392:51:43, 8.64s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 3539/1000000 [8:57:13<2445:01:24, 8.83s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [3539], local_loss=0.018038146197795868, train_loss=0.02115170657634735, time_cost=1.673987865447998
+
Steps: 0%| | 3539/1000000 [8:57:13<2445:01:24, 8.83s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 3540/1000000 [8:57:25<2632:30:29, 9.51s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [3540], local_loss=0.0320809930562973, train_loss=0.037258848547935486, time_cost=9.091476678848267
+
Steps: 0%| | 3540/1000000 [8:57:25<2632:30:29, 9.51s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 3541/1000000 [8:57:37<2852:49:58, 10.31s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [3541], local_loss=0.047774530947208405, train_loss=0.04310692846775055, time_cost=4.895910978317261
+
Steps: 0%| | 3541/1000000 [8:57:37<2852:49:58, 10.31s/it, lr=1e-5, step_loss=0.0478]
Steps: 0%| | 3542/1000000 [8:57:47<2830:11:46, 10.22s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [3542], local_loss=0.02285519242286682, train_loss=0.02305571362376213, time_cost=2.492955207824707
+
Steps: 0%| | 3542/1000000 [8:57:47<2830:11:46, 10.22s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 3543/1000000 [8:57:56<2786:19:36, 10.07s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [3543], local_loss=0.019262054935097694, train_loss=0.03437655419111252, time_cost=1.7109761238098145
+
Steps: 0%| | 3543/1000000 [8:57:56<2786:19:36, 10.07s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 3544/1000000 [8:58:07<2837:19:24, 10.25s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [3544], local_loss=0.013578721322119236, train_loss=0.05322222784161568, time_cost=2.1575841903686523
+
Steps: 0%| | 3544/1000000 [8:58:07<2837:19:24, 10.25s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 3545/1000000 [8:58:18<2901:52:53, 10.48s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [3545], local_loss=0.02288004942238331, train_loss=0.07558770477771759, time_cost=2.0359747409820557
+
Steps: 0%| | 3545/1000000 [8:58:18<2901:52:53, 10.48s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 3546/1000000 [8:58:24<2519:55:58, 9.10s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [3546], local_loss=0.04064381122589111, train_loss=0.037487782537937164, time_cost=1.8488028049468994
+
Steps: 0%| | 3546/1000000 [8:58:24<2519:55:58, 9.10s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 3547/1000000 [8:58:33<2548:10:24, 9.21s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [3547], local_loss=0.03415950387716293, train_loss=0.04450604319572449, time_cost=1.8901939392089844
+
Steps: 0%| | 3547/1000000 [8:58:33<2548:10:24, 9.21s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 3548/1000000 [8:58:38<2139:47:53, 7.73s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [3548], local_loss=0.0725848600268364, train_loss=0.04480251669883728, time_cost=1.2153558731079102
+
Steps: 0%| | 3548/1000000 [8:58:38<2139:47:53, 7.73s/it, lr=1e-5, step_loss=0.0726]
Steps: 0%| | 3549/1000000 [8:58:43<1905:25:23, 6.88s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [3549], local_loss=0.6875330209732056, train_loss=0.11854206770658493, time_cost=1.761859655380249
+
Steps: 0%| | 3549/1000000 [8:58:43<1905:25:23, 6.88s/it, lr=1e-5, step_loss=0.688]
Steps: 0%| | 3550/1000000 [8:58:54<2262:40:11, 8.17s/it, lr=1e-5, step_loss=0.688][RANK-0]: Step: [3550], local_loss=0.4337303936481476, train_loss=0.07912065833806992, time_cost=2.515117645263672
+
Steps: 0%| | 3550/1000000 [8:58:54<2262:40:11, 8.17s/it, lr=1e-5, step_loss=0.434]
Steps: 0%| | 3551/1000000 [8:59:05<2485:02:25, 8.98s/it, lr=1e-5, step_loss=0.434][RANK-0]: Step: [3551], local_loss=0.05155806243419647, train_loss=0.06143723055720329, time_cost=3.313876152038574
+
Steps: 0%| | 3551/1000000 [8:59:05<2485:02:25, 8.98s/it, lr=1e-5, step_loss=0.0516]
Steps: 0%| | 3552/1000000 [8:59:15<2609:18:42, 9.43s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [3552], local_loss=0.08832982927560806, train_loss=0.043717123568058014, time_cost=2.1457998752593994
+
Steps: 0%| | 3552/1000000 [8:59:15<2609:18:42, 9.43s/it, lr=1e-5, step_loss=0.0883]
Steps: 0%| | 3553/1000000 [8:59:27<2821:12:04, 10.19s/it, lr=1e-5, step_loss=0.0883][RANK-0]: Step: [3553], local_loss=0.051030248403549194, train_loss=0.037598613649606705, time_cost=4.340456247329712
+
Steps: 0%| | 3553/1000000 [8:59:27<2821:12:04, 10.19s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 3554/1000000 [8:59:41<3089:09:23, 11.16s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [3554], local_loss=0.017219044268131256, train_loss=0.03009922429919243, time_cost=4.0873847007751465
+
Steps: 0%| | 3554/1000000 [8:59:41<3089:09:23, 11.16s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 3555/1000000 [8:59:51<3042:26:19, 10.99s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [3555], local_loss=0.03546302020549774, train_loss=0.024746956303715706, time_cost=3.1615819931030273
+
Steps: 0%| | 3555/1000000 [8:59:51<3042:26:19, 10.99s/it, lr=1e-5, step_loss=0.0355]
Steps: 0%| | 3556/1000000 [8:59:58<2680:39:55, 9.68s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [3556], local_loss=0.01234041340649128, train_loss=0.03318022936582565, time_cost=2.0887444019317627
+
Steps: 0%| | 3556/1000000 [8:59:58<2680:39:55, 9.68s/it, lr=1e-5, step_loss=0.0123]
Steps: 0%| | 3557/1000000 [9:00:06<2523:24:38, 9.12s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [3557], local_loss=0.029693379998207092, train_loss=0.032075319439172745, time_cost=3.6845014095306396
+
Steps: 0%| | 3557/1000000 [9:00:06<2523:24:38, 9.12s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 3558/1000000 [9:00:17<2692:51:16, 9.73s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [3558], local_loss=0.039009299129247665, train_loss=0.043796274811029434, time_cost=1.4857139587402344
+
Steps: 0%| | 3558/1000000 [9:00:17<2692:51:16, 9.73s/it, lr=1e-5, step_loss=0.039]
Steps: 0%| | 3559/1000000 [9:00:21<2249:22:20, 8.13s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [3559], local_loss=0.011629895307123661, train_loss=0.02521331049501896, time_cost=1.2372162342071533
+
Steps: 0%| | 3559/1000000 [9:00:21<2249:22:20, 8.13s/it, lr=1e-5, step_loss=0.0116]
Steps: 0%| | 3560/1000000 [9:00:30<2325:47:00, 8.40s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [3560], local_loss=0.08186817914247513, train_loss=0.04899677634239197, time_cost=2.9335238933563232
+
Steps: 0%| | 3560/1000000 [9:00:30<2325:47:00, 8.40s/it, lr=1e-5, step_loss=0.0819]
Steps: 0%| | 3561/1000000 [9:00:41<2527:17:51, 9.13s/it, lr=1e-5, step_loss=0.0819][RANK-0]: Step: [3561], local_loss=0.01782013289630413, train_loss=0.06322077661752701, time_cost=1.2195241451263428
+
Steps: 0%| | 3561/1000000 [9:00:41<2527:17:51, 9.13s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 3562/1000000 [9:00:47<2255:09:42, 8.15s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [3562], local_loss=0.1281629502773285, train_loss=0.050330694764852524, time_cost=1.5796020030975342
+
Steps: 0%| | 3562/1000000 [9:00:47<2255:09:42, 8.15s/it, lr=1e-5, step_loss=0.128]
Steps: 0%| | 3563/1000000 [9:00:54<2141:20:24, 7.74s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [3563], local_loss=0.16616372764110565, train_loss=0.1108924150466919, time_cost=1.2363941669464111
+
Steps: 0%| | 3563/1000000 [9:00:54<2141:20:24, 7.74s/it, lr=1e-5, step_loss=0.166]
Steps: 0%| | 3564/1000000 [9:01:05<2418:44:02, 8.74s/it, lr=1e-5, step_loss=0.166][RANK-0]: Step: [3564], local_loss=0.02560473419725895, train_loss=0.060626737773418427, time_cost=3.085690498352051
+
Steps: 0%| | 3564/1000000 [9:01:05<2418:44:02, 8.74s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 3565/1000000 [9:01:11<2199:32:16, 7.95s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [3565], local_loss=0.09733419865369797, train_loss=0.0442434586584568, time_cost=1.2263200283050537
+
Steps: 0%| | 3565/1000000 [9:01:11<2199:32:16, 7.95s/it, lr=1e-5, step_loss=0.0973]
Steps: 0%| | 3566/1000000 [9:01:22<2487:15:06, 8.99s/it, lr=1e-5, step_loss=0.0973][RANK-0]: Step: [3566], local_loss=0.043644458055496216, train_loss=0.1542402058839798, time_cost=2.5488193035125732
+
Steps: 0%| | 3566/1000000 [9:01:22<2487:15:06, 8.99s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 3567/1000000 [9:01:33<2663:09:46, 9.62s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [3567], local_loss=0.015717435628175735, train_loss=0.020980482921004295, time_cost=1.2277920246124268
+
Steps: 0%| | 3567/1000000 [9:01:33<2663:09:46, 9.62s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 3568/1000000 [9:01:43<2667:58:21, 9.64s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [3568], local_loss=0.069920614361763, train_loss=0.04967478662729263, time_cost=3.889587163925171
+
Steps: 0%| | 3568/1000000 [9:01:43<2667:58:21, 9.64s/it, lr=1e-5, step_loss=0.0699]
Steps: 0%| | 3569/1000000 [9:01:57<3010:50:16, 10.88s/it, lr=1e-5, step_loss=0.0699][RANK-0]: Step: [3569], local_loss=0.04325348883867264, train_loss=0.028116842731833458, time_cost=7.835389137268066
+
Steps: 0%| | 3569/1000000 [9:01:57<3010:50:16, 10.88s/it, lr=1e-5, step_loss=0.0433]
Steps: 0%| | 3570/1000000 [9:02:07<2976:06:37, 10.75s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [3570], local_loss=0.011777756735682487, train_loss=0.03112112730741501, time_cost=8.62559700012207
+
Steps: 0%| | 3570/1000000 [9:02:07<2976:06:37, 10.75s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 3571/1000000 [9:02:12<2494:31:30, 9.01s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [3571], local_loss=0.06705647706985474, train_loss=0.0425613671541214, time_cost=1.9578125476837158
+
Steps: 0%| | 3571/1000000 [9:02:12<2494:31:30, 9.01s/it, lr=1e-5, step_loss=0.0671]
Steps: 0%| | 3572/1000000 [9:02:21<2501:40:48, 9.04s/it, lr=1e-5, step_loss=0.0671][RANK-0]: Step: [3572], local_loss=0.017577867954969406, train_loss=0.029090814292430878, time_cost=1.561661958694458
+
Steps: 0%| | 3572/1000000 [9:02:21<2501:40:48, 9.04s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 3573/1000000 [9:02:35<2912:32:51, 10.52s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [3573], local_loss=0.017970623448491096, train_loss=0.030217695981264114, time_cost=5.88426661491394
+
Steps: 0%| | 3573/1000000 [9:02:35<2912:32:51, 10.52s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 3574/1000000 [9:02:47<2981:27:32, 10.77s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [3574], local_loss=0.027083896100521088, train_loss=0.048530783504247665, time_cost=3.735917329788208
+
Steps: 0%| | 3574/1000000 [9:02:47<2981:27:32, 10.77s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 3575/1000000 [9:02:55<2777:43:04, 10.04s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [3575], local_loss=0.07770533859729767, train_loss=0.06593431532382965, time_cost=2.320054292678833
+
Steps: 0%| | 3575/1000000 [9:02:55<2777:43:04, 10.04s/it, lr=1e-5, step_loss=0.0777]
Steps: 0%| | 3576/1000000 [9:03:10<3232:11:57, 11.68s/it, lr=1e-5, step_loss=0.0777][RANK-0]: Step: [3576], local_loss=0.011430308222770691, train_loss=0.15449264645576477, time_cost=6.950339317321777
+
Steps: 0%| | 3576/1000000 [9:03:11<3232:11:57, 11.68s/it, lr=1e-5, step_loss=0.0114]
Steps: 0%| | 3577/1000000 [9:03:24<3353:02:02, 12.11s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [3577], local_loss=1.0060659646987915, train_loss=0.1592327058315277, time_cost=5.012692213058472
+
Steps: 0%| | 3577/1000000 [9:03:24<3353:02:02, 12.11s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 3578/1000000 [9:03:36<3380:26:30, 12.21s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [3578], local_loss=0.02065560594201088, train_loss=0.06610420346260071, time_cost=5.68550968170166
+
Steps: 0%| | 3578/1000000 [9:03:36<3380:26:30, 12.21s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 3579/1000000 [9:03:41<2789:28:30, 10.08s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [3579], local_loss=1.0164881944656372, train_loss=0.1498289704322815, time_cost=2.5222396850585938
+
Steps: 0%| | 3579/1000000 [9:03:41<2789:28:30, 10.08s/it, lr=1e-5, step_loss=1.02]
Steps: 0%| | 3580/1000000 [9:03:47<2398:32:28, 8.67s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [3580], local_loss=0.021236300468444824, train_loss=0.03110666386783123, time_cost=1.241302490234375
+
Steps: 0%| | 3580/1000000 [9:03:47<2398:32:28, 8.67s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 3581/1000000 [9:03:52<2094:54:29, 7.57s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [3581], local_loss=0.08883621543645859, train_loss=0.03938470035791397, time_cost=1.977691888809204
+
Steps: 0%| | 3581/1000000 [9:03:52<2094:54:29, 7.57s/it, lr=1e-5, step_loss=0.0888]
Steps: 0%| | 3582/1000000 [9:03:57<1890:28:39, 6.83s/it, lr=1e-5, step_loss=0.0888][RANK-0]: Step: [3582], local_loss=0.0107108848169446, train_loss=0.04552720487117767, time_cost=2.1443192958831787
+
Steps: 0%| | 3582/1000000 [9:03:57<1890:28:39, 6.83s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 3583/1000000 [9:04:09<2378:33:59, 8.59s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [3583], local_loss=0.012661419808864594, train_loss=6.889235496520996, time_cost=5.565850257873535
+
Steps: 0%| | 3583/1000000 [9:04:09<2378:33:59, 8.59s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 3584/1000000 [9:04:14<2013:33:39, 7.27s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [3584], local_loss=134.56619262695312, train_loss=16.840850830078125, time_cost=1.223872184753418
+
Steps: 0%| | 3584/1000000 [9:04:14<2013:33:39, 7.27s/it, lr=1e-5, step_loss=135]
Steps: 0%| | 3585/1000000 [9:04:23<2177:51:12, 7.87s/it, lr=1e-5, step_loss=135][RANK-0]: Step: [3585], local_loss=0.10673840343952179, train_loss=0.17644944787025452, time_cost=5.702167749404907
+
Steps: 0%| | 3585/1000000 [9:04:23<2177:51:12, 7.87s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 3586/1000000 [9:04:36<2637:34:47, 9.53s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [3586], local_loss=0.01370549201965332, train_loss=0.13202537596225739, time_cost=3.5877459049224854
+
Steps: 0%| | 3586/1000000 [9:04:36<2637:34:47, 9.53s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 3587/1000000 [9:04:40<2201:18:18, 7.95s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [3587], local_loss=0.41351500153541565, train_loss=0.09187459200620651, time_cost=1.2176384925842285
+
Steps: 0%| | 3587/1000000 [9:04:41<2201:18:18, 7.95s/it, lr=1e-5, step_loss=0.414]
Steps: 0%| | 3588/1000000 [9:04:52<2458:10:35, 8.88s/it, lr=1e-5, step_loss=0.414][RANK-0]: Step: [3588], local_loss=0.0493750236928463, train_loss=0.032100409269332886, time_cost=2.16308331489563
+
Steps: 0%| | 3588/1000000 [9:04:52<2458:10:35, 8.88s/it, lr=1e-5, step_loss=0.0494]
Steps: 0%| | 3589/1000000 [9:05:04<2779:33:10, 10.04s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [3589], local_loss=0.010599866509437561, train_loss=0.0391874760389328, time_cost=4.987272500991821
+
Steps: 0%| | 3589/1000000 [9:05:04<2779:33:10, 10.04s/it, lr=1e-5, step_loss=0.0106]
Steps: 0%| | 3590/1000000 [9:05:11<2536:59:48, 9.17s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [3590], local_loss=0.030391309410333633, train_loss=0.041897255927324295, time_cost=1.223372220993042
+
Steps: 0%| | 3590/1000000 [9:05:11<2536:59:48, 9.17s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 3591/1000000 [9:05:18<2350:55:49, 8.49s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [3591], local_loss=0.055917877703905106, train_loss=0.04455069452524185, time_cost=1.3325226306915283
+
Steps: 0%| | 3591/1000000 [9:05:18<2350:55:49, 8.49s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 3592/1000000 [9:05:26<2322:34:25, 8.39s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [3592], local_loss=0.06210567429661751, train_loss=0.04963920637965202, time_cost=4.183148622512817
+
Steps: 0%| | 3592/1000000 [9:05:26<2322:34:25, 8.39s/it, lr=1e-5, step_loss=0.0621]
Steps: 0%| | 3593/1000000 [9:05:32<2048:56:31, 7.40s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [3593], local_loss=0.03820143640041351, train_loss=0.0969136655330658, time_cost=2.025280237197876
+
Steps: 0%| | 3593/1000000 [9:05:32<2048:56:31, 7.40s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 3594/1000000 [9:05:39<2010:11:23, 7.26s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [3594], local_loss=0.013714886270463467, train_loss=0.028770213946700096, time_cost=2.240584135055542
+
Steps: 0%| | 3594/1000000 [9:05:39<2010:11:23, 7.26s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 3595/1000000 [9:05:46<2008:03:25, 7.26s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [3595], local_loss=0.04063685983419418, train_loss=0.046399399638175964, time_cost=1.3404300212860107
+
Steps: 0%| | 3595/1000000 [9:05:46<2008:03:25, 7.26s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 3596/1000000 [9:05:59<2498:03:30, 9.03s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [3596], local_loss=0.029438190162181854, train_loss=0.09568879008293152, time_cost=4.657687187194824
+
Steps: 0%| | 3596/1000000 [9:05:59<2498:03:30, 9.03s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 3597/1000000 [9:06:10<2654:40:11, 9.59s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [3597], local_loss=0.05149058252573013, train_loss=0.03444059565663338, time_cost=1.2283945083618164
+
Steps: 0%| | 3597/1000000 [9:06:10<2654:40:11, 9.59s/it, lr=1e-5, step_loss=0.0515]
Steps: 0%| | 3598/1000000 [9:06:15<2293:23:18, 8.29s/it, lr=1e-5, step_loss=0.0515][RANK-0]: Step: [3598], local_loss=0.04299953952431679, train_loss=0.04838916286826134, time_cost=2.220259189605713
+
Steps: 0%| | 3598/1000000 [9:06:15<2293:23:18, 8.29s/it, lr=1e-5, step_loss=0.043]
Steps: 0%| | 3599/1000000 [9:06:22<2166:24:09, 7.83s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [3599], local_loss=0.14259566366672516, train_loss=0.0746578723192215, time_cost=2.9627108573913574
+
Steps: 0%| | 3599/1000000 [9:06:22<2166:24:09, 7.83s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 3600/1000000 [9:06:28<2064:03:50, 7.46s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [3600], local_loss=0.014820506796240807, train_loss=0.20141440629959106, time_cost=5.114638328552246
+
Steps: 0%| | 3600/1000000 [9:06:28<2064:03:50, 7.46s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 3601/1000000 [9:06:40<2426:28:02, 8.77s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [3601], local_loss=0.021337490528821945, train_loss=0.17079749703407288, time_cost=5.85640025138855
+
Steps: 0%| | 3601/1000000 [9:06:40<2426:28:02, 8.77s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 3602/1000000 [9:06:48<2328:40:12, 8.41s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [3602], local_loss=0.023327002301812172, train_loss=0.025251034647226334, time_cost=3.1970181465148926
+
Steps: 0%| | 3602/1000000 [9:06:48<2328:40:12, 8.41s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 3603/1000000 [9:07:00<2662:07:48, 9.62s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [3603], local_loss=0.013128591701388359, train_loss=0.05384325981140137, time_cost=1.238713026046753
+
Steps: 0%| | 3603/1000000 [9:07:00<2662:07:48, 9.62s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 3604/1000000 [9:07:13<2891:32:12, 10.45s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [3604], local_loss=0.02140016481280327, train_loss=0.036732207983732224, time_cost=6.404789447784424
+
Steps: 0%| | 3604/1000000 [9:07:13<2891:32:12, 10.45s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 3605/1000000 [9:07:22<2807:15:36, 10.14s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [3605], local_loss=0.015471117570996284, train_loss=0.025204408913850784, time_cost=3.454421281814575
+
Steps: 0%| | 3605/1000000 [9:07:22<2807:15:36, 10.14s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 3606/1000000 [9:07:27<2335:14:16, 8.44s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [3606], local_loss=0.1398816853761673, train_loss=0.046958185732364655, time_cost=1.388474702835083
+
Steps: 0%| | 3606/1000000 [9:07:27<2335:14:16, 8.44s/it, lr=1e-5, step_loss=0.14]
Steps: 0%| | 3607/1000000 [9:07:33<2205:23:01, 7.97s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [3607], local_loss=0.050952404737472534, train_loss=0.053128860890865326, time_cost=2.336291790008545
+
Steps: 0%| | 3607/1000000 [9:07:33<2205:23:01, 7.97s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 3608/1000000 [9:07:45<2525:53:42, 9.13s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [3608], local_loss=0.010422952473163605, train_loss=0.1475769430398941, time_cost=2.7472329139709473
+
Steps: 0%| | 3608/1000000 [9:07:45<2525:53:42, 9.13s/it, lr=1e-5, step_loss=0.0104]
Steps: 0%| | 3609/1000000 [9:07:54<2496:48:09, 9.02s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [3609], local_loss=0.015820294618606567, train_loss=0.027600636705756187, time_cost=3.706289768218994
+
Steps: 0%| | 3609/1000000 [9:07:54<2496:48:09, 9.02s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 3610/1000000 [9:07:58<2101:02:37, 7.59s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [3610], local_loss=0.05064322426915169, train_loss=0.17090268433094025, time_cost=1.2360587120056152
+
Steps: 0%| | 3610/1000000 [9:07:58<2101:02:37, 7.59s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 3611/1000000 [9:08:09<2331:11:11, 8.42s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [3611], local_loss=0.05925699323415756, train_loss=0.03914739936590195, time_cost=7.83251690864563
+
Steps: 0%| | 3611/1000000 [9:08:09<2331:11:11, 8.42s/it, lr=1e-5, step_loss=0.0593]
Steps: 0%| | 3612/1000000 [9:08:17<2360:56:03, 8.53s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [3612], local_loss=0.02241230010986328, train_loss=0.06015869602560997, time_cost=5.212024450302124
+
Steps: 0%| | 3612/1000000 [9:08:17<2360:56:03, 8.53s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 3613/1000000 [9:08:23<2075:57:39, 7.50s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [3613], local_loss=0.05604920536279678, train_loss=0.05741839483380318, time_cost=2.1250946521759033
+
Steps: 0%| | 3613/1000000 [9:08:23<2075:57:39, 7.50s/it, lr=1e-5, step_loss=0.056]
Steps: 0%| | 3614/1000000 [9:08:34<2400:46:21, 8.67s/it, lr=1e-5, step_loss=0.056][RANK-0]: Step: [3614], local_loss=0.020128048956394196, train_loss=0.12470318377017975, time_cost=2.4353580474853516
+
Steps: 0%| | 3614/1000000 [9:08:34<2400:46:21, 8.67s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 3615/1000000 [9:08:45<2610:28:00, 9.43s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [3615], local_loss=0.014645575545728207, train_loss=0.026757288724184036, time_cost=4.3496904373168945
+
Steps: 0%| | 3615/1000000 [9:08:45<2610:28:00, 9.43s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 3616/1000000 [9:08:50<2241:59:56, 8.10s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [3616], local_loss=0.013003816828131676, train_loss=0.033317599445581436, time_cost=3.9766054153442383
+
Steps: 0%| | 3616/1000000 [9:08:50<2241:59:56, 8.10s/it, lr=1e-5, step_loss=0.013]
Steps: 0%| | 3617/1000000 [9:08:59<2282:35:26, 8.25s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [3617], local_loss=0.022700827568769455, train_loss=0.037955522537231445, time_cost=2.610292911529541
+
Steps: 0%| | 3617/1000000 [9:08:59<2282:35:26, 8.25s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 3618/1000000 [9:09:09<2472:54:28, 8.93s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [3618], local_loss=0.012562049552798271, train_loss=0.05056501179933548, time_cost=3.1028852462768555
+
Steps: 0%| | 3618/1000000 [9:09:09<2472:54:28, 8.93s/it, lr=1e-5, step_loss=0.0126]
Steps: 0%| | 3619/1000000 [9:09:17<2345:59:58, 8.48s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [3619], local_loss=0.06915771216154099, train_loss=0.04758474603295326, time_cost=1.2790350914001465
+
Steps: 0%| | 3619/1000000 [9:09:17<2345:59:58, 8.48s/it, lr=1e-5, step_loss=0.0692]
Steps: 0%| | 3620/1000000 [9:09:31<2866:52:07, 10.36s/it, lr=1e-5, step_loss=0.0692][RANK-0]: Step: [3620], local_loss=0.9066375494003296, train_loss=0.15765216946601868, time_cost=5.910074472427368
+
Steps: 0%| | 3620/1000000 [9:09:31<2866:52:07, 10.36s/it, lr=1e-5, step_loss=0.907]
Steps: 0%| | 3621/1000000 [9:09:42<2858:13:52, 10.33s/it, lr=1e-5, step_loss=0.907][RANK-0]: Step: [3621], local_loss=0.015587400645017624, train_loss=0.054658159613609314, time_cost=4.628253698348999
+
Steps: 0%| | 3621/1000000 [9:09:42<2858:13:52, 10.33s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 3622/1000000 [9:09:57<3298:23:58, 11.92s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [3622], local_loss=0.04515919089317322, train_loss=0.07382484525442123, time_cost=7.480914831161499
+
Steps: 0%| | 3622/1000000 [9:09:57<3298:23:58, 11.92s/it, lr=1e-5, step_loss=0.0452]
Steps: 0%| | 3623/1000000 [9:10:07<3083:44:52, 11.14s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [3623], local_loss=0.018773365765810013, train_loss=0.02736577019095421, time_cost=1.2011902332305908
+
Steps: 0%| | 3623/1000000 [9:10:07<3083:44:52, 11.14s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 3624/1000000 [9:10:14<2766:13:08, 9.99s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [3624], local_loss=0.03392254188656807, train_loss=0.06196798011660576, time_cost=1.2272908687591553
+
Steps: 0%| | 3624/1000000 [9:10:14<2766:13:08, 9.99s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 3625/1000000 [9:10:27<3028:01:56, 10.94s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [3625], local_loss=0.048814449459314346, train_loss=0.041496723890304565, time_cost=6.142780303955078
+
Steps: 0%| | 3625/1000000 [9:10:27<3028:01:56, 10.94s/it, lr=1e-5, step_loss=0.0488]
Steps: 0%| | 3626/1000000 [9:10:40<3232:40:37, 11.68s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [3626], local_loss=0.11562480032444, train_loss=0.04803002253174782, time_cost=4.268901824951172
+
Steps: 0%| | 3626/1000000 [9:10:40<3232:40:37, 11.68s/it, lr=1e-5, step_loss=0.116]
Steps: 0%| | 3627/1000000 [9:10:48<2884:47:17, 10.42s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [3627], local_loss=0.018987853080034256, train_loss=0.06498110294342041, time_cost=2.8170294761657715
+
Steps: 0%| | 3627/1000000 [9:10:48<2884:47:17, 10.42s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 3628/1000000 [9:10:58<2815:06:17, 10.17s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [3628], local_loss=0.10320134460926056, train_loss=0.04330891743302345, time_cost=2.4081568717956543
+
Steps: 0%| | 3628/1000000 [9:10:58<2815:06:17, 10.17s/it, lr=1e-5, step_loss=0.103]
Steps: 0%| | 3629/1000000 [9:11:10<2982:32:09, 10.78s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [3629], local_loss=0.011885231360793114, train_loss=0.17128397524356842, time_cost=3.8985633850097656
+
Steps: 0%| | 3629/1000000 [9:11:10<2982:32:09, 10.78s/it, lr=1e-5, step_loss=0.0119]
Steps: 0%| | 3630/1000000 [9:11:17<2658:59:55, 9.61s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [3630], local_loss=0.016224931925535202, train_loss=0.03875768929719925, time_cost=4.971059560775757
+
Steps: 0%| | 3630/1000000 [9:11:17<2658:59:55, 9.61s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 3631/1000000 [9:11:24<2481:12:26, 8.96s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [3631], local_loss=0.9968451261520386, train_loss=0.19910873472690582, time_cost=5.711358547210693
+
Steps: 0%| | 3631/1000000 [9:11:24<2481:12:26, 8.96s/it, lr=1e-5, step_loss=0.997]
Steps: 0%| | 3632/1000000 [9:11:36<2704:56:12, 9.77s/it, lr=1e-5, step_loss=0.997][RANK-0]: Step: [3632], local_loss=0.0800594612956047, train_loss=0.061025574803352356, time_cost=8.509119510650635
+
Steps: 0%| | 3632/1000000 [9:11:36<2704:56:12, 9.77s/it, lr=1e-5, step_loss=0.0801]
Steps: 0%| | 3633/1000000 [9:11:43<2457:24:10, 8.88s/it, lr=1e-5, step_loss=0.0801][RANK-0]: Step: [3633], local_loss=0.04437653720378876, train_loss=0.03641963377594948, time_cost=4.991936445236206
+
Steps: 0%| | 3633/1000000 [9:11:43<2457:24:10, 8.88s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 3634/1000000 [9:11:55<2714:22:30, 9.81s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [3634], local_loss=0.03480561450123787, train_loss=0.03552072495222092, time_cost=1.242483139038086
+
Steps: 0%| | 3634/1000000 [9:11:55<2714:22:30, 9.81s/it, lr=1e-5, step_loss=0.0348]
Steps: 0%| | 3635/1000000 [9:12:07<2932:18:00, 10.59s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [3635], local_loss=0.0159014742821455, train_loss=0.07606824487447739, time_cost=4.865886211395264
+
Steps: 0%| | 3635/1000000 [9:12:07<2932:18:00, 10.59s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 3636/1000000 [9:12:14<2639:44:51, 9.54s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [3636], local_loss=0.01989457756280899, train_loss=0.038679689168930054, time_cost=2.4423274993896484
+
Steps: 0%| | 3636/1000000 [9:12:14<2639:44:51, 9.54s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 3637/1000000 [9:12:22<2544:59:34, 9.20s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [3637], local_loss=0.2942626178264618, train_loss=0.07991428673267365, time_cost=3.7664291858673096
+
Steps: 0%| | 3637/1000000 [9:12:22<2544:59:34, 9.20s/it, lr=1e-5, step_loss=0.294]
Steps: 0%| | 3638/1000000 [9:12:32<2591:09:31, 9.36s/it, lr=1e-5, step_loss=0.294][RANK-0]: Step: [3638], local_loss=0.013300579972565174, train_loss=0.06287390738725662, time_cost=1.2293460369110107
+
Steps: 0%| | 3638/1000000 [9:12:32<2591:09:31, 9.36s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 3639/1000000 [9:12:36<2161:00:58, 7.81s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [3639], local_loss=0.010989603586494923, train_loss=0.05801599472761154, time_cost=1.6533563137054443
+
Steps: 0%| | 3639/1000000 [9:12:36<2161:00:58, 7.81s/it, lr=1e-5, step_loss=0.011]
Steps: 0%| | 3640/1000000 [9:12:43<2064:44:33, 7.46s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [3640], local_loss=0.026971425861120224, train_loss=0.04716211557388306, time_cost=2.2569210529327393
+
Steps: 0%| | 3640/1000000 [9:12:43<2064:44:33, 7.46s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 3641/1000000 [9:12:58<2720:10:50, 9.83s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [3641], local_loss=0.0411599799990654, train_loss=0.08095099031925201, time_cost=5.995800495147705
+
Steps: 0%| | 3641/1000000 [9:12:58<2720:10:50, 9.83s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 3642/1000000 [9:13:07<2620:22:50, 9.47s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [3642], local_loss=0.022051647305488586, train_loss=0.06351703405380249, time_cost=2.5926513671875
+
Steps: 0%| | 3642/1000000 [9:13:07<2620:22:50, 9.47s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 3643/1000000 [9:13:18<2734:39:53, 9.88s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [3643], local_loss=0.015687618404626846, train_loss=0.06201842427253723, time_cost=1.239448070526123
+
Steps: 0%| | 3643/1000000 [9:13:18<2734:39:53, 9.88s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 3644/1000000 [9:13:29<2804:02:25, 10.13s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [3644], local_loss=0.01550380326807499, train_loss=0.02656332403421402, time_cost=1.2963438034057617
+
Steps: 0%| | 3644/1000000 [9:13:29<2804:02:25, 10.13s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 3645/1000000 [9:13:35<2516:44:45, 9.09s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [3645], local_loss=0.02510339766740799, train_loss=0.022490553557872772, time_cost=1.2757866382598877
+
Steps: 0%| | 3645/1000000 [9:13:35<2516:44:45, 9.09s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 3646/1000000 [9:13:44<2519:14:11, 9.10s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [3646], local_loss=0.11644361913204193, train_loss=0.04891085624694824, time_cost=3.9153048992156982
+
Steps: 0%| | 3646/1000000 [9:13:44<2519:14:11, 9.10s/it, lr=1e-5, step_loss=0.116]
Steps: 0%| | 3647/1000000 [9:13:50<2197:21:49, 7.94s/it, lr=1e-5, step_loss=0.116][RANK-0]: Step: [3647], local_loss=0.035866957157850266, train_loss=0.028506556525826454, time_cost=2.2963056564331055
+
Steps: 0%| | 3647/1000000 [9:13:50<2197:21:49, 7.94s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 3648/1000000 [9:14:00<2428:28:14, 8.77s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [3648], local_loss=0.013373686000704765, train_loss=0.04001982510089874, time_cost=2.3434722423553467
+
Steps: 0%| | 3648/1000000 [9:14:00<2428:28:14, 8.77s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 3649/1000000 [9:14:07<2262:50:57, 8.18s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [3649], local_loss=0.01826055534183979, train_loss=0.02178129181265831, time_cost=2.3440983295440674
+
Steps: 0%| | 3649/1000000 [9:14:07<2262:50:57, 8.18s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 3650/1000000 [9:14:11<1949:15:43, 7.04s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [3650], local_loss=0.9969308376312256, train_loss=0.159701406955719, time_cost=1.5675339698791504
+
Steps: 0%| | 3650/1000000 [9:14:11<1949:15:43, 7.04s/it, lr=1e-5, step_loss=0.997]
Steps: 0%| | 3651/1000000 [9:14:21<2194:51:03, 7.93s/it, lr=1e-5, step_loss=0.997][RANK-0]: Step: [3651], local_loss=0.0802905336022377, train_loss=0.037860460579395294, time_cost=7.387129306793213
+
Steps: 0%| | 3651/1000000 [9:14:21<2194:51:03, 7.93s/it, lr=1e-5, step_loss=0.0803]
Steps: 0%| | 3652/1000000 [9:14:32<2404:28:11, 8.69s/it, lr=1e-5, step_loss=0.0803][RANK-0]: Step: [3652], local_loss=0.03537020832300186, train_loss=0.2776971459388733, time_cost=2.425935983657837
+
Steps: 0%| | 3652/1000000 [9:14:32<2404:28:11, 8.69s/it, lr=1e-5, step_loss=0.0354]
Steps: 0%| | 3653/1000000 [9:14:37<2120:32:27, 7.66s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [3653], local_loss=0.016505466774106026, train_loss=0.025177529081702232, time_cost=3.2182765007019043
+
Steps: 0%| | 3653/1000000 [9:14:37<2120:32:27, 7.66s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 3654/1000000 [9:14:46<2175:25:11, 7.86s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [3654], local_loss=0.4286573827266693, train_loss=0.07364337891340256, time_cost=2.6732327938079834
+
Steps: 0%| | 3654/1000000 [9:14:46<2175:25:11, 7.86s/it, lr=1e-5, step_loss=0.429]
Steps: 0%| | 3655/1000000 [9:14:54<2217:41:39, 8.01s/it, lr=1e-5, step_loss=0.429][RANK-0]: Step: [3655], local_loss=0.026683224365115166, train_loss=5.583765029907227, time_cost=3.327641725540161
+
Steps: 0%| | 3655/1000000 [9:14:54<2217:41:39, 8.01s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 3656/1000000 [9:15:03<2287:51:08, 8.27s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [3656], local_loss=0.15010206401348114, train_loss=0.12924057245254517, time_cost=2.800718069076538
+
Steps: 0%| | 3656/1000000 [9:15:03<2287:51:08, 8.27s/it, lr=1e-5, step_loss=0.15]
Steps: 0%| | 3657/1000000 [9:15:07<1985:24:04, 7.17s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [3657], local_loss=0.015725037083029747, train_loss=0.019283605739474297, time_cost=2.140991687774658
+
Steps: 0%| | 3657/1000000 [9:15:07<1985:24:04, 7.17s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 3658/1000000 [9:15:20<2468:50:51, 8.92s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [3658], local_loss=0.04912503436207771, train_loss=0.0624428316950798, time_cost=4.51263427734375
+
Steps: 0%| | 3658/1000000 [9:15:20<2468:50:51, 8.92s/it, lr=1e-5, step_loss=0.0491]
Steps: 0%| | 3659/1000000 [9:15:26<2215:08:12, 8.00s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [3659], local_loss=0.045607790350914, train_loss=0.07163591682910919, time_cost=1.6245636940002441
+
Steps: 0%| | 3659/1000000 [9:15:26<2215:08:12, 8.00s/it, lr=1e-5, step_loss=0.0456]
Steps: 0%| | 3660/1000000 [9:15:37<2421:44:48, 8.75s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [3660], local_loss=0.02455197647213936, train_loss=0.11237205564975739, time_cost=3.4258241653442383
+
Steps: 0%| | 3660/1000000 [9:15:37<2421:44:48, 8.75s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 3661/1000000 [9:15:44<2311:02:06, 8.35s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [3661], local_loss=1.0011208057403564, train_loss=0.1582396775484085, time_cost=2.6158480644226074
+
Steps: 0%| | 3661/1000000 [9:15:44<2311:02:06, 8.35s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 3662/1000000 [9:15:58<2771:10:13, 10.01s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [3662], local_loss=0.27629750967025757, train_loss=0.13198517262935638, time_cost=5.3956990242004395
+
Steps: 0%| | 3662/1000000 [9:15:58<2771:10:13, 10.01s/it, lr=1e-5, step_loss=0.276]
Steps: 0%| | 3663/1000000 [9:16:09<2871:40:58, 10.38s/it, lr=1e-5, step_loss=0.276][RANK-0]: Step: [3663], local_loss=0.011140605434775352, train_loss=0.0350647009909153, time_cost=8.858287572860718
+
Steps: 0%| | 3663/1000000 [9:16:09<2871:40:58, 10.38s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 3664/1000000 [9:16:20<2897:04:03, 10.47s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [3664], local_loss=0.03780343011021614, train_loss=0.05230097100138664, time_cost=1.2389309406280518
+
Steps: 0%| | 3664/1000000 [9:16:20<2897:04:03, 10.47s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 3665/1000000 [9:16:26<2491:02:18, 9.00s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [3665], local_loss=1.000370979309082, train_loss=0.15175937116146088, time_cost=1.3079755306243896
+
Steps: 0%| | 3665/1000000 [9:16:26<2491:02:18, 9.00s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 3666/1000000 [9:16:33<2390:33:47, 8.64s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [3666], local_loss=0.010401072911918163, train_loss=0.040897998958826065, time_cost=3.705885410308838
+
Steps: 0%| | 3666/1000000 [9:16:33<2390:33:47, 8.64s/it, lr=1e-5, step_loss=0.0104]
Steps: 0%| | 3667/1000000 [9:16:48<2928:51:09, 10.58s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [3667], local_loss=0.03087281435728073, train_loss=0.08637722581624985, time_cost=6.550574541091919
+
Steps: 0%| | 3667/1000000 [9:16:48<2928:51:09, 10.58s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 3668/1000000 [9:16:58<2870:54:19, 10.37s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [3668], local_loss=0.016131047159433365, train_loss=0.02664700709283352, time_cost=4.17862868309021
+
Steps: 0%| | 3668/1000000 [9:16:58<2870:54:19, 10.37s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 3669/1000000 [9:17:05<2599:42:36, 9.39s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [3669], local_loss=0.0844346135854721, train_loss=0.07887323200702667, time_cost=2.634483575820923
+
Steps: 0%| | 3669/1000000 [9:17:05<2599:42:36, 9.39s/it, lr=1e-5, step_loss=0.0844]
Steps: 0%| | 3670/1000000 [9:17:16<2724:00:33, 9.84s/it, lr=1e-5, step_loss=0.0844][RANK-0]: Step: [3670], local_loss=0.016674594953656197, train_loss=0.04177793115377426, time_cost=7.239559650421143
+
Steps: 0%| | 3670/1000000 [9:17:16<2724:00:33, 9.84s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 3671/1000000 [9:17:21<2323:15:20, 8.39s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [3671], local_loss=0.13318467140197754, train_loss=0.04582144692540169, time_cost=1.976320505142212
+
Steps: 0%| | 3671/1000000 [9:17:21<2323:15:20, 8.39s/it, lr=1e-5, step_loss=0.133]
Steps: 0%| | 3672/1000000 [9:17:31<2469:27:16, 8.92s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [3672], local_loss=0.013221537694334984, train_loss=0.056562043726444244, time_cost=6.406068563461304
+
Steps: 0%| | 3672/1000000 [9:17:31<2469:27:16, 8.92s/it, lr=1e-5, step_loss=0.0132]
Steps: 0%| | 3673/1000000 [9:17:38<2281:33:26, 8.24s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [3673], local_loss=0.06529659032821655, train_loss=0.027881618589162827, time_cost=2.5109808444976807
+
Steps: 0%| | 3673/1000000 [9:17:38<2281:33:26, 8.24s/it, lr=1e-5, step_loss=0.0653]
Steps: 0%| | 3674/1000000 [9:17:49<2494:05:02, 9.01s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [3674], local_loss=0.04225831478834152, train_loss=0.02961532585322857, time_cost=1.2281181812286377
+
Steps: 0%| | 3674/1000000 [9:17:49<2494:05:02, 9.01s/it, lr=1e-5, step_loss=0.0423]
Steps: 0%| | 3675/1000000 [9:17:57<2398:03:40, 8.66s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [3675], local_loss=0.2822686731815338, train_loss=0.06631970405578613, time_cost=3.803326368331909
+
Steps: 0%| | 3675/1000000 [9:17:57<2398:03:40, 8.66s/it, lr=1e-5, step_loss=0.282]
Steps: 0%| | 3676/1000000 [9:18:07<2535:23:01, 9.16s/it, lr=1e-5, step_loss=0.282][RANK-0]: Step: [3676], local_loss=0.05818171054124832, train_loss=0.056650981307029724, time_cost=1.6466381549835205
+
Steps: 0%| | 3676/1000000 [9:18:07<2535:23:01, 9.16s/it, lr=1e-5, step_loss=0.0582]
Steps: 0%| | 3677/1000000 [9:18:12<2152:31:53, 7.78s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [3677], local_loss=0.12800613045692444, train_loss=0.05063922330737114, time_cost=2.0506927967071533
+
Steps: 0%| | 3677/1000000 [9:18:12<2152:31:53, 7.78s/it, lr=1e-5, step_loss=0.128]
Steps: 0%| | 3678/1000000 [9:18:20<2180:30:25, 7.88s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [3678], local_loss=0.13863787055015564, train_loss=0.05602128803730011, time_cost=4.140892505645752
+
Steps: 0%| | 3678/1000000 [9:18:20<2180:30:25, 7.88s/it, lr=1e-5, step_loss=0.139]
Steps: 0%| | 3679/1000000 [9:18:28<2170:13:41, 7.84s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [3679], local_loss=0.04884715378284454, train_loss=0.038889557123184204, time_cost=4.2186620235443115
+
Steps: 0%| | 3679/1000000 [9:18:28<2170:13:41, 7.84s/it, lr=1e-5, step_loss=0.0488]
Steps: 0%| | 3680/1000000 [9:18:35<2146:55:15, 7.76s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [3680], local_loss=0.08935952186584473, train_loss=0.12504041194915771, time_cost=3.770672082901001
+
Steps: 0%| | 3680/1000000 [9:18:35<2146:55:15, 7.76s/it, lr=1e-5, step_loss=0.0894]
Steps: 0%| | 3681/1000000 [9:18:49<2684:26:06, 9.70s/it, lr=1e-5, step_loss=0.0894][RANK-0]: Step: [3681], local_loss=0.07263731956481934, train_loss=0.098833829164505, time_cost=1.204329252243042
+
Steps: 0%| | 3681/1000000 [9:18:49<2684:26:06, 9.70s/it, lr=1e-5, step_loss=0.0726]
Steps: 0%| | 3682/1000000 [9:19:00<2793:49:16, 10.09s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [3682], local_loss=0.09691401571035385, train_loss=0.04077969864010811, time_cost=1.425459861755371
+
Steps: 0%| | 3682/1000000 [9:19:00<2793:49:16, 10.09s/it, lr=1e-5, step_loss=0.0969]
Steps: 0%| | 3683/1000000 [9:19:05<2322:16:14, 8.39s/it, lr=1e-5, step_loss=0.0969][RANK-0]: Step: [3683], local_loss=0.014615570195019245, train_loss=0.05732084810733795, time_cost=1.6479175090789795
+
Steps: 0%| | 3683/1000000 [9:19:05<2322:16:14, 8.39s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 3684/1000000 [9:19:09<1984:05:20, 7.17s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [3684], local_loss=0.017571859061717987, train_loss=0.04525606334209442, time_cost=1.2457921504974365
+
Steps: 0%| | 3684/1000000 [9:19:09<1984:05:20, 7.17s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 3685/1000000 [9:19:24<2635:43:22, 9.52s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [3685], local_loss=0.01476115919649601, train_loss=0.039350878447294235, time_cost=1.225961446762085
+
Steps: 0%| | 3685/1000000 [9:19:24<2635:43:22, 9.52s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 3686/1000000 [9:19:29<2247:09:46, 8.12s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [3686], local_loss=0.016843246296048164, train_loss=0.01976545713841915, time_cost=1.2214863300323486
+
Steps: 0%| | 3686/1000000 [9:19:29<2247:09:46, 8.12s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 3687/1000000 [9:19:34<1992:03:44, 7.20s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [3687], local_loss=0.014096975326538086, train_loss=0.054882243275642395, time_cost=1.6131417751312256
+
Steps: 0%| | 3687/1000000 [9:19:34<1992:03:44, 7.20s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 3688/1000000 [9:19:45<2274:26:28, 8.22s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [3688], local_loss=0.04064520448446274, train_loss=0.09597961604595184, time_cost=2.01287579536438
+
Steps: 0%| | 3688/1000000 [9:19:45<2274:26:28, 8.22s/it, lr=1e-5, step_loss=0.0406]
Steps: 0%| | 3689/1000000 [9:19:50<2009:36:12, 7.26s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [3689], local_loss=0.020487293601036072, train_loss=0.033288225531578064, time_cost=1.9517018795013428
+
Steps: 0%| | 3689/1000000 [9:19:50<2009:36:12, 7.26s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 3690/1000000 [9:19:54<1767:12:36, 6.39s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [3690], local_loss=0.0520796999335289, train_loss=0.06799893826246262, time_cost=1.3409779071807861
+
Steps: 0%| | 3690/1000000 [9:19:54<1767:12:36, 6.39s/it, lr=1e-5, step_loss=0.0521]
Steps: 0%| | 3691/1000000 [9:20:00<1735:41:26, 6.27s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [3691], local_loss=0.04314510524272919, train_loss=0.030392207205295563, time_cost=1.853607416152954
+
Steps: 0%| | 3691/1000000 [9:20:00<1735:41:26, 6.27s/it, lr=1e-5, step_loss=0.0431]
Steps: 0%| | 3692/1000000 [9:20:05<1660:33:47, 6.00s/it, lr=1e-5, step_loss=0.0431][RANK-0]: Step: [3692], local_loss=0.024201788008213043, train_loss=0.14508412778377533, time_cost=2.395932674407959
+
Steps: 0%| | 3692/1000000 [9:20:05<1660:33:47, 6.00s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 3693/1000000 [9:20:20<2374:33:33, 8.58s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [3693], local_loss=0.09062565118074417, train_loss=0.0472196489572525, time_cost=1.232466697692871
+
Steps: 0%| | 3693/1000000 [9:20:20<2374:33:33, 8.58s/it, lr=1e-5, step_loss=0.0906]
Steps: 0%| | 3694/1000000 [9:20:25<2096:22:57, 7.57s/it, lr=1e-5, step_loss=0.0906][RANK-0]: Step: [3694], local_loss=0.03863837569952011, train_loss=0.027277112007141113, time_cost=2.39687442779541
+
Steps: 0%| | 3694/1000000 [9:20:25<2096:22:57, 7.57s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 3695/1000000 [9:20:32<2045:44:51, 7.39s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [3695], local_loss=0.025088300928473473, train_loss=0.08227840065956116, time_cost=1.2568094730377197
+
Steps: 0%| | 3695/1000000 [9:20:32<2045:44:51, 7.39s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 3696/1000000 [9:20:43<2306:02:05, 8.33s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [3696], local_loss=0.018450845032930374, train_loss=0.05331730842590332, time_cost=1.3347413539886475
+
Steps: 0%| | 3696/1000000 [9:20:43<2306:02:05, 8.33s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 3697/1000000 [9:20:54<2539:34:47, 9.18s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [3697], local_loss=0.20747384428977966, train_loss=0.13378292322158813, time_cost=3.5982773303985596
+
Steps: 0%| | 3697/1000000 [9:20:54<2539:34:47, 9.18s/it, lr=1e-5, step_loss=0.207]
Steps: 0%| | 3698/1000000 [9:21:09<3049:24:38, 11.02s/it, lr=1e-5, step_loss=0.207][RANK-0]: Step: [3698], local_loss=0.03125791251659393, train_loss=0.03232366964221001, time_cost=7.408310890197754
+
Steps: 0%| | 3698/1000000 [9:21:09<3049:24:38, 11.02s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 3699/1000000 [9:21:20<3064:46:13, 11.07s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [3699], local_loss=0.46262460947036743, train_loss=0.08251100778579712, time_cost=2.432007312774658
+
Steps: 0%| | 3699/1000000 [9:21:20<3064:46:13, 11.07s/it, lr=1e-5, step_loss=0.463]
Steps: 0%| | 3700/1000000 [9:21:25<2572:50:28, 9.30s/it, lr=1e-5, step_loss=0.463][RANK-0]: Step: [3700], local_loss=0.03016582876443863, train_loss=0.040271058678627014, time_cost=1.3186655044555664
+
Steps: 0%| | 3700/1000000 [9:21:25<2572:50:28, 9.30s/it, lr=1e-5, step_loss=0.0302]
Steps: 0%| | 3701/1000000 [9:21:34<2480:29:52, 8.96s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [3701], local_loss=0.040755610913038254, train_loss=0.05193597823381424, time_cost=4.073909759521484
+
Steps: 0%| | 3701/1000000 [9:21:34<2480:29:52, 8.96s/it, lr=1e-5, step_loss=0.0408]
Steps: 0%| | 3702/1000000 [9:21:39<2189:43:58, 7.91s/it, lr=1e-5, step_loss=0.0408][RANK-0]: Step: [3702], local_loss=0.0828271359205246, train_loss=0.16833502054214478, time_cost=2.7919294834136963
+
Steps: 0%| | 3702/1000000 [9:21:39<2189:43:58, 7.91s/it, lr=1e-5, step_loss=0.0828]
Steps: 0%| | 3703/1000000 [9:21:43<1897:20:03, 6.86s/it, lr=1e-5, step_loss=0.0828][RANK-0]: Step: [3703], local_loss=0.028913332149386406, train_loss=0.07840745896100998, time_cost=3.080744981765747
+
Steps: 0%| | 3703/1000000 [9:21:44<1897:20:03, 6.86s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 3704/1000000 [9:21:48<1678:42:57, 6.07s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [3704], local_loss=0.06502298265695572, train_loss=0.06317607313394547, time_cost=1.7148613929748535
+
Steps: 0%| | 3704/1000000 [9:21:48<1678:42:57, 6.07s/it, lr=1e-5, step_loss=0.065]
Steps: 0%| | 3705/1000000 [9:21:52<1511:27:59, 5.46s/it, lr=1e-5, step_loss=0.065][RANK-0]: Step: [3705], local_loss=0.017775822430849075, train_loss=0.05554395914077759, time_cost=1.3235304355621338
+
Steps: 0%| | 3705/1000000 [9:21:52<1511:27:59, 5.46s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 3706/1000000 [9:22:09<2451:03:35, 8.86s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [3706], local_loss=0.22850580513477325, train_loss=0.04353829473257065, time_cost=2.735028028488159
+
Steps: 0%| | 3706/1000000 [9:22:09<2451:03:35, 8.86s/it, lr=1e-5, step_loss=0.229]
Steps: 0%| | 3707/1000000 [9:22:23<2940:36:46, 10.63s/it, lr=1e-5, step_loss=0.229][RANK-0]: Step: [3707], local_loss=0.05430842190980911, train_loss=0.045578405261039734, time_cost=4.960874080657959
+
Steps: 0%| | 3707/1000000 [9:22:23<2940:36:46, 10.63s/it, lr=1e-5, step_loss=0.0543]
Steps: 0%| | 3708/1000000 [9:22:37<3177:27:58, 11.48s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [3708], local_loss=0.022158732637763023, train_loss=0.03754144161939621, time_cost=5.859692573547363
+
Steps: 0%| | 3708/1000000 [9:22:37<3177:27:58, 11.48s/it, lr=1e-5, step_loss=0.0222]
Steps: 0%| | 3709/1000000 [9:22:51<3416:21:43, 12.34s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [3709], local_loss=0.1336580216884613, train_loss=0.06157872825860977, time_cost=5.41047739982605
+
Steps: 0%| | 3709/1000000 [9:22:51<3416:21:43, 12.34s/it, lr=1e-5, step_loss=0.134]
Steps: 0%| | 3710/1000000 [9:23:07<3734:30:15, 13.49s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [3710], local_loss=0.015186918899416924, train_loss=0.04835088923573494, time_cost=2.168137788772583
+
Steps: 0%| | 3710/1000000 [9:23:07<3734:30:15, 13.49s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 3711/1000000 [9:23:14<3181:14:18, 11.50s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [3711], local_loss=0.03163173422217369, train_loss=0.05189972743391991, time_cost=2.2236058712005615
+
Steps: 0%| | 3711/1000000 [9:23:14<3181:14:18, 11.50s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 3712/1000000 [9:23:21<2814:00:35, 10.17s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [3712], local_loss=0.008800780400633812, train_loss=0.1680884212255478, time_cost=2.6420936584472656
+
Steps: 0%| | 3712/1000000 [9:23:21<2814:00:35, 10.17s/it, lr=1e-5, step_loss=0.0088]
Steps: 0%| | 3713/1000000 [9:23:35<3136:58:36, 11.34s/it, lr=1e-5, step_loss=0.0088][RANK-0]: Step: [3713], local_loss=0.011802264489233494, train_loss=0.022398341447114944, time_cost=3.7643582820892334
+
Steps: 0%| | 3713/1000000 [9:23:35<3136:58:36, 11.34s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 3714/1000000 [9:23:43<2809:27:47, 10.15s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [3714], local_loss=1.003711223602295, train_loss=0.1399143487215042, time_cost=1.7352395057678223
+
Steps: 0%| | 3714/1000000 [9:23:43<2809:27:47, 10.15s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 3715/1000000 [9:23:50<2538:53:37, 9.17s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [3715], local_loss=0.01331725250929594, train_loss=0.039793774485588074, time_cost=4.965380668640137
+
Steps: 0%| | 3715/1000000 [9:23:50<2538:53:37, 9.17s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 3716/1000000 [9:24:03<2871:52:55, 10.38s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [3716], local_loss=0.04641461372375488, train_loss=0.08144305646419525, time_cost=5.1024487018585205
+
Steps: 0%| | 3716/1000000 [9:24:03<2871:52:55, 10.38s/it, lr=1e-5, step_loss=0.0464]
Steps: 0%| | 3717/1000000 [9:24:10<2600:22:22, 9.40s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [3717], local_loss=0.02619762159883976, train_loss=0.1490289717912674, time_cost=2.9601728916168213
+
Steps: 0%| | 3717/1000000 [9:24:10<2600:22:22, 9.40s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 3718/1000000 [9:24:15<2227:33:55, 8.05s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [3718], local_loss=0.027567455545067787, train_loss=0.02472982183098793, time_cost=1.2226788997650146
+
Steps: 0%| | 3718/1000000 [9:24:15<2227:33:55, 8.05s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 3719/1000000 [9:24:22<2159:15:30, 7.80s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [3719], local_loss=0.06465453654527664, train_loss=0.08339883387088776, time_cost=2.772552490234375
+
Steps: 0%| | 3719/1000000 [9:24:22<2159:15:30, 7.80s/it, lr=1e-5, step_loss=0.0647]
Steps: 0%| | 3720/1000000 [9:24:30<2181:52:32, 7.88s/it, lr=1e-5, step_loss=0.0647][RANK-0]: Step: [3720], local_loss=0.020759113132953644, train_loss=0.08270102739334106, time_cost=1.2187929153442383
+
Steps: 0%| | 3720/1000000 [9:24:30<2181:52:32, 7.88s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 3721/1000000 [9:24:37<2065:03:31, 7.46s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [3721], local_loss=0.030907444655895233, train_loss=0.035002537071704865, time_cost=2.3847970962524414
+
Steps: 0%| | 3721/1000000 [9:24:37<2065:03:31, 7.46s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 3722/1000000 [9:24:45<2157:54:08, 7.80s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [3722], local_loss=0.017785413190722466, train_loss=0.14699694514274597, time_cost=4.486875772476196
+
Steps: 0%| | 3722/1000000 [9:24:45<2157:54:08, 7.80s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 3723/1000000 [9:24:56<2454:29:56, 8.87s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [3723], local_loss=0.33810025453567505, train_loss=0.09644050896167755, time_cost=2.478776454925537
+
Steps: 0%| | 3723/1000000 [9:24:56<2454:29:56, 8.87s/it, lr=1e-5, step_loss=0.338]
Steps: 0%| | 3724/1000000 [9:25:04<2376:20:17, 8.59s/it, lr=1e-5, step_loss=0.338][RANK-0]: Step: [3724], local_loss=0.009411115199327469, train_loss=0.2025407999753952, time_cost=4.073914051055908
+
Steps: 0%| | 3724/1000000 [9:25:04<2376:20:17, 8.59s/it, lr=1e-5, step_loss=0.00941]
Steps: 0%| | 3725/1000000 [9:25:18<2754:55:43, 9.95s/it, lr=1e-5, step_loss=0.00941][RANK-0]: Step: [3725], local_loss=0.016327224671840668, train_loss=0.05522266775369644, time_cost=6.959258317947388
+
Steps: 0%| | 3725/1000000 [9:25:18<2754:55:43, 9.95s/it, lr=1e-5, step_loss=0.0163]
Steps: 0%| | 3726/1000000 [9:25:29<2840:16:37, 10.26s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [3726], local_loss=0.007637185975909233, train_loss=0.09425067156553268, time_cost=5.163543462753296
+
Steps: 0%| | 3726/1000000 [9:25:29<2840:16:37, 10.26s/it, lr=1e-5, step_loss=0.00764]
Steps: 0%| | 3727/1000000 [9:25:43<3192:53:38, 11.54s/it, lr=1e-5, step_loss=0.00764][RANK-0]: Step: [3727], local_loss=0.06896299123764038, train_loss=0.05783351883292198, time_cost=1.779548168182373
+
Steps: 0%| | 3727/1000000 [9:25:43<3192:53:38, 11.54s/it, lr=1e-5, step_loss=0.069]
Steps: 0%| | 3728/1000000 [9:25:54<3138:12:17, 11.34s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [3728], local_loss=0.05961809307336807, train_loss=0.031020313501358032, time_cost=2.633852243423462
+
Steps: 0%| | 3728/1000000 [9:25:54<3138:12:17, 11.34s/it, lr=1e-5, step_loss=0.0596]
Steps: 0%| | 3729/1000000 [9:26:04<3066:42:33, 11.08s/it, lr=1e-5, step_loss=0.0596][RANK-0]: Step: [3729], local_loss=0.07634608447551727, train_loss=0.03036121279001236, time_cost=2.3347315788269043
+
Steps: 0%| | 3729/1000000 [9:26:04<3066:42:33, 11.08s/it, lr=1e-5, step_loss=0.0763]
Steps: 0%| | 3730/1000000 [9:26:10<2579:25:34, 9.32s/it, lr=1e-5, step_loss=0.0763][RANK-0]: Step: [3730], local_loss=0.01418980211019516, train_loss=0.023655757308006287, time_cost=2.518547296524048
+
Steps: 0%| | 3730/1000000 [9:26:10<2579:25:34, 9.32s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 3731/1000000 [9:26:21<2714:17:07, 9.81s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [3731], local_loss=0.034030988812446594, train_loss=0.3092356324195862, time_cost=7.933473587036133
+
Steps: 0%| | 3731/1000000 [9:26:21<2714:17:07, 9.81s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 3732/1000000 [9:26:32<2838:45:30, 10.26s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [3732], local_loss=0.04892164096236229, train_loss=0.18874968588352203, time_cost=4.321677207946777
+
Steps: 0%| | 3732/1000000 [9:26:32<2838:45:30, 10.26s/it, lr=1e-5, step_loss=0.0489]
Steps: 0%| | 3733/1000000 [9:26:43<2885:17:45, 10.43s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [3733], local_loss=0.013493228703737259, train_loss=0.05107765272259712, time_cost=3.1944868564605713
+
Steps: 0%| | 3733/1000000 [9:26:43<2885:17:45, 10.43s/it, lr=1e-5, step_loss=0.0135]
Steps: 0%| | 3734/1000000 [9:26:54<2922:19:14, 10.56s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [3734], local_loss=0.021519554778933525, train_loss=0.21068409085273743, time_cost=2.5099267959594727
+
Steps: 0%| | 3734/1000000 [9:26:54<2922:19:14, 10.56s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 3735/1000000 [9:27:04<2914:07:26, 10.53s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [3735], local_loss=0.01116898749023676, train_loss=0.029885491356253624, time_cost=1.2736616134643555
+
Steps: 0%| | 3735/1000000 [9:27:04<2914:07:26, 10.53s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 3736/1000000 [9:27:11<2627:32:18, 9.49s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [3736], local_loss=0.024157878011465073, train_loss=0.182652086019516, time_cost=1.6384329795837402
+
Steps: 0%| | 3736/1000000 [9:27:11<2627:32:18, 9.49s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 3737/1000000 [9:27:22<2747:43:58, 9.93s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [3737], local_loss=0.013423328287899494, train_loss=0.01739831268787384, time_cost=1.9906997680664062
+
Steps: 0%| | 3737/1000000 [9:27:22<2747:43:58, 9.93s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 3738/1000000 [9:27:32<2755:28:33, 9.96s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [3738], local_loss=0.018272094428539276, train_loss=0.037282347679138184, time_cost=3.0928730964660645
+
Steps: 0%| | 3738/1000000 [9:27:32<2755:28:33, 9.96s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 3739/1000000 [9:27:44<2936:13:38, 10.61s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [3739], local_loss=0.01140556950122118, train_loss=0.06638417392969131, time_cost=5.5598766803741455
+
Steps: 0%| | 3739/1000000 [9:27:44<2936:13:38, 10.61s/it, lr=1e-5, step_loss=0.0114]
Steps: 0%| | 3740/1000000 [9:27:58<3239:30:34, 11.71s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [3740], local_loss=0.030704312026500702, train_loss=0.06118897721171379, time_cost=4.677760124206543
+
Steps: 0%| | 3740/1000000 [9:27:58<3239:30:34, 11.71s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 3741/1000000 [9:28:05<2773:44:03, 10.02s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [3741], local_loss=0.014295882545411587, train_loss=0.05120585113763809, time_cost=1.2168824672698975
+
Steps: 0%| | 3741/1000000 [9:28:05<2773:44:03, 10.02s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 3742/1000000 [9:28:15<2770:27:02, 10.01s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [3742], local_loss=0.6073789596557617, train_loss=0.19576522707939148, time_cost=1.235172986984253
+
Steps: 0%| | 3742/1000000 [9:28:15<2770:27:02, 10.01s/it, lr=1e-5, step_loss=0.607]
Steps: 0%| | 3743/1000000 [9:28:23<2671:26:49, 9.65s/it, lr=1e-5, step_loss=0.607][RANK-0]: Step: [3743], local_loss=0.02873993292450905, train_loss=0.05834786221385002, time_cost=6.535761117935181
+
Steps: 0%| | 3743/1000000 [9:28:23<2671:26:49, 9.65s/it, lr=1e-5, step_loss=0.0287]
Steps: 0%| | 3744/1000000 [9:28:34<2791:37:13, 10.09s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [3744], local_loss=0.05165240541100502, train_loss=0.05720530450344086, time_cost=1.9965524673461914
+
Steps: 0%| | 3744/1000000 [9:28:34<2791:37:13, 10.09s/it, lr=1e-5, step_loss=0.0517]
Steps: 0%| | 3745/1000000 [9:28:44<2761:42:55, 9.98s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [3745], local_loss=0.019431300461292267, train_loss=0.021791312843561172, time_cost=2.7098488807678223
+
Steps: 0%| | 3745/1000000 [9:28:44<2761:42:55, 9.98s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 3746/1000000 [9:28:53<2638:50:17, 9.54s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [3746], local_loss=0.04202275723218918, train_loss=0.029199369251728058, time_cost=2.82354474067688
+
Steps: 0%| | 3746/1000000 [9:28:53<2638:50:17, 9.54s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 3747/1000000 [9:29:04<2769:14:19, 10.01s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [3747], local_loss=0.04657192528247833, train_loss=0.047578562051057816, time_cost=3.7762579917907715
+
Steps: 0%| | 3747/1000000 [9:29:04<2769:14:19, 10.01s/it, lr=1e-5, step_loss=0.0466]
Steps: 0%| | 3748/1000000 [9:29:09<2331:51:45, 8.43s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [3748], local_loss=0.041036151349544525, train_loss=0.03593471646308899, time_cost=1.857194423675537
+
Steps: 0%| | 3748/1000000 [9:29:09<2331:51:45, 8.43s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 3749/1000000 [9:29:19<2524:51:21, 9.12s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [3749], local_loss=0.046345826238393784, train_loss=0.16989734768867493, time_cost=3.50347638130188
+
Steps: 0%| | 3749/1000000 [9:29:19<2524:51:21, 9.12s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 3750/1000000 [9:29:32<2828:33:50, 10.22s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [3750], local_loss=0.04031749069690704, train_loss=0.045160431414842606, time_cost=4.238781929016113
+
Steps: 0%| | 3750/1000000 [9:29:32<2828:33:50, 10.22s/it, lr=1e-5, step_loss=0.0403]
Steps: 0%| | 3751/1000000 [9:29:40<2604:45:58, 9.41s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [3751], local_loss=0.012674437835812569, train_loss=0.027936093509197235, time_cost=3.316220283508301
+
Steps: 0%| | 3751/1000000 [9:29:40<2604:45:58, 9.41s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 3752/1000000 [9:29:50<2684:56:54, 9.70s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [3752], local_loss=0.0172999557107687, train_loss=0.05870206654071808, time_cost=1.9090113639831543
+
Steps: 0%| | 3752/1000000 [9:29:50<2684:56:54, 9.70s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 3753/1000000 [9:29:58<2517:22:39, 9.10s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [3753], local_loss=0.021083412691950798, train_loss=0.0643010288476944, time_cost=3.3966562747955322
+
Steps: 0%| | 3753/1000000 [9:29:58<2517:22:39, 9.10s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 3754/1000000 [9:30:08<2620:18:49, 9.47s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [3754], local_loss=0.019863447174429893, train_loss=0.0781031921505928, time_cost=5.10625433921814
+
Steps: 0%| | 3754/1000000 [9:30:08<2620:18:49, 9.47s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 3755/1000000 [9:30:13<2237:14:05, 8.08s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [3755], local_loss=0.019747531041502953, train_loss=0.06649472564458847, time_cost=1.8763210773468018
+
Steps: 0%| | 3755/1000000 [9:30:13<2237:14:05, 8.08s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 3756/1000000 [9:30:17<1937:49:27, 7.00s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [3756], local_loss=0.032255154103040695, train_loss=0.030058205127716064, time_cost=1.916501522064209
+
Steps: 0%| | 3756/1000000 [9:30:17<1937:49:27, 7.00s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 3757/1000000 [9:30:31<2471:18:51, 8.93s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [3757], local_loss=0.037864990532398224, train_loss=0.0387585423886776, time_cost=4.918464422225952
+
Steps: 0%| | 3757/1000000 [9:30:31<2471:18:51, 8.93s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 3758/1000000 [9:30:46<2998:02:35, 10.83s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [3758], local_loss=0.01032822486013174, train_loss=0.12117574363946915, time_cost=6.687827825546265
+
Steps: 0%| | 3758/1000000 [9:30:46<2998:02:35, 10.83s/it, lr=1e-5, step_loss=0.0103]
Steps: 0%| | 3759/1000000 [9:30:51<2517:28:04, 9.10s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [3759], local_loss=0.011852745898067951, train_loss=0.026109781116247177, time_cost=2.3154139518737793
+
Steps: 0%| | 3759/1000000 [9:30:51<2517:28:04, 9.10s/it, lr=1e-5, step_loss=0.0119]
Steps: 0%| | 3760/1000000 [9:31:05<2908:51:06, 10.51s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [3760], local_loss=0.017438527196645737, train_loss=0.11459788680076599, time_cost=4.768136978149414
+
Steps: 0%| | 3760/1000000 [9:31:05<2908:51:06, 10.51s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3761/1000000 [9:31:10<2494:41:44, 9.01s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3761], local_loss=0.013253690674901009, train_loss=0.03442443162202835, time_cost=1.2164151668548584
+
Steps: 0%| | 3761/1000000 [9:31:10<2494:41:44, 9.01s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 3762/1000000 [9:31:23<2800:30:55, 10.12s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [3762], local_loss=0.015039755962789059, train_loss=0.06468918919563293, time_cost=3.160750150680542
+
Steps: 0%| | 3762/1000000 [9:31:23<2800:30:55, 10.12s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 3763/1000000 [9:31:36<3038:03:56, 10.98s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [3763], local_loss=0.018066177144646645, train_loss=0.07843481004238129, time_cost=10.61854076385498
+
Steps: 0%| | 3763/1000000 [9:31:36<3038:03:56, 10.98s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 3764/1000000 [9:31:41<2532:53:21, 9.15s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [3764], local_loss=0.0575539655983448, train_loss=0.04119059443473816, time_cost=1.9613816738128662
+
Steps: 0%| | 3764/1000000 [9:31:41<2532:53:21, 9.15s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 3765/1000000 [9:31:52<2691:10:02, 9.72s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [3765], local_loss=0.013585720211267471, train_loss=0.05092918872833252, time_cost=1.2515678405761719
+
Steps: 0%| | 3765/1000000 [9:31:52<2691:10:02, 9.72s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 3766/1000000 [9:31:57<2307:19:50, 8.34s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [3766], local_loss=0.11463367938995361, train_loss=0.0473078228533268, time_cost=2.06579852104187
+
Steps: 0%| | 3766/1000000 [9:31:57<2307:19:50, 8.34s/it, lr=1e-5, step_loss=0.115]
Steps: 0%| | 3767/1000000 [9:32:02<1987:52:48, 7.18s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [3767], local_loss=0.13086481392383575, train_loss=0.056376755237579346, time_cost=1.9310047626495361
+
Steps: 0%| | 3767/1000000 [9:32:02<1987:52:48, 7.18s/it, lr=1e-5, step_loss=0.131]
Steps: 0%| | 3768/1000000 [9:32:13<2302:06:37, 8.32s/it, lr=1e-5, step_loss=0.131][RANK-0]: Step: [3768], local_loss=0.093598872423172, train_loss=0.05761784315109253, time_cost=1.9077413082122803
+
Steps: 0%| | 3768/1000000 [9:32:13<2302:06:37, 8.32s/it, lr=1e-5, step_loss=0.0936]
Steps: 0%| | 3769/1000000 [9:32:19<2124:00:32, 7.68s/it, lr=1e-5, step_loss=0.0936][RANK-0]: Step: [3769], local_loss=0.049034420400857925, train_loss=0.033774279057979584, time_cost=2.1991443634033203
+
Steps: 0%| | 3769/1000000 [9:32:19<2124:00:32, 7.68s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 3770/1000000 [9:32:30<2377:24:52, 8.59s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [3770], local_loss=0.05780048668384552, train_loss=0.034072525799274445, time_cost=7.838704824447632
+
Steps: 0%| | 3770/1000000 [9:32:30<2377:24:52, 8.59s/it, lr=1e-5, step_loss=0.0578]
Steps: 0%| | 3771/1000000 [9:32:40<2576:40:41, 9.31s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [3771], local_loss=0.09202027320861816, train_loss=0.15940356254577637, time_cost=1.957317590713501
+
Steps: 0%| | 3771/1000000 [9:32:41<2576:40:41, 9.31s/it, lr=1e-5, step_loss=0.092]
Steps: 0%| | 3772/1000000 [9:32:54<2947:30:35, 10.65s/it, lr=1e-5, step_loss=0.092][RANK-0]: Step: [3772], local_loss=0.03319947421550751, train_loss=0.030574066564440727, time_cost=4.55676794052124
+
Steps: 0%| | 3772/1000000 [9:32:54<2947:30:35, 10.65s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 3773/1000000 [9:33:01<2652:14:31, 9.58s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [3773], local_loss=0.06360047310590744, train_loss=0.05469900369644165, time_cost=1.4081041812896729
+
Steps: 0%| | 3773/1000000 [9:33:01<2652:14:31, 9.58s/it, lr=1e-5, step_loss=0.0636]
Steps: 0%| | 3774/1000000 [9:33:06<2235:31:31, 8.08s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [3774], local_loss=0.05906294286251068, train_loss=0.06666386127471924, time_cost=1.7929422855377197
+
Steps: 0%| | 3774/1000000 [9:33:06<2235:31:31, 8.08s/it, lr=1e-5, step_loss=0.0591]
Steps: 0%| | 3775/1000000 [9:33:19<2637:45:18, 9.53s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [3775], local_loss=0.028061838820576668, train_loss=0.03688248246908188, time_cost=5.529498815536499
+
Steps: 0%| | 3775/1000000 [9:33:19<2637:45:18, 9.53s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 3776/1000000 [9:33:30<2787:44:27, 10.07s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [3776], local_loss=0.01739651896059513, train_loss=0.027884814888238907, time_cost=3.6754863262176514
+
Steps: 0%| | 3776/1000000 [9:33:30<2787:44:27, 10.07s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3777/1000000 [9:33:35<2385:17:33, 8.62s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3777], local_loss=0.3887577950954437, train_loss=0.07220814377069473, time_cost=2.342775821685791
+
Steps: 0%| | 3777/1000000 [9:33:35<2385:17:33, 8.62s/it, lr=1e-5, step_loss=0.389]
Steps: 0%| | 3778/1000000 [9:33:43<2288:14:08, 8.27s/it, lr=1e-5, step_loss=0.389][RANK-0]: Step: [3778], local_loss=0.029099758714437485, train_loss=0.08329436182975769, time_cost=1.433518409729004
+
Steps: 0%| | 3778/1000000 [9:33:43<2288:14:08, 8.27s/it, lr=1e-5, step_loss=0.0291]
Steps: 0%| | 3779/1000000 [9:33:54<2546:37:58, 9.20s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [3779], local_loss=0.019218025729060173, train_loss=0.02560437098145485, time_cost=3.408338785171509
+
Steps: 0%| | 3779/1000000 [9:33:54<2546:37:58, 9.20s/it, lr=1e-5, step_loss=0.0192]
Steps: 0%| | 3780/1000000 [9:34:05<2707:14:51, 9.78s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [3780], local_loss=0.014885000884532928, train_loss=0.027230963110923767, time_cost=2.5937767028808594
+
Steps: 0%| | 3780/1000000 [9:34:05<2707:14:51, 9.78s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 3781/1000000 [9:34:21<3192:39:50, 11.54s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [3781], local_loss=0.020994246006011963, train_loss=0.06845597177743912, time_cost=1.3282570838928223
+
Steps: 0%| | 3781/1000000 [9:34:21<3192:39:50, 11.54s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 3782/1000000 [9:34:32<3172:55:21, 11.47s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [3782], local_loss=0.11294619739055634, train_loss=0.03824979066848755, time_cost=1.3117995262145996
+
Steps: 0%| | 3782/1000000 [9:34:32<3172:55:21, 11.47s/it, lr=1e-5, step_loss=0.113]
Steps: 0%| | 3783/1000000 [9:34:44<3162:08:14, 11.43s/it, lr=1e-5, step_loss=0.113][RANK-0]: Step: [3783], local_loss=0.3555751442909241, train_loss=0.09247447550296783, time_cost=5.479970216751099
+
Steps: 0%| | 3783/1000000 [9:34:44<3162:08:14, 11.43s/it, lr=1e-5, step_loss=0.356]
Steps: 0%| | 3784/1000000 [9:34:53<2955:47:49, 10.68s/it, lr=1e-5, step_loss=0.356][RANK-0]: Step: [3784], local_loss=0.035576097667217255, train_loss=0.03470339626073837, time_cost=1.2996470928192139
+
Steps: 0%| | 3784/1000000 [9:34:53<2955:47:49, 10.68s/it, lr=1e-5, step_loss=0.0356]
Steps: 0%| | 3785/1000000 [9:34:58<2502:31:22, 9.04s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [3785], local_loss=0.024396689608693123, train_loss=0.048222657293081284, time_cost=2.2646496295928955
+
Steps: 0%| | 3785/1000000 [9:34:58<2502:31:22, 9.04s/it, lr=1e-5, step_loss=0.0244]
Steps: 0%| | 3786/1000000 [9:35:08<2591:52:54, 9.37s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [3786], local_loss=0.011740505695343018, train_loss=0.11917545646429062, time_cost=1.2400243282318115
+
Steps: 0%| | 3786/1000000 [9:35:08<2591:52:54, 9.37s/it, lr=1e-5, step_loss=0.0117]
Steps: 0%| | 3787/1000000 [9:35:22<3013:16:21, 10.89s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [3787], local_loss=0.08148876577615738, train_loss=0.0423603281378746, time_cost=5.389330625534058
+
Steps: 0%| | 3787/1000000 [9:35:22<3013:16:21, 10.89s/it, lr=1e-5, step_loss=0.0815]
Steps: 0%| | 3788/1000000 [9:35:36<3236:37:38, 11.70s/it, lr=1e-5, step_loss=0.0815][RANK-0]: Step: [3788], local_loss=0.04979795962572098, train_loss=0.023065952584147453, time_cost=4.204732656478882
+
Steps: 0%| | 3788/1000000 [9:35:36<3236:37:38, 11.70s/it, lr=1e-5, step_loss=0.0498]
Steps: 0%| | 3789/1000000 [9:35:41<2666:19:29, 9.64s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [3789], local_loss=0.01373983919620514, train_loss=0.06338281184434891, time_cost=3.84022855758667
+
Steps: 0%| | 3789/1000000 [9:35:41<2666:19:29, 9.64s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 3790/1000000 [9:35:52<2760:47:22, 9.98s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [3790], local_loss=0.029387060552835464, train_loss=0.1787889301776886, time_cost=2.804671049118042
+
Steps: 0%| | 3790/1000000 [9:35:52<2760:47:22, 9.98s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 3791/1000000 [9:35:56<2340:04:35, 8.46s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [3791], local_loss=0.24960190057754517, train_loss=0.0931587815284729, time_cost=2.009690999984741
+
Steps: 0%| | 3791/1000000 [9:35:56<2340:04:35, 8.46s/it, lr=1e-5, step_loss=0.25]
Steps: 0%| | 3792/1000000 [9:36:06<2393:09:37, 8.65s/it, lr=1e-5, step_loss=0.25][RANK-0]: Step: [3792], local_loss=0.035936154425144196, train_loss=0.029629921540617943, time_cost=3.1938531398773193
+
Steps: 0%| | 3792/1000000 [9:36:06<2393:09:37, 8.65s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 3793/1000000 [9:36:13<2264:39:55, 8.18s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [3793], local_loss=0.022789571434259415, train_loss=0.0365503691136837, time_cost=2.425231456756592
+
Steps: 0%| | 3793/1000000 [9:36:13<2264:39:55, 8.18s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 3794/1000000 [9:36:21<2238:02:06, 8.09s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [3794], local_loss=0.013343504630029202, train_loss=0.026985377073287964, time_cost=1.4957315921783447
+
Steps: 0%| | 3794/1000000 [9:36:21<2238:02:06, 8.09s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 3795/1000000 [9:36:28<2195:22:40, 7.93s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [3795], local_loss=0.16069349646568298, train_loss=0.07636886835098267, time_cost=1.545783281326294
+
Steps: 0%| | 3795/1000000 [9:36:28<2195:22:40, 7.93s/it, lr=1e-5, step_loss=0.161]
Steps: 0%| | 3796/1000000 [9:36:35<2078:51:30, 7.51s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [3796], local_loss=0.013112233020365238, train_loss=0.04460565373301506, time_cost=3.026080846786499
+
Steps: 0%| | 3796/1000000 [9:36:35<2078:51:30, 7.51s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 3797/1000000 [9:36:49<2681:53:49, 9.69s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [3797], local_loss=0.11126043647527695, train_loss=8.13072681427002, time_cost=5.498655557632446
+
Steps: 0%| | 3797/1000000 [9:36:49<2681:53:49, 9.69s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 3798/1000000 [9:36:59<2686:43:24, 9.71s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [3798], local_loss=0.013961933553218842, train_loss=0.06390510499477386, time_cost=3.6956875324249268
+
Steps: 0%| | 3798/1000000 [9:36:59<2686:43:24, 9.71s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 3799/1000000 [9:37:11<2827:32:24, 10.22s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [3799], local_loss=0.03526289761066437, train_loss=0.04257804900407791, time_cost=5.194286108016968
+
Steps: 0%| | 3799/1000000 [9:37:11<2827:32:24, 10.22s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 3800/1000000 [9:37:17<2545:00:51, 9.20s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [3800], local_loss=0.18900638818740845, train_loss=0.06286723911762238, time_cost=2.503740072250366
+
Steps: 0%| | 3800/1000000 [9:37:17<2545:00:51, 9.20s/it, lr=1e-5, step_loss=0.189]
Steps: 0%| | 3801/1000000 [9:37:25<2402:16:09, 8.68s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [3801], local_loss=0.028488367795944214, train_loss=0.043176572769880295, time_cost=3.406649112701416
+
Steps: 0%| | 3801/1000000 [9:37:25<2402:16:09, 8.68s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 3802/1000000 [9:37:29<2047:35:25, 7.40s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [3802], local_loss=0.017448995262384415, train_loss=0.02895880863070488, time_cost=3.0827486515045166
+
Steps: 0%| | 3802/1000000 [9:37:29<2047:35:25, 7.40s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 3803/1000000 [9:37:35<1871:08:17, 6.76s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [3803], local_loss=0.05729418620467186, train_loss=0.29332682490348816, time_cost=2.483346700668335
+
Steps: 0%| | 3803/1000000 [9:37:35<1871:08:17, 6.76s/it, lr=1e-5, step_loss=0.0573]
Steps: 0%| | 3804/1000000 [9:37:46<2251:12:59, 8.14s/it, lr=1e-5, step_loss=0.0573][RANK-0]: Step: [3804], local_loss=0.23928265273571014, train_loss=0.12628524005413055, time_cost=2.3704111576080322
+
Steps: 0%| | 3804/1000000 [9:37:46<2251:12:59, 8.14s/it, lr=1e-5, step_loss=0.239]
Steps: 0%| | 3805/1000000 [9:37:56<2413:04:20, 8.72s/it, lr=1e-5, step_loss=0.239][RANK-0]: Step: [3805], local_loss=0.03764169663190842, train_loss=0.035199761390686035, time_cost=1.9156568050384521
+
Steps: 0%| | 3805/1000000 [9:37:56<2413:04:20, 8.72s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 3806/1000000 [9:38:07<2625:08:26, 9.49s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [3806], local_loss=0.0671050101518631, train_loss=0.13554011285305023, time_cost=4.129619359970093
+
Steps: 0%| | 3806/1000000 [9:38:07<2625:08:26, 9.49s/it, lr=1e-5, step_loss=0.0671]
Steps: 0%| | 3807/1000000 [9:38:24<3194:46:53, 11.55s/it, lr=1e-5, step_loss=0.0671][RANK-0]: Step: [3807], local_loss=0.014713311567902565, train_loss=0.03741812705993652, time_cost=2.262164831161499
+
Steps: 0%| | 3807/1000000 [9:38:24<3194:46:53, 11.55s/it, lr=1e-5, step_loss=0.0147]
Steps: 0%| | 3808/1000000 [9:38:29<2726:33:37, 9.85s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [3808], local_loss=0.03111492283642292, train_loss=0.04374384880065918, time_cost=1.4816148281097412
+
Steps: 0%| | 3808/1000000 [9:38:29<2726:33:37, 9.85s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 3809/1000000 [9:38:38<2627:33:07, 9.50s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [3809], local_loss=0.010423481464385986, train_loss=0.05220990628004074, time_cost=2.614983320236206
+
Steps: 0%| | 3809/1000000 [9:38:38<2627:33:07, 9.50s/it, lr=1e-5, step_loss=0.0104]
Steps: 0%| | 3810/1000000 [9:38:45<2395:16:16, 8.66s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [3810], local_loss=0.031923823058605194, train_loss=0.036945585161447525, time_cost=1.5696203708648682
+
Steps: 0%| | 3810/1000000 [9:38:45<2395:16:16, 8.66s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 3811/1000000 [9:38:56<2600:51:12, 9.40s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [3811], local_loss=0.04260842129588127, train_loss=0.07422240823507309, time_cost=1.2275736331939697
+
Steps: 0%| | 3811/1000000 [9:38:56<2600:51:12, 9.40s/it, lr=1e-5, step_loss=0.0426]
Steps: 0%| | 3812/1000000 [9:39:09<2904:05:13, 10.49s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [3812], local_loss=0.024701183661818504, train_loss=0.06222929060459137, time_cost=3.3706889152526855
+
Steps: 0%| | 3812/1000000 [9:39:09<2904:05:13, 10.49s/it, lr=1e-5, step_loss=0.0247]
Steps: 0%| | 3813/1000000 [9:39:18<2756:53:17, 9.96s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [3813], local_loss=0.029113013297319412, train_loss=0.1896292269229889, time_cost=2.635301113128662
+
Steps: 0%| | 3813/1000000 [9:39:18<2756:53:17, 9.96s/it, lr=1e-5, step_loss=0.0291]
Steps: 0%| | 3814/1000000 [9:39:33<3163:00:20, 11.43s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [3814], local_loss=0.03960815444588661, train_loss=0.04259645566344261, time_cost=5.170651912689209
+
Steps: 0%| | 3814/1000000 [9:39:33<3163:00:20, 11.43s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 3815/1000000 [9:39:40<2829:01:13, 10.22s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [3815], local_loss=0.01578686758875847, train_loss=0.03259587287902832, time_cost=1.6147255897521973
+
Steps: 0%| | 3815/1000000 [9:39:40<2829:01:13, 10.22s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 3816/1000000 [9:39:44<2338:23:16, 8.45s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [3816], local_loss=0.08096077293157578, train_loss=0.0948430746793747, time_cost=1.8027901649475098
+
Steps: 0%| | 3816/1000000 [9:39:44<2338:23:16, 8.45s/it, lr=1e-5, step_loss=0.081]
Steps: 0%| | 3817/1000000 [9:39:54<2399:44:26, 8.67s/it, lr=1e-5, step_loss=0.081][RANK-0]: Step: [3817], local_loss=0.017243074253201485, train_loss=0.03022472932934761, time_cost=2.1673450469970703
+
Steps: 0%| | 3817/1000000 [9:39:54<2399:44:26, 8.67s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 3818/1000000 [9:40:05<2598:26:56, 9.39s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [3818], local_loss=0.02007574774324894, train_loss=0.06899909675121307, time_cost=7.727483510971069
+
Steps: 0%| | 3818/1000000 [9:40:05<2598:26:56, 9.39s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 3819/1000000 [9:40:16<2757:13:17, 9.96s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [3819], local_loss=0.03231125324964523, train_loss=0.026580480858683586, time_cost=3.462569236755371
+
Steps: 0%| | 3819/1000000 [9:40:16<2757:13:17, 9.96s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 3820/1000000 [9:40:23<2495:06:44, 9.02s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [3820], local_loss=0.06353086233139038, train_loss=0.07586531341075897, time_cost=2.42931866645813
+
Steps: 0%| | 3820/1000000 [9:40:23<2495:06:44, 9.02s/it, lr=1e-5, step_loss=0.0635]
Steps: 0%| | 3821/1000000 [9:40:29<2231:08:02, 8.06s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [3821], local_loss=0.011205310933291912, train_loss=0.03182751685380936, time_cost=1.9095396995544434
+
Steps: 0%| | 3821/1000000 [9:40:29<2231:08:02, 8.06s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 3822/1000000 [9:40:41<2612:46:39, 9.44s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [3822], local_loss=0.011076743714511395, train_loss=0.027059968560934067, time_cost=4.849165678024292
+
Steps: 0%| | 3822/1000000 [9:40:41<2612:46:39, 9.44s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 3823/1000000 [9:40:55<2974:30:45, 10.75s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [3823], local_loss=0.041681427508592606, train_loss=0.07217782735824585, time_cost=4.878587961196899
+
Steps: 0%| | 3823/1000000 [9:40:55<2974:30:45, 10.75s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 3824/1000000 [9:41:07<3055:48:18, 11.04s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [3824], local_loss=0.11227727681398392, train_loss=0.052923619747161865, time_cost=3.8997445106506348
+
Steps: 0%| | 3824/1000000 [9:41:07<3055:48:18, 11.04s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 3825/1000000 [9:41:12<2579:23:42, 9.32s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [3825], local_loss=0.018564963713288307, train_loss=0.1460554599761963, time_cost=1.6700091361999512
+
Steps: 0%| | 3825/1000000 [9:41:12<2579:23:42, 9.32s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 3826/1000000 [9:41:19<2392:50:35, 8.65s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [3826], local_loss=0.011566612869501114, train_loss=0.034475259482860565, time_cost=5.668655157089233
+
Steps: 0%| | 3826/1000000 [9:41:19<2392:50:35, 8.65s/it, lr=1e-5, step_loss=0.0116]
Steps: 0%| | 3827/1000000 [9:41:25<2152:10:06, 7.78s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [3827], local_loss=0.017473869025707245, train_loss=0.04486723244190216, time_cost=3.309898853302002
+
Steps: 0%| | 3827/1000000 [9:41:25<2152:10:06, 7.78s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 3828/1000000 [9:41:36<2439:55:38, 8.82s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [3828], local_loss=0.043261103332042694, train_loss=0.0934523344039917, time_cost=1.9608473777770996
+
Steps: 0%| | 3828/1000000 [9:41:36<2439:55:38, 8.82s/it, lr=1e-5, step_loss=0.0433]
Steps: 0%| | 3829/1000000 [9:41:43<2307:16:04, 8.34s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [3829], local_loss=0.013355745933949947, train_loss=0.023424429818987846, time_cost=1.7104103565216064
+
Steps: 0%| | 3829/1000000 [9:41:43<2307:16:04, 8.34s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 3830/1000000 [9:41:59<2890:59:50, 10.45s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [3830], local_loss=0.04614488035440445, train_loss=0.05423417687416077, time_cost=7.624083518981934
+
Steps: 0%| | 3830/1000000 [9:41:59<2890:59:50, 10.45s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 3831/1000000 [9:42:10<2945:38:09, 10.65s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [3831], local_loss=0.031843382865190506, train_loss=0.07040838897228241, time_cost=3.441885232925415
+
Steps: 0%| | 3831/1000000 [9:42:10<2945:38:09, 10.65s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 3832/1000000 [9:42:21<2950:59:48, 10.66s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [3832], local_loss=0.012137477286159992, train_loss=0.04009640961885452, time_cost=3.705669403076172
+
Steps: 0%| | 3832/1000000 [9:42:21<2950:59:48, 10.66s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 3833/1000000 [9:42:27<2641:47:51, 9.55s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [3833], local_loss=0.0421304851770401, train_loss=0.035820167511701584, time_cost=3.1041972637176514
+
Steps: 0%| | 3833/1000000 [9:42:27<2641:47:51, 9.55s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 3834/1000000 [9:42:37<2659:55:47, 9.61s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [3834], local_loss=0.015915334224700928, train_loss=0.08319562673568726, time_cost=3.8145899772644043
+
Steps: 0%| | 3834/1000000 [9:42:37<2659:55:47, 9.61s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 3835/1000000 [9:42:44<2449:51:17, 8.85s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [3835], local_loss=0.01070672832429409, train_loss=0.05678970366716385, time_cost=1.253871202468872
+
Steps: 0%| | 3835/1000000 [9:42:44<2449:51:17, 8.85s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 3836/1000000 [9:42:52<2336:34:08, 8.44s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [3836], local_loss=0.06725900620222092, train_loss=0.042396847158670425, time_cost=1.4050710201263428
+
Steps: 0%| | 3836/1000000 [9:42:52<2336:34:08, 8.44s/it, lr=1e-5, step_loss=0.0673]
Steps: 0%| | 3837/1000000 [9:42:58<2121:56:51, 7.67s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [3837], local_loss=0.03224271908402443, train_loss=0.024619143456220627, time_cost=1.4668927192687988
+
Steps: 0%| | 3837/1000000 [9:42:58<2121:56:51, 7.67s/it, lr=1e-5, step_loss=0.0322]
Steps: 0%| | 3838/1000000 [9:43:08<2386:53:29, 8.63s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [3838], local_loss=0.032425280660390854, train_loss=0.06623650342226028, time_cost=3.231565475463867
+
Steps: 0%| | 3838/1000000 [9:43:08<2386:53:29, 8.63s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 3839/1000000 [9:43:17<2402:47:21, 8.68s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [3839], local_loss=0.03802609071135521, train_loss=0.08262205123901367, time_cost=2.88761305809021
+
Steps: 0%| | 3839/1000000 [9:43:17<2402:47:21, 8.68s/it, lr=1e-5, step_loss=0.038]
Steps: 0%| | 3840/1000000 [9:43:28<2549:46:33, 9.21s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [3840], local_loss=0.050038233399391174, train_loss=0.08262352645397186, time_cost=5.147456407546997
+
Steps: 0%| | 3840/1000000 [9:43:28<2549:46:33, 9.21s/it, lr=1e-5, step_loss=0.05]
Steps: 0%| | 3841/1000000 [9:43:37<2526:33:02, 9.13s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [3841], local_loss=0.04409908875823021, train_loss=0.04693370312452316, time_cost=2.854383945465088
+
Steps: 0%| | 3841/1000000 [9:43:37<2526:33:02, 9.13s/it, lr=1e-5, step_loss=0.0441]
Steps: 0%| | 3842/1000000 [9:43:45<2494:08:43, 9.01s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [3842], local_loss=0.21331414580345154, train_loss=0.06125880405306816, time_cost=2.8965649604797363
+
Steps: 0%| | 3842/1000000 [9:43:45<2494:08:43, 9.01s/it, lr=1e-5, step_loss=0.213]
Steps: 0%| | 3843/1000000 [9:43:52<2331:09:24, 8.42s/it, lr=1e-5, step_loss=0.213][RANK-0]: Step: [3843], local_loss=0.10548041015863419, train_loss=0.03942546620965004, time_cost=1.4099605083465576
+
Steps: 0%| | 3843/1000000 [9:43:52<2331:09:24, 8.42s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 3844/1000000 [9:43:57<2043:40:40, 7.39s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [3844], local_loss=0.019379109144210815, train_loss=0.037008266896009445, time_cost=1.7809967994689941
+
Steps: 0%| | 3844/1000000 [9:43:57<2043:40:40, 7.39s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 3845/1000000 [9:44:03<1862:26:54, 6.73s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [3845], local_loss=56.2648811340332, train_loss=7.08853006362915, time_cost=2.023113965988159
+
Steps: 0%| | 3845/1000000 [9:44:03<1862:26:54, 6.73s/it, lr=1e-5, step_loss=56.3]
Steps: 0%| | 3846/1000000 [9:44:18<2596:12:48, 9.38s/it, lr=1e-5, step_loss=56.3][RANK-0]: Step: [3846], local_loss=83.35948181152344, train_loss=10.460919380187988, time_cost=12.662963390350342
+
Steps: 0%| | 3846/1000000 [9:44:18<2596:12:48, 9.38s/it, lr=1e-5, step_loss=83.4]
Steps: 0%| | 3847/1000000 [9:44:23<2182:37:22, 7.89s/it, lr=1e-5, step_loss=83.4][RANK-0]: Step: [3847], local_loss=0.0843786746263504, train_loss=0.0815972164273262, time_cost=1.9085757732391357
+
Steps: 0%| | 3847/1000000 [9:44:23<2182:37:22, 7.89s/it, lr=1e-5, step_loss=0.0844]
Steps: 0%| | 3848/1000000 [9:44:37<2753:52:42, 9.95s/it, lr=1e-5, step_loss=0.0844][RANK-0]: Step: [3848], local_loss=0.017887240275740623, train_loss=0.08085082471370697, time_cost=5.660665988922119
+
Steps: 0%| | 3848/1000000 [9:44:37<2753:52:42, 9.95s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 3849/1000000 [9:44:46<2635:36:42, 9.52s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [3849], local_loss=0.02178753912448883, train_loss=0.0247202031314373, time_cost=2.388493537902832
+
Steps: 0%| | 3849/1000000 [9:44:46<2635:36:42, 9.52s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 3850/1000000 [9:44:58<2886:51:08, 10.43s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [3850], local_loss=0.016911273822188377, train_loss=0.0347171425819397, time_cost=3.5252466201782227
+
Steps: 0%| | 3850/1000000 [9:44:58<2886:51:08, 10.43s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 3851/1000000 [9:45:03<2424:08:04, 8.76s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [3851], local_loss=0.028977904468774796, train_loss=0.08010561764240265, time_cost=2.152945041656494
+
Steps: 0%| | 3851/1000000 [9:45:03<2424:08:04, 8.76s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 3852/1000000 [9:45:15<2639:46:16, 9.54s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [3852], local_loss=0.01643414795398712, train_loss=0.12372918426990509, time_cost=1.770843505859375
+
Steps: 0%| | 3852/1000000 [9:45:15<2639:46:16, 9.54s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 3853/1000000 [9:45:28<2980:20:46, 10.77s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [3853], local_loss=0.023400284349918365, train_loss=4.691117286682129, time_cost=4.570939779281616
+
Steps: 0%| | 3853/1000000 [9:45:28<2980:20:46, 10.77s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 3854/1000000 [9:45:41<3105:46:29, 11.22s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [3854], local_loss=0.06363072246313095, train_loss=0.07342445850372314, time_cost=2.4092884063720703
+
Steps: 0%| | 3854/1000000 [9:45:41<3105:46:29, 11.22s/it, lr=1e-5, step_loss=0.0636]
Steps: 0%| | 3855/1000000 [9:45:51<3053:31:38, 11.04s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [3855], local_loss=0.028326071798801422, train_loss=0.03404458612203598, time_cost=1.7776391506195068
+
Steps: 0%| | 3855/1000000 [9:45:51<3053:31:38, 11.04s/it, lr=1e-5, step_loss=0.0283]
Steps: 0%| | 3856/1000000 [9:46:06<3351:15:55, 12.11s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [3856], local_loss=0.029662448912858963, train_loss=0.14707832038402557, time_cost=5.504828453063965
+
Steps: 0%| | 3856/1000000 [9:46:06<3351:15:55, 12.11s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 3857/1000000 [9:46:17<3278:39:49, 11.85s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [3857], local_loss=0.01796431839466095, train_loss=0.034500397741794586, time_cost=1.9047999382019043
+
Steps: 0%| | 3857/1000000 [9:46:17<3278:39:49, 11.85s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 3858/1000000 [9:46:26<3029:53:30, 10.95s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [3858], local_loss=0.046782270073890686, train_loss=0.050222352147102356, time_cost=2.3739914894104004
+
Steps: 0%| | 3858/1000000 [9:46:26<3029:53:30, 10.95s/it, lr=1e-5, step_loss=0.0468]
Steps: 0%| | 3859/1000000 [9:46:31<2530:55:19, 9.15s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [3859], local_loss=0.012861285358667374, train_loss=0.03787253797054291, time_cost=1.8947324752807617
+
Steps: 0%| | 3859/1000000 [9:46:31<2530:55:19, 9.15s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 3860/1000000 [9:46:38<2372:06:52, 8.57s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [3860], local_loss=0.26270678639411926, train_loss=0.05748854950070381, time_cost=1.5670957565307617
+
Steps: 0%| | 3860/1000000 [9:46:38<2372:06:52, 8.57s/it, lr=1e-5, step_loss=0.263]
Steps: 0%| | 3861/1000000 [9:46:49<2593:30:05, 9.37s/it, lr=1e-5, step_loss=0.263][RANK-0]: Step: [3861], local_loss=0.04968617856502533, train_loss=0.032416924834251404, time_cost=5.187297821044922
+
Steps: 0%| | 3861/1000000 [9:46:49<2593:30:05, 9.37s/it, lr=1e-5, step_loss=0.0497]
Steps: 0%| | 3862/1000000 [9:47:05<3154:23:19, 11.40s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [3862], local_loss=0.05278700217604637, train_loss=0.024918409064412117, time_cost=7.304707765579224
+
Steps: 0%| | 3862/1000000 [9:47:05<3154:23:19, 11.40s/it, lr=1e-5, step_loss=0.0528]
Steps: 0%| | 3863/1000000 [9:47:12<2778:56:33, 10.04s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [3863], local_loss=0.02981831133365631, train_loss=0.08822167664766312, time_cost=1.3503286838531494
+
Steps: 0%| | 3863/1000000 [9:47:12<2778:56:33, 10.04s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 3864/1000000 [9:47:18<2437:42:01, 8.81s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [3864], local_loss=0.026522936299443245, train_loss=0.050383731722831726, time_cost=1.500744342803955
+
Steps: 0%| | 3864/1000000 [9:47:18<2437:42:01, 8.81s/it, lr=1e-5, step_loss=0.0265]
Steps: 0%| | 3865/1000000 [9:47:31<2735:28:00, 9.89s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [3865], local_loss=0.011232269927859306, train_loss=0.1581668108701706, time_cost=5.518108606338501
+
Steps: 0%| | 3865/1000000 [9:47:31<2735:28:00, 9.89s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 3866/1000000 [9:47:38<2500:49:27, 9.04s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [3866], local_loss=0.03533672168850899, train_loss=0.027532922104001045, time_cost=1.2129206657409668
+
Steps: 0%| | 3866/1000000 [9:47:38<2500:49:27, 9.04s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 3867/1000000 [9:47:47<2507:55:12, 9.06s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [3867], local_loss=0.0645744800567627, train_loss=0.04308057203888893, time_cost=1.2850618362426758
+
Steps: 0%| | 3867/1000000 [9:47:47<2507:55:12, 9.06s/it, lr=1e-5, step_loss=0.0646]
Steps: 0%| | 3868/1000000 [9:47:57<2581:16:21, 9.33s/it, lr=1e-5, step_loss=0.0646][RANK-0]: Step: [3868], local_loss=0.027487313374876976, train_loss=0.07726562768220901, time_cost=3.8907084465026855
+
Steps: 0%| | 3868/1000000 [9:47:57<2581:16:21, 9.33s/it, lr=1e-5, step_loss=0.0275]
Steps: 0%| | 3869/1000000 [9:48:07<2680:42:42, 9.69s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [3869], local_loss=0.018071018159389496, train_loss=0.10567794740200043, time_cost=2.5402591228485107
+
Steps: 0%| | 3869/1000000 [9:48:07<2680:42:42, 9.69s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 3870/1000000 [9:48:12<2247:13:07, 8.12s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [3870], local_loss=0.021121134981513023, train_loss=0.15043054521083832, time_cost=1.7617566585540771
+
Steps: 0%| | 3870/1000000 [9:48:12<2247:13:07, 8.12s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 3871/1000000 [9:48:25<2633:38:50, 9.52s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [3871], local_loss=0.009568829089403152, train_loss=0.03471404314041138, time_cost=4.76004695892334
+
Steps: 0%| | 3871/1000000 [9:48:25<2633:38:50, 9.52s/it, lr=1e-5, step_loss=0.00957]
Steps: 0%| | 3872/1000000 [9:48:32<2486:20:49, 8.99s/it, lr=1e-5, step_loss=0.00957][RANK-0]: Step: [3872], local_loss=0.023869208991527557, train_loss=0.05252739042043686, time_cost=1.2199196815490723
+
Steps: 0%| | 3872/1000000 [9:48:32<2486:20:49, 8.99s/it, lr=1e-5, step_loss=0.0239]
Steps: 0%| | 3873/1000000 [9:48:37<2169:00:58, 7.84s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [3873], local_loss=0.045376479625701904, train_loss=0.1249479353427887, time_cost=1.38089919090271
+
Steps: 0%| | 3873/1000000 [9:48:37<2169:00:58, 7.84s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 3874/1000000 [9:48:47<2303:30:48, 8.32s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [3874], local_loss=0.008427994325757027, train_loss=0.03809845820069313, time_cost=3.709486722946167
+
Steps: 0%| | 3874/1000000 [9:48:47<2303:30:48, 8.32s/it, lr=1e-5, step_loss=0.00843]
Steps: 0%| | 3875/1000000 [9:48:58<2519:31:26, 9.11s/it, lr=1e-5, step_loss=0.00843][RANK-0]: Step: [3875], local_loss=0.028935417532920837, train_loss=0.038695551455020905, time_cost=1.3275177478790283
+
Steps: 0%| | 3875/1000000 [9:48:58<2519:31:26, 9.11s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 3876/1000000 [9:49:03<2178:18:35, 7.87s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [3876], local_loss=0.01971384510397911, train_loss=0.025749925523996353, time_cost=1.540400505065918
+
Steps: 0%| | 3876/1000000 [9:49:03<2178:18:35, 7.87s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 3877/1000000 [9:49:10<2111:29:32, 7.63s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [3877], local_loss=0.020483069121837616, train_loss=0.035997793078422546, time_cost=1.2292978763580322
+
Steps: 0%| | 3877/1000000 [9:49:10<2111:29:32, 7.63s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 3878/1000000 [9:49:14<1827:08:19, 6.60s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [3878], local_loss=0.016086703166365623, train_loss=0.03804005682468414, time_cost=1.4121167659759521
+
Steps: 0%| | 3878/1000000 [9:49:14<1827:08:19, 6.60s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 3879/1000000 [9:49:21<1884:05:14, 6.81s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [3879], local_loss=0.017679039388895035, train_loss=0.024632493034005165, time_cost=4.475239038467407
+
Steps: 0%| | 3879/1000000 [9:49:21<1884:05:14, 6.81s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 3880/1000000 [9:49:31<2154:21:37, 7.79s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [3880], local_loss=0.01877407357096672, train_loss=0.053583044558763504, time_cost=4.499675750732422
+
Steps: 0%| | 3880/1000000 [9:49:31<2154:21:37, 7.79s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 3881/1000000 [9:49:41<2326:34:39, 8.41s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [3881], local_loss=0.015207668766379356, train_loss=0.18083399534225464, time_cost=1.3932170867919922
+
Steps: 0%| | 3881/1000000 [9:49:41<2326:34:39, 8.41s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 3882/1000000 [9:49:47<2129:24:51, 7.70s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [3882], local_loss=0.04668399691581726, train_loss=0.03910082206130028, time_cost=1.1976978778839111
+
Steps: 0%| | 3882/1000000 [9:49:47<2129:24:51, 7.70s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 3883/1000000 [9:49:54<2061:55:15, 7.45s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [3883], local_loss=0.0265605840831995, train_loss=0.05350761115550995, time_cost=1.3531556129455566
+
Steps: 0%| | 3883/1000000 [9:49:54<2061:55:15, 7.45s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 3884/1000000 [9:49:58<1793:36:27, 6.48s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [3884], local_loss=0.039997000247240067, train_loss=0.0760982483625412, time_cost=1.3751225471496582
+
Steps: 0%| | 3884/1000000 [9:49:58<1793:36:27, 6.48s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 3885/1000000 [9:50:12<2385:25:24, 8.62s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [3885], local_loss=0.03640254586935043, train_loss=0.03129421919584274, time_cost=5.342226266860962
+
Steps: 0%| | 3885/1000000 [9:50:12<2385:25:24, 8.62s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 3886/1000000 [9:50:26<2813:08:19, 10.17s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [3886], local_loss=0.05290835723280907, train_loss=0.02403993532061577, time_cost=1.2351696491241455
+
Steps: 0%| | 3886/1000000 [9:50:26<2813:08:19, 10.17s/it, lr=1e-5, step_loss=0.0529]
Steps: 0%| | 3887/1000000 [9:50:33<2563:30:29, 9.26s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [3887], local_loss=0.013424885459244251, train_loss=0.07282812893390656, time_cost=5.386416912078857
+
Steps: 0%| | 3887/1000000 [9:50:33<2563:30:29, 9.26s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 3888/1000000 [9:50:44<2685:25:22, 9.71s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [3888], local_loss=0.07210379838943481, train_loss=0.048080772161483765, time_cost=5.723632335662842
+
Steps: 0%| | 3888/1000000 [9:50:44<2685:25:22, 9.71s/it, lr=1e-5, step_loss=0.0721]
Steps: 0%| | 3889/1000000 [9:50:50<2363:08:16, 8.54s/it, lr=1e-5, step_loss=0.0721][RANK-0]: Step: [3889], local_loss=0.04421710968017578, train_loss=0.043443888425827026, time_cost=1.6050357818603516
+
Steps: 0%| | 3889/1000000 [9:50:50<2363:08:16, 8.54s/it, lr=1e-5, step_loss=0.0442]
Steps: 0%| | 3890/1000000 [9:51:05<2909:15:41, 10.51s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [3890], local_loss=0.026140859350562096, train_loss=0.11478649079799652, time_cost=3.141751527786255
+
Steps: 0%| | 3890/1000000 [9:51:05<2909:15:41, 10.51s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 3891/1000000 [9:51:14<2788:49:38, 10.08s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [3891], local_loss=0.009879152290523052, train_loss=0.03698486089706421, time_cost=3.7129616737365723
+
Steps: 0%| | 3891/1000000 [9:51:14<2788:49:38, 10.08s/it, lr=1e-5, step_loss=0.00988]
Steps: 0%| | 3892/1000000 [9:51:19<2419:19:09, 8.74s/it, lr=1e-5, step_loss=0.00988][RANK-0]: Step: [3892], local_loss=0.04403278976678848, train_loss=0.038969166576862335, time_cost=3.5935158729553223
+
Steps: 0%| | 3892/1000000 [9:51:19<2419:19:09, 8.74s/it, lr=1e-5, step_loss=0.044]
Steps: 0%| | 3893/1000000 [9:51:30<2593:33:43, 9.37s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [3893], local_loss=0.054195381700992584, train_loss=0.05784621462225914, time_cost=1.3246829509735107
+
Steps: 0%| | 3893/1000000 [9:51:30<2593:33:43, 9.37s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 3894/1000000 [9:51:45<3042:52:13, 11.00s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [3894], local_loss=0.015747224912047386, train_loss=0.1477796584367752, time_cost=7.361023664474487
+
Steps: 0%| | 3894/1000000 [9:51:45<3042:52:13, 11.00s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 3895/1000000 [9:51:56<3063:38:34, 11.07s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [3895], local_loss=0.029579848051071167, train_loss=0.035008855164051056, time_cost=3.520582675933838
+
Steps: 0%| | 3895/1000000 [9:51:56<3063:38:34, 11.07s/it, lr=1e-5, step_loss=0.0296]
Steps: 0%| | 3896/1000000 [9:52:06<2918:58:32, 10.55s/it, lr=1e-5, step_loss=0.0296][RANK-0]: Step: [3896], local_loss=0.02522343024611473, train_loss=0.03753069043159485, time_cost=4.122686862945557
+
Steps: 0%| | 3896/1000000 [9:52:06<2918:58:32, 10.55s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 3897/1000000 [9:52:16<2897:28:35, 10.47s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [3897], local_loss=1.002977728843689, train_loss=0.16106157004833221, time_cost=3.799367904663086
+
Steps: 0%| | 3897/1000000 [9:52:16<2897:28:35, 10.47s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 3898/1000000 [9:52:22<2568:16:05, 9.28s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [3898], local_loss=0.01701975427567959, train_loss=0.024680502712726593, time_cost=1.7925560474395752
+
Steps: 0%| | 3898/1000000 [9:52:22<2568:16:05, 9.28s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 3899/1000000 [9:52:28<2256:42:25, 8.16s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [3899], local_loss=0.016808871179819107, train_loss=0.07188285142183304, time_cost=1.6633379459381104
+
Steps: 0%| | 3899/1000000 [9:52:28<2256:42:25, 8.16s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 3900/1000000 [9:52:37<2294:06:00, 8.29s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [3900], local_loss=0.05901738628745079, train_loss=0.07709901034832001, time_cost=2.7314324378967285
+
Steps: 0%| | 3900/1000000 [9:52:37<2294:06:00, 8.29s/it, lr=1e-5, step_loss=0.059]
Steps: 0%| | 3901/1000000 [9:52:44<2205:45:26, 7.97s/it, lr=1e-5, step_loss=0.059][RANK-0]: Step: [3901], local_loss=0.020564306527376175, train_loss=0.1531803160905838, time_cost=3.680405855178833
+
Steps: 0%| | 3901/1000000 [9:52:44<2205:45:26, 7.97s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 3902/1000000 [9:53:01<2975:09:17, 10.75s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [3902], local_loss=0.0495433583855629, train_loss=0.029249301180243492, time_cost=6.285080432891846
+
Steps: 0%| | 3902/1000000 [9:53:01<2975:09:17, 10.75s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 3903/1000000 [9:53:12<2993:02:17, 10.82s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [3903], local_loss=0.026866314932703972, train_loss=0.041725218296051025, time_cost=2.3167834281921387
+
Steps: 0%| | 3903/1000000 [9:53:12<2993:02:17, 10.82s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 3904/1000000 [9:53:16<2459:40:51, 8.89s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [3904], local_loss=0.032829657196998596, train_loss=0.02649242803454399, time_cost=1.5795049667358398
+
Steps: 0%| | 3904/1000000 [9:53:16<2459:40:51, 8.89s/it, lr=1e-5, step_loss=0.0328]
Steps: 0%| | 3905/1000000 [9:53:21<2141:08:41, 7.74s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [3905], local_loss=0.0442768856883049, train_loss=0.030211899429559708, time_cost=1.242297649383545
+
Steps: 0%| | 3905/1000000 [9:53:21<2141:08:41, 7.74s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 3906/1000000 [9:53:34<2572:00:24, 9.30s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [3906], local_loss=0.3317054808139801, train_loss=0.10738292336463928, time_cost=5.982814073562622
+
Steps: 0%| | 3906/1000000 [9:53:34<2572:00:24, 9.30s/it, lr=1e-5, step_loss=0.332]
Steps: 0%| | 3907/1000000 [9:53:47<2885:57:14, 10.43s/it, lr=1e-5, step_loss=0.332][RANK-0]: Step: [3907], local_loss=0.05947403982281685, train_loss=0.06747519969940186, time_cost=1.1993494033813477
+
Steps: 0%| | 3907/1000000 [9:53:47<2885:57:14, 10.43s/it, lr=1e-5, step_loss=0.0595]
Steps: 0%| | 3908/1000000 [9:54:02<3219:21:15, 11.64s/it, lr=1e-5, step_loss=0.0595][RANK-0]: Step: [3908], local_loss=0.02425490878522396, train_loss=0.04413807392120361, time_cost=6.270791053771973
+
Steps: 0%| | 3908/1000000 [9:54:02<3219:21:15, 11.64s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 3909/1000000 [9:54:07<2692:12:41, 9.73s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [3909], local_loss=170.08372497558594, train_loss=21.374479293823242, time_cost=1.286097764968872
+
Steps: 0%| | 3909/1000000 [9:54:07<2692:12:41, 9.73s/it, lr=1e-5, step_loss=170]
Steps: 0%| | 3910/1000000 [9:54:12<2307:10:33, 8.34s/it, lr=1e-5, step_loss=170][RANK-0]: Step: [3910], local_loss=0.08350696414709091, train_loss=0.06716243922710419, time_cost=1.6035714149475098
+
Steps: 0%| | 3910/1000000 [9:54:12<2307:10:33, 8.34s/it, lr=1e-5, step_loss=0.0835]
Steps: 0%| | 3911/1000000 [9:54:22<2404:23:20, 8.69s/it, lr=1e-5, step_loss=0.0835][RANK-0]: Step: [3911], local_loss=0.014430802315473557, train_loss=27.480026245117188, time_cost=4.03513765335083
+
Steps: 0%| | 3911/1000000 [9:54:22<2404:23:20, 8.69s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 3912/1000000 [9:54:30<2359:37:59, 8.53s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [3912], local_loss=0.027472801506519318, train_loss=0.0749410092830658, time_cost=1.1936285495758057
+
Steps: 0%| | 3912/1000000 [9:54:30<2359:37:59, 8.53s/it, lr=1e-5, step_loss=0.0275]
Steps: 0%| | 3913/1000000 [9:54:35<2054:19:53, 7.42s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [3913], local_loss=0.04724874719977379, train_loss=0.03821180760860443, time_cost=3.928713321685791
+
Steps: 0%| | 3913/1000000 [9:54:35<2054:19:53, 7.42s/it, lr=1e-5, step_loss=0.0472]
Steps: 0%| | 3914/1000000 [9:54:44<2210:13:10, 7.99s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [3914], local_loss=0.03714238852262497, train_loss=0.0588027685880661, time_cost=4.044056177139282
+
Steps: 0%| | 3914/1000000 [9:54:44<2210:13:10, 7.99s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 3915/1000000 [9:54:51<2142:18:54, 7.74s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [3915], local_loss=0.03302124887704849, train_loss=0.029798805713653564, time_cost=1.5292420387268066
+
Steps: 0%| | 3915/1000000 [9:54:51<2142:18:54, 7.74s/it, lr=1e-5, step_loss=0.033]
Steps: 0%| | 3916/1000000 [9:55:04<2581:03:48, 9.33s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [3916], local_loss=0.03317385911941528, train_loss=0.026088882237672806, time_cost=3.3438048362731934
+
Steps: 0%| | 3916/1000000 [9:55:04<2581:03:48, 9.33s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 3917/1000000 [9:55:16<2749:27:52, 9.94s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [3917], local_loss=0.021444130688905716, train_loss=0.022970501333475113, time_cost=5.156508922576904
+
Steps: 0%| | 3917/1000000 [9:55:16<2749:27:52, 9.94s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 3918/1000000 [9:55:24<2642:46:46, 9.55s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [3918], local_loss=0.037078771740198135, train_loss=0.15758761763572693, time_cost=2.4812493324279785
+
Steps: 0%| | 3918/1000000 [9:55:24<2642:46:46, 9.55s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 3919/1000000 [9:55:30<2336:53:59, 8.45s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [3919], local_loss=0.1954055279493332, train_loss=0.12335138767957687, time_cost=2.3018100261688232
+
Steps: 0%| | 3919/1000000 [9:55:30<2336:53:59, 8.45s/it, lr=1e-5, step_loss=0.195]
Steps: 0%| | 3920/1000000 [9:55:44<2825:04:29, 10.21s/it, lr=1e-5, step_loss=0.195][RANK-0]: Step: [3920], local_loss=0.030623331665992737, train_loss=0.03176290914416313, time_cost=1.2244555950164795
+
Steps: 0%| | 3920/1000000 [9:55:44<2825:04:29, 10.21s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 3921/1000000 [9:55:49<2343:59:49, 8.47s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [3921], local_loss=0.019384562969207764, train_loss=0.031834423542022705, time_cost=1.429140567779541
+
Steps: 0%| | 3921/1000000 [9:55:49<2343:59:49, 8.47s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 3922/1000000 [9:56:02<2746:41:07, 9.93s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [3922], local_loss=0.04313618317246437, train_loss=0.03791755810379982, time_cost=7.091221809387207
+
Steps: 0%| | 3922/1000000 [9:56:02<2746:41:07, 9.93s/it, lr=1e-5, step_loss=0.0431]
Steps: 0%| | 3923/1000000 [9:56:07<2325:28:08, 8.40s/it, lr=1e-5, step_loss=0.0431][RANK-0]: Step: [3923], local_loss=0.019552361220121384, train_loss=0.02167864516377449, time_cost=1.9524040222167969
+
Steps: 0%| | 3923/1000000 [9:56:07<2325:28:08, 8.40s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 3924/1000000 [9:56:22<2850:17:54, 10.30s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [3924], local_loss=0.042729489505290985, train_loss=0.01937464252114296, time_cost=5.761716365814209
+
Steps: 0%| | 3924/1000000 [9:56:22<2850:17:54, 10.30s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 3925/1000000 [9:56:36<3156:22:59, 11.41s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [3925], local_loss=0.021877631545066833, train_loss=0.0330381840467453, time_cost=4.23815393447876
+
Steps: 0%| | 3925/1000000 [9:56:36<3156:22:59, 11.41s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 3926/1000000 [9:56:47<3173:40:28, 11.47s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [3926], local_loss=0.02063414640724659, train_loss=0.07628189772367477, time_cost=1.4828872680664062
+
Steps: 0%| | 3926/1000000 [9:56:47<3173:40:28, 11.47s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 3927/1000000 [9:57:00<3282:22:55, 11.86s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [3927], local_loss=0.060465872287750244, train_loss=0.047592684626579285, time_cost=6.034202814102173
+
Steps: 0%| | 3927/1000000 [9:57:00<3282:22:55, 11.86s/it, lr=1e-5, step_loss=0.0605]
Steps: 0%| | 3928/1000000 [9:57:15<3509:55:33, 12.69s/it, lr=1e-5, step_loss=0.0605][RANK-0]: Step: [3928], local_loss=0.02525300160050392, train_loss=0.12001249194145203, time_cost=1.1959586143493652
+
Steps: 0%| | 3928/1000000 [9:57:15<3509:55:33, 12.69s/it, lr=1e-5, step_loss=0.0253]
Steps: 0%| | 3929/1000000 [9:57:23<3118:36:52, 11.27s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [3929], local_loss=0.09753426164388657, train_loss=0.04497789964079857, time_cost=4.411903619766235
+
Steps: 0%| | 3929/1000000 [9:57:23<3118:36:52, 11.27s/it, lr=1e-5, step_loss=0.0975]
Steps: 0%| | 3930/1000000 [9:57:32<2947:00:53, 10.65s/it, lr=1e-5, step_loss=0.0975][RANK-0]: Step: [3930], local_loss=0.027133625000715256, train_loss=0.022613275796175003, time_cost=2.899202585220337
+
Steps: 0%| | 3930/1000000 [9:57:32<2947:00:53, 10.65s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 3931/1000000 [9:57:37<2460:20:23, 8.89s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [3931], local_loss=0.06442857533693314, train_loss=0.16649682819843292, time_cost=1.7175242900848389
+
Steps: 0%| | 3931/1000000 [9:57:37<2460:20:23, 8.89s/it, lr=1e-5, step_loss=0.0644]
Steps: 0%| | 3932/1000000 [9:57:49<2777:38:10, 10.04s/it, lr=1e-5, step_loss=0.0644][RANK-0]: Step: [3932], local_loss=0.056593187153339386, train_loss=0.03817182406783104, time_cost=3.0978708267211914
+
Steps: 0%| | 3932/1000000 [9:57:49<2777:38:10, 10.04s/it, lr=1e-5, step_loss=0.0566]
Steps: 0%| | 3933/1000000 [9:58:05<3200:50:34, 11.57s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [3933], local_loss=0.015574774704873562, train_loss=0.0705970898270607, time_cost=5.527877330780029
+
Steps: 0%| | 3933/1000000 [9:58:05<3200:50:34, 11.57s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 3934/1000000 [9:58:10<2725:14:51, 9.85s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [3934], local_loss=0.011747203767299652, train_loss=0.028132902458310127, time_cost=1.2861642837524414
+
Steps: 0%| | 3934/1000000 [9:58:10<2725:14:51, 9.85s/it, lr=1e-5, step_loss=0.0117]
Steps: 0%| | 3935/1000000 [9:58:18<2506:09:49, 9.06s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [3935], local_loss=0.3645175099372864, train_loss=0.07260752469301224, time_cost=2.004204750061035
+
Steps: 0%| | 3935/1000000 [9:58:18<2506:09:49, 9.06s/it, lr=1e-5, step_loss=0.365]
Steps: 0%| | 3936/1000000 [9:58:23<2232:40:49, 8.07s/it, lr=1e-5, step_loss=0.365][RANK-0]: Step: [3936], local_loss=0.00952690839767456, train_loss=0.05170666426420212, time_cost=2.0079691410064697
+
Steps: 0%| | 3936/1000000 [9:58:23<2232:40:49, 8.07s/it, lr=1e-5, step_loss=0.00953]
Steps: 0%| | 3937/1000000 [9:58:38<2764:42:43, 9.99s/it, lr=1e-5, step_loss=0.00953][RANK-0]: Step: [3937], local_loss=0.06739982962608337, train_loss=0.05945366993546486, time_cost=1.4944887161254883
+
Steps: 0%| | 3937/1000000 [9:58:38<2764:42:43, 9.99s/it, lr=1e-5, step_loss=0.0674]
Steps: 0%| | 3938/1000000 [9:58:47<2727:50:30, 9.86s/it, lr=1e-5, step_loss=0.0674][RANK-0]: Step: [3938], local_loss=0.03532915189862251, train_loss=0.02013571932911873, time_cost=4.018989324569702
+
Steps: 0%| | 3938/1000000 [9:58:47<2727:50:30, 9.86s/it, lr=1e-5, step_loss=0.0353]
Steps: 0%| | 3939/1000000 [9:58:58<2789:12:04, 10.08s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [3939], local_loss=0.030292989686131477, train_loss=0.06405377388000488, time_cost=1.383194923400879
+
Steps: 0%| | 3939/1000000 [9:58:58<2789:12:04, 10.08s/it, lr=1e-5, step_loss=0.0303]
Steps: 0%| | 3940/1000000 [9:59:07<2694:41:26, 9.74s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [3940], local_loss=0.01571953482925892, train_loss=0.14541323482990265, time_cost=1.225311040878296
+
Steps: 0%| | 3940/1000000 [9:59:07<2694:41:26, 9.74s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 3941/1000000 [9:59:20<2994:07:47, 10.82s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [3941], local_loss=0.023490862920880318, train_loss=0.05235546454787254, time_cost=7.167342185974121
+
Steps: 0%| | 3941/1000000 [9:59:20<2994:07:47, 10.82s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 3942/1000000 [9:59:33<3188:12:51, 11.52s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [3942], local_loss=0.019746016710996628, train_loss=0.03154093399643898, time_cost=2.3413596153259277
+
Steps: 0%| | 3942/1000000 [9:59:33<3188:12:51, 11.52s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 3943/1000000 [9:59:41<2882:08:34, 10.42s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [3943], local_loss=0.015312757343053818, train_loss=0.058724358677864075, time_cost=2.835796594619751
+
Steps: 0%| | 3943/1000000 [9:59:41<2882:08:34, 10.42s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 3944/1000000 [9:59:47<2451:49:28, 8.86s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [3944], local_loss=1.002902865409851, train_loss=0.17263288795948029, time_cost=2.0326194763183594
+
Steps: 0%| | 3944/1000000 [9:59:47<2451:49:28, 8.86s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 3945/1000000 [9:59:55<2415:20:26, 8.73s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [3945], local_loss=0.02166610024869442, train_loss=0.03835923969745636, time_cost=1.530543565750122
+
Steps: 0%| | 3945/1000000 [9:59:55<2415:20:26, 8.73s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 3946/1000000 [10:00:12<3124:22:25, 11.29s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [3946], local_loss=0.013985587283968925, train_loss=0.03807312250137329, time_cost=7.776430368423462
+
Steps: 0%| | 3946/1000000 [10:00:12<3124:22:25, 11.29s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 3947/1000000 [10:00:23<3078:24:25, 11.13s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [3947], local_loss=0.029394280165433884, train_loss=0.037681594491004944, time_cost=1.2826097011566162
+
Steps: 0%| | 3947/1000000 [10:00:23<3078:24:25, 11.13s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 3948/1000000 [10:00:36<3259:43:08, 11.78s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [3948], local_loss=0.2801055312156677, train_loss=0.06569678336381912, time_cost=1.8565385341644287
+
Steps: 0%| | 3948/1000000 [10:00:36<3259:43:08, 11.78s/it, lr=1e-5, step_loss=0.28]
Steps: 0%| | 3949/1000000 [10:00:45<2993:24:53, 10.82s/it, lr=1e-5, step_loss=0.28][RANK-0]: Step: [3949], local_loss=0.027344416826963425, train_loss=0.022187761962413788, time_cost=2.4875547885894775
+
Steps: 0%| | 3949/1000000 [10:00:45<2993:24:53, 10.82s/it, lr=1e-5, step_loss=0.0273]
Steps: 0%| | 3950/1000000 [10:00:56<3020:42:59, 10.92s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [3950], local_loss=0.10204983502626419, train_loss=18.209543228149414, time_cost=7.2414772510528564
+
Steps: 0%| | 3950/1000000 [10:00:56<3020:42:59, 10.92s/it, lr=1e-5, step_loss=0.102]
Steps: 0%| | 3951/1000000 [10:01:03<2656:15:41, 9.60s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [3951], local_loss=0.0574154406785965, train_loss=0.0667310506105423, time_cost=1.29459547996521
+
Steps: 0%| | 3951/1000000 [10:01:03<2656:15:41, 9.60s/it, lr=1e-5, step_loss=0.0574]
Steps: 0%| | 3952/1000000 [10:01:18<3152:53:54, 11.40s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [3952], local_loss=0.06816643476486206, train_loss=0.1251668483018875, time_cost=4.918278455734253
+
Steps: 0%| | 3952/1000000 [10:01:18<3152:53:54, 11.40s/it, lr=1e-5, step_loss=0.0682]
Steps: 0%| | 3953/1000000 [10:01:31<3248:42:57, 11.74s/it, lr=1e-5, step_loss=0.0682][RANK-0]: Step: [3953], local_loss=0.014275372959673405, train_loss=0.02539299614727497, time_cost=2.856123685836792
+
Steps: 0%| | 3953/1000000 [10:01:31<3248:42:57, 11.74s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 3954/1000000 [10:01:36<2712:06:47, 9.80s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [3954], local_loss=0.019458774477243423, train_loss=0.060534801334142685, time_cost=2.5451531410217285
+
Steps: 0%| | 3954/1000000 [10:01:36<2712:06:47, 9.80s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 3955/1000000 [10:01:41<2352:43:24, 8.50s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [3955], local_loss=0.03780599310994148, train_loss=0.03610420972108841, time_cost=2.128840446472168
+
Steps: 0%| | 3955/1000000 [10:01:41<2352:43:24, 8.50s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 3956/1000000 [10:01:49<2299:57:15, 8.31s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [3956], local_loss=0.07823318988084793, train_loss=0.062113672494888306, time_cost=2.176403522491455
+
Steps: 0%| | 3956/1000000 [10:01:49<2299:57:15, 8.31s/it, lr=1e-5, step_loss=0.0782]
Steps: 0%| | 3957/1000000 [10:02:00<2475:54:27, 8.95s/it, lr=1e-5, step_loss=0.0782][RANK-0]: Step: [3957], local_loss=0.15154927968978882, train_loss=0.13702940940856934, time_cost=1.327481985092163
+
Steps: 0%| | 3957/1000000 [10:02:00<2475:54:27, 8.95s/it, lr=1e-5, step_loss=0.152]
Steps: 0%| | 3958/1000000 [10:02:05<2141:51:49, 7.74s/it, lr=1e-5, step_loss=0.152][RANK-0]: Step: [3958], local_loss=0.055861905217170715, train_loss=0.03624168038368225, time_cost=3.4843251705169678
+
Steps: 0%| | 3958/1000000 [10:02:05<2141:51:49, 7.74s/it, lr=1e-5, step_loss=0.0559]
Steps: 0%| | 3959/1000000 [10:02:10<1907:46:10, 6.90s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [3959], local_loss=0.0395747609436512, train_loss=0.050268545746803284, time_cost=1.4140663146972656
+
Steps: 0%| | 3959/1000000 [10:02:10<1907:46:10, 6.90s/it, lr=1e-5, step_loss=0.0396]
Steps: 0%| | 3960/1000000 [10:02:24<2570:13:52, 9.29s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [3960], local_loss=0.997175931930542, train_loss=0.2085498869419098, time_cost=1.2902159690856934
+
Steps: 0%| | 3960/1000000 [10:02:24<2570:13:52, 9.29s/it, lr=1e-5, step_loss=0.997]
Steps: 0%| | 3961/1000000 [10:02:33<2476:15:29, 8.95s/it, lr=1e-5, step_loss=0.997][RANK-0]: Step: [3961], local_loss=0.011490664444863796, train_loss=0.09886062890291214, time_cost=4.027676582336426
+
Steps: 0%| | 3961/1000000 [10:02:33<2476:15:29, 8.95s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 3962/1000000 [10:02:40<2359:06:25, 8.53s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [3962], local_loss=0.15213823318481445, train_loss=0.0423005074262619, time_cost=2.6717000007629395
+
Steps: 0%| | 3962/1000000 [10:02:40<2359:06:25, 8.53s/it, lr=1e-5, step_loss=0.152]
Steps: 0%| | 3963/1000000 [10:02:45<2017:10:51, 7.29s/it, lr=1e-5, step_loss=0.152][RANK-0]: Step: [3963], local_loss=0.07932939380407333, train_loss=0.17918196320533752, time_cost=1.6585745811462402
+
Steps: 0%| | 3963/1000000 [10:02:45<2017:10:51, 7.29s/it, lr=1e-5, step_loss=0.0793]
Steps: 0%| | 3964/1000000 [10:02:58<2521:40:37, 9.11s/it, lr=1e-5, step_loss=0.0793][RANK-0]: Step: [3964], local_loss=0.038098812103271484, train_loss=0.02837667241692543, time_cost=4.054982423782349
+
Steps: 0%| | 3964/1000000 [10:02:58<2521:40:37, 9.11s/it, lr=1e-5, step_loss=0.0381]
Steps: 0%| | 3965/1000000 [10:03:04<2261:36:03, 8.17s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [3965], local_loss=0.023532571271061897, train_loss=0.03851241618394852, time_cost=1.8049366474151611
+
Steps: 0%| | 3965/1000000 [10:03:04<2261:36:03, 8.17s/it, lr=1e-5, step_loss=0.0235]
Steps: 0%| | 3966/1000000 [10:03:19<2822:18:50, 10.20s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [3966], local_loss=0.0865749940276146, train_loss=0.0458250492811203, time_cost=5.196375608444214
+
Steps: 0%| | 3966/1000000 [10:03:19<2822:18:50, 10.20s/it, lr=1e-5, step_loss=0.0866]
Steps: 0%| | 3967/1000000 [10:03:33<3116:09:05, 11.26s/it, lr=1e-5, step_loss=0.0866][RANK-0]: Step: [3967], local_loss=0.05047112703323364, train_loss=0.02666446566581726, time_cost=1.929450511932373
+
Steps: 0%| | 3967/1000000 [10:03:33<3116:09:05, 11.26s/it, lr=1e-5, step_loss=0.0505]
Steps: 0%| | 3968/1000000 [10:03:48<3428:02:43, 12.39s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [3968], local_loss=0.03003104217350483, train_loss=0.06245695799589157, time_cost=1.8169972896575928
+
Steps: 0%| | 3968/1000000 [10:03:48<3428:02:43, 12.39s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 3969/1000000 [10:04:00<3421:27:18, 12.37s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [3969], local_loss=0.05156894773244858, train_loss=0.034615449607372284, time_cost=3.297579288482666
+
Steps: 0%| | 3969/1000000 [10:04:00<3421:27:18, 12.37s/it, lr=1e-5, step_loss=0.0516]
Steps: 0%| | 3970/1000000 [10:04:16<3769:22:18, 13.62s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [3970], local_loss=0.08856263011693954, train_loss=0.04008825868368149, time_cost=1.2333669662475586
+
Steps: 0%| | 3970/1000000 [10:04:16<3769:22:18, 13.62s/it, lr=1e-5, step_loss=0.0886]
Steps: 0%| | 3971/1000000 [10:04:25<3352:18:11, 12.12s/it, lr=1e-5, step_loss=0.0886][RANK-0]: Step: [3971], local_loss=0.0247611403465271, train_loss=0.03842349350452423, time_cost=1.233100175857544
+
Steps: 0%| | 3971/1000000 [10:04:25<3352:18:11, 12.12s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 3972/1000000 [10:04:37<3358:06:10, 12.14s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [3972], local_loss=0.04118683934211731, train_loss=0.052815936505794525, time_cost=1.2518055438995361
+
Steps: 0%| | 3972/1000000 [10:04:37<3358:06:10, 12.14s/it, lr=1e-5, step_loss=0.0412]
Steps: 0%| | 3973/1000000 [10:04:51<3517:23:39, 12.71s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [3973], local_loss=0.015866795554757118, train_loss=0.1040562316775322, time_cost=10.322067499160767
+
Steps: 0%| | 3973/1000000 [10:04:51<3517:23:39, 12.71s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 3974/1000000 [10:04:57<2907:09:41, 10.51s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [3974], local_loss=0.039282310754060745, train_loss=0.07027000188827515, time_cost=1.4605138301849365
+
Steps: 0%| | 3974/1000000 [10:04:57<2907:09:41, 10.51s/it, lr=1e-5, step_loss=0.0393]
Steps: 0%| | 3975/1000000 [10:05:12<3281:12:31, 11.86s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [3975], local_loss=0.05671808868646622, train_loss=0.039921656250953674, time_cost=5.784392833709717
+
Steps: 0%| | 3975/1000000 [10:05:12<3281:12:31, 11.86s/it, lr=1e-5, step_loss=0.0567]
Steps: 0%| | 3976/1000000 [10:05:17<2769:22:04, 10.01s/it, lr=1e-5, step_loss=0.0567][RANK-0]: Step: [3976], local_loss=0.02074233815073967, train_loss=0.02057129517197609, time_cost=4.062875270843506
+
Steps: 0%| | 3976/1000000 [10:05:17<2769:22:04, 10.01s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 3977/1000000 [10:05:23<2429:16:21, 8.78s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [3977], local_loss=0.047814905643463135, train_loss=0.08776413649320602, time_cost=2.383781909942627
+
Steps: 0%| | 3977/1000000 [10:05:23<2429:16:21, 8.78s/it, lr=1e-5, step_loss=0.0478]
Steps: 0%| | 3978/1000000 [10:05:29<2164:50:51, 7.82s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [3978], local_loss=0.14519080519676208, train_loss=0.09438836574554443, time_cost=2.312150239944458
+
Steps: 0%| | 3978/1000000 [10:05:29<2164:50:51, 7.82s/it, lr=1e-5, step_loss=0.145]
Steps: 0%| | 3979/1000000 [10:05:40<2447:19:17, 8.85s/it, lr=1e-5, step_loss=0.145][RANK-0]: Step: [3979], local_loss=0.027931515127420425, train_loss=0.030547678470611572, time_cost=8.072515726089478
+
Steps: 0%| | 3979/1000000 [10:05:40<2447:19:17, 8.85s/it, lr=1e-5, step_loss=0.0279]
Steps: 0%| | 3980/1000000 [10:05:47<2284:58:08, 8.26s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [3980], local_loss=0.02452521026134491, train_loss=0.028779122978448868, time_cost=1.6211471557617188
+
Steps: 0%| | 3980/1000000 [10:05:47<2284:58:08, 8.26s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 3981/1000000 [10:06:03<2905:00:51, 10.50s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [3981], local_loss=0.03654684126377106, train_loss=0.18722616136074066, time_cost=5.773394823074341
+
Steps: 0%| | 3981/1000000 [10:06:03<2905:00:51, 10.50s/it, lr=1e-5, step_loss=0.0365]
Steps: 0%| | 3982/1000000 [10:06:14<2999:08:05, 10.84s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [3982], local_loss=0.01488539855927229, train_loss=0.030781246721744537, time_cost=1.5824763774871826
+
Steps: 0%| | 3982/1000000 [10:06:14<2999:08:05, 10.84s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 3983/1000000 [10:06:30<3373:52:06, 12.19s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [3983], local_loss=0.09143222123384476, train_loss=0.1762618124485016, time_cost=5.38094687461853
+
Steps: 0%| | 3983/1000000 [10:06:30<3373:52:06, 12.19s/it, lr=1e-5, step_loss=0.0914]
Steps: 0%| | 3984/1000000 [10:06:37<2994:07:49, 10.82s/it, lr=1e-5, step_loss=0.0914][RANK-0]: Step: [3984], local_loss=0.008284782990813255, train_loss=0.03594940900802612, time_cost=6.074482440948486
+
Steps: 0%| | 3984/1000000 [10:06:37<2994:07:49, 10.82s/it, lr=1e-5, step_loss=0.00828]
Steps: 0%| | 3985/1000000 [10:06:48<2997:03:18, 10.83s/it, lr=1e-5, step_loss=0.00828][RANK-0]: Step: [3985], local_loss=0.021682772785425186, train_loss=0.0577915757894516, time_cost=3.750645399093628
+
Steps: 0%| | 3985/1000000 [10:06:48<2997:03:18, 10.83s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 3986/1000000 [10:07:00<3066:46:35, 11.08s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [3986], local_loss=0.019822722300887108, train_loss=0.039494045078754425, time_cost=1.8299403190612793
+
Steps: 0%| | 3986/1000000 [10:07:00<3066:46:35, 11.08s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 3987/1000000 [10:07:12<3152:09:37, 11.39s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [3987], local_loss=0.013391059823334217, train_loss=0.027677178382873535, time_cost=1.903247356414795
+
Steps: 0%| | 3987/1000000 [10:07:12<3152:09:37, 11.39s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 3988/1000000 [10:07:25<3314:00:21, 11.98s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [3988], local_loss=0.06472611427307129, train_loss=0.03434021398425102, time_cost=8.301687002182007
+
Steps: 0%| | 3988/1000000 [10:07:25<3314:00:21, 11.98s/it, lr=1e-5, step_loss=0.0647]
Steps: 0%| | 3989/1000000 [10:07:31<2822:55:59, 10.20s/it, lr=1e-5, step_loss=0.0647][RANK-0]: Step: [3989], local_loss=0.08879214525222778, train_loss=0.04182089492678642, time_cost=2.62624192237854
+
Steps: 0%| | 3989/1000000 [10:07:31<2822:55:59, 10.20s/it, lr=1e-5, step_loss=0.0888]
Steps: 0%| | 3990/1000000 [10:07:40<2701:25:42, 9.76s/it, lr=1e-5, step_loss=0.0888][RANK-0]: Step: [3990], local_loss=0.4136771559715271, train_loss=0.08269481360912323, time_cost=5.314979553222656
+
Steps: 0%| | 3990/1000000 [10:07:40<2701:25:42, 9.76s/it, lr=1e-5, step_loss=0.414]
Steps: 0%| | 3991/1000000 [10:07:55<3163:36:53, 11.43s/it, lr=1e-5, step_loss=0.414][RANK-0]: Step: [3991], local_loss=0.017237050458788872, train_loss=0.045218925923109055, time_cost=5.446666717529297
+
Steps: 0%| | 3991/1000000 [10:07:55<3163:36:53, 11.43s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 3992/1000000 [10:08:05<2972:14:01, 10.74s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [3992], local_loss=0.020432911813259125, train_loss=0.03575994074344635, time_cost=3.406848192214966
+
Steps: 0%| | 3992/1000000 [10:08:05<2972:14:01, 10.74s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 3993/1000000 [10:08:11<2585:24:26, 9.34s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [3993], local_loss=0.0584520548582077, train_loss=0.08757735788822174, time_cost=1.3257007598876953
+
Steps: 0%| | 3993/1000000 [10:08:11<2585:24:26, 9.34s/it, lr=1e-5, step_loss=0.0585]
Steps: 0%| | 3994/1000000 [10:08:25<3027:57:44, 10.94s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [3994], local_loss=0.012110617943108082, train_loss=1.608410120010376, time_cost=5.7374677658081055
+
Steps: 0%| | 3994/1000000 [10:08:25<3027:57:44, 10.94s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 3995/1000000 [10:08:34<2826:03:43, 10.21s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [3995], local_loss=0.03408148139715195, train_loss=0.029561514034867287, time_cost=1.9850172996520996
+
Steps: 0%| | 3995/1000000 [10:08:34<2826:03:43, 10.21s/it, lr=1e-5, step_loss=0.0341]
Steps: 0%| | 3996/1000000 [10:08:39<2411:19:47, 8.72s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [3996], local_loss=0.06349892169237137, train_loss=0.10080477595329285, time_cost=1.491039752960205
+
Steps: 0%| | 3996/1000000 [10:08:39<2411:19:47, 8.72s/it, lr=1e-5, step_loss=0.0635]
Steps: 0%| | 3997/1000000 [10:08:45<2190:19:01, 7.92s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [3997], local_loss=0.12750521302223206, train_loss=0.05495183914899826, time_cost=2.959040403366089
+
Steps: 0%| | 3997/1000000 [10:08:45<2190:19:01, 7.92s/it, lr=1e-5, step_loss=0.128]
Steps: 0%| | 3998/1000000 [10:08:56<2417:46:14, 8.74s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [3998], local_loss=0.029510006308555603, train_loss=0.07199268788099289, time_cost=1.6510062217712402
+
Steps: 0%| | 3998/1000000 [10:08:56<2417:46:14, 8.74s/it, lr=1e-5, step_loss=0.0295]
Steps: 0%| | 3999/1000000 [10:09:12<3019:53:03, 10.92s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [3999], local_loss=0.0404476672410965, train_loss=0.042545825242996216, time_cost=1.2047510147094727
+
Steps: 0%| | 3999/1000000 [10:09:12<3019:53:03, 10.92s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 4000/1000000 [10:09:17<2554:59:50, 9.23s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [4000], local_loss=0.017095739021897316, train_loss=0.022040044888854027, time_cost=1.219771385192871
+09/19/2024 09:19:10 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000
+09/19/2024 09:19:10 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 09:19:10,207] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-19 09:19:10,238] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 09:19:10,238] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 09:19:33,535] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 09:19:33,546] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 09:20:05,162] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:05,163] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:05,163] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:07,704] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:07,704] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:07,704] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:08,859] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:08,859] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:08,859] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:09,031] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:09,031] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:09,031] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:09,658] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:09,658] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:09,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:09,712] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:09,757] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:09,757] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:09,866] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:09,866] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:09,866] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 09:20:09,950] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 09:20:09,950] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 09:20:09,950] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 09:20:09 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/model/diffusion_pytorch_model.safetensors
+09/19/2024 09:21:34 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/scheduler.bin
+09/19/2024 09:21:34 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/sampler.bin
+09/19/2024 09:21:34 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000/random_states_0.pkl
+09/19/2024 09:21:34 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-4000
+
Steps: 0%| | 4000/1000000 [10:11:42<2554:59:50, 9.23s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 4001/1000000 [10:11:48<14344:15:13, 51.85s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [4001], local_loss=0.04916660487651825, train_loss=0.07064324617385864, time_cost=1.2419893741607666
+
Steps: 0%| | 4001/1000000 [10:11:48<14344:15:13, 51.85s/it, lr=1e-5, step_loss=0.0492]
Steps: 0%| | 4002/1000000 [10:11:54<10502:28:55, 37.96s/it, lr=1e-5, step_loss=0.0492][RANK-0]: Step: [4002], local_loss=0.0173378586769104, train_loss=0.0532962828874588, time_cost=1.5719661712646484
+
Steps: 0%| | 4002/1000000 [10:11:54<10502:28:55, 37.96s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 4003/1000000 [10:12:07<8440:11:30, 30.51s/it, lr=1e-5, step_loss=0.0173] [RANK-0]: Step: [4003], local_loss=0.0224454328417778, train_loss=0.05386589094996452, time_cost=4.071990728378296
+
Steps: 0%| | 4003/1000000 [10:12:07<8440:11:30, 30.51s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 4004/1000000 [10:12:21<7078:13:10, 25.58s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [4004], local_loss=0.024345330893993378, train_loss=0.0355398952960968, time_cost=10.809370517730713
+
Steps: 0%| | 4004/1000000 [10:12:21<7078:13:10, 25.58s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 4005/1000000 [10:12:34<6021:07:34, 21.76s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [4005], local_loss=0.01705807074904442, train_loss=0.030989987775683403, time_cost=5.3130457401275635
+
Steps: 0%| | 4005/1000000 [10:12:34<6021:07:34, 21.76s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 4006/1000000 [10:12:49<5502:51:51, 19.89s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [4006], local_loss=0.04466567188501358, train_loss=0.06069561839103699, time_cost=7.509719133377075
+
Steps: 0%| | 4006/1000000 [10:12:49<5502:51:51, 19.89s/it, lr=1e-5, step_loss=0.0447]
Steps: 0%| | 4007/1000000 [10:13:06<5246:28:43, 18.96s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [4007], local_loss=0.019553061574697495, train_loss=0.05096573382616043, time_cost=2.5959293842315674
+
Steps: 0%| | 4007/1000000 [10:13:06<5246:28:43, 18.96s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 4008/1000000 [10:13:12<4182:00:43, 15.12s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [4008], local_loss=0.015343185514211655, train_loss=0.1512937694787979, time_cost=1.4915196895599365
+
Steps: 0%| | 4008/1000000 [10:13:12<4182:00:43, 15.12s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 4009/1000000 [10:13:27<4153:41:35, 15.01s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [4009], local_loss=0.8370193839073181, train_loss=0.12909841537475586, time_cost=5.261274337768555
+
Steps: 0%| | 4009/1000000 [10:13:27<4153:41:35, 15.01s/it, lr=1e-5, step_loss=0.837]
Steps: 0%| | 4010/1000000 [10:13:32<3337:41:13, 12.06s/it, lr=1e-5, step_loss=0.837][RANK-0]: Step: [4010], local_loss=0.03917820751667023, train_loss=0.03215816989541054, time_cost=4.252053260803223
+
Steps: 0%| | 4010/1000000 [10:13:32<3337:41:13, 12.06s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 4011/1000000 [10:13:41<3057:15:35, 11.05s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [4011], local_loss=0.12868382036685944, train_loss=0.14300240576267242, time_cost=1.4164819717407227
+
Steps: 0%| | 4011/1000000 [10:13:41<3057:15:35, 11.05s/it, lr=1e-5, step_loss=0.129]
Steps: 0%| | 4012/1000000 [10:13:48<2745:58:13, 9.93s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [4012], local_loss=0.10085806250572205, train_loss=0.07144127786159515, time_cost=2.7139034271240234
+
Steps: 0%| | 4012/1000000 [10:13:48<2745:58:13, 9.93s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 4013/1000000 [10:14:00<2920:49:31, 10.56s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [4013], local_loss=0.03513080254197121, train_loss=0.03821183741092682, time_cost=1.2483937740325928
+
Steps: 0%| | 4013/1000000 [10:14:00<2920:49:31, 10.56s/it, lr=1e-5, step_loss=0.0351]
Steps: 0%| | 4014/1000000 [10:14:07<2581:05:03, 9.33s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [4014], local_loss=0.013963881880044937, train_loss=0.027783920988440514, time_cost=2.9259419441223145
+
Steps: 0%| | 4014/1000000 [10:14:07<2581:05:03, 9.33s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 4015/1000000 [10:14:11<2172:49:52, 7.85s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [4015], local_loss=0.07585553079843521, train_loss=0.04905453696846962, time_cost=1.2085754871368408
+
Steps: 0%| | 4015/1000000 [10:14:11<2172:49:52, 7.85s/it, lr=1e-5, step_loss=0.0759]
Steps: 0%| | 4016/1000000 [10:14:16<1901:29:11, 6.87s/it, lr=1e-5, step_loss=0.0759][RANK-0]: Step: [4016], local_loss=0.042227961122989655, train_loss=0.03299683332443237, time_cost=3.3671422004699707
+
Steps: 0%| | 4016/1000000 [10:14:16<1901:29:11, 6.87s/it, lr=1e-5, step_loss=0.0422]
Steps: 0%| | 4017/1000000 [10:14:29<2386:04:49, 8.62s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [4017], local_loss=0.021525545045733452, train_loss=0.05497080832719803, time_cost=5.527381181716919
+
Steps: 0%| | 4017/1000000 [10:14:29<2386:04:49, 8.62s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 4018/1000000 [10:14:38<2481:08:23, 8.97s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [4018], local_loss=0.021804623305797577, train_loss=0.028485499322414398, time_cost=8.252992391586304
+
Steps: 0%| | 4018/1000000 [10:14:38<2481:08:23, 8.97s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 4019/1000000 [10:14:52<2839:31:19, 10.26s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [4019], local_loss=0.018942853435873985, train_loss=0.028674472123384476, time_cost=1.2524945735931396
+
Steps: 0%| | 4019/1000000 [10:14:52<2839:31:19, 10.26s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 4020/1000000 [10:14:58<2510:44:12, 9.08s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [4020], local_loss=0.027152828872203827, train_loss=0.028925031423568726, time_cost=1.6062443256378174
+
Steps: 0%| | 4020/1000000 [10:14:58<2510:44:12, 9.08s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 4021/1000000 [10:15:06<2394:46:17, 8.66s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [4021], local_loss=0.02558397501707077, train_loss=0.17056183516979218, time_cost=1.2137105464935303
+
Steps: 0%| | 4021/1000000 [10:15:06<2394:46:17, 8.66s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 4022/1000000 [10:15:11<2122:27:25, 7.67s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [4022], local_loss=0.037156615406274796, train_loss=0.030663074925541878, time_cost=2.0029232501983643
+
Steps: 0%| | 4022/1000000 [10:15:11<2122:27:25, 7.67s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 4023/1000000 [10:15:21<2284:57:32, 8.26s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [4023], local_loss=0.03188891336321831, train_loss=0.03925947844982147, time_cost=2.5781784057617188
+
Steps: 0%| | 4023/1000000 [10:15:21<2284:57:32, 8.26s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 4024/1000000 [10:15:26<2047:52:44, 7.40s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [4024], local_loss=0.02992095984518528, train_loss=0.03206915408372879, time_cost=1.997175931930542
+
Steps: 0%| | 4024/1000000 [10:15:26<2047:52:44, 7.40s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 4025/1000000 [10:15:40<2613:26:04, 9.45s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [4025], local_loss=0.053453460335731506, train_loss=0.049259789288043976, time_cost=2.954007625579834
+
Steps: 0%| | 4025/1000000 [10:15:40<2613:26:04, 9.45s/it, lr=1e-5, step_loss=0.0535]
Steps: 0%| | 4026/1000000 [10:15:46<2347:36:30, 8.49s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [4026], local_loss=0.03952311724424362, train_loss=0.02912573330104351, time_cost=1.9449820518493652
+
Steps: 0%| | 4026/1000000 [10:15:46<2347:36:30, 8.49s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 4027/1000000 [10:15:56<2468:02:19, 8.92s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [4027], local_loss=0.014137925580143929, train_loss=0.04583975672721863, time_cost=5.121779203414917
+
Steps: 0%| | 4027/1000000 [10:15:56<2468:02:19, 8.92s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 4028/1000000 [10:16:01<2148:36:44, 7.77s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [4028], local_loss=0.9859596490859985, train_loss=0.2105315774679184, time_cost=2.09224796295166
+
Steps: 0%| | 4028/1000000 [10:16:01<2148:36:44, 7.77s/it, lr=1e-5, step_loss=0.986]
Steps: 0%| | 4029/1000000 [10:16:08<2058:19:07, 7.44s/it, lr=1e-5, step_loss=0.986][RANK-0]: Step: [4029], local_loss=0.04803231358528137, train_loss=0.08356820046901703, time_cost=2.1517298221588135
+
Steps: 0%| | 4029/1000000 [10:16:08<2058:19:07, 7.44s/it, lr=1e-5, step_loss=0.048]
Steps: 0%| | 4030/1000000 [10:16:13<1856:58:57, 6.71s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [4030], local_loss=0.04633290320634842, train_loss=0.0748080387711525, time_cost=3.7890918254852295
+
Steps: 0%| | 4030/1000000 [10:16:13<1856:58:57, 6.71s/it, lr=1e-5, step_loss=0.0463]
Steps: 0%| | 4031/1000000 [10:16:19<1779:43:20, 6.43s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [4031], local_loss=0.021401336416602135, train_loss=0.09467014670372009, time_cost=1.2358014583587646
+
Steps: 0%| | 4031/1000000 [10:16:19<1779:43:20, 6.43s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 4032/1000000 [10:16:33<2418:39:24, 8.74s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [4032], local_loss=0.024816062301397324, train_loss=0.035052813589572906, time_cost=4.848874092102051
+
Steps: 0%| | 4032/1000000 [10:16:33<2418:39:24, 8.74s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 4033/1000000 [10:16:39<2173:40:36, 7.86s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [4033], local_loss=0.015504192560911179, train_loss=0.03358301520347595, time_cost=1.5661876201629639
+
Steps: 0%| | 4033/1000000 [10:16:39<2173:40:36, 7.86s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4034/1000000 [10:16:53<2722:35:49, 9.84s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4034], local_loss=0.039946477860212326, train_loss=0.05049348995089531, time_cost=5.558278322219849
+
Steps: 0%| | 4034/1000000 [10:16:53<2722:35:49, 9.84s/it, lr=1e-5, step_loss=0.0399]
Steps: 0%| | 4035/1000000 [10:17:05<2861:45:58, 10.34s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [4035], local_loss=0.048649612814188004, train_loss=0.031188765540719032, time_cost=3.6130173206329346
+
Steps: 0%| | 4035/1000000 [10:17:05<2861:45:58, 10.34s/it, lr=1e-5, step_loss=0.0486]
Steps: 0%| | 4036/1000000 [10:17:16<2954:57:05, 10.68s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [4036], local_loss=0.017213376238942146, train_loss=0.04030374437570572, time_cost=1.2303216457366943
+
Steps: 0%| | 4036/1000000 [10:17:16<2954:57:05, 10.68s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 4037/1000000 [10:17:24<2685:35:07, 9.71s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [4037], local_loss=0.04378073289990425, train_loss=0.04927263408899307, time_cost=5.247173547744751
+
Steps: 0%| | 4037/1000000 [10:17:24<2685:35:07, 9.71s/it, lr=1e-5, step_loss=0.0438]
Steps: 0%| | 4038/1000000 [10:17:29<2303:47:50, 8.33s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [4038], local_loss=0.043248098343610764, train_loss=0.028589602559804916, time_cost=1.4027514457702637
+
Steps: 0%| | 4038/1000000 [10:17:29<2303:47:50, 8.33s/it, lr=1e-5, step_loss=0.0432]
Steps: 0%| | 4039/1000000 [10:17:42<2718:29:15, 9.83s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [4039], local_loss=0.025350669398903847, train_loss=0.03952958062291145, time_cost=10.193384647369385
+
Steps: 0%| | 4039/1000000 [10:17:42<2718:29:15, 9.83s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 4040/1000000 [10:17:47<2292:09:22, 8.29s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [4040], local_loss=0.02844209223985672, train_loss=0.026380520313978195, time_cost=1.5512349605560303
+
Steps: 0%| | 4040/1000000 [10:17:47<2292:09:22, 8.29s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 4041/1000000 [10:17:52<2045:28:00, 7.39s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [4041], local_loss=0.051393575966358185, train_loss=0.06935122609138489, time_cost=2.1701290607452393
+
Steps: 0%| | 4041/1000000 [10:17:52<2045:28:00, 7.39s/it, lr=1e-5, step_loss=0.0514]
Steps: 0%| | 4042/1000000 [10:18:03<2317:17:24, 8.38s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [4042], local_loss=0.03187419846653938, train_loss=0.04669000580906868, time_cost=1.212599515914917
+
Steps: 0%| | 4042/1000000 [10:18:03<2317:17:24, 8.38s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 4043/1000000 [10:18:09<2143:32:55, 7.75s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [4043], local_loss=0.017669208347797394, train_loss=0.048106178641319275, time_cost=2.743647813796997
+
Steps: 0%| | 4043/1000000 [10:18:09<2143:32:55, 7.75s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 4044/1000000 [10:18:17<2135:59:57, 7.72s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [4044], local_loss=0.023098204284906387, train_loss=0.02215241640806198, time_cost=1.1983153820037842
+
Steps: 0%| | 4044/1000000 [10:18:17<2135:59:57, 7.72s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 4045/1000000 [10:18:27<2360:47:39, 8.53s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [4045], local_loss=0.010526875965297222, train_loss=0.03651728481054306, time_cost=3.031651496887207
+
Steps: 0%| | 4045/1000000 [10:18:27<2360:47:39, 8.53s/it, lr=1e-5, step_loss=0.0105]
Steps: 0%| | 4046/1000000 [10:18:38<2560:54:40, 9.26s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [4046], local_loss=0.4546360969543457, train_loss=0.08322115242481232, time_cost=4.93266224861145
+
Steps: 0%| | 4046/1000000 [10:18:38<2560:54:40, 9.26s/it, lr=1e-5, step_loss=0.455]
Steps: 0%| | 4047/1000000 [10:18:44<2311:05:27, 8.35s/it, lr=1e-5, step_loss=0.455][RANK-0]: Step: [4047], local_loss=0.013886775821447372, train_loss=0.020077109336853027, time_cost=2.1737494468688965
+
Steps: 0%| | 4047/1000000 [10:18:44<2311:05:27, 8.35s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4048/1000000 [10:19:01<2968:15:18, 10.73s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4048], local_loss=0.07634991407394409, train_loss=0.0497024767100811, time_cost=10.480566263198853
+
Steps: 0%| | 4048/1000000 [10:19:01<2968:15:18, 10.73s/it, lr=1e-5, step_loss=0.0763]
Steps: 0%| | 4049/1000000 [10:19:12<3046:27:21, 11.01s/it, lr=1e-5, step_loss=0.0763][RANK-0]: Step: [4049], local_loss=0.04364019259810448, train_loss=0.049834463745355606, time_cost=3.4976003170013428
+
Steps: 0%| | 4049/1000000 [10:19:12<3046:27:21, 11.01s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 4050/1000000 [10:19:19<2689:53:46, 9.72s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [4050], local_loss=0.01843412034213543, train_loss=0.044542647898197174, time_cost=2.5202555656433105
+
Steps: 0%| | 4050/1000000 [10:19:19<2689:53:46, 9.72s/it, lr=1e-5, step_loss=0.0184]
Steps: 0%| | 4051/1000000 [10:19:29<2693:28:45, 9.74s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [4051], local_loss=0.013172746635973454, train_loss=0.174554705619812, time_cost=1.9014811515808105
+
Steps: 0%| | 4051/1000000 [10:19:29<2693:28:45, 9.74s/it, lr=1e-5, step_loss=0.0132]
Steps: 0%| | 4052/1000000 [10:19:38<2611:35:59, 9.44s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [4052], local_loss=0.03168613836169243, train_loss=0.0707075446844101, time_cost=1.7126829624176025
+
Steps: 0%| | 4052/1000000 [10:19:38<2611:35:59, 9.44s/it, lr=1e-5, step_loss=0.0317]
Steps: 0%| | 4053/1000000 [10:19:43<2307:10:40, 8.34s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [4053], local_loss=0.016135504469275475, train_loss=0.17824921011924744, time_cost=1.1960251331329346
+
Steps: 0%| | 4053/1000000 [10:19:43<2307:10:40, 8.34s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 4054/1000000 [10:19:50<2128:44:08, 7.69s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [4054], local_loss=0.0766012892127037, train_loss=0.17528259754180908, time_cost=1.1980116367340088
+
Steps: 0%| | 4054/1000000 [10:19:50<2128:44:08, 7.69s/it, lr=1e-5, step_loss=0.0766]
Steps: 0%| | 4055/1000000 [10:19:54<1870:56:18, 6.76s/it, lr=1e-5, step_loss=0.0766][RANK-0]: Step: [4055], local_loss=0.02216009795665741, train_loss=0.04153623804450035, time_cost=1.2324879169464111
+
Steps: 0%| | 4055/1000000 [10:19:54<1870:56:18, 6.76s/it, lr=1e-5, step_loss=0.0222]
Steps: 0%| | 4056/1000000 [10:20:01<1845:17:33, 6.67s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [4056], local_loss=0.013540051877498627, train_loss=0.03659212216734886, time_cost=1.805891990661621
+
Steps: 0%| | 4056/1000000 [10:20:01<1845:17:33, 6.67s/it, lr=1e-5, step_loss=0.0135]
Steps: 0%| | 4057/1000000 [10:20:09<1950:42:06, 7.05s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [4057], local_loss=0.056119404733181, train_loss=0.030629288405179977, time_cost=1.625115156173706
+
Steps: 0%| | 4057/1000000 [10:20:09<1950:42:06, 7.05s/it, lr=1e-5, step_loss=0.0561]
Steps: 0%| | 4058/1000000 [10:20:24<2672:16:06, 9.66s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [4058], local_loss=0.46029359102249146, train_loss=0.23743510246276855, time_cost=1.203643560409546
+
Steps: 0%| | 4058/1000000 [10:20:24<2672:16:06, 9.66s/it, lr=1e-5, step_loss=0.46]
Steps: 0%| | 4059/1000000 [10:20:35<2728:16:16, 9.86s/it, lr=1e-5, step_loss=0.46][RANK-0]: Step: [4059], local_loss=0.19094915688037872, train_loss=0.07810451090335846, time_cost=1.2069098949432373
+
Steps: 0%| | 4059/1000000 [10:20:35<2728:16:16, 9.86s/it, lr=1e-5, step_loss=0.191]
Steps: 0%| | 4060/1000000 [10:20:49<3074:10:20, 11.11s/it, lr=1e-5, step_loss=0.191][RANK-0]: Step: [4060], local_loss=0.032871562987565994, train_loss=0.10596780478954315, time_cost=4.336248397827148
+
Steps: 0%| | 4060/1000000 [10:20:49<3074:10:20, 11.11s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 4061/1000000 [10:20:58<2926:56:31, 10.58s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [4061], local_loss=0.01535602193325758, train_loss=0.04029180854558945, time_cost=3.3280282020568848
+
Steps: 0%| | 4061/1000000 [10:20:58<2926:56:31, 10.58s/it, lr=1e-5, step_loss=0.0154]
Steps: 0%| | 4062/1000000 [10:21:03<2491:34:51, 9.01s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [4062], local_loss=0.019264565780758858, train_loss=0.050318047404289246, time_cost=1.8952357769012451
+
Steps: 0%| | 4062/1000000 [10:21:03<2491:34:51, 9.01s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 4063/1000000 [10:21:14<2661:14:51, 9.62s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [4063], local_loss=0.02086183987557888, train_loss=0.09135599434375763, time_cost=1.228588342666626
+
Steps: 0%| | 4063/1000000 [10:21:14<2661:14:51, 9.62s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 4064/1000000 [10:21:20<2297:13:17, 8.30s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [4064], local_loss=0.03817611560225487, train_loss=0.06019044667482376, time_cost=2.6405704021453857
+
Steps: 0%| | 4064/1000000 [10:21:20<2297:13:17, 8.30s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 4065/1000000 [10:21:33<2725:10:19, 9.85s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [4065], local_loss=0.030542436987161636, train_loss=0.08941517770290375, time_cost=4.401435852050781
+
Steps: 0%| | 4065/1000000 [10:21:33<2725:10:19, 9.85s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 4066/1000000 [10:21:38<2340:14:11, 8.46s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [4066], local_loss=0.018590597435832024, train_loss=0.02942647971212864, time_cost=1.3767144680023193
+
Steps: 0%| | 4066/1000000 [10:21:38<2340:14:11, 8.46s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 4067/1000000 [10:21:52<2807:18:14, 10.15s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [4067], local_loss=0.08462385088205338, train_loss=0.040971316397190094, time_cost=4.085034370422363
+
Steps: 0%| | 4067/1000000 [10:21:52<2807:18:14, 10.15s/it, lr=1e-5, step_loss=0.0846]
Steps: 0%| | 4068/1000000 [10:22:00<2566:24:21, 9.28s/it, lr=1e-5, step_loss=0.0846][RANK-0]: Step: [4068], local_loss=0.03422411158680916, train_loss=0.03585007041692734, time_cost=2.888988494873047
+
Steps: 0%| | 4068/1000000 [10:22:00<2566:24:21, 9.28s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 4069/1000000 [10:22:14<3008:33:42, 10.88s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [4069], local_loss=0.012995163910090923, train_loss=0.09266415238380432, time_cost=10.954285621643066
+
Steps: 0%| | 4069/1000000 [10:22:14<3008:33:42, 10.88s/it, lr=1e-5, step_loss=0.013]
Steps: 0%| | 4070/1000000 [10:22:22<2753:59:15, 9.95s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [4070], local_loss=0.018835172057151794, train_loss=0.022906426340341568, time_cost=2.708352565765381
+
Steps: 0%| | 4070/1000000 [10:22:22<2753:59:15, 9.95s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 4071/1000000 [10:22:34<2933:34:20, 10.60s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [4071], local_loss=0.028560278937220573, train_loss=0.11918248236179352, time_cost=1.212702989578247
+
Steps: 0%| | 4071/1000000 [10:22:34<2933:34:20, 10.60s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 4072/1000000 [10:22:39<2494:29:53, 9.02s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [4072], local_loss=0.23672683537006378, train_loss=0.09899908304214478, time_cost=1.2163467407226562
+
Steps: 0%| | 4072/1000000 [10:22:39<2494:29:53, 9.02s/it, lr=1e-5, step_loss=0.237]
Steps: 0%| | 4073/1000000 [10:22:49<2503:09:31, 9.05s/it, lr=1e-5, step_loss=0.237][RANK-0]: Step: [4073], local_loss=0.02342372015118599, train_loss=0.028315577656030655, time_cost=2.310504913330078
+
Steps: 0%| | 4073/1000000 [10:22:49<2503:09:31, 9.05s/it, lr=1e-5, step_loss=0.0234]
Steps: 0%| | 4074/1000000 [10:22:55<2259:13:42, 8.17s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [4074], local_loss=0.06370560824871063, train_loss=0.029804885387420654, time_cost=2.226611852645874
+
Steps: 0%| | 4074/1000000 [10:22:55<2259:13:42, 8.17s/it, lr=1e-5, step_loss=0.0637]
Steps: 0%| | 4075/1000000 [10:23:10<2824:26:01, 10.21s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [4075], local_loss=0.12520422041416168, train_loss=0.08319050073623657, time_cost=5.5885865688323975
+
Steps: 0%| | 4075/1000000 [10:23:10<2824:26:01, 10.21s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 4076/1000000 [10:23:17<2559:33:24, 9.25s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [4076], local_loss=0.016604114323854446, train_loss=0.02091585285961628, time_cost=2.8199427127838135
+
Steps: 0%| | 4076/1000000 [10:23:17<2559:33:24, 9.25s/it, lr=1e-5, step_loss=0.0166]
Steps: 0%| | 4077/1000000 [10:23:27<2657:27:56, 9.61s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [4077], local_loss=0.0633433386683464, train_loss=0.0726102739572525, time_cost=2.0945236682891846
+
Steps: 0%| | 4077/1000000 [10:23:27<2657:27:56, 9.61s/it, lr=1e-5, step_loss=0.0633]
Steps: 0%| | 4078/1000000 [10:23:41<3017:13:18, 10.91s/it, lr=1e-5, step_loss=0.0633][RANK-0]: Step: [4078], local_loss=0.02215063013136387, train_loss=0.08187027275562286, time_cost=3.7864537239074707
+
Steps: 0%| | 4078/1000000 [10:23:41<3017:13:18, 10.91s/it, lr=1e-5, step_loss=0.0222]
Steps: 0%| | 4079/1000000 [10:23:54<3224:35:59, 11.66s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [4079], local_loss=0.030931223183870316, train_loss=0.027051616460084915, time_cost=11.235817909240723
+
Steps: 0%| | 4079/1000000 [10:23:54<3224:35:59, 11.66s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 4080/1000000 [10:24:02<2865:51:14, 10.36s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [4080], local_loss=0.055808525532484055, train_loss=0.034718483686447144, time_cost=1.4870104789733887
+
Steps: 0%| | 4080/1000000 [10:24:02<2865:51:14, 10.36s/it, lr=1e-5, step_loss=0.0558]
Steps: 0%| | 4081/1000000 [10:24:13<2904:54:31, 10.50s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [4081], local_loss=0.01783253625035286, train_loss=0.0739864930510521, time_cost=2.9582951068878174
+
Steps: 0%| | 4081/1000000 [10:24:13<2904:54:31, 10.50s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 4082/1000000 [10:24:17<2393:56:45, 8.65s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [4082], local_loss=0.032291918992996216, train_loss=0.022839687764644623, time_cost=1.906294822692871
+
Steps: 0%| | 4082/1000000 [10:24:17<2393:56:45, 8.65s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 4083/1000000 [10:24:29<2683:52:17, 9.70s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [4083], local_loss=0.01578741893172264, train_loss=0.03350714594125748, time_cost=1.66255784034729
+
Steps: 0%| | 4083/1000000 [10:24:29<2683:52:17, 9.70s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 4084/1000000 [10:24:35<2392:44:22, 8.65s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [4084], local_loss=0.31525757908821106, train_loss=0.06176941469311714, time_cost=3.1381287574768066
+
Steps: 0%| | 4084/1000000 [10:24:35<2392:44:22, 8.65s/it, lr=1e-5, step_loss=0.315]
Steps: 0%| | 4085/1000000 [10:24:48<2746:14:00, 9.93s/it, lr=1e-5, step_loss=0.315][RANK-0]: Step: [4085], local_loss=0.08249638229608536, train_loss=0.032348621636629105, time_cost=3.972782611846924
+
Steps: 0%| | 4085/1000000 [10:24:48<2746:14:00, 9.93s/it, lr=1e-5, step_loss=0.0825]
Steps: 0%| | 4086/1000000 [10:24:56<2578:54:35, 9.32s/it, lr=1e-5, step_loss=0.0825][RANK-0]: Step: [4086], local_loss=0.02198771946132183, train_loss=0.02778228372335434, time_cost=1.7168443202972412
+
Steps: 0%| | 4086/1000000 [10:24:56<2578:54:35, 9.32s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 4087/1000000 [10:25:02<2262:01:47, 8.18s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [4087], local_loss=0.015230529941618443, train_loss=4.995988368988037, time_cost=1.2260942459106445
+
Steps: 0%| | 4087/1000000 [10:25:02<2262:01:47, 8.18s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 4088/1000000 [10:25:12<2482:22:28, 8.97s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [4088], local_loss=0.07372482120990753, train_loss=0.05604731664061546, time_cost=1.575713872909546
+
Steps: 0%| | 4088/1000000 [10:25:12<2482:22:28, 8.97s/it, lr=1e-5, step_loss=0.0737]
Steps: 0%| | 4089/1000000 [10:25:18<2173:02:01, 7.86s/it, lr=1e-5, step_loss=0.0737][RANK-0]: Step: [4089], local_loss=0.026555664837360382, train_loss=0.05007726699113846, time_cost=1.2350366115570068
+
Steps: 0%| | 4089/1000000 [10:25:18<2173:02:01, 7.86s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 4090/1000000 [10:25:25<2154:13:21, 7.79s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [4090], local_loss=0.1701642870903015, train_loss=0.04228820279240608, time_cost=1.8338134288787842
+
Steps: 0%| | 4090/1000000 [10:25:25<2154:13:21, 7.79s/it, lr=1e-5, step_loss=0.17]
Steps: 0%| | 4091/1000000 [10:25:31<1984:04:02, 7.17s/it, lr=1e-5, step_loss=0.17][RANK-0]: Step: [4091], local_loss=0.015521974302828312, train_loss=0.12670999765396118, time_cost=2.5567831993103027
+
Steps: 0%| | 4091/1000000 [10:25:31<1984:04:02, 7.17s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4092/1000000 [10:25:42<2326:41:46, 8.41s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4092], local_loss=0.6709703207015991, train_loss=0.14625480771064758, time_cost=8.110400199890137
+
Steps: 0%| | 4092/1000000 [10:25:42<2326:41:46, 8.41s/it, lr=1e-5, step_loss=0.671]
Steps: 0%| | 4093/1000000 [10:25:53<2480:07:42, 8.97s/it, lr=1e-5, step_loss=0.671][RANK-0]: Step: [4093], local_loss=0.01811208575963974, train_loss=0.1633988618850708, time_cost=1.505544900894165
+
Steps: 0%| | 4093/1000000 [10:25:53<2480:07:42, 8.97s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 4094/1000000 [10:26:01<2397:35:48, 8.67s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [4094], local_loss=0.040070708841085434, train_loss=0.03799976781010628, time_cost=1.4274086952209473
+
Steps: 0%| | 4094/1000000 [10:26:01<2397:35:48, 8.67s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 4095/1000000 [10:26:05<2053:47:51, 7.42s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [4095], local_loss=0.022361233830451965, train_loss=0.07322216033935547, time_cost=1.4124984741210938
+
Steps: 0%| | 4095/1000000 [10:26:05<2053:47:51, 7.42s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 4096/1000000 [10:26:19<2551:54:44, 9.22s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [4096], local_loss=0.06964799761772156, train_loss=0.0652848333120346, time_cost=3.4066739082336426
+
Steps: 0%| | 4096/1000000 [10:26:19<2551:54:44, 9.22s/it, lr=1e-5, step_loss=0.0696]
Steps: 0%| | 4097/1000000 [10:26:27<2524:49:53, 9.13s/it, lr=1e-5, step_loss=0.0696][RANK-0]: Step: [4097], local_loss=130.82113647460938, train_loss=16.370868682861328, time_cost=1.234715461730957
+
Steps: 0%| | 4097/1000000 [10:26:27<2524:49:53, 9.13s/it, lr=1e-5, step_loss=131]
Steps: 0%| | 4098/1000000 [10:26:43<3028:12:26, 10.95s/it, lr=1e-5, step_loss=131][RANK-0]: Step: [4098], local_loss=0.01802736520767212, train_loss=0.21380524337291718, time_cost=1.2213854789733887
+
Steps: 0%| | 4098/1000000 [10:26:43<3028:12:26, 10.95s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 4099/1000000 [10:26:49<2610:28:08, 9.44s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [4099], local_loss=0.018154550343751907, train_loss=0.02103842794895172, time_cost=3.0373337268829346
+
Steps: 0%| | 4099/1000000 [10:26:49<2610:28:08, 9.44s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 4100/1000000 [10:27:00<2735:27:59, 9.89s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [4100], local_loss=0.037465646862983704, train_loss=0.03784464672207832, time_cost=1.206507921218872
+
Steps: 0%| | 4100/1000000 [10:27:00<2735:27:59, 9.89s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 4101/1000000 [10:27:05<2376:08:00, 8.59s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [4101], local_loss=0.46289509534835815, train_loss=0.08474108576774597, time_cost=1.949408769607544
+
Steps: 0%| | 4101/1000000 [10:27:05<2376:08:00, 8.59s/it, lr=1e-5, step_loss=0.463]
Steps: 0%| | 4102/1000000 [10:27:14<2444:33:39, 8.84s/it, lr=1e-5, step_loss=0.463][RANK-0]: Step: [4102], local_loss=0.014058683067560196, train_loss=0.05966845899820328, time_cost=2.7698681354522705
+
Steps: 0%| | 4102/1000000 [10:27:14<2444:33:39, 8.84s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 4103/1000000 [10:27:20<2174:09:51, 7.86s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [4103], local_loss=0.006767827086150646, train_loss=0.03830892592668533, time_cost=1.3147411346435547
+
Steps: 0%| | 4103/1000000 [10:27:20<2174:09:51, 7.86s/it, lr=1e-5, step_loss=0.00677]
Steps: 0%| | 4104/1000000 [10:27:30<2334:00:07, 8.44s/it, lr=1e-5, step_loss=0.00677][RANK-0]: Step: [4104], local_loss=0.021379444748163223, train_loss=0.050135474652051926, time_cost=2.065486192703247
+
Steps: 0%| | 4104/1000000 [10:27:30<2334:00:07, 8.44s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 4105/1000000 [10:27:38<2351:35:04, 8.50s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [4105], local_loss=0.045409463346004486, train_loss=0.027149386703968048, time_cost=7.081944227218628
+
Steps: 0%| | 4105/1000000 [10:27:38<2351:35:04, 8.50s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 4106/1000000 [10:27:46<2239:01:42, 8.09s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [4106], local_loss=0.06014968454837799, train_loss=0.07608763128519058, time_cost=2.385568857192993
+
Steps: 0%| | 4106/1000000 [10:27:46<2239:01:42, 8.09s/it, lr=1e-5, step_loss=0.0601]
Steps: 0%| | 4107/1000000 [10:27:54<2257:09:21, 8.16s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [4107], local_loss=0.05394681170582771, train_loss=0.03761106729507446, time_cost=2.023073673248291
+
Steps: 0%| | 4107/1000000 [10:27:54<2257:09:21, 8.16s/it, lr=1e-5, step_loss=0.0539]
Steps: 0%| | 4108/1000000 [10:28:03<2345:10:24, 8.48s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [4108], local_loss=0.04877733066678047, train_loss=0.1319674700498581, time_cost=1.4540677070617676
+
Steps: 0%| | 4108/1000000 [10:28:03<2345:10:24, 8.48s/it, lr=1e-5, step_loss=0.0488]
Steps: 0%| | 4109/1000000 [10:28:10<2167:31:48, 7.84s/it, lr=1e-5, step_loss=0.0488][RANK-0]: Step: [4109], local_loss=0.013254053890705109, train_loss=0.06734958291053772, time_cost=1.2278296947479248
+
Steps: 0%| | 4109/1000000 [10:28:10<2167:31:48, 7.84s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 4110/1000000 [10:28:16<2023:57:56, 7.32s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [4110], local_loss=0.10552668571472168, train_loss=0.05960741639137268, time_cost=1.9034438133239746
+
Steps: 0%| | 4110/1000000 [10:28:16<2023:57:56, 7.32s/it, lr=1e-5, step_loss=0.106]
Steps: 0%| | 4111/1000000 [10:28:24<2078:08:28, 7.51s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [4111], local_loss=0.2754529118537903, train_loss=0.12002154439687729, time_cost=5.783147811889648
+
Steps: 0%| | 4111/1000000 [10:28:24<2078:08:28, 7.51s/it, lr=1e-5, step_loss=0.275]
Steps: 0%| | 4112/1000000 [10:28:33<2259:17:51, 8.17s/it, lr=1e-5, step_loss=0.275][RANK-0]: Step: [4112], local_loss=0.02398613840341568, train_loss=0.06486330926418304, time_cost=2.194772243499756
+
Steps: 0%| | 4112/1000000 [10:28:33<2259:17:51, 8.17s/it, lr=1e-5, step_loss=0.024]
Steps: 0%| | 4113/1000000 [10:28:48<2811:07:47, 10.16s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [4113], local_loss=0.007655290886759758, train_loss=0.0759066492319107, time_cost=6.142132997512817
+
Steps: 0%| | 4113/1000000 [10:28:48<2811:07:47, 10.16s/it, lr=1e-5, step_loss=0.00766]
Steps: 0%| | 4114/1000000 [10:28:57<2738:35:07, 9.90s/it, lr=1e-5, step_loss=0.00766][RANK-0]: Step: [4114], local_loss=0.5138713717460632, train_loss=0.08790615200996399, time_cost=1.2432141304016113
+
Steps: 0%| | 4114/1000000 [10:28:57<2738:35:07, 9.90s/it, lr=1e-5, step_loss=0.514]
Steps: 0%| | 4115/1000000 [10:29:03<2419:52:39, 8.75s/it, lr=1e-5, step_loss=0.514][RANK-0]: Step: [4115], local_loss=0.015653086826205254, train_loss=0.07513128966093063, time_cost=2.6448962688446045
+
Steps: 0%| | 4115/1000000 [10:29:03<2419:52:39, 8.75s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 4116/1000000 [10:29:17<2822:16:28, 10.20s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [4116], local_loss=0.07984130829572678, train_loss=0.05384241044521332, time_cost=1.1960859298706055
+
Steps: 0%| | 4116/1000000 [10:29:17<2822:16:28, 10.20s/it, lr=1e-5, step_loss=0.0798]
Steps: 0%| | 4117/1000000 [10:29:27<2831:19:27, 10.23s/it, lr=1e-5, step_loss=0.0798][RANK-0]: Step: [4117], local_loss=0.048576273024082184, train_loss=0.09311385452747345, time_cost=4.627519369125366
+
Steps: 0%| | 4117/1000000 [10:29:27<2831:19:27, 10.23s/it, lr=1e-5, step_loss=0.0486]
Steps: 0%| | 4118/1000000 [10:29:32<2393:11:07, 8.65s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [4118], local_loss=0.017983779311180115, train_loss=0.06582444161176682, time_cost=2.447434902191162
+
Steps: 0%| | 4118/1000000 [10:29:32<2393:11:07, 8.65s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 4119/1000000 [10:29:37<2077:41:25, 7.51s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [4119], local_loss=0.01650402508676052, train_loss=0.07723762094974518, time_cost=2.144848108291626
+
Steps: 0%| | 4119/1000000 [10:29:37<2077:41:25, 7.51s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 4120/1000000 [10:29:44<2025:08:07, 7.32s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [4120], local_loss=0.019878484308719635, train_loss=0.059566430747509, time_cost=2.3162343502044678
+
Steps: 0%| | 4120/1000000 [10:29:44<2025:08:07, 7.32s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 4121/1000000 [10:29:56<2409:22:58, 8.71s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [4121], local_loss=0.01579042710363865, train_loss=0.034040093421936035, time_cost=9.798445463180542
+
Steps: 0%| | 4121/1000000 [10:29:56<2409:22:58, 8.71s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 4122/1000000 [10:30:14<3210:39:48, 11.61s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [4122], local_loss=0.02084827423095703, train_loss=0.066309854388237, time_cost=8.486838340759277
+
Steps: 0%| | 4122/1000000 [10:30:14<3210:39:48, 11.61s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 4123/1000000 [10:30:22<2876:02:26, 10.40s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [4123], local_loss=0.032078150659799576, train_loss=0.03320109099149704, time_cost=2.492953300476074
+
Steps: 0%| | 4123/1000000 [10:30:22<2876:02:26, 10.40s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 4124/1000000 [10:30:28<2483:53:53, 8.98s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [4124], local_loss=0.016168303787708282, train_loss=0.04729101061820984, time_cost=1.818692684173584
+
Steps: 0%| | 4124/1000000 [10:30:28<2483:53:53, 8.98s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 4125/1000000 [10:30:34<2303:59:05, 8.33s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [4125], local_loss=0.03359417989850044, train_loss=0.05738289654254913, time_cost=1.2187511920928955
+
Steps: 0%| | 4125/1000000 [10:30:34<2303:59:05, 8.33s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 4126/1000000 [10:30:45<2527:20:23, 9.14s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [4126], local_loss=0.5022830367088318, train_loss=0.3946130871772766, time_cost=1.2642271518707275
+
Steps: 0%| | 4126/1000000 [10:30:45<2527:20:23, 9.14s/it, lr=1e-5, step_loss=0.502]
Steps: 0%| | 4127/1000000 [10:30:51<2253:51:18, 8.15s/it, lr=1e-5, step_loss=0.502][RANK-0]: Step: [4127], local_loss=0.023763112723827362, train_loss=0.05605185776948929, time_cost=1.50559663772583
+
Steps: 0%| | 4127/1000000 [10:30:51<2253:51:18, 8.15s/it, lr=1e-5, step_loss=0.0238]
Steps: 0%| | 4128/1000000 [10:31:02<2498:03:55, 9.03s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [4128], local_loss=0.04103957116603851, train_loss=0.05138903856277466, time_cost=1.8009130954742432
+
Steps: 0%| | 4128/1000000 [10:31:02<2498:03:55, 9.03s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 4129/1000000 [10:31:14<2679:54:00, 9.69s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [4129], local_loss=0.08409234136343002, train_loss=0.034779228270053864, time_cost=3.1569244861602783
+
Steps: 0%| | 4129/1000000 [10:31:14<2679:54:00, 9.69s/it, lr=1e-5, step_loss=0.0841]
Steps: 0%| | 4130/1000000 [10:31:26<2916:57:58, 10.54s/it, lr=1e-5, step_loss=0.0841][RANK-0]: Step: [4130], local_loss=0.01178036816418171, train_loss=0.15915343165397644, time_cost=5.437500476837158
+
Steps: 0%| | 4130/1000000 [10:31:26<2916:57:58, 10.54s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 4131/1000000 [10:31:36<2827:23:43, 10.22s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [4131], local_loss=0.05282943323254585, train_loss=0.04705294966697693, time_cost=1.7030549049377441
+
Steps: 0%| | 4131/1000000 [10:31:36<2827:23:43, 10.22s/it, lr=1e-5, step_loss=0.0528]
Steps: 0%| | 4132/1000000 [10:31:43<2582:50:05, 9.34s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [4132], local_loss=0.02621648646891117, train_loss=0.07609823346138, time_cost=2.8417110443115234
+
Steps: 0%| | 4132/1000000 [10:31:43<2582:50:05, 9.34s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 4133/1000000 [10:31:50<2393:51:37, 8.65s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [4133], local_loss=0.026417212560772896, train_loss=0.1292961835861206, time_cost=1.4734630584716797
+
Steps: 0%| | 4133/1000000 [10:31:50<2393:51:37, 8.65s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 4134/1000000 [10:31:55<2105:59:42, 7.61s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [4134], local_loss=0.04041728749871254, train_loss=0.0450751930475235, time_cost=2.2098493576049805
+
Steps: 0%| | 4134/1000000 [10:31:55<2105:59:42, 7.61s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 4135/1000000 [10:32:00<1903:43:17, 6.88s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [4135], local_loss=0.03066631220281124, train_loss=0.029476052150130272, time_cost=3.6241719722747803
+
Steps: 0%| | 4135/1000000 [10:32:00<1903:43:17, 6.88s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 4136/1000000 [10:32:12<2292:51:09, 8.29s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [4136], local_loss=0.11826574802398682, train_loss=5.419899940490723, time_cost=1.2209267616271973
+
Steps: 0%| | 4136/1000000 [10:32:12<2292:51:09, 8.29s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 4137/1000000 [10:32:17<2041:24:05, 7.38s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [4137], local_loss=0.017026152461767197, train_loss=0.03901636600494385, time_cost=2.0732667446136475
+
Steps: 0%| | 4137/1000000 [10:32:17<2041:24:05, 7.38s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 4138/1000000 [10:32:26<2158:33:26, 7.80s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [4138], local_loss=0.037496067583560944, train_loss=0.04638173058629036, time_cost=1.6007792949676514
+
Steps: 0%| | 4138/1000000 [10:32:26<2158:33:26, 7.80s/it, lr=1e-5, step_loss=0.0375]
Steps: 0%| | 4139/1000000 [10:32:37<2424:54:28, 8.77s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [4139], local_loss=0.029827119782567024, train_loss=0.048895854502916336, time_cost=4.40610671043396
+
Steps: 0%| | 4139/1000000 [10:32:37<2424:54:28, 8.77s/it, lr=1e-5, step_loss=0.0298]
Steps: 0%| | 4140/1000000 [10:32:44<2287:58:57, 8.27s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [4140], local_loss=0.04818915203213692, train_loss=0.1158217266201973, time_cost=5.810619115829468
+
Steps: 0%| | 4140/1000000 [10:32:44<2287:58:57, 8.27s/it, lr=1e-5, step_loss=0.0482]
Steps: 0%| | 4141/1000000 [10:32:55<2491:13:34, 9.01s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [4141], local_loss=0.03337579965591431, train_loss=0.025450749322772026, time_cost=1.4830031394958496
+
Steps: 0%| | 4141/1000000 [10:32:55<2491:13:34, 9.01s/it, lr=1e-5, step_loss=0.0334]
Steps: 0%| | 4142/1000000 [10:33:02<2318:20:21, 8.38s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [4142], local_loss=0.10928104817867279, train_loss=0.03924109786748886, time_cost=1.8001022338867188
+
Steps: 0%| | 4142/1000000 [10:33:02<2318:20:21, 8.38s/it, lr=1e-5, step_loss=0.109]
Steps: 0%| | 4143/1000000 [10:33:08<2125:36:13, 7.68s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [4143], local_loss=0.02125423215329647, train_loss=0.020158259198069572, time_cost=1.8459012508392334
+
Steps: 0%| | 4143/1000000 [10:33:08<2125:36:13, 7.68s/it, lr=1e-5, step_loss=0.0213]
Steps: 0%| | 4144/1000000 [10:33:12<1860:27:10, 6.73s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [4144], local_loss=0.014755652286112309, train_loss=0.17168530821800232, time_cost=1.3104300498962402
+
Steps: 0%| | 4144/1000000 [10:33:12<1860:27:10, 6.73s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 4145/1000000 [10:33:17<1707:01:53, 6.17s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [4145], local_loss=0.05097230523824692, train_loss=0.05090688169002533, time_cost=1.2827999591827393
+
Steps: 0%| | 4145/1000000 [10:33:17<1707:01:53, 6.17s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 4146/1000000 [10:33:27<1994:16:24, 7.21s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [4146], local_loss=0.0766739547252655, train_loss=0.035786278545856476, time_cost=2.1750526428222656
+
Steps: 0%| | 4146/1000000 [10:33:27<1994:16:24, 7.21s/it, lr=1e-5, step_loss=0.0767]
Steps: 0%| | 4147/1000000 [10:33:40<2517:55:33, 9.10s/it, lr=1e-5, step_loss=0.0767][RANK-0]: Step: [4147], local_loss=0.038203611969947815, train_loss=0.060061220079660416, time_cost=5.1601855754852295
+
Steps: 0%| | 4147/1000000 [10:33:40<2517:55:33, 9.10s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 4148/1000000 [10:33:46<2278:45:26, 8.24s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [4148], local_loss=0.03588622808456421, train_loss=0.030611161142587662, time_cost=4.262911796569824
+
Steps: 0%| | 4148/1000000 [10:33:46<2278:45:26, 8.24s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 4149/1000000 [10:33:54<2203:16:47, 7.96s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [4149], local_loss=0.0360969640314579, train_loss=0.05947718769311905, time_cost=1.2717053890228271
+
Steps: 0%| | 4149/1000000 [10:33:54<2203:16:47, 7.96s/it, lr=1e-5, step_loss=0.0361]
Steps: 0%| | 4150/1000000 [10:33:59<1938:32:10, 7.01s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [4150], local_loss=0.04265836998820305, train_loss=0.036351852118968964, time_cost=3.333160161972046
+
Steps: 0%| | 4150/1000000 [10:33:59<1938:32:10, 7.01s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 4151/1000000 [10:34:08<2150:11:07, 7.77s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [4151], local_loss=0.047308843582868576, train_loss=0.06353195756673813, time_cost=1.6247570514678955
+
Steps: 0%| | 4151/1000000 [10:34:08<2150:11:07, 7.77s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 4152/1000000 [10:34:22<2646:11:36, 9.57s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [4152], local_loss=0.025960315018892288, train_loss=0.032302215695381165, time_cost=5.682842969894409
+
Steps: 0%| | 4152/1000000 [10:34:22<2646:11:36, 9.57s/it, lr=1e-5, step_loss=0.026]
Steps: 0%| | 4153/1000000 [10:34:28<2365:32:19, 8.55s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [4153], local_loss=0.07601246982812881, train_loss=0.05738835036754608, time_cost=1.3290064334869385
+
Steps: 0%| | 4153/1000000 [10:34:28<2365:32:19, 8.55s/it, lr=1e-5, step_loss=0.076]
Steps: 0%| | 4154/1000000 [10:34:34<2134:06:04, 7.71s/it, lr=1e-5, step_loss=0.076][RANK-0]: Step: [4154], local_loss=0.011227661743760109, train_loss=0.08254246413707733, time_cost=2.0682554244995117
+
Steps: 0%| | 4154/1000000 [10:34:34<2134:06:04, 7.71s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 4155/1000000 [10:34:43<2250:08:10, 8.13s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [4155], local_loss=0.020423920825123787, train_loss=0.045810263603925705, time_cost=2.77310848236084
+
Steps: 0%| | 4155/1000000 [10:34:43<2250:08:10, 8.13s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 4156/1000000 [10:34:51<2215:53:27, 8.01s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [4156], local_loss=0.024651378393173218, train_loss=0.08295021951198578, time_cost=1.214322805404663
+
Steps: 0%| | 4156/1000000 [10:34:51<2215:53:27, 8.01s/it, lr=1e-5, step_loss=0.0247]
Steps: 0%| | 4157/1000000 [10:34:58<2133:30:43, 7.71s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [4157], local_loss=0.014331379905343056, train_loss=0.034918103367090225, time_cost=1.2123970985412598
+
Steps: 0%| | 4157/1000000 [10:34:58<2133:30:43, 7.71s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 4158/1000000 [10:35:05<2118:43:37, 7.66s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [4158], local_loss=0.04764923080801964, train_loss=0.16343453526496887, time_cost=2.2012500762939453
+
Steps: 0%| | 4158/1000000 [10:35:05<2118:43:37, 7.66s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 4159/1000000 [10:35:10<1895:30:08, 6.85s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [4159], local_loss=0.030654028058052063, train_loss=0.02274300903081894, time_cost=1.2317821979522705
+
Steps: 0%| | 4159/1000000 [10:35:10<1895:30:08, 6.85s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 4160/1000000 [10:35:18<1973:30:32, 7.13s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [4160], local_loss=0.037910979241132736, train_loss=0.03468158468604088, time_cost=2.6018171310424805
+
Steps: 0%| | 4160/1000000 [10:35:18<1973:30:32, 7.13s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 4161/1000000 [10:35:28<2235:42:37, 8.08s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [4161], local_loss=0.028820957988500595, train_loss=0.08946369588375092, time_cost=1.3994710445404053
+
Steps: 0%| | 4161/1000000 [10:35:28<2235:42:37, 8.08s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 4162/1000000 [10:35:34<2032:00:31, 7.35s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [4162], local_loss=0.019822966307401657, train_loss=0.107836052775383, time_cost=1.9015281200408936
+
Steps: 0%| | 4162/1000000 [10:35:34<2032:00:31, 7.35s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 4163/1000000 [10:35:49<2652:45:58, 9.59s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [4163], local_loss=0.10393955558538437, train_loss=0.03928136080503464, time_cost=4.204108238220215
+
Steps: 0%| | 4163/1000000 [10:35:49<2652:45:58, 9.59s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 4164/1000000 [10:35:56<2498:37:28, 9.03s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [4164], local_loss=0.021748606115579605, train_loss=0.02750137448310852, time_cost=2.611215829849243
+
Steps: 0%| | 4164/1000000 [10:35:56<2498:37:28, 9.03s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 4165/1000000 [10:36:08<2741:28:45, 9.91s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [4165], local_loss=0.02772115170955658, train_loss=0.15535475313663483, time_cost=3.4823126792907715
+
Steps: 0%| | 4165/1000000 [10:36:08<2741:28:45, 9.91s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 4166/1000000 [10:36:13<2291:57:09, 8.29s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [4166], local_loss=1.0030088424682617, train_loss=0.15544947981834412, time_cost=2.4732632637023926
+
Steps: 0%| | 4166/1000000 [10:36:13<2291:57:09, 8.29s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 4167/1000000 [10:36:27<2797:39:31, 10.11s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [4167], local_loss=0.02771679125726223, train_loss=0.03575838357210159, time_cost=3.58553147315979
+
Steps: 0%| | 4167/1000000 [10:36:27<2797:39:31, 10.11s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 4168/1000000 [10:36:32<2378:50:25, 8.60s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [4168], local_loss=0.025221003219485283, train_loss=0.02887607365846634, time_cost=1.2881591320037842
+
Steps: 0%| | 4168/1000000 [10:36:32<2378:50:25, 8.60s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 4169/1000000 [10:36:44<2591:17:38, 9.37s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [4169], local_loss=0.020314116030931473, train_loss=0.05423605442047119, time_cost=4.017257213592529
+
Steps: 0%| | 4169/1000000 [10:36:44<2591:17:38, 9.37s/it, lr=1e-5, step_loss=0.0203]
Steps: 0%| | 4170/1000000 [10:36:55<2780:12:13, 10.05s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [4170], local_loss=0.034187041223049164, train_loss=0.10683734714984894, time_cost=1.8471803665161133
+
Steps: 0%| | 4170/1000000 [10:36:55<2780:12:13, 10.05s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 4171/1000000 [10:37:03<2569:39:36, 9.29s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [4171], local_loss=0.032927148044109344, train_loss=0.0882893055677414, time_cost=1.4877309799194336
+
Steps: 0%| | 4171/1000000 [10:37:03<2569:39:36, 9.29s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 4172/1000000 [10:37:07<2194:28:20, 7.93s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [4172], local_loss=0.020943304523825645, train_loss=0.031068477779626846, time_cost=1.2793607711791992
+
Steps: 0%| | 4172/1000000 [10:37:07<2194:28:20, 7.93s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 4173/1000000 [10:37:13<1978:34:57, 7.15s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [4173], local_loss=0.4414709806442261, train_loss=0.09901831299066544, time_cost=2.1042120456695557
+
Steps: 0%| | 4173/1000000 [10:37:13<1978:34:57, 7.15s/it, lr=1e-5, step_loss=0.441]
Steps: 0%| | 4174/1000000 [10:37:22<2142:55:17, 7.75s/it, lr=1e-5, step_loss=0.441][RANK-0]: Step: [4174], local_loss=0.012880939058959484, train_loss=0.0423453226685524, time_cost=2.397841453552246
+
Steps: 0%| | 4174/1000000 [10:37:22<2142:55:17, 7.75s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 4175/1000000 [10:37:33<2444:45:37, 8.84s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [4175], local_loss=0.4542101323604584, train_loss=0.0836639553308487, time_cost=7.072401762008667
+
Steps: 0%| | 4175/1000000 [10:37:33<2444:45:37, 8.84s/it, lr=1e-5, step_loss=0.454]
Steps: 0%| | 4176/1000000 [10:37:44<2612:11:38, 9.44s/it, lr=1e-5, step_loss=0.454][RANK-0]: Step: [4176], local_loss=0.12516064941883087, train_loss=0.04655567556619644, time_cost=1.6297917366027832
+
Steps: 0%| | 4176/1000000 [10:37:44<2612:11:38, 9.44s/it, lr=1e-5, step_loss=0.125]
Steps: 0%| | 4177/1000000 [10:37:55<2689:34:53, 9.72s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [4177], local_loss=0.01093719620257616, train_loss=0.07767082750797272, time_cost=6.5334312915802
+
Steps: 0%| | 4177/1000000 [10:37:55<2689:34:53, 9.72s/it, lr=1e-5, step_loss=0.0109]
Steps: 0%| | 4178/1000000 [10:38:07<2957:03:38, 10.69s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [4178], local_loss=0.02277565374970436, train_loss=0.07910969853401184, time_cost=5.4543375968933105
+
Steps: 0%| | 4178/1000000 [10:38:07<2957:03:38, 10.69s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 4179/1000000 [10:38:18<2946:58:47, 10.65s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [4179], local_loss=0.014862879179418087, train_loss=0.061739787459373474, time_cost=2.3762011528015137
+
Steps: 0%| | 4179/1000000 [10:38:18<2946:58:47, 10.65s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 4180/1000000 [10:38:32<3197:17:55, 11.56s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [4180], local_loss=0.012338448315858841, train_loss=19.174875259399414, time_cost=4.017084360122681
+
Steps: 0%| | 4180/1000000 [10:38:32<3197:17:55, 11.56s/it, lr=1e-5, step_loss=0.0123]
Steps: 0%| | 4181/1000000 [10:38:43<3169:31:49, 11.46s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [4181], local_loss=0.016164764761924744, train_loss=0.023481961339712143, time_cost=1.2849068641662598
+
Steps: 0%| | 4181/1000000 [10:38:43<3169:31:49, 11.46s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 4182/1000000 [10:38:56<3338:51:26, 12.07s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [4182], local_loss=0.02942448854446411, train_loss=0.025169868022203445, time_cost=3.5708322525024414
+
Steps: 0%| | 4182/1000000 [10:38:56<3338:51:26, 12.07s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 4183/1000000 [10:39:03<2897:44:29, 10.48s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [4183], local_loss=0.01639591157436371, train_loss=0.01889091357588768, time_cost=2.558260440826416
+
Steps: 0%| | 4183/1000000 [10:39:03<2897:44:29, 10.48s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 4184/1000000 [10:39:10<2612:27:35, 9.44s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [4184], local_loss=0.10888122022151947, train_loss=0.07928101718425751, time_cost=1.2134485244750977
+
Steps: 0%| | 4184/1000000 [10:39:10<2612:27:35, 9.44s/it, lr=1e-5, step_loss=0.109]
Steps: 0%| | 4185/1000000 [10:39:27<3189:58:41, 11.53s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [4185], local_loss=0.07135676592588425, train_loss=0.1566939353942871, time_cost=5.7490997314453125
+
Steps: 0%| | 4185/1000000 [10:39:27<3189:58:41, 11.53s/it, lr=1e-5, step_loss=0.0714]
Steps: 0%| | 4186/1000000 [10:39:36<2989:49:55, 10.81s/it, lr=1e-5, step_loss=0.0714][RANK-0]: Step: [4186], local_loss=0.01900102198123932, train_loss=0.021572912111878395, time_cost=3.6958682537078857
+
Steps: 0%| | 4186/1000000 [10:39:36<2989:49:55, 10.81s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 4187/1000000 [10:39:47<3031:51:46, 10.96s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [4187], local_loss=0.011077670380473137, train_loss=0.038893599063158035, time_cost=1.2289824485778809
+
Steps: 0%| | 4187/1000000 [10:39:47<3031:51:46, 10.96s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 4188/1000000 [10:39:54<2719:30:27, 9.83s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [4188], local_loss=0.01786557212471962, train_loss=0.0365556925535202, time_cost=3.684269666671753
+
Steps: 0%| | 4188/1000000 [10:39:54<2719:30:27, 9.83s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 4189/1000000 [10:40:06<2900:46:05, 10.49s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [4189], local_loss=0.07798366248607635, train_loss=0.04021066427230835, time_cost=2.9530203342437744
+
Steps: 0%| | 4189/1000000 [10:40:06<2900:46:05, 10.49s/it, lr=1e-5, step_loss=0.078]
Steps: 0%| | 4190/1000000 [10:40:20<3132:24:51, 11.32s/it, lr=1e-5, step_loss=0.078][RANK-0]: Step: [4190], local_loss=0.012851856648921967, train_loss=0.0913565456867218, time_cost=3.3191330432891846
+
Steps: 0%| | 4190/1000000 [10:40:20<3132:24:51, 11.32s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 4191/1000000 [10:40:39<3821:59:08, 13.82s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [4191], local_loss=0.03685839846730232, train_loss=0.01881902664899826, time_cost=10.696313858032227
+
Steps: 0%| | 4191/1000000 [10:40:39<3821:59:08, 13.82s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 4192/1000000 [10:40:46<3251:36:31, 11.76s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [4192], local_loss=0.03371899947524071, train_loss=0.030676715075969696, time_cost=1.2037224769592285
+
Steps: 0%| | 4192/1000000 [10:40:46<3251:36:31, 11.76s/it, lr=1e-5, step_loss=0.0337]
Steps: 0%| | 4193/1000000 [10:40:52<2749:43:10, 9.94s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [4193], local_loss=0.028199320659041405, train_loss=0.05618694797158241, time_cost=1.4149620532989502
+
Steps: 0%| | 4193/1000000 [10:40:52<2749:43:10, 9.94s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 4194/1000000 [10:41:02<2743:10:44, 9.92s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [4194], local_loss=0.020732250064611435, train_loss=0.028462722897529602, time_cost=1.2188715934753418
+
Steps: 0%| | 4194/1000000 [10:41:02<2743:10:44, 9.92s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 4195/1000000 [10:41:11<2733:27:29, 9.88s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [4195], local_loss=0.02815951220691204, train_loss=0.08703576028347015, time_cost=1.4133031368255615
+
Steps: 0%| | 4195/1000000 [10:41:11<2733:27:29, 9.88s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 4196/1000000 [10:41:26<3082:42:37, 11.14s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [4196], local_loss=0.2628815770149231, train_loss=0.22906897962093353, time_cost=5.810293912887573
+
Steps: 0%| | 4196/1000000 [10:41:26<3082:42:37, 11.14s/it, lr=1e-5, step_loss=0.263]
Steps: 0%| | 4197/1000000 [10:41:37<3079:27:57, 11.13s/it, lr=1e-5, step_loss=0.263][RANK-0]: Step: [4197], local_loss=0.01746394857764244, train_loss=0.026886355131864548, time_cost=3.510014533996582
+
Steps: 0%| | 4197/1000000 [10:41:37<3079:27:57, 11.13s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4198/1000000 [10:41:52<3423:54:32, 12.38s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4198], local_loss=0.488364040851593, train_loss=0.17835688591003418, time_cost=6.266700267791748
+
Steps: 0%| | 4198/1000000 [10:41:52<3423:54:32, 12.38s/it, lr=1e-5, step_loss=0.488]
Steps: 0%| | 4199/1000000 [10:41:57<2808:36:50, 10.15s/it, lr=1e-5, step_loss=0.488][RANK-0]: Step: [4199], local_loss=0.03061923012137413, train_loss=0.06703546643257141, time_cost=1.3411378860473633
+
Steps: 0%| | 4199/1000000 [10:41:57<2808:36:50, 10.15s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 4200/1000000 [10:42:05<2618:20:36, 9.47s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [4200], local_loss=0.028903434053063393, train_loss=0.06727785617113113, time_cost=2.056180477142334
+
Steps: 0%| | 4200/1000000 [10:42:05<2618:20:36, 9.47s/it, lr=1e-5, step_loss=0.0289]
Steps: 0%| | 4201/1000000 [10:42:16<2781:30:49, 10.06s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [4201], local_loss=0.01394239068031311, train_loss=0.07825542986392975, time_cost=1.2167108058929443
+
Steps: 0%| | 4201/1000000 [10:42:16<2781:30:49, 10.06s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4202/1000000 [10:42:25<2696:19:16, 9.75s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4202], local_loss=0.045893169939517975, train_loss=0.022440897300839424, time_cost=2.4150664806365967
+
Steps: 0%| | 4202/1000000 [10:42:25<2696:19:16, 9.75s/it, lr=1e-5, step_loss=0.0459]
Steps: 0%| | 4203/1000000 [10:42:39<3048:45:23, 11.02s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [4203], local_loss=0.01901576854288578, train_loss=0.05188945680856705, time_cost=7.1598756313323975
+
Steps: 0%| | 4203/1000000 [10:42:39<3048:45:23, 11.02s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 4204/1000000 [10:42:44<2561:22:54, 9.26s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [4204], local_loss=0.009895379655063152, train_loss=0.020937014371156693, time_cost=2.2985551357269287
+
Steps: 0%| | 4204/1000000 [10:42:44<2561:22:54, 9.26s/it, lr=1e-5, step_loss=0.0099]
Steps: 0%| | 4205/1000000 [10:42:53<2493:25:11, 9.01s/it, lr=1e-5, step_loss=0.0099][RANK-0]: Step: [4205], local_loss=0.01710587926208973, train_loss=0.048304878175258636, time_cost=5.277247905731201
+
Steps: 0%| | 4205/1000000 [10:42:53<2493:25:11, 9.01s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 4206/1000000 [10:43:00<2330:38:16, 8.43s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [4206], local_loss=0.051638200879096985, train_loss=0.08690404891967773, time_cost=2.5990445613861084
+
Steps: 0%| | 4206/1000000 [10:43:00<2330:38:16, 8.43s/it, lr=1e-5, step_loss=0.0516]
Steps: 0%| | 4207/1000000 [10:43:07<2212:21:17, 8.00s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [4207], local_loss=0.17841440439224243, train_loss=0.07862057536840439, time_cost=4.752817630767822
+
Steps: 0%| | 4207/1000000 [10:43:07<2212:21:17, 8.00s/it, lr=1e-5, step_loss=0.178]
Steps: 0%| | 4208/1000000 [10:43:19<2533:39:27, 9.16s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [4208], local_loss=0.020480122417211533, train_loss=0.036499716341495514, time_cost=3.868149518966675
+
Steps: 0%| | 4208/1000000 [10:43:19<2533:39:27, 9.16s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 4209/1000000 [10:43:26<2402:23:03, 8.69s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [4209], local_loss=0.02147774025797844, train_loss=0.07519437372684479, time_cost=1.3073689937591553
+
Steps: 0%| | 4209/1000000 [10:43:26<2402:23:03, 8.69s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 4210/1000000 [10:43:32<2160:08:51, 7.81s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [4210], local_loss=0.040136829018592834, train_loss=0.03334088250994682, time_cost=4.074785470962524
+
Steps: 0%| | 4210/1000000 [10:43:32<2160:08:51, 7.81s/it, lr=1e-5, step_loss=0.0401]
Steps: 0%| | 4211/1000000 [10:43:40<2147:54:04, 7.77s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [4211], local_loss=0.034224316477775574, train_loss=0.028012610971927643, time_cost=1.2207584381103516
+
Steps: 0%| | 4211/1000000 [10:43:40<2147:54:04, 7.77s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 4212/1000000 [10:43:51<2416:19:20, 8.74s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [4212], local_loss=0.047634437680244446, train_loss=0.06437647342681885, time_cost=3.2965948581695557
+
Steps: 0%| | 4212/1000000 [10:43:51<2416:19:20, 8.74s/it, lr=1e-5, step_loss=0.0476]
Steps: 0%| | 4213/1000000 [10:44:02<2604:25:13, 9.42s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [4213], local_loss=0.32970955967903137, train_loss=0.06402067095041275, time_cost=4.0031821727752686
+
Steps: 0%| | 4213/1000000 [10:44:02<2604:25:13, 9.42s/it, lr=1e-5, step_loss=0.33]
Steps: 0%| | 4214/1000000 [10:44:09<2437:58:18, 8.81s/it, lr=1e-5, step_loss=0.33][RANK-0]: Step: [4214], local_loss=0.017544567584991455, train_loss=0.04124158248305321, time_cost=1.2317044734954834
+
Steps: 0%| | 4214/1000000 [10:44:09<2437:58:18, 8.81s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4215/1000000 [10:44:18<2435:52:20, 8.81s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4215], local_loss=0.22217205166816711, train_loss=0.05552386865019798, time_cost=6.079755783081055
+
Steps: 0%| | 4215/1000000 [10:44:18<2435:52:20, 8.81s/it, lr=1e-5, step_loss=0.222]
Steps: 0%| | 4216/1000000 [10:44:28<2565:37:30, 9.28s/it, lr=1e-5, step_loss=0.222][RANK-0]: Step: [4216], local_loss=0.026971224695444107, train_loss=0.020549504086375237, time_cost=2.7415857315063477
+
Steps: 0%| | 4216/1000000 [10:44:28<2565:37:30, 9.28s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 4217/1000000 [10:44:41<2832:29:04, 10.24s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [4217], local_loss=1.004831314086914, train_loss=0.2250319868326187, time_cost=4.616959810256958
+
Steps: 0%| | 4217/1000000 [10:44:41<2832:29:04, 10.24s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 4218/1000000 [10:44:57<3295:17:56, 11.91s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [4218], local_loss=0.18996167182922363, train_loss=0.053526557981967926, time_cost=4.9651172161102295
+
Steps: 0%| | 4218/1000000 [10:44:57<3295:17:56, 11.91s/it, lr=1e-5, step_loss=0.19]
Steps: 0%| | 4219/1000000 [10:45:10<3374:41:43, 12.20s/it, lr=1e-5, step_loss=0.19][RANK-0]: Step: [4219], local_loss=0.02304711751639843, train_loss=0.024188749492168427, time_cost=4.865398406982422
+
Steps: 0%| | 4219/1000000 [10:45:10<3374:41:43, 12.20s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 4220/1000000 [10:45:19<3126:17:50, 11.30s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [4220], local_loss=0.07157760858535767, train_loss=0.039020635187625885, time_cost=6.054802417755127
+
Steps: 0%| | 4220/1000000 [10:45:19<3126:17:50, 11.30s/it, lr=1e-5, step_loss=0.0716]
Steps: 0%| | 4221/1000000 [10:45:31<3203:08:29, 11.58s/it, lr=1e-5, step_loss=0.0716][RANK-0]: Step: [4221], local_loss=0.02211819216609001, train_loss=0.04528588056564331, time_cost=2.8059544563293457
+
Steps: 0%| | 4221/1000000 [10:45:31<3203:08:29, 11.58s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 4222/1000000 [10:45:46<3470:31:20, 12.55s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [4222], local_loss=0.008419983088970184, train_loss=0.028894461691379547, time_cost=4.116788625717163
+
Steps: 0%| | 4222/1000000 [10:45:46<3470:31:20, 12.55s/it, lr=1e-5, step_loss=0.00842]
Steps: 0%| | 4223/1000000 [10:45:54<3137:15:06, 11.34s/it, lr=1e-5, step_loss=0.00842][RANK-0]: Step: [4223], local_loss=0.011113043874502182, train_loss=0.017818329855799675, time_cost=4.59917426109314
+
Steps: 0%| | 4223/1000000 [10:45:54<3137:15:06, 11.34s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 4224/1000000 [10:46:01<2725:35:55, 9.85s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [4224], local_loss=0.013860211707651615, train_loss=0.02814343199133873, time_cost=2.4479031562805176
+
Steps: 0%| | 4224/1000000 [10:46:01<2725:35:55, 9.85s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4225/1000000 [10:46:11<2776:54:09, 10.04s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4225], local_loss=0.04430247098207474, train_loss=0.08516574651002884, time_cost=1.6738085746765137
+
Steps: 0%| | 4225/1000000 [10:46:11<2776:54:09, 10.04s/it, lr=1e-5, step_loss=0.0443]
Steps: 0%| | 4226/1000000 [10:46:30<3537:00:03, 12.79s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [4226], local_loss=0.03998049348592758, train_loss=0.1041766107082367, time_cost=1.2636044025421143
+
Steps: 0%| | 4226/1000000 [10:46:30<3537:00:03, 12.79s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 4227/1000000 [10:46:36<2912:01:00, 10.53s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [4227], local_loss=0.010693302378058434, train_loss=0.07235170900821686, time_cost=1.1994950771331787
+
Steps: 0%| | 4227/1000000 [10:46:36<2912:01:00, 10.53s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 4228/1000000 [10:46:41<2463:05:35, 8.90s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [4228], local_loss=0.04623810946941376, train_loss=0.03067040629684925, time_cost=2.5970304012298584
+
Steps: 0%| | 4228/1000000 [10:46:41<2463:05:35, 8.90s/it, lr=1e-5, step_loss=0.0462]
Steps: 0%| | 4229/1000000 [10:46:52<2682:23:56, 9.70s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [4229], local_loss=0.09715445339679718, train_loss=0.03402721881866455, time_cost=1.2345585823059082
+
Steps: 0%| | 4229/1000000 [10:46:52<2682:23:56, 9.70s/it, lr=1e-5, step_loss=0.0972]
Steps: 0%| | 4230/1000000 [10:46:57<2290:43:32, 8.28s/it, lr=1e-5, step_loss=0.0972][RANK-0]: Step: [4230], local_loss=0.07832701504230499, train_loss=0.05694354325532913, time_cost=1.2882301807403564
+
Steps: 0%| | 4230/1000000 [10:46:57<2290:43:32, 8.28s/it, lr=1e-5, step_loss=0.0783]
Steps: 0%| | 4231/1000000 [10:47:04<2190:15:50, 7.92s/it, lr=1e-5, step_loss=0.0783][RANK-0]: Step: [4231], local_loss=0.018194932490587234, train_loss=0.104067862033844, time_cost=3.9680848121643066
+
Steps: 0%| | 4231/1000000 [10:47:04<2190:15:50, 7.92s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 4232/1000000 [10:47:10<2001:11:35, 7.23s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [4232], local_loss=0.0603654682636261, train_loss=0.04441270977258682, time_cost=1.7539594173431396
+
Steps: 0%| | 4232/1000000 [10:47:10<2001:11:35, 7.23s/it, lr=1e-5, step_loss=0.0604]
Steps: 0%| | 4233/1000000 [10:47:20<2205:48:22, 7.97s/it, lr=1e-5, step_loss=0.0604][RANK-0]: Step: [4233], local_loss=0.04779205471277237, train_loss=0.09242226183414459, time_cost=1.5072340965270996
+
Steps: 0%| | 4233/1000000 [10:47:20<2205:48:22, 7.97s/it, lr=1e-5, step_loss=0.0478]
Steps: 0%| | 4234/1000000 [10:47:26<2047:41:09, 7.40s/it, lr=1e-5, step_loss=0.0478][RANK-0]: Step: [4234], local_loss=0.07995903491973877, train_loss=0.046380966901779175, time_cost=2.0533456802368164
+
Steps: 0%| | 4234/1000000 [10:47:26<2047:41:09, 7.40s/it, lr=1e-5, step_loss=0.08]
Steps: 0%| | 4235/1000000 [10:47:34<2098:47:44, 7.59s/it, lr=1e-5, step_loss=0.08][RANK-0]: Step: [4235], local_loss=0.056831516325473785, train_loss=0.163309708237648, time_cost=3.6641879081726074
+
Steps: 0%| | 4235/1000000 [10:47:34<2098:47:44, 7.59s/it, lr=1e-5, step_loss=0.0568]
Steps: 0%| | 4236/1000000 [10:47:40<1965:29:08, 7.11s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [4236], local_loss=0.018826240673661232, train_loss=0.03844519704580307, time_cost=2.353137254714966
+
Steps: 0%| | 4236/1000000 [10:47:40<1965:29:08, 7.11s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 4237/1000000 [10:47:45<1813:10:38, 6.56s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [4237], local_loss=0.19397181272506714, train_loss=0.08168645203113556, time_cost=3.7490954399108887
+
Steps: 0%| | 4237/1000000 [10:47:45<1813:10:38, 6.56s/it, lr=1e-5, step_loss=0.194]
Steps: 0%| | 4238/1000000 [10:47:54<2013:14:29, 7.28s/it, lr=1e-5, step_loss=0.194][RANK-0]: Step: [4238], local_loss=0.01567159593105316, train_loss=0.03360864520072937, time_cost=2.871943473815918
+
Steps: 0%| | 4238/1000000 [10:47:54<2013:14:29, 7.28s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 4239/1000000 [10:48:03<2134:36:17, 7.72s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [4239], local_loss=0.14295931160449982, train_loss=0.04453650489449501, time_cost=1.6470654010772705
+
Steps: 0%| | 4239/1000000 [10:48:03<2134:36:17, 7.72s/it, lr=1e-5, step_loss=0.143]
Steps: 0%| | 4240/1000000 [10:48:17<2650:11:26, 9.58s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [4240], local_loss=0.058486782014369965, train_loss=0.03577103093266487, time_cost=1.508669376373291
+
Steps: 0%| | 4240/1000000 [10:48:17<2650:11:26, 9.58s/it, lr=1e-5, step_loss=0.0585]
Steps: 0%| | 4241/1000000 [10:48:31<3022:14:21, 10.93s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [4241], local_loss=0.03446302190423012, train_loss=0.02886665053665638, time_cost=4.142777681350708
+
Steps: 0%| | 4241/1000000 [10:48:31<3022:14:21, 10.93s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 4242/1000000 [10:48:42<3089:03:56, 11.17s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [4242], local_loss=0.02096850983798504, train_loss=0.02314484678208828, time_cost=1.5872020721435547
+
Steps: 0%| | 4242/1000000 [10:48:42<3089:03:56, 11.17s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 4243/1000000 [10:48:54<3119:31:41, 11.28s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [4243], local_loss=0.042900845408439636, train_loss=0.019408656284213066, time_cost=3.8579294681549072
+
Steps: 0%| | 4243/1000000 [10:48:54<3119:31:41, 11.28s/it, lr=1e-5, step_loss=0.0429]
Steps: 0%| | 4244/1000000 [10:49:05<3081:39:49, 11.14s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [4244], local_loss=0.021846991032361984, train_loss=0.0559745728969574, time_cost=2.10756778717041
+
Steps: 0%| | 4244/1000000 [10:49:05<3081:39:49, 11.14s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 4245/1000000 [10:49:10<2622:19:51, 9.48s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [4245], local_loss=0.07518137246370316, train_loss=0.035707950592041016, time_cost=1.8989861011505127
+
Steps: 0%| | 4245/1000000 [10:49:10<2622:19:51, 9.48s/it, lr=1e-5, step_loss=0.0752]
Steps: 0%| | 4246/1000000 [10:49:19<2551:33:16, 9.22s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [4246], local_loss=0.0590846911072731, train_loss=0.05150476470589638, time_cost=2.258944511413574
+
Steps: 0%| | 4246/1000000 [10:49:19<2551:33:16, 9.22s/it, lr=1e-5, step_loss=0.0591]
Steps: 0%| | 4247/1000000 [10:49:26<2398:10:36, 8.67s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [4247], local_loss=0.017975011840462685, train_loss=13.173678398132324, time_cost=1.2977087497711182
+
Steps: 0%| | 4247/1000000 [10:49:26<2398:10:36, 8.67s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 4248/1000000 [10:49:39<2744:53:44, 9.92s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [4248], local_loss=0.0158747099339962, train_loss=0.047293178737163544, time_cost=5.214148044586182
+
Steps: 0%| | 4248/1000000 [10:49:39<2744:53:44, 9.92s/it, lr=1e-5, step_loss=0.0159]
Steps: 0%| | 4249/1000000 [10:49:49<2727:05:51, 9.86s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [4249], local_loss=0.021677929908037186, train_loss=0.08439643681049347, time_cost=3.2502284049987793
+
Steps: 0%| | 4249/1000000 [10:49:49<2727:05:51, 9.86s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 4250/1000000 [10:49:53<2275:23:54, 8.23s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [4250], local_loss=0.024651337414979935, train_loss=0.06305830180644989, time_cost=1.34629487991333
+
Steps: 0%| | 4250/1000000 [10:49:53<2275:23:54, 8.23s/it, lr=1e-5, step_loss=0.0247]
Steps: 0%| | 4251/1000000 [10:50:01<2189:47:18, 7.92s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [4251], local_loss=0.03561324253678322, train_loss=0.04214826226234436, time_cost=2.615971326828003
+
Steps: 0%| | 4251/1000000 [10:50:01<2189:47:18, 7.92s/it, lr=1e-5, step_loss=0.0356]
Steps: 0%| | 4252/1000000 [10:50:07<2101:09:22, 7.60s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [4252], local_loss=0.15981626510620117, train_loss=0.04398088902235031, time_cost=2.157472610473633
+
Steps: 0%| | 4252/1000000 [10:50:07<2101:09:22, 7.60s/it, lr=1e-5, step_loss=0.16]
Steps: 0%| | 4253/1000000 [10:50:21<2620:44:28, 9.47s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [4253], local_loss=0.04937887564301491, train_loss=0.16709834337234497, time_cost=5.869401454925537
+
Steps: 0%| | 4253/1000000 [10:50:21<2620:44:28, 9.47s/it, lr=1e-5, step_loss=0.0494]
Steps: 0%| | 4254/1000000 [10:50:27<2308:31:16, 8.35s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [4254], local_loss=0.04202521964907646, train_loss=0.0322171151638031, time_cost=2.573019504547119
+
Steps: 0%| | 4254/1000000 [10:50:27<2308:31:16, 8.35s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 4255/1000000 [10:50:38<2544:53:30, 9.20s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [4255], local_loss=0.12780162692070007, train_loss=0.08493870496749878, time_cost=4.519363880157471
+
Steps: 0%| | 4255/1000000 [10:50:38<2544:53:30, 9.20s/it, lr=1e-5, step_loss=0.128]
Steps: 0%| | 4256/1000000 [10:50:46<2423:45:54, 8.76s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [4256], local_loss=0.29961472749710083, train_loss=0.0649980679154396, time_cost=1.388873815536499
+
Steps: 0%| | 4256/1000000 [10:50:46<2423:45:54, 8.76s/it, lr=1e-5, step_loss=0.3]
Steps: 0%| | 4257/1000000 [10:50:56<2572:02:42, 9.30s/it, lr=1e-5, step_loss=0.3][RANK-0]: Step: [4257], local_loss=0.050775352865457535, train_loss=0.033836595714092255, time_cost=4.786077976226807
+
Steps: 0%| | 4257/1000000 [10:50:56<2572:02:42, 9.30s/it, lr=1e-5, step_loss=0.0508]
Steps: 0%| | 4258/1000000 [10:51:08<2733:54:51, 9.88s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [4258], local_loss=0.0318133719265461, train_loss=0.026541605591773987, time_cost=1.200854778289795
+
Steps: 0%| | 4258/1000000 [10:51:08<2733:54:51, 9.88s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 4259/1000000 [10:51:12<2280:02:56, 8.24s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [4259], local_loss=0.17088468372821808, train_loss=0.04605130851268768, time_cost=1.6949081420898438
+
Steps: 0%| | 4259/1000000 [10:51:12<2280:02:56, 8.24s/it, lr=1e-5, step_loss=0.171]
Steps: 0%| | 4260/1000000 [10:51:20<2216:50:59, 8.01s/it, lr=1e-5, step_loss=0.171][RANK-0]: Step: [4260], local_loss=0.016476746648550034, train_loss=0.022640129551291466, time_cost=3.1638710498809814
+
Steps: 0%| | 4260/1000000 [10:51:20<2216:50:59, 8.01s/it, lr=1e-5, step_loss=0.0165]
Steps: 0%| | 4261/1000000 [10:51:26<2065:21:39, 7.47s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [4261], local_loss=0.013763075694441795, train_loss=0.027741258963942528, time_cost=2.3018527030944824
+
Steps: 0%| | 4261/1000000 [10:51:26<2065:21:39, 7.47s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 4262/1000000 [10:51:31<1892:26:40, 6.84s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [4262], local_loss=0.027634313330054283, train_loss=0.07993730902671814, time_cost=2.074218511581421
+
Steps: 0%| | 4262/1000000 [10:51:31<1892:26:40, 6.84s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 4263/1000000 [10:51:38<1888:15:16, 6.83s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [4263], local_loss=0.04343794286251068, train_loss=0.11160897463560104, time_cost=2.710824966430664
+
Steps: 0%| | 4263/1000000 [10:51:38<1888:15:16, 6.83s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 4264/1000000 [10:51:48<2176:41:14, 7.87s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [4264], local_loss=0.01761525310575962, train_loss=0.04693412035703659, time_cost=3.512606382369995
+
Steps: 0%| | 4264/1000000 [10:51:48<2176:41:14, 7.87s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 4265/1000000 [10:51:56<2142:18:55, 7.75s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [4265], local_loss=0.06815242022275925, train_loss=0.04165313392877579, time_cost=4.926336765289307
+
Steps: 0%| | 4265/1000000 [10:51:56<2142:18:55, 7.75s/it, lr=1e-5, step_loss=0.0682]
Steps: 0%| | 4266/1000000 [10:52:02<1989:12:05, 7.19s/it, lr=1e-5, step_loss=0.0682][RANK-0]: Step: [4266], local_loss=0.03182756528258324, train_loss=0.03709341213107109, time_cost=3.411015272140503
+
Steps: 0%| | 4266/1000000 [10:52:02<1989:12:05, 7.19s/it, lr=1e-5, step_loss=0.0318]
Steps: 0%| | 4267/1000000 [10:52:09<1991:24:35, 7.20s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [4267], local_loss=0.03271627053618431, train_loss=0.03920814394950867, time_cost=1.2383997440338135
+
Steps: 0%| | 4267/1000000 [10:52:09<1991:24:35, 7.20s/it, lr=1e-5, step_loss=0.0327]
Steps: 0%| | 4268/1000000 [10:52:26<2797:25:58, 10.11s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [4268], local_loss=0.016445109620690346, train_loss=0.048622071743011475, time_cost=1.245786190032959
+
Steps: 0%| | 4268/1000000 [10:52:26<2797:25:58, 10.11s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 4269/1000000 [10:52:31<2402:35:42, 8.69s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [4269], local_loss=0.02009359933435917, train_loss=0.04507008194923401, time_cost=2.183171510696411
+
Steps: 0%| | 4269/1000000 [10:52:31<2402:35:42, 8.69s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 4270/1000000 [10:52:41<2514:08:46, 9.09s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [4270], local_loss=0.03710414469242096, train_loss=0.46761682629585266, time_cost=1.215463638305664
+
Steps: 0%| | 4270/1000000 [10:52:41<2514:08:46, 9.09s/it, lr=1e-5, step_loss=0.0371]
Steps: 0%| | 4271/1000000 [10:52:53<2769:00:52, 10.01s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [4271], local_loss=0.03683532774448395, train_loss=0.03911904990673065, time_cost=5.814104080200195
+
Steps: 0%| | 4271/1000000 [10:52:53<2769:00:52, 10.01s/it, lr=1e-5, step_loss=0.0368]
Steps: 0%| | 4272/1000000 [10:52:59<2376:29:37, 8.59s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [4272], local_loss=0.017542172223329544, train_loss=0.10504129528999329, time_cost=2.0283167362213135
+
Steps: 0%| | 4272/1000000 [10:52:59<2376:29:37, 8.59s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4273/1000000 [10:53:10<2597:30:34, 9.39s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4273], local_loss=0.009437878616154194, train_loss=0.028019888326525688, time_cost=7.550412178039551
+
Steps: 0%| | 4273/1000000 [10:53:10<2597:30:34, 9.39s/it, lr=1e-5, step_loss=0.00944]
Steps: 0%| | 4274/1000000 [10:53:21<2727:57:05, 9.86s/it, lr=1e-5, step_loss=0.00944][RANK-0]: Step: [4274], local_loss=0.014557993039488792, train_loss=0.03186732903122902, time_cost=2.781373977661133
+
Steps: 0%| | 4274/1000000 [10:53:21<2727:57:05, 9.86s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 4275/1000000 [10:53:37<3216:36:47, 11.63s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [4275], local_loss=0.0774686336517334, train_loss=23.126571655273438, time_cost=6.618482828140259
+
Steps: 0%| | 4275/1000000 [10:53:37<3216:36:47, 11.63s/it, lr=1e-5, step_loss=0.0775]
Steps: 0%| | 4276/1000000 [10:53:46<3060:57:23, 11.07s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [4276], local_loss=0.02009037882089615, train_loss=0.03200262784957886, time_cost=1.2817895412445068
+
Steps: 0%| | 4276/1000000 [10:53:46<3060:57:23, 11.07s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 4277/1000000 [10:53:59<3158:13:53, 11.42s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [4277], local_loss=0.10352933406829834, train_loss=0.07084662467241287, time_cost=1.3090863227844238
+
Steps: 0%| | 4277/1000000 [10:53:59<3158:13:53, 11.42s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 4278/1000000 [10:54:15<3546:33:30, 12.82s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [4278], local_loss=0.029711101204156876, train_loss=0.06242992728948593, time_cost=7.5342442989349365
+
Steps: 0%| | 4278/1000000 [10:54:15<3546:33:30, 12.82s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 4279/1000000 [10:54:22<3117:24:08, 11.27s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [4279], local_loss=0.02139485441148281, train_loss=0.15506625175476074, time_cost=3.410252094268799
+
Steps: 0%| | 4279/1000000 [10:54:22<3117:24:08, 11.27s/it, lr=1e-5, step_loss=0.0214]
Steps: 0%| | 4280/1000000 [10:54:32<3006:28:15, 10.87s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [4280], local_loss=0.08113079518079758, train_loss=0.12069684267044067, time_cost=2.535911798477173
+
Steps: 0%| | 4280/1000000 [10:54:32<3006:28:15, 10.87s/it, lr=1e-5, step_loss=0.0811]
Steps: 0%| | 4281/1000000 [10:54:40<2727:35:22, 9.86s/it, lr=1e-5, step_loss=0.0811][RANK-0]: Step: [4281], local_loss=0.021977383643388748, train_loss=0.04234921187162399, time_cost=1.7566804885864258
+
Steps: 0%| | 4281/1000000 [10:54:40<2727:35:22, 9.86s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 4282/1000000 [10:54:54<3082:09:07, 11.14s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [4282], local_loss=0.039985258132219315, train_loss=0.10429459065198898, time_cost=4.659875869750977
+
Steps: 0%| | 4282/1000000 [10:54:54<3082:09:07, 11.14s/it, lr=1e-5, step_loss=0.04]
Steps: 0%| | 4283/1000000 [10:55:02<2798:03:25, 10.12s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [4283], local_loss=0.01597236655652523, train_loss=0.03009095788002014, time_cost=3.0269861221313477
+
Steps: 0%| | 4283/1000000 [10:55:02<2798:03:25, 10.12s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4284/1000000 [10:55:07<2442:20:52, 8.83s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4284], local_loss=0.0391458123922348, train_loss=0.0975300744175911, time_cost=2.2724695205688477
+
Steps: 0%| | 4284/1000000 [10:55:07<2442:20:52, 8.83s/it, lr=1e-5, step_loss=0.0391]
Steps: 0%| | 4285/1000000 [10:55:22<2913:20:21, 10.53s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [4285], local_loss=0.02046199142932892, train_loss=0.04515889659523964, time_cost=6.052017688751221
+
Steps: 0%| | 4285/1000000 [10:55:22<2913:20:21, 10.53s/it, lr=1e-5, step_loss=0.0205]
Steps: 0%| | 4286/1000000 [10:55:30<2688:54:39, 9.72s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [4286], local_loss=0.021992899477481842, train_loss=0.04780272766947746, time_cost=3.917567014694214
+
Steps: 0%| | 4286/1000000 [10:55:30<2688:54:39, 9.72s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 4287/1000000 [10:55:44<3026:49:12, 10.94s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [4287], local_loss=0.015139405615627766, train_loss=0.05739636346697807, time_cost=6.219615459442139
+
Steps: 0%| | 4287/1000000 [10:55:44<3026:49:12, 10.94s/it, lr=1e-5, step_loss=0.0151]
Steps: 0%| | 4288/1000000 [10:55:55<3077:52:42, 11.13s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [4288], local_loss=0.07841278612613678, train_loss=0.16431252658367157, time_cost=3.753807544708252
+
Steps: 0%| | 4288/1000000 [10:55:55<3077:52:42, 11.13s/it, lr=1e-5, step_loss=0.0784]
Steps: 0%| | 4289/1000000 [10:56:08<3248:18:51, 11.74s/it, lr=1e-5, step_loss=0.0784][RANK-0]: Step: [4289], local_loss=0.045960985124111176, train_loss=0.1227019652724266, time_cost=4.051148176193237
+
Steps: 0%| | 4289/1000000 [10:56:08<3248:18:51, 11.74s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 4290/1000000 [10:56:23<3472:59:54, 12.56s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [4290], local_loss=0.02997704967856407, train_loss=0.023343581706285477, time_cost=1.257568120956421
+
Steps: 0%| | 4290/1000000 [10:56:23<3472:59:54, 12.56s/it, lr=1e-5, step_loss=0.03]
Steps: 0%| | 4291/1000000 [10:56:31<3094:05:59, 11.19s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [4291], local_loss=0.17532099783420563, train_loss=0.05299006775021553, time_cost=1.5205907821655273
+
Steps: 0%| | 4291/1000000 [10:56:31<3094:05:59, 11.19s/it, lr=1e-5, step_loss=0.175]
Steps: 0%| | 4292/1000000 [10:56:37<2658:19:07, 9.61s/it, lr=1e-5, step_loss=0.175][RANK-0]: Step: [4292], local_loss=0.016787787899374962, train_loss=0.0325242355465889, time_cost=1.8094062805175781
+
Steps: 0%| | 4292/1000000 [10:56:37<2658:19:07, 9.61s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 4293/1000000 [10:56:52<3161:11:35, 11.43s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [4293], local_loss=0.12247871607542038, train_loss=0.04298824816942215, time_cost=7.201933145523071
+
Steps: 0%| | 4293/1000000 [10:56:52<3161:11:35, 11.43s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 4294/1000000 [10:56:58<2679:40:01, 9.69s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [4294], local_loss=0.11422763019800186, train_loss=0.042945101857185364, time_cost=2.7573468685150146
+
Steps: 0%| | 4294/1000000 [10:56:58<2679:40:01, 9.69s/it, lr=1e-5, step_loss=0.114]
Steps: 0%| | 4295/1000000 [10:57:09<2822:23:02, 10.20s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [4295], local_loss=0.07141714543104172, train_loss=0.0376632958650589, time_cost=4.968042373657227
+
Steps: 0%| | 4295/1000000 [10:57:09<2822:23:02, 10.20s/it, lr=1e-5, step_loss=0.0714]
Steps: 0%| | 4296/1000000 [10:57:21<2978:59:47, 10.77s/it, lr=1e-5, step_loss=0.0714][RANK-0]: Step: [4296], local_loss=0.09558981657028198, train_loss=0.08260157704353333, time_cost=3.84983491897583
+
Steps: 0%| | 4296/1000000 [10:57:21<2978:59:47, 10.77s/it, lr=1e-5, step_loss=0.0956]
Steps: 0%| | 4297/1000000 [10:57:38<3482:07:08, 12.59s/it, lr=1e-5, step_loss=0.0956][RANK-0]: Step: [4297], local_loss=0.01694120652973652, train_loss=0.049681346863508224, time_cost=12.069376945495605
+
Steps: 0%| | 4297/1000000 [10:57:38<3482:07:08, 12.59s/it, lr=1e-5, step_loss=0.0169]
Steps: 0%| | 4298/1000000 [10:57:45<3023:24:36, 10.93s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [4298], local_loss=0.01827007532119751, train_loss=0.0287034809589386, time_cost=2.6115498542785645
+
Steps: 0%| | 4298/1000000 [10:57:45<3023:24:36, 10.93s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 4299/1000000 [10:57:55<2907:10:16, 10.51s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [4299], local_loss=0.014408249408006668, train_loss=0.03181687742471695, time_cost=1.2250442504882812
+
Steps: 0%| | 4299/1000000 [10:57:55<2907:10:16, 10.51s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 4300/1000000 [10:58:07<3037:30:53, 10.98s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [4300], local_loss=0.009918803349137306, train_loss=0.08167349547147751, time_cost=1.2742056846618652
+
Steps: 0%| | 4300/1000000 [10:58:07<3037:30:53, 10.98s/it, lr=1e-5, step_loss=0.00992]
Steps: 0%| | 4301/1000000 [10:58:16<2850:53:27, 10.31s/it, lr=1e-5, step_loss=0.00992][RANK-0]: Step: [4301], local_loss=0.01768273487687111, train_loss=0.03568542003631592, time_cost=2.9417724609375
+
Steps: 0%| | 4301/1000000 [10:58:16<2850:53:27, 10.31s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 4302/1000000 [10:58:20<2356:03:21, 8.52s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [4302], local_loss=0.01768035814166069, train_loss=0.0930706113576889, time_cost=1.5222914218902588
+
Steps: 0%| | 4302/1000000 [10:58:20<2356:03:21, 8.52s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 4303/1000000 [10:58:31<2596:51:36, 9.39s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [4303], local_loss=0.04591640457510948, train_loss=0.03366196155548096, time_cost=1.2135169506072998
+
Steps: 0%| | 4303/1000000 [10:58:31<2596:51:36, 9.39s/it, lr=1e-5, step_loss=0.0459]
Steps: 0%| | 4304/1000000 [10:58:42<2668:35:52, 9.65s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [4304], local_loss=0.008390799164772034, train_loss=0.04197126626968384, time_cost=2.1071560382843018
+
Steps: 0%| | 4304/1000000 [10:58:42<2668:35:52, 9.65s/it, lr=1e-5, step_loss=0.00839]
Steps: 0%| | 4305/1000000 [10:58:58<3187:26:52, 11.52s/it, lr=1e-5, step_loss=0.00839][RANK-0]: Step: [4305], local_loss=0.021552184596657753, train_loss=0.05985303968191147, time_cost=6.678083658218384
+
Steps: 0%| | 4305/1000000 [10:58:58<3187:26:52, 11.52s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 4306/1000000 [10:59:04<2739:22:50, 9.90s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [4306], local_loss=0.021108558401465416, train_loss=0.03059983439743519, time_cost=1.1951539516448975
+
Steps: 0%| | 4306/1000000 [10:59:04<2739:22:50, 9.90s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 4307/1000000 [10:59:14<2803:10:38, 10.14s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [4307], local_loss=0.01259643491357565, train_loss=0.030831290408968925, time_cost=1.5763750076293945
+
Steps: 0%| | 4307/1000000 [10:59:14<2803:10:38, 10.14s/it, lr=1e-5, step_loss=0.0126]
Steps: 0%| | 4308/1000000 [10:59:29<3133:19:46, 11.33s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [4308], local_loss=0.03056916780769825, train_loss=0.03606133162975311, time_cost=8.04435658454895
+
Steps: 0%| | 4308/1000000 [10:59:29<3133:19:46, 11.33s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 4309/1000000 [10:59:42<3280:55:16, 11.86s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [4309], local_loss=0.01602550595998764, train_loss=0.05347176268696785, time_cost=5.96785044670105
+
Steps: 0%| | 4309/1000000 [10:59:42<3280:55:16, 11.86s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4310/1000000 [10:59:57<3531:46:45, 12.77s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4310], local_loss=0.04690587520599365, train_loss=0.04982108622789383, time_cost=4.642439365386963
+
Steps: 0%| | 4310/1000000 [10:59:57<3531:46:45, 12.77s/it, lr=1e-5, step_loss=0.0469]
Steps: 0%| | 4311/1000000 [11:00:04<3124:30:58, 11.30s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [4311], local_loss=0.09420210123062134, train_loss=0.041749000549316406, time_cost=2.9495201110839844
+
Steps: 0%| | 4311/1000000 [11:00:04<3124:30:58, 11.30s/it, lr=1e-5, step_loss=0.0942]
Steps: 0%| | 4312/1000000 [11:00:12<2793:01:19, 10.10s/it, lr=1e-5, step_loss=0.0942][RANK-0]: Step: [4312], local_loss=0.03129187971353531, train_loss=0.034266397356987, time_cost=1.2183301448822021
+
Steps: 0%| | 4312/1000000 [11:00:12<2793:01:19, 10.10s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 4313/1000000 [11:00:23<2922:47:53, 10.57s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [4313], local_loss=0.028684036806225777, train_loss=0.022410281002521515, time_cost=7.673495054244995
+
Steps: 0%| | 4313/1000000 [11:00:23<2922:47:53, 10.57s/it, lr=1e-5, step_loss=0.0287]
Steps: 0%| | 4314/1000000 [11:00:41<3469:20:43, 12.54s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [4314], local_loss=0.07032217085361481, train_loss=13.941937446594238, time_cost=3.5415408611297607
+
Steps: 0%| | 4314/1000000 [11:00:41<3469:20:43, 12.54s/it, lr=1e-5, step_loss=0.0703]
Steps: 0%| | 4315/1000000 [11:00:53<3426:57:03, 12.39s/it, lr=1e-5, step_loss=0.0703][RANK-0]: Step: [4315], local_loss=0.014212333597242832, train_loss=0.02291349694132805, time_cost=1.233598232269287
+
Steps: 0%| | 4315/1000000 [11:00:53<3426:57:03, 12.39s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 4316/1000000 [11:01:04<3326:19:06, 12.03s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [4316], local_loss=0.014158276841044426, train_loss=0.148738831281662, time_cost=1.215794324874878
+
Steps: 0%| | 4316/1000000 [11:01:04<3326:19:06, 12.03s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 4317/1000000 [11:01:16<3305:25:45, 11.95s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [4317], local_loss=0.06899659335613251, train_loss=0.04645143449306488, time_cost=3.7785301208496094
+
Steps: 0%| | 4317/1000000 [11:01:16<3305:25:45, 11.95s/it, lr=1e-5, step_loss=0.069]
Steps: 0%| | 4318/1000000 [11:01:27<3271:03:18, 11.83s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [4318], local_loss=0.014825664460659027, train_loss=0.022178448736667633, time_cost=1.658630609512329
+
Steps: 0%| | 4318/1000000 [11:01:27<3271:03:18, 11.83s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 4319/1000000 [11:01:36<3043:35:09, 11.00s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [4319], local_loss=0.2612474262714386, train_loss=0.22841623425483704, time_cost=6.408640384674072
+
Steps: 0%| | 4319/1000000 [11:01:36<3043:35:09, 11.00s/it, lr=1e-5, step_loss=0.261]
Steps: 0%| | 4320/1000000 [11:01:45<2894:44:11, 10.47s/it, lr=1e-5, step_loss=0.261][RANK-0]: Step: [4320], local_loss=0.0744478851556778, train_loss=0.07607884705066681, time_cost=2.1418817043304443
+
Steps: 0%| | 4320/1000000 [11:01:45<2894:44:11, 10.47s/it, lr=1e-5, step_loss=0.0744]
Steps: 0%| | 4321/1000000 [11:01:56<2900:33:58, 10.49s/it, lr=1e-5, step_loss=0.0744][RANK-0]: Step: [4321], local_loss=0.07231604307889938, train_loss=0.037362612783908844, time_cost=2.7127132415771484
+
Steps: 0%| | 4321/1000000 [11:01:56<2900:33:58, 10.49s/it, lr=1e-5, step_loss=0.0723]
Steps: 0%| | 4322/1000000 [11:02:07<2987:03:31, 10.80s/it, lr=1e-5, step_loss=0.0723][RANK-0]: Step: [4322], local_loss=0.013762789778411388, train_loss=0.08484756201505661, time_cost=1.386676549911499
+
Steps: 0%| | 4322/1000000 [11:02:07<2987:03:31, 10.80s/it, lr=1e-5, step_loss=0.0138]
Steps: 0%| | 4323/1000000 [11:02:14<2663:01:34, 9.63s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [4323], local_loss=0.017347492277622223, train_loss=0.022434473037719727, time_cost=4.751613140106201
+
Steps: 0%| | 4323/1000000 [11:02:14<2663:01:34, 9.63s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 4324/1000000 [11:02:25<2735:25:15, 9.89s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [4324], local_loss=0.029465526342391968, train_loss=0.040369682013988495, time_cost=3.617262601852417
+
Steps: 0%| | 4324/1000000 [11:02:25<2735:25:15, 9.89s/it, lr=1e-5, step_loss=0.0295]
Steps: 0%| | 4325/1000000 [11:02:40<3144:20:57, 11.37s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [4325], local_loss=0.04506644234061241, train_loss=0.027907105162739754, time_cost=6.420752286911011
+
Steps: 0%| | 4325/1000000 [11:02:40<3144:20:57, 11.37s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 4326/1000000 [11:02:46<2699:26:46, 9.76s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [4326], local_loss=0.013954048976302147, train_loss=0.05073115602135658, time_cost=1.2605187892913818
+
Steps: 0%| | 4326/1000000 [11:02:46<2699:26:46, 9.76s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 4327/1000000 [11:02:54<2557:29:14, 9.25s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [4327], local_loss=0.015999242663383484, train_loss=0.032596949487924576, time_cost=3.955476760864258
+
Steps: 0%| | 4327/1000000 [11:02:54<2557:29:14, 9.25s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4328/1000000 [11:03:01<2437:10:08, 8.81s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4328], local_loss=0.04149198532104492, train_loss=0.05373752862215042, time_cost=1.204390287399292
+
Steps: 0%| | 4328/1000000 [11:03:01<2437:10:08, 8.81s/it, lr=1e-5, step_loss=0.0415]
Steps: 0%| | 4329/1000000 [11:03:09<2339:22:55, 8.46s/it, lr=1e-5, step_loss=0.0415][RANK-0]: Step: [4329], local_loss=0.5120560526847839, train_loss=0.2244969755411148, time_cost=3.333752155303955
+
Steps: 0%| | 4329/1000000 [11:03:09<2339:22:55, 8.46s/it, lr=1e-5, step_loss=0.512]
Steps: 0%| | 4330/1000000 [11:03:16<2235:43:51, 8.08s/it, lr=1e-5, step_loss=0.512][RANK-0]: Step: [4330], local_loss=0.03790832683444023, train_loss=0.038130611181259155, time_cost=2.843390464782715
+
Steps: 0%| | 4330/1000000 [11:03:16<2235:43:51, 8.08s/it, lr=1e-5, step_loss=0.0379]
Steps: 0%| | 4331/1000000 [11:03:33<2956:22:33, 10.69s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [4331], local_loss=0.28022152185440063, train_loss=0.07506698369979858, time_cost=7.345998764038086
+
Steps: 0%| | 4331/1000000 [11:03:33<2956:22:33, 10.69s/it, lr=1e-5, step_loss=0.28]
Steps: 0%| | 4332/1000000 [11:03:44<2957:48:07, 10.69s/it, lr=1e-5, step_loss=0.28][RANK-0]: Step: [4332], local_loss=0.052737779915332794, train_loss=0.025797292590141296, time_cost=2.8427574634552
+
Steps: 0%| | 4332/1000000 [11:03:44<2957:48:07, 10.69s/it, lr=1e-5, step_loss=0.0527]
Steps: 0%| | 4333/1000000 [11:03:49<2530:48:52, 9.15s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [4333], local_loss=0.012459661811590195, train_loss=0.042322881519794464, time_cost=1.5255792140960693
+
Steps: 0%| | 4333/1000000 [11:03:49<2530:48:52, 9.15s/it, lr=1e-5, step_loss=0.0125]
Steps: 0%| | 4334/1000000 [11:03:57<2406:59:40, 8.70s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [4334], local_loss=0.01615319587290287, train_loss=0.05204426124691963, time_cost=1.2265222072601318
+
Steps: 0%| | 4334/1000000 [11:03:57<2406:59:40, 8.70s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 4335/1000000 [11:04:03<2143:16:36, 7.75s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [4335], local_loss=0.058312106877565384, train_loss=0.02477945387363434, time_cost=2.397677183151245
+
Steps: 0%| | 4335/1000000 [11:04:03<2143:16:36, 7.75s/it, lr=1e-5, step_loss=0.0583]
Steps: 0%| | 4336/1000000 [11:04:16<2603:34:28, 9.41s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [4336], local_loss=0.11055238544940948, train_loss=0.10760574042797089, time_cost=2.1954102516174316
+
Steps: 0%| | 4336/1000000 [11:04:16<2603:34:28, 9.41s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 4337/1000000 [11:04:31<3083:42:54, 11.15s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [4337], local_loss=0.20192021131515503, train_loss=0.05117775872349739, time_cost=6.154749393463135
+
Steps: 0%| | 4337/1000000 [11:04:31<3083:42:54, 11.15s/it, lr=1e-5, step_loss=0.202]
Steps: 0%| | 4338/1000000 [11:04:39<2790:42:18, 10.09s/it, lr=1e-5, step_loss=0.202][RANK-0]: Step: [4338], local_loss=0.021070308983325958, train_loss=0.03338049352169037, time_cost=1.9142532348632812
+
Steps: 0%| | 4338/1000000 [11:04:39<2790:42:18, 10.09s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 4339/1000000 [11:04:56<3356:00:09, 12.13s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [4339], local_loss=0.03725780174136162, train_loss=0.032105930149555206, time_cost=7.793071985244751
+
Steps: 0%| | 4339/1000000 [11:04:56<3356:00:09, 12.13s/it, lr=1e-5, step_loss=0.0373]
Steps: 0%| | 4340/1000000 [11:05:10<3559:18:22, 12.87s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [4340], local_loss=0.018286919221282005, train_loss=0.027453726157546043, time_cost=5.4477245807647705
+
Steps: 0%| | 4340/1000000 [11:05:10<3559:18:22, 12.87s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 4341/1000000 [11:05:26<3848:37:54, 13.92s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [4341], local_loss=0.012926528230309486, train_loss=0.05101676285266876, time_cost=7.509976387023926
+
Steps: 0%| | 4341/1000000 [11:05:26<3848:37:54, 13.92s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 4342/1000000 [11:05:33<3260:29:24, 11.79s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [4342], local_loss=0.015039524994790554, train_loss=0.034863077104091644, time_cost=2.979517936706543
+
Steps: 0%| | 4342/1000000 [11:05:33<3260:29:24, 11.79s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4343/1000000 [11:05:38<2696:52:04, 9.75s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4343], local_loss=0.011510249227285385, train_loss=8.046387672424316, time_cost=1.8287937641143799
+
Steps: 0%| | 4343/1000000 [11:05:38<2696:52:04, 9.75s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 4344/1000000 [11:05:44<2322:49:54, 8.40s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [4344], local_loss=0.01718803308904171, train_loss=0.022576769813895226, time_cost=2.1830649375915527
+
Steps: 0%| | 4344/1000000 [11:05:44<2322:49:54, 8.40s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 4345/1000000 [11:05:49<2068:24:44, 7.48s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [4345], local_loss=0.026376493275165558, train_loss=0.07191368192434311, time_cost=1.9727742671966553
+
Steps: 0%| | 4345/1000000 [11:05:49<2068:24:44, 7.48s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 4346/1000000 [11:05:59<2249:53:06, 8.13s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [4346], local_loss=0.022645222023129463, train_loss=0.034049488604068756, time_cost=1.2282395362854004
+
Steps: 0%| | 4346/1000000 [11:05:59<2249:53:06, 8.13s/it, lr=1e-5, step_loss=0.0226]
Steps: 0%| | 4347/1000000 [11:06:14<2827:03:27, 10.22s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [4347], local_loss=0.026225633919239044, train_loss=0.019235864281654358, time_cost=2.821711778640747
+
Steps: 0%| | 4347/1000000 [11:06:14<2827:03:27, 10.22s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 4348/1000000 [11:06:19<2405:16:05, 8.70s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [4348], local_loss=0.013325483538210392, train_loss=0.029200565069913864, time_cost=1.2240958213806152
+
Steps: 0%| | 4348/1000000 [11:06:19<2405:16:05, 8.70s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 4349/1000000 [11:06:27<2371:02:05, 8.57s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [4349], local_loss=0.08191558718681335, train_loss=0.055514320731163025, time_cost=5.457512378692627
+
Steps: 0%| | 4349/1000000 [11:06:27<2371:02:05, 8.57s/it, lr=1e-5, step_loss=0.0819]
Steps: 0%| | 4350/1000000 [11:06:41<2805:33:18, 10.14s/it, lr=1e-5, step_loss=0.0819][RANK-0]: Step: [4350], local_loss=0.02202313393354416, train_loss=0.03929971158504486, time_cost=5.04326868057251
+
Steps: 0%| | 4350/1000000 [11:06:41<2805:33:18, 10.14s/it, lr=1e-5, step_loss=0.022]
Steps: 0%| | 4351/1000000 [11:06:45<2309:34:51, 8.35s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [4351], local_loss=0.016350338235497475, train_loss=0.021447032690048218, time_cost=1.3257701396942139
+
Steps: 0%| | 4351/1000000 [11:06:45<2309:34:51, 8.35s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 4352/1000000 [11:06:56<2547:57:35, 9.21s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [4352], local_loss=0.31337735056877136, train_loss=0.11200597882270813, time_cost=2.9805431365966797
+
Steps: 0%| | 4352/1000000 [11:06:56<2547:57:35, 9.21s/it, lr=1e-5, step_loss=0.313]
Steps: 0%| | 4353/1000000 [11:07:02<2229:56:16, 8.06s/it, lr=1e-5, step_loss=0.313][RANK-0]: Step: [4353], local_loss=0.057570114731788635, train_loss=0.17772480845451355, time_cost=1.2051262855529785
+
Steps: 0%| | 4353/1000000 [11:07:02<2229:56:16, 8.06s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 4354/1000000 [11:07:12<2439:34:28, 8.82s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [4354], local_loss=0.01824365369975567, train_loss=0.0680730938911438, time_cost=1.2278430461883545
+
Steps: 0%| | 4354/1000000 [11:07:12<2439:34:28, 8.82s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 4355/1000000 [11:07:24<2650:06:32, 9.58s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [4355], local_loss=0.049884382635354996, train_loss=0.15119445323944092, time_cost=1.2386016845703125
+
Steps: 0%| | 4355/1000000 [11:07:24<2650:06:32, 9.58s/it, lr=1e-5, step_loss=0.0499]
Steps: 0%| | 4356/1000000 [11:07:30<2423:41:00, 8.76s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [4356], local_loss=0.016246087849140167, train_loss=0.045246947556734085, time_cost=1.7379090785980225
+
Steps: 0%| | 4356/1000000 [11:07:30<2423:41:00, 8.76s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 4357/1000000 [11:07:46<2993:22:24, 10.82s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [4357], local_loss=0.023070385679602623, train_loss=0.05456313118338585, time_cost=6.912100553512573
+
Steps: 0%| | 4357/1000000 [11:07:46<2993:22:24, 10.82s/it, lr=1e-5, step_loss=0.0231]
Steps: 0%| | 4358/1000000 [11:07:54<2738:18:02, 9.90s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [4358], local_loss=0.02931739203631878, train_loss=0.05090715363621712, time_cost=1.2070825099945068
+
Steps: 0%| | 4358/1000000 [11:07:54<2738:18:02, 9.90s/it, lr=1e-5, step_loss=0.0293]
Steps: 0%| | 4359/1000000 [11:08:07<3018:41:34, 10.91s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [4359], local_loss=0.20793995261192322, train_loss=0.06897582113742828, time_cost=3.7133567333221436
+
Steps: 0%| | 4359/1000000 [11:08:07<3018:41:34, 10.91s/it, lr=1e-5, step_loss=0.208]
Steps: 0%| | 4360/1000000 [11:08:17<2901:02:08, 10.49s/it, lr=1e-5, step_loss=0.208][RANK-0]: Step: [4360], local_loss=0.05841880291700363, train_loss=0.17463847994804382, time_cost=2.803694248199463
+
Steps: 0%| | 4360/1000000 [11:08:17<2901:02:08, 10.49s/it, lr=1e-5, step_loss=0.0584]
Steps: 0%| | 4361/1000000 [11:08:22<2449:10:05, 8.86s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [4361], local_loss=0.016236256808042526, train_loss=0.11278647184371948, time_cost=2.571601152420044
+
Steps: 0%| | 4361/1000000 [11:08:22<2449:10:05, 8.86s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 4362/1000000 [11:08:35<2788:32:41, 10.08s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [4362], local_loss=0.017542697489261627, train_loss=0.13174717128276825, time_cost=4.3161022663116455
+
Steps: 0%| | 4362/1000000 [11:08:35<2788:32:41, 10.08s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4363/1000000 [11:08:46<2892:03:55, 10.46s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4363], local_loss=0.09067557752132416, train_loss=0.0526033453643322, time_cost=3.3529295921325684
+
Steps: 0%| | 4363/1000000 [11:08:46<2892:03:55, 10.46s/it, lr=1e-5, step_loss=0.0907]
Steps: 0%| | 4364/1000000 [11:08:56<2873:47:28, 10.39s/it, lr=1e-5, step_loss=0.0907][RANK-0]: Step: [4364], local_loss=0.0241360105574131, train_loss=0.08796659111976624, time_cost=4.0754923820495605
+
Steps: 0%| | 4364/1000000 [11:08:56<2873:47:28, 10.39s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 4365/1000000 [11:09:01<2410:24:58, 8.72s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [4365], local_loss=0.3935316205024719, train_loss=0.0870983898639679, time_cost=1.6694531440734863
+
Steps: 0%| | 4365/1000000 [11:09:01<2410:24:58, 8.72s/it, lr=1e-5, step_loss=0.394] /home/image_data/hxy/Open-Sora-Plan/opensora/utils/utils.py:369: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
+ caption = BeautifulSoup(caption, features='html.parser').text
+
Steps: 0%| | 4366/1000000 [11:09:16<2971:53:54, 10.75s/it, lr=1e-5, step_loss=0.394][RANK-0]: Step: [4366], local_loss=0.05824385583400726, train_loss=0.11233744770288467, time_cost=5.159188270568848
+
Steps: 0%| | 4366/1000000 [11:09:16<2971:53:54, 10.75s/it, lr=1e-5, step_loss=0.0582]
Steps: 0%| | 4367/1000000 [11:09:24<2700:28:49, 9.76s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [4367], local_loss=0.033985886722803116, train_loss=0.15073031187057495, time_cost=2.463859796524048
+
Steps: 0%| | 4367/1000000 [11:09:24<2700:28:49, 9.76s/it, lr=1e-5, step_loss=0.034]
Steps: 0%| | 4368/1000000 [11:09:36<2869:21:29, 10.38s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [4368], local_loss=0.054136984050273895, train_loss=0.0848989263176918, time_cost=1.2166779041290283
+
Steps: 0%| | 4368/1000000 [11:09:36<2869:21:29, 10.38s/it, lr=1e-5, step_loss=0.0541]
Steps: 0%| | 4369/1000000 [11:09:41<2454:45:00, 8.88s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [4369], local_loss=0.021104738116264343, train_loss=0.024367043748497963, time_cost=3.8608157634735107
+
Steps: 0%| | 4369/1000000 [11:09:41<2454:45:00, 8.88s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 4370/1000000 [11:09:54<2819:03:07, 10.19s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [4370], local_loss=0.021055402234196663, train_loss=0.045538388192653656, time_cost=9.052120208740234
+
Steps: 0%| | 4370/1000000 [11:09:54<2819:03:07, 10.19s/it, lr=1e-5, step_loss=0.0211]
Steps: 0%| | 4371/1000000 [11:10:12<3436:10:52, 12.42s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [4371], local_loss=0.12736904621124268, train_loss=0.043864671140909195, time_cost=1.2295656204223633
+
Steps: 0%| | 4371/1000000 [11:10:12<3436:10:52, 12.42s/it, lr=1e-5, step_loss=0.127]
Steps: 0%| | 4372/1000000 [11:10:22<3236:36:03, 11.70s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [4372], local_loss=0.01922772452235222, train_loss=0.024760890752077103, time_cost=1.47501540184021
+
Steps: 0%| | 4372/1000000 [11:10:22<3236:36:03, 11.70s/it, lr=1e-5, step_loss=0.0192]
Steps: 0%| | 4373/1000000 [11:10:35<3325:55:47, 12.03s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [4373], local_loss=0.013594991527497768, train_loss=0.024400709196925163, time_cost=1.202821969985962
+
Steps: 0%| | 4373/1000000 [11:10:35<3325:55:47, 12.03s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 4374/1000000 [11:10:40<2774:45:25, 10.03s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [4374], local_loss=0.13444936275482178, train_loss=0.035690538585186005, time_cost=2.3245041370391846
+
Steps: 0%| | 4374/1000000 [11:10:40<2774:45:25, 10.03s/it, lr=1e-5, step_loss=0.134]
Steps: 0%| | 4375/1000000 [11:10:46<2424:18:49, 8.77s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [4375], local_loss=0.024256527423858643, train_loss=0.02831112965941429, time_cost=1.5400118827819824
+
Steps: 0%| | 4375/1000000 [11:10:46<2424:18:49, 8.77s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 4376/1000000 [11:11:02<3055:02:20, 11.05s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [4376], local_loss=0.03194313496351242, train_loss=0.038153037428855896, time_cost=8.985477447509766
+
Steps: 0%| | 4376/1000000 [11:11:02<3055:02:20, 11.05s/it, lr=1e-5, step_loss=0.0319]
Steps: 0%| | 4377/1000000 [11:11:11<2839:31:24, 10.27s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [4377], local_loss=0.01221058052033186, train_loss=0.03588094934821129, time_cost=1.2434494495391846
+
Steps: 0%| | 4377/1000000 [11:11:11<2839:31:24, 10.27s/it, lr=1e-5, step_loss=0.0122]
Steps: 0%| | 4378/1000000 [11:11:18<2595:10:49, 9.38s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [4378], local_loss=0.03975076600909233, train_loss=0.0829218178987503, time_cost=1.200012445449829
+
Steps: 0%| | 4378/1000000 [11:11:18<2595:10:49, 9.38s/it, lr=1e-5, step_loss=0.0398]
Steps: 0%| | 4379/1000000 [11:11:26<2459:25:04, 8.89s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [4379], local_loss=0.04240936413407326, train_loss=0.0442960187792778, time_cost=2.6628615856170654
+
Steps: 0%| | 4379/1000000 [11:11:26<2459:25:04, 8.89s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 4380/1000000 [11:11:31<2115:51:20, 7.65s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [4380], local_loss=0.01937958225607872, train_loss=0.14790458977222443, time_cost=1.8521299362182617
+
Steps: 0%| | 4380/1000000 [11:11:31<2115:51:20, 7.65s/it, lr=1e-5, step_loss=0.0194]
Steps: 0%| | 4381/1000000 [11:11:38<2092:49:21, 7.57s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [4381], local_loss=0.06741844862699509, train_loss=0.048111096024513245, time_cost=2.446894884109497
+
Steps: 0%| | 4381/1000000 [11:11:38<2092:49:21, 7.57s/it, lr=1e-5, step_loss=0.0674]
Steps: 0%| | 4382/1000000 [11:11:49<2384:32:22, 8.62s/it, lr=1e-5, step_loss=0.0674][RANK-0]: Step: [4382], local_loss=0.035589635372161865, train_loss=0.03932804614305496, time_cost=3.433314323425293
+
Steps: 0%| | 4382/1000000 [11:11:49<2384:32:22, 8.62s/it, lr=1e-5, step_loss=0.0356]
Steps: 0%| | 4383/1000000 [11:11:55<2153:03:22, 7.79s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [4383], local_loss=0.011138007044792175, train_loss=0.03243841975927353, time_cost=1.9208214282989502
+
Steps: 0%| | 4383/1000000 [11:11:55<2153:03:22, 7.79s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 4384/1000000 [11:12:05<2378:29:23, 8.60s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [4384], local_loss=0.0551135390996933, train_loss=0.06146809086203575, time_cost=3.7077267169952393
+
Steps: 0%| | 4384/1000000 [11:12:05<2378:29:23, 8.60s/it, lr=1e-5, step_loss=0.0551]
Steps: 0%| | 4385/1000000 [11:12:16<2571:25:12, 9.30s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [4385], local_loss=0.010857970453798771, train_loss=0.09071987867355347, time_cost=7.589498519897461
+
Steps: 0%| | 4385/1000000 [11:12:16<2571:25:12, 9.30s/it, lr=1e-5, step_loss=0.0109]
Steps: 0%| | 4386/1000000 [11:12:22<2293:51:27, 8.29s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [4386], local_loss=0.022070230916142464, train_loss=0.022075491026043892, time_cost=1.2765586376190186
+
Steps: 0%| | 4386/1000000 [11:12:22<2293:51:27, 8.29s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 4387/1000000 [11:12:30<2216:34:56, 8.01s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [4387], local_loss=0.06990905106067657, train_loss=0.04847995564341545, time_cost=1.1995978355407715
+
Steps: 0%| | 4387/1000000 [11:12:30<2216:34:56, 8.01s/it, lr=1e-5, step_loss=0.0699]
Steps: 0%| | 4388/1000000 [11:12:37<2156:16:29, 7.80s/it, lr=1e-5, step_loss=0.0699][RANK-0]: Step: [4388], local_loss=0.014968606643378735, train_loss=0.08980660885572433, time_cost=4.220609664916992
+
Steps: 0%| | 4388/1000000 [11:12:37<2156:16:29, 7.80s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4389/1000000 [11:12:53<2862:10:43, 10.35s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4389], local_loss=0.02060128003358841, train_loss=0.029941851273179054, time_cost=8.699724435806274
+
Steps: 0%| | 4389/1000000 [11:12:53<2862:10:43, 10.35s/it, lr=1e-5, step_loss=0.0206]
Steps: 0%| | 4390/1000000 [11:12:58<2365:39:33, 8.55s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [4390], local_loss=0.02696882374584675, train_loss=0.07977922260761261, time_cost=1.500293254852295
+
Steps: 0%| | 4390/1000000 [11:12:58<2365:39:33, 8.55s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 4391/1000000 [11:13:09<2584:39:01, 9.35s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [4391], local_loss=0.017687181010842323, train_loss=0.06307217478752136, time_cost=4.033728122711182
+
Steps: 0%| | 4391/1000000 [11:13:09<2584:39:01, 9.35s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 4392/1000000 [11:13:13<2168:14:36, 7.84s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [4392], local_loss=0.04645582661032677, train_loss=0.044373348355293274, time_cost=3.40718150138855
+
Steps: 0%| | 4392/1000000 [11:13:13<2168:14:36, 7.84s/it, lr=1e-5, step_loss=0.0465]
Steps: 0%| | 4393/1000000 [11:13:23<2355:43:40, 8.52s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [4393], local_loss=0.05469739809632301, train_loss=0.05959320068359375, time_cost=1.2680819034576416
+
Steps: 0%| | 4393/1000000 [11:13:23<2355:43:40, 8.52s/it, lr=1e-5, step_loss=0.0547]
Steps: 0%| | 4394/1000000 [11:13:37<2778:49:28, 10.05s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [4394], local_loss=0.03757398575544357, train_loss=0.03612112998962402, time_cost=3.6207449436187744
+
Steps: 0%| | 4394/1000000 [11:13:37<2778:49:28, 10.05s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 4395/1000000 [11:13:51<3102:36:24, 11.22s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [4395], local_loss=0.04248354211449623, train_loss=0.12573841214179993, time_cost=5.3424811363220215
+
Steps: 0%| | 4395/1000000 [11:13:51<3102:36:24, 11.22s/it, lr=1e-5, step_loss=0.0425]
Steps: 0%| | 4396/1000000 [11:14:06<3394:54:56, 12.28s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [4396], local_loss=0.014694868586957455, train_loss=0.030958378687500954, time_cost=4.795273065567017
+
Steps: 0%| | 4396/1000000 [11:14:06<3394:54:56, 12.28s/it, lr=1e-5, step_loss=0.0147]
Steps: 0%| | 4397/1000000 [11:14:13<2974:08:15, 10.75s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [4397], local_loss=0.01659592241048813, train_loss=0.11769206076860428, time_cost=1.2376418113708496
+
Steps: 0%| | 4397/1000000 [11:14:13<2974:08:15, 10.75s/it, lr=1e-5, step_loss=0.0166]
Steps: 0%| | 4398/1000000 [11:14:20<2652:16:43, 9.59s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [4398], local_loss=0.05418483912944794, train_loss=0.03140611946582794, time_cost=2.454662561416626
+
Steps: 0%| | 4398/1000000 [11:14:20<2652:16:43, 9.59s/it, lr=1e-5, step_loss=0.0542]
Steps: 0%| | 4399/1000000 [11:14:33<2935:25:12, 10.61s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [4399], local_loss=0.08331651240587234, train_loss=0.09981663525104523, time_cost=4.36470627784729
+
Steps: 0%| | 4399/1000000 [11:14:33<2935:25:12, 10.61s/it, lr=1e-5, step_loss=0.0833]
Steps: 0%| | 4400/1000000 [11:14:46<3148:45:44, 11.39s/it, lr=1e-5, step_loss=0.0833][RANK-0]: Step: [4400], local_loss=0.15053299069404602, train_loss=0.04704166203737259, time_cost=10.192960500717163
+
Steps: 0%| | 4400/1000000 [11:14:46<3148:45:44, 11.39s/it, lr=1e-5, step_loss=0.151]
Steps: 0%| | 4401/1000000 [11:14:58<3232:34:19, 11.69s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [4401], local_loss=0.04406818374991417, train_loss=0.04902440682053566, time_cost=8.770163536071777
+
Steps: 0%| | 4401/1000000 [11:14:58<3232:34:19, 11.69s/it, lr=1e-5, step_loss=0.0441]
Steps: 0%| | 4402/1000000 [11:15:06<2870:23:05, 10.38s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [4402], local_loss=0.017424622550606728, train_loss=0.026219811290502548, time_cost=1.2290809154510498
+
Steps: 0%| | 4402/1000000 [11:15:06<2870:23:05, 10.38s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 4403/1000000 [11:15:22<3391:10:42, 12.26s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [4403], local_loss=0.06490974873304367, train_loss=0.08451057970523834, time_cost=8.00304913520813
+
Steps: 0%| | 4403/1000000 [11:15:22<3391:10:42, 12.26s/it, lr=1e-5, step_loss=0.0649]
Steps: 0%| | 4404/1000000 [11:15:38<3650:09:48, 13.20s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [4404], local_loss=0.021031370386481285, train_loss=0.023611251264810562, time_cost=3.1272826194763184
+
Steps: 0%| | 4404/1000000 [11:15:38<3650:09:48, 13.20s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 4405/1000000 [11:15:48<3436:51:19, 12.43s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [4405], local_loss=0.0208415724337101, train_loss=0.02293587103486061, time_cost=4.526559114456177
+
Steps: 0%| | 4405/1000000 [11:15:48<3436:51:19, 12.43s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 4406/1000000 [11:15:54<2904:48:52, 10.50s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [4406], local_loss=0.01073056273162365, train_loss=0.043350353837013245, time_cost=1.2443058490753174
+
Steps: 0%| | 4406/1000000 [11:15:54<2904:48:52, 10.50s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 4407/1000000 [11:16:12<3516:56:34, 12.72s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [4407], local_loss=0.06898601353168488, train_loss=0.05008173733949661, time_cost=15.649166345596313
+
Steps: 0%| | 4407/1000000 [11:16:12<3516:56:34, 12.72s/it, lr=1e-5, step_loss=0.069]
Steps: 0%| | 4408/1000000 [11:16:20<3132:03:21, 11.33s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [4408], local_loss=0.050088442862033844, train_loss=0.06192971020936966, time_cost=4.449138641357422
+
Steps: 0%| | 4408/1000000 [11:16:20<3132:03:21, 11.33s/it, lr=1e-5, step_loss=0.0501]
Steps: 0%| | 4409/1000000 [11:16:32<3163:06:39, 11.44s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [4409], local_loss=0.01952219009399414, train_loss=0.03707262501120567, time_cost=2.525797128677368
+
Steps: 0%| | 4409/1000000 [11:16:32<3163:06:39, 11.44s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 4410/1000000 [11:16:36<2562:53:34, 9.27s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [4410], local_loss=0.014768103137612343, train_loss=0.02986595407128334, time_cost=1.233316421508789
+
Steps: 0%| | 4410/1000000 [11:16:36<2562:53:34, 9.27s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 4411/1000000 [11:16:41<2200:17:50, 7.96s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [4411], local_loss=0.010819233022630215, train_loss=0.03773513808846474, time_cost=1.885805368423462
+
Steps: 0%| | 4411/1000000 [11:16:41<2200:17:50, 7.96s/it, lr=1e-5, step_loss=0.0108]
Steps: 0%| | 4412/1000000 [11:16:50<2286:56:19, 8.27s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [4412], local_loss=0.018108857795596123, train_loss=0.02532222867012024, time_cost=1.2709527015686035
+
Steps: 0%| | 4412/1000000 [11:16:50<2286:56:19, 8.27s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 4413/1000000 [11:17:04<2733:40:03, 9.88s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [4413], local_loss=0.014271056279540062, train_loss=0.048286501318216324, time_cost=4.234731912612915
+
Steps: 0%| | 4413/1000000 [11:17:04<2733:40:03, 9.88s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 4414/1000000 [11:17:15<2840:42:40, 10.27s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [4414], local_loss=0.017117546871304512, train_loss=0.027978844940662384, time_cost=1.3921318054199219
+
Steps: 0%| | 4414/1000000 [11:17:15<2840:42:40, 10.27s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 4415/1000000 [11:17:19<2351:18:03, 8.50s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [4415], local_loss=0.8213872313499451, train_loss=0.14310702681541443, time_cost=1.5582196712493896
+
Steps: 0%| | 4415/1000000 [11:17:19<2351:18:03, 8.50s/it, lr=1e-5, step_loss=0.821]
Steps: 0%| | 4416/1000000 [11:17:26<2242:49:26, 8.11s/it, lr=1e-5, step_loss=0.821][RANK-0]: Step: [4416], local_loss=0.10392878204584122, train_loss=0.04984080046415329, time_cost=2.623892307281494
+
Steps: 0%| | 4416/1000000 [11:17:26<2242:49:26, 8.11s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 4417/1000000 [11:17:31<1969:57:50, 7.12s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [4417], local_loss=0.03445756807923317, train_loss=0.03809034079313278, time_cost=1.9056706428527832
+
Steps: 0%| | 4417/1000000 [11:17:31<1969:57:50, 7.12s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 4418/1000000 [11:17:37<1864:28:29, 6.74s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [4418], local_loss=0.016162773594260216, train_loss=0.03415735438466072, time_cost=1.214461088180542
+
Steps: 0%| | 4418/1000000 [11:17:37<1864:28:29, 6.74s/it, lr=1e-5, step_loss=0.0162]
Steps: 0%| | 4419/1000000 [11:17:44<1898:32:10, 6.87s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [4419], local_loss=0.025084152817726135, train_loss=0.10954681038856506, time_cost=1.280404806137085
+
Steps: 0%| | 4419/1000000 [11:17:44<1898:32:10, 6.87s/it, lr=1e-5, step_loss=0.0251]
Steps: 0%| | 4420/1000000 [11:17:54<2174:20:25, 7.86s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [4420], local_loss=0.04347328841686249, train_loss=0.049203481525182724, time_cost=1.2206599712371826
+
Steps: 0%| | 4420/1000000 [11:17:54<2174:20:25, 7.86s/it, lr=1e-5, step_loss=0.0435]
Steps: 0%| | 4421/1000000 [11:18:00<1989:09:32, 7.19s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [4421], local_loss=0.021594839170575142, train_loss=0.05290985852479935, time_cost=1.2097697257995605
+
Steps: 0%| | 4421/1000000 [11:18:00<1989:09:32, 7.19s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 4422/1000000 [11:18:05<1806:36:31, 6.53s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [4422], local_loss=0.28270360827445984, train_loss=0.07515071332454681, time_cost=2.132528781890869
+
Steps: 0%| | 4422/1000000 [11:18:05<1806:36:31, 6.53s/it, lr=1e-5, step_loss=0.283]
Steps: 0%| | 4423/1000000 [11:18:14<2011:01:22, 7.27s/it, lr=1e-5, step_loss=0.283][RANK-0]: Step: [4423], local_loss=0.01167682558298111, train_loss=19.391820907592773, time_cost=1.845777988433838
+
Steps: 0%| | 4423/1000000 [11:18:14<2011:01:22, 7.27s/it, lr=1e-5, step_loss=0.0117]
Steps: 0%| | 4424/1000000 [11:18:21<1998:05:53, 7.23s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [4424], local_loss=0.06696832925081253, train_loss=0.057912006974220276, time_cost=1.4507098197937012
+
Steps: 0%| | 4424/1000000 [11:18:21<1998:05:53, 7.23s/it, lr=1e-5, step_loss=0.067]
Steps: 0%| | 4425/1000000 [11:18:27<1905:56:12, 6.89s/it, lr=1e-5, step_loss=0.067][RANK-0]: Step: [4425], local_loss=0.027023641392588615, train_loss=0.03993426263332367, time_cost=1.7517023086547852
+
Steps: 0%| | 4425/1000000 [11:18:27<1905:56:12, 6.89s/it, lr=1e-5, step_loss=0.027]
Steps: 0%| | 4426/1000000 [11:18:39<2281:52:44, 8.25s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [4426], local_loss=0.011203753761947155, train_loss=0.05851581320166588, time_cost=8.01558232307434
+
Steps: 0%| | 4426/1000000 [11:18:39<2281:52:44, 8.25s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 4427/1000000 [11:18:52<2728:46:52, 9.87s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [4427], local_loss=0.01487966813147068, train_loss=0.02470884472131729, time_cost=5.025815725326538
+
Steps: 0%| | 4427/1000000 [11:18:52<2728:46:52, 9.87s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 4428/1000000 [11:19:05<3001:44:01, 10.85s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [4428], local_loss=0.01340504176914692, train_loss=0.022531021386384964, time_cost=3.9448630809783936
+
Steps: 0%| | 4428/1000000 [11:19:05<3001:44:01, 10.85s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 4429/1000000 [11:19:22<3478:01:58, 12.58s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [4429], local_loss=0.01342160813510418, train_loss=0.018414437770843506, time_cost=7.978387355804443
+
Steps: 0%| | 4429/1000000 [11:19:22<3478:01:58, 12.58s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 4430/1000000 [11:19:29<3022:44:41, 10.93s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [4430], local_loss=0.02740875817835331, train_loss=0.030582036823034286, time_cost=1.2749428749084473
+
Steps: 0%| | 4430/1000000 [11:19:29<3022:44:41, 10.93s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 4431/1000000 [11:19:36<2680:52:45, 9.69s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [4431], local_loss=0.026249807327985764, train_loss=0.034749530255794525, time_cost=2.4441943168640137
+
Steps: 0%| | 4431/1000000 [11:19:36<2680:52:45, 9.69s/it, lr=1e-5, step_loss=0.0262]
Steps: 0%| | 4432/1000000 [11:19:49<2917:29:08, 10.55s/it, lr=1e-5, step_loss=0.0262][RANK-0]: Step: [4432], local_loss=0.008969016373157501, train_loss=0.032078661024570465, time_cost=3.137948513031006
+
Steps: 0%| | 4432/1000000 [11:19:49<2917:29:08, 10.55s/it, lr=1e-5, step_loss=0.00897]
Steps: 0%| | 4433/1000000 [11:19:56<2646:17:31, 9.57s/it, lr=1e-5, step_loss=0.00897][RANK-0]: Step: [4433], local_loss=0.03344424068927765, train_loss=0.048459991812705994, time_cost=2.114039897918701
+
Steps: 0%| | 4433/1000000 [11:19:56<2646:17:31, 9.57s/it, lr=1e-5, step_loss=0.0334]
Steps: 0%| | 4434/1000000 [11:20:11<3121:22:41, 11.29s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [4434], local_loss=0.013070957735180855, train_loss=0.037223465740680695, time_cost=6.245129346847534
+
Steps: 0%| | 4434/1000000 [11:20:11<3121:22:41, 11.29s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 4435/1000000 [11:20:16<2565:57:52, 9.28s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [4435], local_loss=0.01949566975235939, train_loss=12.352444648742676, time_cost=2.203117847442627
+
Steps: 0%| | 4435/1000000 [11:20:16<2565:57:52, 9.28s/it, lr=1e-5, step_loss=0.0195]
Steps: 0%| | 4436/1000000 [11:20:29<2867:51:03, 10.37s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [4436], local_loss=0.024636996909976006, train_loss=0.03125893697142601, time_cost=4.444169521331787
+
Steps: 0%| | 4436/1000000 [11:20:29<2867:51:03, 10.37s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 4437/1000000 [11:20:34<2418:15:36, 8.74s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [4437], local_loss=0.017522767186164856, train_loss=0.05214563384652138, time_cost=1.8371782302856445
+
Steps: 0%| | 4437/1000000 [11:20:34<2418:15:36, 8.74s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4438/1000000 [11:20:38<2045:24:46, 7.40s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4438], local_loss=0.012402337975800037, train_loss=0.041452325880527496, time_cost=1.2331211566925049
+
Steps: 0%| | 4438/1000000 [11:20:38<2045:24:46, 7.40s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 4439/1000000 [11:20:46<2123:26:54, 7.68s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [4439], local_loss=0.011812176555395126, train_loss=0.1149199828505516, time_cost=4.348477125167847
+
Steps: 0%| | 4439/1000000 [11:20:46<2123:26:54, 7.68s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 4440/1000000 [11:20:51<1930:31:08, 6.98s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [4440], local_loss=0.022858180105686188, train_loss=0.05213271826505661, time_cost=1.9280850887298584
+
Steps: 0%| | 4440/1000000 [11:20:51<1930:31:08, 6.98s/it, lr=1e-5, step_loss=0.0229]
Steps: 0%| | 4441/1000000 [11:21:02<2210:54:23, 7.99s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [4441], local_loss=0.13636580109596252, train_loss=0.052952107042074203, time_cost=3.9429829120635986
+
Steps: 0%| | 4441/1000000 [11:21:02<2210:54:23, 7.99s/it, lr=1e-5, step_loss=0.136]
Steps: 0%| | 4442/1000000 [11:21:14<2532:09:54, 9.16s/it, lr=1e-5, step_loss=0.136][RANK-0]: Step: [4442], local_loss=0.0148614551872015, train_loss=18.562210083007812, time_cost=1.2156288623809814
+
Steps: 0%| | 4442/1000000 [11:21:14<2532:09:54, 9.16s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 4443/1000000 [11:21:25<2713:47:10, 9.81s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [4443], local_loss=0.016777224838733673, train_loss=0.04497310891747475, time_cost=2.4867799282073975
+
Steps: 0%| | 4443/1000000 [11:21:25<2713:47:10, 9.81s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 4444/1000000 [11:21:37<2914:41:18, 10.54s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [4444], local_loss=0.01885215751826763, train_loss=0.03941739350557327, time_cost=1.2197105884552002
+
Steps: 0%| | 4444/1000000 [11:21:37<2914:41:18, 10.54s/it, lr=1e-5, step_loss=0.0189]
Steps: 0%| | 4445/1000000 [11:21:42<2463:15:39, 8.91s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [4445], local_loss=0.023029878735542297, train_loss=0.02943853661417961, time_cost=1.1964459419250488
+
Steps: 0%| | 4445/1000000 [11:21:42<2463:15:39, 8.91s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 4446/1000000 [11:21:49<2256:24:25, 8.16s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [4446], local_loss=0.13842833042144775, train_loss=0.05073682963848114, time_cost=1.9714484214782715
+
Steps: 0%| | 4446/1000000 [11:21:49<2256:24:25, 8.16s/it, lr=1e-5, step_loss=0.138]
Steps: 0%| | 4447/1000000 [11:21:54<1988:30:34, 7.19s/it, lr=1e-5, step_loss=0.138][RANK-0]: Step: [4447], local_loss=0.057622943073511124, train_loss=0.07193534821271896, time_cost=1.8050951957702637
+
Steps: 0%| | 4447/1000000 [11:21:54<1988:30:34, 7.19s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 4448/1000000 [11:21:59<1817:53:06, 6.57s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [4448], local_loss=0.05274457111954689, train_loss=0.03469587862491608, time_cost=2.0553622245788574
+
Steps: 0%| | 4448/1000000 [11:21:59<1817:53:06, 6.57s/it, lr=1e-5, step_loss=0.0527]
Steps: 0%| | 4449/1000000 [11:22:05<1808:16:30, 6.54s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [4449], local_loss=0.017294205725193024, train_loss=0.05582103133201599, time_cost=1.3099238872528076
+
Steps: 0%| | 4449/1000000 [11:22:05<1808:16:30, 6.54s/it, lr=1e-5, step_loss=0.0173]
Steps: 0%| | 4450/1000000 [11:22:11<1747:27:25, 6.32s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [4450], local_loss=0.038599878549575806, train_loss=0.02510758675634861, time_cost=1.9102797508239746
+
Steps: 0%| | 4450/1000000 [11:22:11<1747:27:25, 6.32s/it, lr=1e-5, step_loss=0.0386]
Steps: 0%| | 4451/1000000 [11:22:19<1880:07:43, 6.80s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [4451], local_loss=0.01501007005572319, train_loss=0.03461836650967598, time_cost=3.4536221027374268
+
Steps: 0%| | 4451/1000000 [11:22:19<1880:07:43, 6.80s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4452/1000000 [11:22:29<2164:03:46, 7.83s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4452], local_loss=0.015519071370363235, train_loss=0.043858759105205536, time_cost=1.2577338218688965
+
Steps: 0%| | 4452/1000000 [11:22:29<2164:03:46, 7.83s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4453/1000000 [11:22:47<2964:15:12, 10.72s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4453], local_loss=0.09682652354240417, train_loss=0.06323011219501495, time_cost=8.47670865058899
+
Steps: 0%| | 4453/1000000 [11:22:47<2964:15:12, 10.72s/it, lr=1e-5, step_loss=0.0968]
Steps: 0%| | 4454/1000000 [11:22:56<2866:16:43, 10.36s/it, lr=1e-5, step_loss=0.0968][RANK-0]: Step: [4454], local_loss=0.014210926368832588, train_loss=0.04120120406150818, time_cost=3.4711999893188477
+
Steps: 0%| | 4454/1000000 [11:22:56<2866:16:43, 10.36s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 4455/1000000 [11:23:10<3129:43:26, 11.32s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [4455], local_loss=0.01879148930311203, train_loss=0.031283266842365265, time_cost=3.744037628173828
+
Steps: 0%| | 4455/1000000 [11:23:10<3129:43:26, 11.32s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 4456/1000000 [11:23:23<3299:51:27, 11.93s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [4456], local_loss=0.01288523804396391, train_loss=0.020057447254657745, time_cost=5.122589588165283
+
Steps: 0%| | 4456/1000000 [11:23:23<3299:51:27, 11.93s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 4457/1000000 [11:23:29<2771:38:45, 10.02s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [4457], local_loss=0.021562108770012856, train_loss=0.018592894077301025, time_cost=1.2289037704467773
+
Steps: 0%| | 4457/1000000 [11:23:29<2771:38:45, 10.02s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 4458/1000000 [11:23:33<2304:03:48, 8.33s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [4458], local_loss=0.043393876403570175, train_loss=0.38893425464630127, time_cost=1.8038291931152344
+
Steps: 0%| | 4458/1000000 [11:23:33<2304:03:48, 8.33s/it, lr=1e-5, step_loss=0.0434]
Steps: 0%| | 4459/1000000 [11:23:46<2674:29:54, 9.67s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [4459], local_loss=0.08404842019081116, train_loss=0.09822804480791092, time_cost=3.2244293689727783
+
Steps: 0%| | 4459/1000000 [11:23:46<2674:29:54, 9.67s/it, lr=1e-5, step_loss=0.084]
Steps: 0%| | 4460/1000000 [11:24:01<3102:27:56, 11.22s/it, lr=1e-5, step_loss=0.084][RANK-0]: Step: [4460], local_loss=0.049529194831848145, train_loss=0.031426846981048584, time_cost=5.303257703781128
+
Steps: 0%| | 4460/1000000 [11:24:01<3102:27:56, 11.22s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 4461/1000000 [11:24:06<2646:03:08, 9.57s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [4461], local_loss=0.012576015666127205, train_loss=0.05246672406792641, time_cost=3.312380313873291
+
Steps: 0%| | 4461/1000000 [11:24:06<2646:03:08, 9.57s/it, lr=1e-5, step_loss=0.0126]
Steps: 0%| | 4462/1000000 [11:24:18<2806:28:09, 10.15s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [4462], local_loss=0.033619578927755356, train_loss=0.03989540785551071, time_cost=1.5396151542663574
+
Steps: 0%| | 4462/1000000 [11:24:18<2806:28:09, 10.15s/it, lr=1e-5, step_loss=0.0336]
Steps: 0%| | 4463/1000000 [11:24:24<2424:57:19, 8.77s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [4463], local_loss=0.039083581417798996, train_loss=50.222877502441406, time_cost=1.4443018436431885
+
Steps: 0%| | 4463/1000000 [11:24:24<2424:57:19, 8.77s/it, lr=1e-5, step_loss=0.0391]
Steps: 0%| | 4464/1000000 [11:24:29<2142:30:22, 7.75s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [4464], local_loss=0.015378524549305439, train_loss=0.04593870788812637, time_cost=1.23573637008667
+
Steps: 0%| | 4464/1000000 [11:24:29<2142:30:22, 7.75s/it, lr=1e-5, step_loss=0.0154]
Steps: 0%| | 4465/1000000 [11:24:39<2315:19:09, 8.37s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [4465], local_loss=0.4635290503501892, train_loss=0.10876856744289398, time_cost=3.2340707778930664
+
Steps: 0%| | 4465/1000000 [11:24:39<2315:19:09, 8.37s/it, lr=1e-5, step_loss=0.464]
Steps: 0%| | 4466/1000000 [11:24:44<2068:56:57, 7.48s/it, lr=1e-5, step_loss=0.464][RANK-0]: Step: [4466], local_loss=0.07066905498504639, train_loss=0.04451794549822807, time_cost=4.350700855255127
+
Steps: 0%| | 4466/1000000 [11:24:44<2068:56:57, 7.48s/it, lr=1e-5, step_loss=0.0707]
Steps: 0%| | 4467/1000000 [11:24:53<2146:37:54, 7.76s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [4467], local_loss=0.016658784821629524, train_loss=0.03813420608639717, time_cost=1.2145171165466309
+
Steps: 0%| | 4467/1000000 [11:24:53<2146:37:54, 7.76s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 4468/1000000 [11:24:58<1933:07:54, 6.99s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [4468], local_loss=0.21177023649215698, train_loss=0.05493420362472534, time_cost=1.1998322010040283
+
Steps: 0%| | 4468/1000000 [11:24:58<1933:07:54, 6.99s/it, lr=1e-5, step_loss=0.212]
Steps: 0%| | 4469/1000000 [11:25:05<1971:01:31, 7.13s/it, lr=1e-5, step_loss=0.212][RANK-0]: Step: [4469], local_loss=0.015554197132587433, train_loss=0.07379563897848129, time_cost=2.024817705154419
+
Steps: 0%| | 4469/1000000 [11:25:05<1971:01:31, 7.13s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 4470/1000000 [11:25:13<2017:35:16, 7.30s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [4470], local_loss=0.06411220133304596, train_loss=0.08238772302865982, time_cost=1.5458481311798096
+
Steps: 0%| | 4470/1000000 [11:25:13<2017:35:16, 7.30s/it, lr=1e-5, step_loss=0.0641]
Steps: 0%| | 4471/1000000 [11:25:31<2950:19:13, 10.67s/it, lr=1e-5, step_loss=0.0641][RANK-0]: Step: [4471], local_loss=0.04287201538681984, train_loss=0.059918805956840515, time_cost=10.31512975692749
+
Steps: 0%| | 4471/1000000 [11:25:31<2950:19:13, 10.67s/it, lr=1e-5, step_loss=0.0429]
Steps: 0%| | 4472/1000000 [11:25:37<2503:17:58, 9.05s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [4472], local_loss=0.01236039586365223, train_loss=0.05882918834686279, time_cost=1.1980323791503906
+
Steps: 0%| | 4472/1000000 [11:25:37<2503:17:58, 9.05s/it, lr=1e-5, step_loss=0.0124]
Steps: 0%| | 4473/1000000 [11:25:42<2216:49:48, 8.02s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [4473], local_loss=0.04535113647580147, train_loss=0.027945391833782196, time_cost=2.9461588859558105
+
Steps: 0%| | 4473/1000000 [11:25:42<2216:49:48, 8.02s/it, lr=1e-5, step_loss=0.0454]
Steps: 0%| | 4474/1000000 [11:25:50<2217:47:21, 8.02s/it, lr=1e-5, step_loss=0.0454][RANK-0]: Step: [4474], local_loss=0.03163183853030205, train_loss=0.03655872121453285, time_cost=1.9456703662872314
+
Steps: 0%| | 4474/1000000 [11:25:50<2217:47:21, 8.02s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 4475/1000000 [11:25:56<2047:31:19, 7.40s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [4475], local_loss=0.024802276864647865, train_loss=0.10283639281988144, time_cost=1.2433397769927979
+
Steps: 0%| | 4475/1000000 [11:25:56<2047:31:19, 7.40s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 4476/1000000 [11:26:04<2089:37:37, 7.56s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [4476], local_loss=0.04810710996389389, train_loss=0.033894043415784836, time_cost=2.886232376098633
+
Steps: 0%| | 4476/1000000 [11:26:04<2089:37:37, 7.56s/it, lr=1e-5, step_loss=0.0481]
Steps: 0%| | 4477/1000000 [11:26:15<2355:01:02, 8.52s/it, lr=1e-5, step_loss=0.0481][RANK-0]: Step: [4477], local_loss=0.07410655915737152, train_loss=0.055962175130844116, time_cost=1.9720849990844727
+
Steps: 0%| | 4477/1000000 [11:26:15<2355:01:02, 8.52s/it, lr=1e-5, step_loss=0.0741]
Steps: 0%| | 4478/1000000 [11:26:26<2568:34:28, 9.29s/it, lr=1e-5, step_loss=0.0741][RANK-0]: Step: [4478], local_loss=0.013684280216693878, train_loss=0.022412722930312157, time_cost=1.904195785522461
+
Steps: 0%| | 4478/1000000 [11:26:26<2568:34:28, 9.29s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 4479/1000000 [11:26:38<2790:27:09, 10.09s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [4479], local_loss=0.0119347358122468, train_loss=0.026951340958476067, time_cost=3.8470096588134766
+
Steps: 0%| | 4479/1000000 [11:26:38<2790:27:09, 10.09s/it, lr=1e-5, step_loss=0.0119]
Steps: 0%| | 4480/1000000 [11:26:52<3122:29:25, 11.29s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [4480], local_loss=0.017463479191064835, train_loss=0.043282490223646164, time_cost=5.4507317543029785
+
Steps: 0%| | 4480/1000000 [11:26:52<3122:29:25, 11.29s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4481/1000000 [11:26:59<2776:41:40, 10.04s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4481], local_loss=0.012268966063857079, train_loss=0.040639180690050125, time_cost=2.5053412914276123
+
Steps: 0%| | 4481/1000000 [11:26:59<2776:41:40, 10.04s/it, lr=1e-5, step_loss=0.0123]
Steps: 0%| | 4482/1000000 [11:27:11<2940:38:50, 10.63s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [4482], local_loss=0.015091242268681526, train_loss=0.030935006216168404, time_cost=5.438178300857544
+
Steps: 0%| | 4482/1000000 [11:27:11<2940:38:50, 10.63s/it, lr=1e-5, step_loss=0.0151]
Steps: 0%| | 4483/1000000 [11:27:16<2445:54:22, 8.84s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [4483], local_loss=0.011947638355195522, train_loss=0.043835967779159546, time_cost=3.318871021270752
+
Steps: 0%| | 4483/1000000 [11:27:16<2445:54:22, 8.84s/it, lr=1e-5, step_loss=0.0119]
Steps: 0%| | 4484/1000000 [11:27:22<2194:05:35, 7.93s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [4484], local_loss=0.04609118402004242, train_loss=26.3174991607666, time_cost=2.112067937850952
+
Steps: 0%| | 4484/1000000 [11:27:22<2194:05:35, 7.93s/it, lr=1e-5, step_loss=0.0461]
Steps: 0%| | 4485/1000000 [11:27:37<2834:35:53, 10.25s/it, lr=1e-5, step_loss=0.0461][RANK-0]: Step: [4485], local_loss=0.04732026904821396, train_loss=0.02613060548901558, time_cost=6.021559238433838
+
Steps: 0%| | 4485/1000000 [11:27:37<2834:35:53, 10.25s/it, lr=1e-5, step_loss=0.0473]
Steps: 0%| | 4486/1000000 [11:27:52<3162:25:53, 11.44s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [4486], local_loss=0.026661299169063568, train_loss=0.15072110295295715, time_cost=4.43587327003479
+
Steps: 0%| | 4486/1000000 [11:27:52<3162:25:53, 11.44s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 4487/1000000 [11:27:59<2825:44:34, 10.22s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [4487], local_loss=0.02001194655895233, train_loss=0.062372274696826935, time_cost=1.3639705181121826
+
Steps: 0%| | 4487/1000000 [11:27:59<2825:44:34, 10.22s/it, lr=1e-5, step_loss=0.02]
Steps: 0%| | 4488/1000000 [11:28:04<2424:22:03, 8.77s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [4488], local_loss=0.03291313722729683, train_loss=0.14683428406715393, time_cost=1.4275929927825928
+
Steps: 0%| | 4488/1000000 [11:28:04<2424:22:03, 8.77s/it, lr=1e-5, step_loss=0.0329]
Steps: 0%| | 4489/1000000 [11:28:23<3276:19:34, 11.85s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [4489], local_loss=0.027577608823776245, train_loss=0.07436657696962357, time_cost=1.2125296592712402
+
Steps: 0%| | 4489/1000000 [11:28:23<3276:19:34, 11.85s/it, lr=1e-5, step_loss=0.0276]
Steps: 0%| | 4490/1000000 [11:28:32<2968:53:18, 10.74s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [4490], local_loss=0.012739663943648338, train_loss=0.06193657964468002, time_cost=4.012319803237915
+
Steps: 0%| | 4490/1000000 [11:28:32<2968:53:18, 10.74s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 4491/1000000 [11:28:39<2709:35:52, 9.80s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [4491], local_loss=0.03225281462073326, train_loss=0.05837714299559593, time_cost=1.1995899677276611
+
Steps: 0%| | 4491/1000000 [11:28:39<2709:35:52, 9.80s/it, lr=1e-5, step_loss=0.0323]
Steps: 0%| | 4492/1000000 [11:28:48<2606:48:20, 9.43s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [4492], local_loss=0.017727434635162354, train_loss=0.048988305032253265, time_cost=4.412319898605347
+
Steps: 0%| | 4492/1000000 [11:28:48<2606:48:20, 9.43s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 4493/1000000 [11:29:01<2964:55:21, 10.72s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [4493], local_loss=0.10021623224020004, train_loss=0.14454065263271332, time_cost=4.251271724700928
+
Steps: 0%| | 4493/1000000 [11:29:01<2964:55:21, 10.72s/it, lr=1e-5, step_loss=0.1]
Steps: 0%| | 4494/1000000 [11:29:16<3292:36:46, 11.91s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [4494], local_loss=0.006843421142548323, train_loss=0.021697744727134705, time_cost=4.478944540023804
+
Steps: 0%| | 4494/1000000 [11:29:16<3292:36:46, 11.91s/it, lr=1e-5, step_loss=0.00684]
Steps: 0%| | 4495/1000000 [11:29:22<2753:51:35, 9.96s/it, lr=1e-5, step_loss=0.00684][RANK-0]: Step: [4495], local_loss=0.01297004148364067, train_loss=0.03056340292096138, time_cost=1.25111722946167
+
Steps: 0%| | 4495/1000000 [11:29:22<2753:51:35, 9.96s/it, lr=1e-5, step_loss=0.013]
Steps: 0%| | 4496/1000000 [11:29:31<2727:04:18, 9.86s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [4496], local_loss=0.023022493347525597, train_loss=0.056142035871744156, time_cost=2.99615216255188
+
Steps: 0%| | 4496/1000000 [11:29:31<2727:04:18, 9.86s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 4497/1000000 [11:29:44<3015:51:04, 10.91s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [4497], local_loss=0.017699897289276123, train_loss=0.04617428407073021, time_cost=5.602066993713379
+
Steps: 0%| | 4497/1000000 [11:29:44<3015:51:04, 10.91s/it, lr=1e-5, step_loss=0.0177]
Steps: 0%| | 4498/1000000 [11:29:52<2704:28:00, 9.78s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [4498], local_loss=0.01845237798988819, train_loss=0.0551881417632103, time_cost=1.2072479724884033
+
Steps: 0%| | 4498/1000000 [11:29:52<2704:28:00, 9.78s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 4499/1000000 [11:29:58<2403:31:54, 8.69s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [4499], local_loss=0.01596047729253769, train_loss=0.0319456085562706, time_cost=1.9170472621917725
+
Steps: 0%| | 4499/1000000 [11:29:58<2403:31:54, 8.69s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4500/1000000 [11:30:07<2472:14:40, 8.94s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4500], local_loss=0.18159866333007812, train_loss=0.1878957599401474, time_cost=7.185954809188843
+
Steps: 0%| | 4500/1000000 [11:30:07<2472:14:40, 8.94s/it, lr=1e-5, step_loss=0.182]
Steps: 0%| | 4501/1000000 [11:30:15<2405:05:23, 8.70s/it, lr=1e-5, step_loss=0.182][RANK-0]: Step: [4501], local_loss=0.04510236158967018, train_loss=5.478147983551025, time_cost=3.9732766151428223
+
Steps: 0%| | 4501/1000000 [11:30:15<2405:05:23, 8.70s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 4502/1000000 [11:30:23<2305:47:34, 8.34s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [4502], local_loss=0.0901828333735466, train_loss=0.04340188577771187, time_cost=2.8606367111206055
+
Steps: 0%| | 4502/1000000 [11:30:23<2305:47:34, 8.34s/it, lr=1e-5, step_loss=0.0902]
Steps: 0%| | 4503/1000000 [11:30:31<2272:49:07, 8.22s/it, lr=1e-5, step_loss=0.0902][RANK-0]: Step: [4503], local_loss=0.02804127335548401, train_loss=0.1395629644393921, time_cost=3.2162208557128906
+
Steps: 0%| | 4503/1000000 [11:30:31<2272:49:07, 8.22s/it, lr=1e-5, step_loss=0.028]
Steps: 0%| | 4504/1000000 [11:30:46<2837:11:47, 10.26s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [4504], local_loss=0.01809382066130638, train_loss=0.03799714148044586, time_cost=7.1654746532440186
+
Steps: 0%| | 4504/1000000 [11:30:46<2837:11:47, 10.26s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 4505/1000000 [11:30:51<2401:18:05, 8.68s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [4505], local_loss=0.03235666826367378, train_loss=0.033240482211112976, time_cost=1.9178080558776855
+
Steps: 0%| | 4505/1000000 [11:30:51<2401:18:05, 8.68s/it, lr=1e-5, step_loss=0.0324]
Steps: 0%| | 4506/1000000 [11:31:04<2773:58:47, 10.03s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [4506], local_loss=0.03088044747710228, train_loss=0.060096725821495056, time_cost=2.307440996170044
+
Steps: 0%| | 4506/1000000 [11:31:04<2773:58:47, 10.03s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 4507/1000000 [11:31:16<2910:59:11, 10.53s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [4507], local_loss=0.056293681263923645, train_loss=0.19829408824443817, time_cost=3.6959176063537598
+
Steps: 0%| | 4507/1000000 [11:31:16<2910:59:11, 10.53s/it, lr=1e-5, step_loss=0.0563]
Steps: 0%| | 4508/1000000 [11:31:24<2709:49:05, 9.80s/it, lr=1e-5, step_loss=0.0563][RANK-0]: Step: [4508], local_loss=0.031223341822624207, train_loss=0.03421904891729355, time_cost=2.2767696380615234
+
Steps: 0%| | 4508/1000000 [11:31:24<2709:49:05, 9.80s/it, lr=1e-5, step_loss=0.0312]
Steps: 0%| | 4509/1000000 [11:31:36<2861:38:50, 10.35s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [4509], local_loss=0.02281316928565502, train_loss=0.07041209936141968, time_cost=1.6602225303649902
+
Steps: 0%| | 4509/1000000 [11:31:36<2861:38:50, 10.35s/it, lr=1e-5, step_loss=0.0228]
Steps: 0%| | 4510/1000000 [11:31:47<2941:25:38, 10.64s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [4510], local_loss=0.036878444254398346, train_loss=0.04068863391876221, time_cost=2.0231399536132812
+
Steps: 0%| | 4510/1000000 [11:31:47<2941:25:38, 10.64s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 4511/1000000 [11:32:01<3244:10:10, 11.73s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [4511], local_loss=0.016032438725233078, train_loss=0.05286647751927376, time_cost=9.050570964813232
+
Steps: 0%| | 4511/1000000 [11:32:01<3244:10:10, 11.73s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4512/1000000 [11:32:08<2858:00:09, 10.34s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4512], local_loss=0.019875556230545044, train_loss=0.0333632193505764, time_cost=1.2682445049285889
+
Steps: 0%| | 4512/1000000 [11:32:08<2858:00:09, 10.34s/it, lr=1e-5, step_loss=0.0199]
Steps: 0%| | 4513/1000000 [11:32:19<2897:56:45, 10.48s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [4513], local_loss=0.043590378016233444, train_loss=0.04982662945985794, time_cost=4.931743383407593
+
Steps: 0%| | 4513/1000000 [11:32:19<2897:56:45, 10.48s/it, lr=1e-5, step_loss=0.0436]
Steps: 0%| | 4514/1000000 [11:32:29<2820:44:12, 10.20s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [4514], local_loss=0.010651389136910439, train_loss=0.04735235869884491, time_cost=5.063402891159058
+
Steps: 0%| | 4514/1000000 [11:32:29<2820:44:12, 10.20s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 4515/1000000 [11:32:39<2864:42:49, 10.36s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [4515], local_loss=0.03161736577749252, train_loss=0.07273492217063904, time_cost=4.865620851516724
+
Steps: 0%| | 4515/1000000 [11:32:39<2864:42:49, 10.36s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 4516/1000000 [11:32:47<2613:38:33, 9.45s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [4516], local_loss=0.027414198964834213, train_loss=0.06817293912172318, time_cost=5.429347038269043
+
Steps: 0%| | 4516/1000000 [11:32:47<2613:38:33, 9.45s/it, lr=1e-5, step_loss=0.0274]
Steps: 0%| | 4517/1000000 [11:32:51<2205:46:18, 7.98s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [4517], local_loss=0.05706562474370003, train_loss=0.06217444688081741, time_cost=3.197394371032715
+
Steps: 0%| | 4517/1000000 [11:32:51<2205:46:18, 7.98s/it, lr=1e-5, step_loss=0.0571]
Steps: 0%| | 4518/1000000 [11:33:03<2510:55:28, 9.08s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [4518], local_loss=0.10467180609703064, train_loss=0.036379821598529816, time_cost=2.4863061904907227
+
Steps: 0%| | 4518/1000000 [11:33:03<2510:55:28, 9.08s/it, lr=1e-5, step_loss=0.105]
Steps: 0%| | 4519/1000000 [11:33:19<3087:02:48, 11.16s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [4519], local_loss=0.03842254355549812, train_loss=0.0816056951880455, time_cost=1.222834587097168
+
Steps: 0%| | 4519/1000000 [11:33:19<3087:02:48, 11.16s/it, lr=1e-5, step_loss=0.0384]
Steps: 0%| | 4520/1000000 [11:33:24<2615:28:18, 9.46s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [4520], local_loss=0.015241468325257301, train_loss=0.039501212537288666, time_cost=1.2259297370910645
+
Steps: 0%| | 4520/1000000 [11:33:24<2615:28:18, 9.46s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 4521/1000000 [11:33:42<3329:09:26, 12.04s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [4521], local_loss=0.17680536210536957, train_loss=0.056334566324949265, time_cost=8.942996740341187
+
Steps: 0%| | 4521/1000000 [11:33:42<3329:09:26, 12.04s/it, lr=1e-5, step_loss=0.177]
Steps: 0%| | 4522/1000000 [11:33:53<3218:50:24, 11.64s/it, lr=1e-5, step_loss=0.177][RANK-0]: Step: [4522], local_loss=0.04039401188492775, train_loss=0.051470428705215454, time_cost=3.3240671157836914
+
Steps: 0%| | 4522/1000000 [11:33:53<3218:50:24, 11.64s/it, lr=1e-5, step_loss=0.0404]
Steps: 0%| | 4523/1000000 [11:34:07<3426:20:13, 12.39s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [4523], local_loss=0.02630653604865074, train_loss=0.03291458636522293, time_cost=10.38583755493164
+
Steps: 0%| | 4523/1000000 [11:34:07<3426:20:13, 12.39s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 4524/1000000 [11:34:18<3282:12:17, 11.87s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [4524], local_loss=0.0369277149438858, train_loss=0.057682447135448456, time_cost=5.022982597351074
+
Steps: 0%| | 4524/1000000 [11:34:18<3282:12:17, 11.87s/it, lr=1e-5, step_loss=0.0369]
Steps: 0%| | 4525/1000000 [11:34:28<3172:50:58, 11.47s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [4525], local_loss=0.1837412416934967, train_loss=0.06791296601295471, time_cost=2.321315050125122
+
Steps: 0%| | 4525/1000000 [11:34:28<3172:50:58, 11.47s/it, lr=1e-5, step_loss=0.184]
Steps: 0%| | 4526/1000000 [11:34:39<3129:32:39, 11.32s/it, lr=1e-5, step_loss=0.184][RANK-0]: Step: [4526], local_loss=0.01391455065459013, train_loss=0.02969105914235115, time_cost=1.2228901386260986
+
Steps: 0%| | 4526/1000000 [11:34:39<3129:32:39, 11.32s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4527/1000000 [11:34:50<3048:33:51, 11.02s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4527], local_loss=0.07814125716686249, train_loss=0.05200590938329697, time_cost=2.5655834674835205
+
Steps: 0%| | 4527/1000000 [11:34:50<3048:33:51, 11.02s/it, lr=1e-5, step_loss=0.0781]
Steps: 0%| | 4528/1000000 [11:34:55<2558:53:03, 9.25s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [4528], local_loss=0.029035519808530807, train_loss=0.07144845277070999, time_cost=3.839292526245117
+
Steps: 0%| | 4528/1000000 [11:34:55<2558:53:03, 9.25s/it, lr=1e-5, step_loss=0.029]
Steps: 0%| | 4529/1000000 [11:35:09<2954:46:04, 10.69s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [4529], local_loss=0.08477774262428284, train_loss=0.03950496017932892, time_cost=1.2148151397705078
+
Steps: 0%| | 4529/1000000 [11:35:09<2954:46:04, 10.69s/it, lr=1e-5, step_loss=0.0848]
Steps: 0%| | 4530/1000000 [11:35:20<2975:48:54, 10.76s/it, lr=1e-5, step_loss=0.0848][RANK-0]: Step: [4530], local_loss=0.01757303811609745, train_loss=0.03093086928129196, time_cost=3.369852066040039
+
Steps: 0%| | 4530/1000000 [11:35:20<2975:48:54, 10.76s/it, lr=1e-5, step_loss=0.0176]
Steps: 0%| | 4531/1000000 [11:35:27<2688:51:02, 9.72s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [4531], local_loss=0.017388317734003067, train_loss=0.046592097729444504, time_cost=2.8283743858337402
+
Steps: 0%| | 4531/1000000 [11:35:27<2688:51:02, 9.72s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 4532/1000000 [11:35:40<2919:54:40, 10.56s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [4532], local_loss=0.042707182466983795, train_loss=0.06376034766435623, time_cost=4.421661138534546
+
Steps: 0%| | 4532/1000000 [11:35:40<2919:54:40, 10.56s/it, lr=1e-5, step_loss=0.0427]
Steps: 0%| | 4533/1000000 [11:35:47<2646:22:27, 9.57s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [4533], local_loss=0.031057005748152733, train_loss=0.09634970128536224, time_cost=3.069448232650757
+
Steps: 0%| | 4533/1000000 [11:35:47<2646:22:27, 9.57s/it, lr=1e-5, step_loss=0.0311]
Steps: 0%| | 4534/1000000 [11:35:52<2297:04:19, 8.31s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [4534], local_loss=0.03937292471528053, train_loss=0.049780260771512985, time_cost=2.18281888961792
+
Steps: 0%| | 4534/1000000 [11:35:52<2297:04:19, 8.31s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 4535/1000000 [11:36:08<2932:46:54, 10.61s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [4535], local_loss=0.04440804198384285, train_loss=6.660189628601074, time_cost=12.218936920166016
+
Steps: 0%| | 4535/1000000 [11:36:08<2932:46:54, 10.61s/it, lr=1e-5, step_loss=0.0444]
Steps: 0%| | 4536/1000000 [11:36:24<3338:07:13, 12.07s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [4536], local_loss=0.1470104455947876, train_loss=4.146362781524658, time_cost=4.761761665344238
+
Steps: 0%| | 4536/1000000 [11:36:24<3338:07:13, 12.07s/it, lr=1e-5, step_loss=0.147]
Steps: 0%| | 4537/1000000 [11:36:32<3019:17:08, 10.92s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [4537], local_loss=0.01796475239098072, train_loss=0.03703344613313675, time_cost=3.7621145248413086
+
Steps: 0%| | 4537/1000000 [11:36:32<3019:17:08, 10.92s/it, lr=1e-5, step_loss=0.018]
Steps: 0%| | 4538/1000000 [11:36:38<2624:04:01, 9.49s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [4538], local_loss=0.013316033408045769, train_loss=0.1764327734708786, time_cost=3.0217742919921875
+
Steps: 0%| | 4538/1000000 [11:36:38<2624:04:01, 9.49s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 4539/1000000 [11:36:44<2329:52:06, 8.43s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [4539], local_loss=1.0536607503890991, train_loss=0.17585839331150055, time_cost=1.4440836906433105
+
Steps: 0%| | 4539/1000000 [11:36:44<2329:52:06, 8.43s/it, lr=1e-5, step_loss=1.05]
Steps: 0%| | 4540/1000000 [11:36:56<2654:17:40, 9.60s/it, lr=1e-5, step_loss=1.05][RANK-0]: Step: [4540], local_loss=0.01159810833632946, train_loss=7.160244464874268, time_cost=1.6692631244659424
+
Steps: 0%| | 4540/1000000 [11:36:56<2654:17:40, 9.60s/it, lr=1e-5, step_loss=0.0116]
Steps: 0%| | 4541/1000000 [11:37:04<2475:30:39, 8.95s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [4541], local_loss=0.03381955623626709, train_loss=0.02836095727980137, time_cost=2.2487411499023438
+
Steps: 0%| | 4541/1000000 [11:37:04<2475:30:39, 8.95s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 4542/1000000 [11:37:10<2220:28:01, 8.03s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [4542], local_loss=0.01198164839297533, train_loss=0.05030961334705353, time_cost=1.5589897632598877
+
Steps: 0%| | 4542/1000000 [11:37:10<2220:28:01, 8.03s/it, lr=1e-5, step_loss=0.012]
Steps: 0%| | 4543/1000000 [11:37:25<2811:41:15, 10.17s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [4543], local_loss=0.01472267135977745, train_loss=0.04357556253671646, time_cost=2.0155797004699707
+
Steps: 0%| | 4543/1000000 [11:37:25<2811:41:15, 10.17s/it, lr=1e-5, step_loss=0.0147]
Steps: 0%| | 4544/1000000 [11:37:30<2412:03:29, 8.72s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [4544], local_loss=0.00946732796728611, train_loss=0.025197558104991913, time_cost=2.70432448387146
+
Steps: 0%| | 4544/1000000 [11:37:30<2412:03:29, 8.72s/it, lr=1e-5, step_loss=0.00947]
Steps: 0%| | 4545/1000000 [11:37:42<2663:26:29, 9.63s/it, lr=1e-5, step_loss=0.00947][RANK-0]: Step: [4545], local_loss=0.03013903833925724, train_loss=0.1127638891339302, time_cost=1.2133498191833496
+
Steps: 0%| | 4545/1000000 [11:37:42<2663:26:29, 9.63s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 4546/1000000 [11:37:48<2337:20:02, 8.45s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [4546], local_loss=0.026920702308416367, train_loss=0.04604153707623482, time_cost=2.4210822582244873
+
Steps: 0%| | 4546/1000000 [11:37:48<2337:20:02, 8.45s/it, lr=1e-5, step_loss=0.0269]
Steps: 0%| | 4547/1000000 [11:37:56<2296:37:33, 8.31s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [4547], local_loss=0.01754201389849186, train_loss=6.740316867828369, time_cost=2.550581932067871
+
Steps: 0%| | 4547/1000000 [11:37:56<2296:37:33, 8.31s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4548/1000000 [11:38:00<1969:13:49, 7.12s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4548], local_loss=0.03840372711420059, train_loss=0.0515286922454834, time_cost=1.6248042583465576
+
Steps: 0%| | 4548/1000000 [11:38:00<1969:13:49, 7.12s/it, lr=1e-5, step_loss=0.0384]
Steps: 0%| | 4549/1000000 [11:38:05<1783:44:18, 6.45s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [4549], local_loss=0.07813432067632675, train_loss=0.1186031624674797, time_cost=1.332219123840332
+
Steps: 0%| | 4549/1000000 [11:38:05<1783:44:18, 6.45s/it, lr=1e-5, step_loss=0.0781]
Steps: 0%| | 4550/1000000 [11:38:10<1660:10:11, 6.00s/it, lr=1e-5, step_loss=0.0781][RANK-0]: Step: [4550], local_loss=0.048854392021894455, train_loss=0.04374261945486069, time_cost=2.443427562713623
+
Steps: 0%| | 4550/1000000 [11:38:10<1660:10:11, 6.00s/it, lr=1e-5, step_loss=0.0489]
Steps: 0%| | 4551/1000000 [11:38:16<1641:09:22, 5.94s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [4551], local_loss=0.018117859959602356, train_loss=0.0401766262948513, time_cost=2.712942361831665
+
Steps: 0%| | 4551/1000000 [11:38:16<1641:09:22, 5.94s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 4552/1000000 [11:38:30<2311:41:12, 8.36s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [4552], local_loss=0.06488519161939621, train_loss=0.09835502505302429, time_cost=3.1313982009887695
+
Steps: 0%| | 4552/1000000 [11:38:30<2311:41:12, 8.36s/it, lr=1e-5, step_loss=0.0649]
Steps: 0%| | 4553/1000000 [11:38:38<2318:58:00, 8.39s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [4553], local_loss=0.4741740822792053, train_loss=0.201162189245224, time_cost=1.2402386665344238
+
Steps: 0%| | 4553/1000000 [11:38:38<2318:58:00, 8.39s/it, lr=1e-5, step_loss=0.474]
Steps: 0%| | 4554/1000000 [11:38:42<1979:40:23, 7.16s/it, lr=1e-5, step_loss=0.474][RANK-0]: Step: [4554], local_loss=0.01678813248872757, train_loss=0.048979438841342926, time_cost=1.206221103668213
+
Steps: 0%| | 4554/1000000 [11:38:42<1979:40:23, 7.16s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 4555/1000000 [11:38:58<2678:13:32, 9.69s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [4555], local_loss=0.014773963019251823, train_loss=0.04382623732089996, time_cost=1.2205662727355957
+
Steps: 0%| | 4555/1000000 [11:38:58<2678:13:32, 9.69s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 4556/1000000 [11:39:02<2245:57:21, 8.12s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [4556], local_loss=0.030713539570569992, train_loss=0.028127729892730713, time_cost=1.2934627532958984
+
Steps: 0%| | 4556/1000000 [11:39:02<2245:57:21, 8.12s/it, lr=1e-5, step_loss=0.0307]
Steps: 0%| | 4557/1000000 [11:39:12<2386:50:04, 8.63s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [4557], local_loss=0.020765025168657303, train_loss=0.03189904987812042, time_cost=3.194136619567871
+
Steps: 0%| | 4557/1000000 [11:39:12<2386:50:04, 8.63s/it, lr=1e-5, step_loss=0.0208]
Steps: 0%| | 4558/1000000 [11:39:26<2809:48:46, 10.16s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [4558], local_loss=0.04894500970840454, train_loss=0.02258223295211792, time_cost=10.783837795257568
+
Steps: 0%| | 4558/1000000 [11:39:26<2809:48:46, 10.16s/it, lr=1e-5, step_loss=0.0489]
Steps: 0%| | 4559/1000000 [11:39:32<2427:02:44, 8.78s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [4559], local_loss=0.06406330317258835, train_loss=0.04725736379623413, time_cost=3.9091107845306396
+
Steps: 0%| | 4559/1000000 [11:39:32<2427:02:44, 8.78s/it, lr=1e-5, step_loss=0.0641]
Steps: 0%| | 4560/1000000 [11:39:49<3109:47:03, 11.25s/it, lr=1e-5, step_loss=0.0641][RANK-0]: Step: [4560], local_loss=0.06087911128997803, train_loss=0.04945184290409088, time_cost=14.578771114349365
+
Steps: 0%| | 4560/1000000 [11:39:49<3109:47:03, 11.25s/it, lr=1e-5, step_loss=0.0609]
Steps: 0%| | 4561/1000000 [11:40:02<3274:26:58, 11.84s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [4561], local_loss=0.06257668137550354, train_loss=0.062117502093315125, time_cost=10.457741260528564
+
Steps: 0%| | 4561/1000000 [11:40:02<3274:26:58, 11.84s/it, lr=1e-5, step_loss=0.0626]
Steps: 0%| | 4562/1000000 [11:40:07<2764:19:32, 10.00s/it, lr=1e-5, step_loss=0.0626][RANK-0]: Step: [4562], local_loss=0.04935605078935623, train_loss=0.028142374008893967, time_cost=2.6101176738739014
+
Steps: 0%| | 4562/1000000 [11:40:07<2764:19:32, 10.00s/it, lr=1e-5, step_loss=0.0494]
Steps: 0%| | 4563/1000000 [11:40:13<2356:44:56, 8.52s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [4563], local_loss=0.03966875374317169, train_loss=0.039633747190237045, time_cost=1.5515398979187012
+
Steps: 0%| | 4563/1000000 [11:40:13<2356:44:56, 8.52s/it, lr=1e-5, step_loss=0.0397]
Steps: 0%| | 4564/1000000 [11:40:18<2097:22:18, 7.59s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [4564], local_loss=0.0646621435880661, train_loss=0.05013417452573776, time_cost=2.3069279193878174
+
Steps: 0%| | 4564/1000000 [11:40:18<2097:22:18, 7.59s/it, lr=1e-5, step_loss=0.0647]
Steps: 0%| | 4565/1000000 [11:40:32<2663:06:05, 9.63s/it, lr=1e-5, step_loss=0.0647][RANK-0]: Step: [4565], local_loss=0.018424957990646362, train_loss=0.031118109822273254, time_cost=4.714038133621216
+
Steps: 0%| | 4565/1000000 [11:40:32<2663:06:05, 9.63s/it, lr=1e-5, step_loss=0.0184]
Steps: 0%| | 4566/1000000 [11:40:44<2862:34:41, 10.35s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [4566], local_loss=0.044600434601306915, train_loss=0.07682067155838013, time_cost=4.326558589935303
+
Steps: 0%| | 4566/1000000 [11:40:44<2862:34:41, 10.35s/it, lr=1e-5, step_loss=0.0446]
Steps: 0%| | 4567/1000000 [11:40:55<2925:17:33, 10.58s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [4567], local_loss=0.058520179241895676, train_loss=0.14931070804595947, time_cost=2.4102842807769775
+
Steps: 0%| | 4567/1000000 [11:40:55<2925:17:33, 10.58s/it, lr=1e-5, step_loss=0.0585]
Steps: 0%| | 4568/1000000 [11:41:03<2648:50:38, 9.58s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [4568], local_loss=0.02575766295194626, train_loss=0.06283067166805267, time_cost=1.2436885833740234
+
Steps: 0%| | 4568/1000000 [11:41:03<2648:50:38, 9.58s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 4569/1000000 [11:41:08<2311:32:07, 8.36s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [4569], local_loss=0.011834428645670414, train_loss=0.03960253298282623, time_cost=2.9082233905792236
+
Steps: 0%| | 4569/1000000 [11:41:08<2311:32:07, 8.36s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 4570/1000000 [11:41:14<2111:38:05, 7.64s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [4570], local_loss=0.040746014565229416, train_loss=0.2804327607154846, time_cost=1.2185449600219727
+
Steps: 0%| | 4570/1000000 [11:41:14<2111:38:05, 7.64s/it, lr=1e-5, step_loss=0.0407]
Steps: 0%| | 4571/1000000 [11:41:22<2099:34:21, 7.59s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [4571], local_loss=0.015551269054412842, train_loss=0.03721233084797859, time_cost=1.9844672679901123
+
Steps: 0%| | 4571/1000000 [11:41:22<2099:34:21, 7.59s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 4572/1000000 [11:41:36<2680:28:26, 9.69s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [4572], local_loss=0.017027614638209343, train_loss=1.1586588621139526, time_cost=3.7755208015441895
+
Steps: 0%| | 4572/1000000 [11:41:36<2680:28:26, 9.69s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 4573/1000000 [11:41:47<2753:38:39, 9.96s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [4573], local_loss=0.03382877632975578, train_loss=0.14902134239673615, time_cost=4.200953006744385
+
Steps: 0%| | 4573/1000000 [11:41:47<2753:38:39, 9.96s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 4574/1000000 [11:41:53<2427:57:50, 8.78s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [4574], local_loss=0.031561221927404404, train_loss=0.0359414741396904, time_cost=1.6351354122161865
+
Steps: 0%| | 4574/1000000 [11:41:53<2427:57:50, 8.78s/it, lr=1e-5, step_loss=0.0316]
Steps: 0%| | 4575/1000000 [11:42:05<2711:11:56, 9.81s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [4575], local_loss=0.029138315469026566, train_loss=0.032633863389492035, time_cost=3.9045329093933105
+
Steps: 0%| | 4575/1000000 [11:42:05<2711:11:56, 9.81s/it, lr=1e-5, step_loss=0.0291]
Steps: 0%| | 4576/1000000 [11:42:11<2390:32:18, 8.65s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [4576], local_loss=118.59683990478516, train_loss=14.899423599243164, time_cost=2.513681173324585
+
Steps: 0%| | 4576/1000000 [11:42:11<2390:32:18, 8.65s/it, lr=1e-5, step_loss=119]
Steps: 0%| | 4577/1000000 [11:42:19<2345:49:20, 8.48s/it, lr=1e-5, step_loss=119][RANK-0]: Step: [4577], local_loss=0.1788996309041977, train_loss=0.045644037425518036, time_cost=4.287474632263184
+
Steps: 0%| | 4577/1000000 [11:42:19<2345:49:20, 8.48s/it, lr=1e-5, step_loss=0.179]
Steps: 0%| | 4578/1000000 [11:42:24<2023:34:54, 7.32s/it, lr=1e-5, step_loss=0.179][RANK-0]: Step: [4578], local_loss=0.034496840089559555, train_loss=0.03381216153502464, time_cost=1.6495089530944824
+
Steps: 0%| | 4578/1000000 [11:42:24<2023:34:54, 7.32s/it, lr=1e-5, step_loss=0.0345]
Steps: 0%| | 4579/1000000 [11:42:28<1804:16:44, 6.53s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [4579], local_loss=0.013086538761854172, train_loss=0.05980147421360016, time_cost=1.519275426864624
+
Steps: 0%| | 4579/1000000 [11:42:28<1804:16:44, 6.53s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 4580/1000000 [11:42:38<2048:47:03, 7.41s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [4580], local_loss=0.02941446378827095, train_loss=0.03389059007167816, time_cost=3.7994534969329834
+
Steps: 0%| | 4580/1000000 [11:42:38<2048:47:03, 7.41s/it, lr=1e-5, step_loss=0.0294]
Steps: 0%| | 4581/1000000 [11:42:53<2714:42:30, 9.82s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [4581], local_loss=0.038697972893714905, train_loss=0.06826659291982651, time_cost=1.2196457386016846
+
Steps: 0%| | 4581/1000000 [11:42:53<2714:42:30, 9.82s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 4582/1000000 [11:42:59<2348:12:26, 8.49s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [4582], local_loss=0.022137142717838287, train_loss=0.29295462369918823, time_cost=2.573963165283203
+
Steps: 0%| | 4582/1000000 [11:42:59<2348:12:26, 8.49s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 4583/1000000 [11:43:05<2169:28:53, 7.85s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [4583], local_loss=0.11226333677768707, train_loss=0.05832291394472122, time_cost=2.068857431411743
+
Steps: 0%| | 4583/1000000 [11:43:05<2169:28:53, 7.85s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 4584/1000000 [11:43:19<2678:59:43, 9.69s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [4584], local_loss=0.07251043617725372, train_loss=0.05292392149567604, time_cost=5.235891103744507
+
Steps: 0%| | 4584/1000000 [11:43:19<2678:59:43, 9.69s/it, lr=1e-5, step_loss=0.0725]
Steps: 0%| | 4585/1000000 [11:43:30<2809:19:22, 10.16s/it, lr=1e-5, step_loss=0.0725][RANK-0]: Step: [4585], local_loss=0.015536355786025524, train_loss=0.032378654927015305, time_cost=1.2207181453704834
+
Steps: 0%| | 4585/1000000 [11:43:30<2809:19:22, 10.16s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4586/1000000 [11:43:45<3173:22:44, 11.48s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4586], local_loss=0.018261244520545006, train_loss=0.04552846401929855, time_cost=9.623795509338379
+
Steps: 0%| | 4586/1000000 [11:43:45<3173:22:44, 11.48s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 4587/1000000 [11:43:52<2836:37:09, 10.26s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [4587], local_loss=0.07752477377653122, train_loss=0.04914268106222153, time_cost=2.040130138397217
+
Steps: 0%| | 4587/1000000 [11:43:52<2836:37:09, 10.26s/it, lr=1e-5, step_loss=0.0775]
Steps: 0%| | 4588/1000000 [11:43:58<2479:56:15, 8.97s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [4588], local_loss=0.08455916494131088, train_loss=0.043077852576971054, time_cost=4.14489221572876
+
Steps: 0%| | 4588/1000000 [11:43:58<2479:56:15, 8.97s/it, lr=1e-5, step_loss=0.0846]
Steps: 0%| | 4589/1000000 [11:44:06<2375:51:48, 8.59s/it, lr=1e-5, step_loss=0.0846][RANK-0]: Step: [4589], local_loss=0.03423745557665825, train_loss=0.08901078999042511, time_cost=3.6878268718719482
+
Steps: 0%| | 4589/1000000 [11:44:06<2375:51:48, 8.59s/it, lr=1e-5, step_loss=0.0342]
Steps: 0%| | 4590/1000000 [11:44:13<2240:10:13, 8.10s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [4590], local_loss=0.31303998827934265, train_loss=0.06620293855667114, time_cost=1.9668149948120117
+
Steps: 0%| | 4590/1000000 [11:44:13<2240:10:13, 8.10s/it, lr=1e-5, step_loss=0.313]
Steps: 0%| | 4591/1000000 [11:44:18<2024:11:30, 7.32s/it, lr=1e-5, step_loss=0.313][RANK-0]: Step: [4591], local_loss=0.021166719496250153, train_loss=0.036888450384140015, time_cost=1.217625379562378
+
Steps: 0%| | 4591/1000000 [11:44:18<2024:11:30, 7.32s/it, lr=1e-5, step_loss=0.0212]
Steps: 0%| | 4592/1000000 [11:44:25<1992:44:56, 7.21s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [4592], local_loss=0.024584922939538956, train_loss=0.0316004641354084, time_cost=2.348151922225952
+
Steps: 0%| | 4592/1000000 [11:44:25<1992:44:56, 7.21s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 4593/1000000 [11:44:40<2604:41:23, 9.42s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [4593], local_loss=0.026621120050549507, train_loss=0.07152160257101059, time_cost=1.22489333152771
+
Steps: 0%| | 4593/1000000 [11:44:40<2604:41:23, 9.42s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 4594/1000000 [11:44:55<3091:07:50, 11.18s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [4594], local_loss=0.06907033175230026, train_loss=0.04908451810479164, time_cost=6.49489426612854
+
Steps: 0%| | 4594/1000000 [11:44:55<3091:07:50, 11.18s/it, lr=1e-5, step_loss=0.0691]
Steps: 0%| | 4595/1000000 [11:45:06<3099:15:58, 11.21s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [4595], local_loss=0.03017422929406166, train_loss=0.16716411709785461, time_cost=3.131681203842163
+
Steps: 0%| | 4595/1000000 [11:45:06<3099:15:58, 11.21s/it, lr=1e-5, step_loss=0.0302]
Steps: 0%| | 4596/1000000 [11:45:12<2652:36:07, 9.59s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [4596], local_loss=0.06539333611726761, train_loss=0.032348815351724625, time_cost=1.239696741104126
+
Steps: 0%| | 4596/1000000 [11:45:12<2652:36:07, 9.59s/it, lr=1e-5, step_loss=0.0654]
Steps: 0%| | 4597/1000000 [11:45:20<2485:09:06, 8.99s/it, lr=1e-5, step_loss=0.0654][RANK-0]: Step: [4597], local_loss=0.9917876124382019, train_loss=25.41222381591797, time_cost=4.493508577346802
+
Steps: 0%| | 4597/1000000 [11:45:20<2485:09:06, 8.99s/it, lr=1e-5, step_loss=0.992]
Steps: 0%| | 4598/1000000 [11:45:27<2332:19:53, 8.44s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [4598], local_loss=0.018727025017142296, train_loss=0.03760681301355362, time_cost=2.4105989933013916
+
Steps: 0%| | 4598/1000000 [11:45:27<2332:19:53, 8.44s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 4599/1000000 [11:45:38<2515:17:55, 9.10s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [4599], local_loss=0.014238393865525723, train_loss=0.03576318919658661, time_cost=1.226302146911621
+
Steps: 0%| | 4599/1000000 [11:45:38<2515:17:55, 9.10s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 4600/1000000 [11:45:45<2341:50:00, 8.47s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [4600], local_loss=0.01701119728386402, train_loss=0.04332945868372917, time_cost=2.1127450466156006
+
Steps: 0%| | 4600/1000000 [11:45:45<2341:50:00, 8.47s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 4601/1000000 [11:45:53<2351:39:19, 8.51s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [4601], local_loss=0.05356884375214577, train_loss=0.16066540777683258, time_cost=1.245081901550293
+
Steps: 0%| | 4601/1000000 [11:45:53<2351:39:19, 8.51s/it, lr=1e-5, step_loss=0.0536]
Steps: 0%| | 4602/1000000 [11:46:01<2278:01:49, 8.24s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [4602], local_loss=0.028239289298653603, train_loss=0.14498499035835266, time_cost=2.7656214237213135
+
Steps: 0%| | 4602/1000000 [11:46:01<2278:01:49, 8.24s/it, lr=1e-5, step_loss=0.0282]
Steps: 0%| | 4603/1000000 [11:46:07<2089:39:40, 7.56s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [4603], local_loss=0.014306141063570976, train_loss=0.05596708506345749, time_cost=1.7524456977844238
+
Steps: 0%| | 4603/1000000 [11:46:07<2089:39:40, 7.56s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 4604/1000000 [11:46:14<2028:13:05, 7.34s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [4604], local_loss=0.09296774864196777, train_loss=0.12907589972019196, time_cost=1.715017557144165
+
Steps: 0%| | 4604/1000000 [11:46:14<2028:13:05, 7.34s/it, lr=1e-5, step_loss=0.093]
Steps: 0%| | 4605/1000000 [11:46:21<2046:44:31, 7.40s/it, lr=1e-5, step_loss=0.093][RANK-0]: Step: [4605], local_loss=0.03257619962096214, train_loss=0.16127431392669678, time_cost=2.625701427459717
+
Steps: 0%| | 4605/1000000 [11:46:21<2046:44:31, 7.40s/it, lr=1e-5, step_loss=0.0326]
Steps: 0%| | 4606/1000000 [11:46:32<2311:00:32, 8.36s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [4606], local_loss=0.04667085036635399, train_loss=0.03521237149834633, time_cost=1.73356294631958
+
Steps: 0%| | 4606/1000000 [11:46:32<2311:00:32, 8.36s/it, lr=1e-5, step_loss=0.0467]
Steps: 0%| | 4607/1000000 [11:46:43<2545:22:32, 9.21s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [4607], local_loss=0.024452365934848785, train_loss=0.027637425810098648, time_cost=3.495612382888794
+
Steps: 0%| | 4607/1000000 [11:46:43<2545:22:32, 9.21s/it, lr=1e-5, step_loss=0.0245]
Steps: 0%| | 4608/1000000 [11:46:52<2552:29:37, 9.23s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [4608], local_loss=0.02304461970925331, train_loss=0.025516245514154434, time_cost=3.4907031059265137
+
Steps: 0%| | 4608/1000000 [11:46:52<2552:29:37, 9.23s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 4609/1000000 [11:47:01<2475:04:54, 8.95s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [4609], local_loss=0.013070419430732727, train_loss=0.05215122178196907, time_cost=4.246145248413086
+
Steps: 0%| | 4609/1000000 [11:47:01<2475:04:54, 8.95s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 4610/1000000 [11:47:18<3145:42:39, 11.38s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [4610], local_loss=0.08301942050457001, train_loss=0.05499889329075813, time_cost=8.64298152923584
+
Steps: 0%| | 4610/1000000 [11:47:18<3145:42:39, 11.38s/it, lr=1e-5, step_loss=0.083]
Steps: 0%| | 4611/1000000 [11:47:23<2640:54:11, 9.55s/it, lr=1e-5, step_loss=0.083][RANK-0]: Step: [4611], local_loss=0.014832811430096626, train_loss=0.024012723937630653, time_cost=1.2185962200164795
+
Steps: 0%| | 4611/1000000 [11:47:23<2640:54:11, 9.55s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 4612/1000000 [11:47:34<2748:27:54, 9.94s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [4612], local_loss=0.04594423621892929, train_loss=0.04000188037753105, time_cost=2.2196688652038574
+
Steps: 0%| | 4612/1000000 [11:47:34<2748:27:54, 9.94s/it, lr=1e-5, step_loss=0.0459]
Steps: 0%| | 4613/1000000 [11:47:41<2483:46:31, 8.98s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [4613], local_loss=0.023303553462028503, train_loss=0.03063707984983921, time_cost=1.205280065536499
+
Steps: 0%| | 4613/1000000 [11:47:41<2483:46:31, 8.98s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 4614/1000000 [11:47:53<2778:47:38, 10.05s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [4614], local_loss=0.026632938534021378, train_loss=0.1652156114578247, time_cost=9.15938663482666
+
Steps: 0%| | 4614/1000000 [11:47:53<2778:47:38, 10.05s/it, lr=1e-5, step_loss=0.0266]
Steps: 0%| | 4615/1000000 [11:48:01<2614:05:11, 9.45s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [4615], local_loss=0.08281595259904861, train_loss=0.12368281930685043, time_cost=4.51805853843689
+
Steps: 0%| | 4615/1000000 [11:48:01<2614:05:11, 9.45s/it, lr=1e-5, step_loss=0.0828]
Steps: 0%| | 4616/1000000 [11:48:10<2592:10:42, 9.38s/it, lr=1e-5, step_loss=0.0828][RANK-0]: Step: [4616], local_loss=0.08485819399356842, train_loss=0.03462629392743111, time_cost=2.899296283721924
+
Steps: 0%| | 4616/1000000 [11:48:10<2592:10:42, 9.38s/it, lr=1e-5, step_loss=0.0849]
Steps: 0%| | 4617/1000000 [11:48:22<2789:10:40, 10.09s/it, lr=1e-5, step_loss=0.0849][RANK-0]: Step: [4617], local_loss=0.06058172509074211, train_loss=0.036280132830142975, time_cost=2.7794790267944336
+
Steps: 0%| | 4617/1000000 [11:48:22<2789:10:40, 10.09s/it, lr=1e-5, step_loss=0.0606]
Steps: 0%| | 4618/1000000 [11:48:27<2371:13:31, 8.58s/it, lr=1e-5, step_loss=0.0606][RANK-0]: Step: [4618], local_loss=1.00067138671875, train_loss=0.16351903975009918, time_cost=2.388627290725708
+
Steps: 0%| | 4618/1000000 [11:48:27<2371:13:31, 8.58s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 4619/1000000 [11:48:32<2036:31:28, 7.37s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [4619], local_loss=0.01683116890490055, train_loss=0.023164082318544388, time_cost=1.6215083599090576
+
Steps: 0%| | 4619/1000000 [11:48:32<2036:31:28, 7.37s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 4620/1000000 [11:48:39<2044:46:51, 7.40s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [4620], local_loss=0.08948203921318054, train_loss=0.06092655658721924, time_cost=3.208014965057373
+
Steps: 0%| | 4620/1000000 [11:48:39<2044:46:51, 7.40s/it, lr=1e-5, step_loss=0.0895]
Steps: 0%| | 4621/1000000 [11:48:50<2302:29:28, 8.33s/it, lr=1e-5, step_loss=0.0895][RANK-0]: Step: [4621], local_loss=0.03875273838639259, train_loss=0.03742482140660286, time_cost=7.496461868286133
+
Steps: 0%| | 4621/1000000 [11:48:50<2302:29:28, 8.33s/it, lr=1e-5, step_loss=0.0388]
Steps: 0%| | 4622/1000000 [11:48:57<2192:17:24, 7.93s/it, lr=1e-5, step_loss=0.0388][RANK-0]: Step: [4622], local_loss=0.015425251796841621, train_loss=0.07200036197900772, time_cost=2.751678943634033
+
Steps: 0%| | 4622/1000000 [11:48:57<2192:17:24, 7.93s/it, lr=1e-5, step_loss=0.0154]
Steps: 0%| | 4623/1000000 [11:49:11<2740:14:41, 9.91s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [4623], local_loss=0.02209177054464817, train_loss=0.06688649207353592, time_cost=5.037403106689453
+
Steps: 0%| | 4623/1000000 [11:49:11<2740:14:41, 9.91s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 4624/1000000 [11:49:18<2495:40:23, 9.03s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [4624], local_loss=0.0181666761636734, train_loss=0.03129817172884941, time_cost=2.2523481845855713
+
Steps: 0%| | 4624/1000000 [11:49:18<2495:40:23, 9.03s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 4625/1000000 [11:49:27<2496:46:37, 9.03s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [4625], local_loss=0.4378955066204071, train_loss=0.08999557793140411, time_cost=2.0224785804748535
+
Steps: 0%| | 4625/1000000 [11:49:27<2496:46:37, 9.03s/it, lr=1e-5, step_loss=0.438]
Steps: 0%| | 4626/1000000 [11:49:36<2460:00:18, 8.90s/it, lr=1e-5, step_loss=0.438][RANK-0]: Step: [4626], local_loss=0.014227432198822498, train_loss=0.03722530975937843, time_cost=6.022247076034546
+
Steps: 0%| | 4626/1000000 [11:49:36<2460:00:18, 8.90s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 4627/1000000 [11:49:53<3152:29:18, 11.40s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [4627], local_loss=0.04052034020423889, train_loss=0.26184412837028503, time_cost=7.617178916931152
+
Steps: 0%| | 4627/1000000 [11:49:53<3152:29:18, 11.40s/it, lr=1e-5, step_loss=0.0405]
Steps: 0%| | 4628/1000000 [11:50:03<3018:33:31, 10.92s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [4628], local_loss=0.5326737761497498, train_loss=0.08619595319032669, time_cost=2.6490871906280518
+
Steps: 0%| | 4628/1000000 [11:50:03<3018:33:31, 10.92s/it, lr=1e-5, step_loss=0.533]
Steps: 0%| | 4629/1000000 [11:50:07<2482:09:23, 8.98s/it, lr=1e-5, step_loss=0.533][RANK-0]: Step: [4629], local_loss=0.043778710067272186, train_loss=0.04454593360424042, time_cost=1.2478246688842773
+
Steps: 0%| | 4629/1000000 [11:50:07<2482:09:23, 8.98s/it, lr=1e-5, step_loss=0.0438]
Steps: 0%| | 4630/1000000 [11:50:17<2578:37:10, 9.33s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [4630], local_loss=0.00930592231452465, train_loss=0.053416796028614044, time_cost=1.2171003818511963
+
Steps: 0%| | 4630/1000000 [11:50:17<2578:37:10, 9.33s/it, lr=1e-5, step_loss=0.00931]
Steps: 0%| | 4631/1000000 [11:50:23<2232:22:46, 8.07s/it, lr=1e-5, step_loss=0.00931][RANK-0]: Step: [4631], local_loss=0.02776731364428997, train_loss=0.14841993153095245, time_cost=2.25533390045166
+
Steps: 0%| | 4631/1000000 [11:50:23<2232:22:46, 8.07s/it, lr=1e-5, step_loss=0.0278]
Steps: 0%| | 4632/1000000 [11:50:36<2710:09:09, 9.80s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [4632], local_loss=1.0070362091064453, train_loss=0.15850967168807983, time_cost=11.695404291152954
+
Steps: 0%| | 4632/1000000 [11:50:36<2710:09:09, 9.80s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 4633/1000000 [11:50:45<2592:39:58, 9.38s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [4633], local_loss=0.033802635967731476, train_loss=0.31337013840675354, time_cost=1.328695297241211
+
Steps: 0%| | 4633/1000000 [11:50:45<2592:39:58, 9.38s/it, lr=1e-5, step_loss=0.0338]
Steps: 0%| | 4634/1000000 [11:50:50<2222:37:13, 8.04s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [4634], local_loss=0.03735083341598511, train_loss=0.07165739685297012, time_cost=2.10364031791687
+
Steps: 0%| | 4634/1000000 [11:50:50<2222:37:13, 8.04s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 4635/1000000 [11:51:01<2503:53:40, 9.06s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [4635], local_loss=0.09745362401008606, train_loss=0.04254959151148796, time_cost=1.2356822490692139
+
Steps: 0%| | 4635/1000000 [11:51:01<2503:53:40, 9.06s/it, lr=1e-5, step_loss=0.0975]
Steps: 0%| | 4636/1000000 [11:51:17<3058:29:14, 11.06s/it, lr=1e-5, step_loss=0.0975][RANK-0]: Step: [4636], local_loss=0.01604219153523445, train_loss=0.02259976789355278, time_cost=5.459927082061768
+
Steps: 0%| | 4636/1000000 [11:51:17<3058:29:14, 11.06s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4637/1000000 [11:51:30<3268:38:18, 11.82s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4637], local_loss=0.05616658180952072, train_loss=0.03913766145706177, time_cost=5.681222438812256
+
Steps: 0%| | 4637/1000000 [11:51:30<3268:38:18, 11.82s/it, lr=1e-5, step_loss=0.0562]
Steps: 0%| | 4638/1000000 [11:51:35<2687:59:51, 9.72s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [4638], local_loss=0.04100291430950165, train_loss=0.033438682556152344, time_cost=1.5122454166412354
+
Steps: 0%| | 4638/1000000 [11:51:35<2687:59:51, 9.72s/it, lr=1e-5, step_loss=0.041]
Steps: 0%| | 4639/1000000 [11:51:46<2792:45:30, 10.10s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [4639], local_loss=0.013293752446770668, train_loss=0.05981054902076721, time_cost=2.8163068294525146
+
Steps: 0%| | 4639/1000000 [11:51:46<2792:45:30, 10.10s/it, lr=1e-5, step_loss=0.0133]
Steps: 0%| | 4640/1000000 [11:51:51<2387:30:40, 8.64s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [4640], local_loss=0.12440060824155807, train_loss=0.10555677860975266, time_cost=2.0580506324768066
+
Steps: 0%| | 4640/1000000 [11:51:51<2387:30:40, 8.64s/it, lr=1e-5, step_loss=0.124]
Steps: 0%| | 4641/1000000 [11:51:58<2244:09:09, 8.12s/it, lr=1e-5, step_loss=0.124][RANK-0]: Step: [4641], local_loss=0.023035982623696327, train_loss=26.391130447387695, time_cost=2.6094934940338135
+
Steps: 0%| | 4641/1000000 [11:51:58<2244:09:09, 8.12s/it, lr=1e-5, step_loss=0.023]
Steps: 0%| | 4642/1000000 [11:52:06<2216:32:07, 8.02s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [4642], local_loss=0.045149702578783035, train_loss=0.14107216894626617, time_cost=1.848238229751587
+
Steps: 0%| | 4642/1000000 [11:52:06<2216:32:07, 8.02s/it, lr=1e-5, step_loss=0.0451]
Steps: 0%| | 4643/1000000 [11:52:12<2005:36:16, 7.25s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [4643], local_loss=0.02985101193189621, train_loss=0.03067702427506447, time_cost=4.468541145324707
+
Steps: 0%| | 4643/1000000 [11:52:12<2005:36:16, 7.25s/it, lr=1e-5, step_loss=0.0299]
Steps: 0%| | 4644/1000000 [11:52:18<1905:19:57, 6.89s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [4644], local_loss=0.014895182102918625, train_loss=0.07069031894207001, time_cost=3.2094857692718506
+
Steps: 0%| | 4644/1000000 [11:52:18<1905:19:57, 6.89s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 4645/1000000 [11:52:28<2211:00:13, 8.00s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [4645], local_loss=0.0197954922914505, train_loss=0.035993896424770355, time_cost=1.6345808506011963
+
Steps: 0%| | 4645/1000000 [11:52:28<2211:00:13, 8.00s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 4646/1000000 [11:52:37<2281:13:25, 8.25s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [4646], local_loss=0.03316620737314224, train_loss=0.029749421402812004, time_cost=2.433349847793579
+
Steps: 0%| | 4646/1000000 [11:52:37<2281:13:25, 8.25s/it, lr=1e-5, step_loss=0.0332]
Steps: 0%| | 4647/1000000 [11:52:45<2232:39:38, 8.08s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [4647], local_loss=0.034933168441057205, train_loss=0.04451692849397659, time_cost=3.5452821254730225
+
Steps: 0%| | 4647/1000000 [11:52:45<2232:39:38, 8.08s/it, lr=1e-5, step_loss=0.0349]
Steps: 0%| | 4648/1000000 [11:52:58<2701:44:18, 9.77s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [4648], local_loss=0.07485616207122803, train_loss=0.15414217114448547, time_cost=5.907505512237549
+
Steps: 0%| | 4648/1000000 [11:52:58<2701:44:18, 9.77s/it, lr=1e-5, step_loss=0.0749]
Steps: 0%| | 4649/1000000 [11:53:03<2259:59:40, 8.17s/it, lr=1e-5, step_loss=0.0749][RANK-0]: Step: [4649], local_loss=0.015521836467087269, train_loss=0.02294248342514038, time_cost=3.4686388969421387
+
Steps: 0%| | 4649/1000000 [11:53:03<2259:59:40, 8.17s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4650/1000000 [11:53:13<2395:14:22, 8.66s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4650], local_loss=0.03569339960813522, train_loss=0.03706520050764084, time_cost=3.828700542449951
+
Steps: 0%| | 4650/1000000 [11:53:13<2395:14:22, 8.66s/it, lr=1e-5, step_loss=0.0357]
Steps: 0%| | 4651/1000000 [11:53:17<2035:43:38, 7.36s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [4651], local_loss=0.027189092710614204, train_loss=0.11971810460090637, time_cost=1.5649302005767822
+
Steps: 0%| | 4651/1000000 [11:53:17<2035:43:38, 7.36s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 4652/1000000 [11:53:24<1981:45:37, 7.17s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [4652], local_loss=0.05327906832098961, train_loss=0.03817657381296158, time_cost=2.3481929302215576
+
Steps: 0%| | 4652/1000000 [11:53:24<1981:45:37, 7.17s/it, lr=1e-5, step_loss=0.0533]
Steps: 0%| | 4653/1000000 [11:53:29<1814:58:29, 6.56s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [4653], local_loss=0.05073076859116554, train_loss=0.05805256590247154, time_cost=2.1591954231262207
+
Steps: 0%| | 4653/1000000 [11:53:29<1814:58:29, 6.56s/it, lr=1e-5, step_loss=0.0507]
Steps: 0%| | 4654/1000000 [11:53:37<1945:50:34, 7.04s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [4654], local_loss=0.017923589795827866, train_loss=0.037783462554216385, time_cost=3.007058620452881
+
Steps: 0%| | 4654/1000000 [11:53:37<1945:50:34, 7.04s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 4655/1000000 [11:53:42<1776:45:21, 6.43s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [4655], local_loss=0.028832565993070602, train_loss=0.04408068209886551, time_cost=1.7864835262298584
+
Steps: 0%| | 4655/1000000 [11:53:42<1776:45:21, 6.43s/it, lr=1e-5, step_loss=0.0288]
Steps: 0%| | 4656/1000000 [11:53:50<1891:39:46, 6.84s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [4656], local_loss=0.13872727751731873, train_loss=0.07118207961320877, time_cost=1.6886515617370605
+
Steps: 0%| | 4656/1000000 [11:53:50<1891:39:46, 6.84s/it, lr=1e-5, step_loss=0.139]
Steps: 0%| | 4657/1000000 [11:53:54<1665:26:01, 6.02s/it, lr=1e-5, step_loss=0.139][RANK-0]: Step: [4657], local_loss=0.009575454518198967, train_loss=11.66260051727295, time_cost=1.342292070388794
+
Steps: 0%| | 4657/1000000 [11:53:54<1665:26:01, 6.02s/it, lr=1e-5, step_loss=0.00958]
Steps: 0%| | 4658/1000000 [11:54:01<1744:52:45, 6.31s/it, lr=1e-5, step_loss=0.00958][RANK-0]: Step: [4658], local_loss=0.11207254976034164, train_loss=0.056374866515398026, time_cost=3.2243785858154297
+
Steps: 0%| | 4658/1000000 [11:54:01<1744:52:45, 6.31s/it, lr=1e-5, step_loss=0.112]
Steps: 0%| | 4659/1000000 [11:54:09<1861:16:59, 6.73s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [4659], local_loss=0.028388231992721558, train_loss=0.03176363557577133, time_cost=1.2320992946624756
+
Steps: 0%| | 4659/1000000 [11:54:09<1861:16:59, 6.73s/it, lr=1e-5, step_loss=0.0284]
Steps: 0%| | 4660/1000000 [11:54:26<2730:05:01, 9.87s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [4660], local_loss=0.05304940789937973, train_loss=0.045031432062387466, time_cost=9.52192497253418
+
Steps: 0%| | 4660/1000000 [11:54:26<2730:05:01, 9.87s/it, lr=1e-5, step_loss=0.053]
Steps: 0%| | 4661/1000000 [11:54:34<2561:35:07, 9.26s/it, lr=1e-5, step_loss=0.053][RANK-0]: Step: [4661], local_loss=0.03585367277264595, train_loss=0.05900293588638306, time_cost=2.440922975540161
+
Steps: 0%| | 4661/1000000 [11:54:34<2561:35:07, 9.26s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 4662/1000000 [11:54:41<2376:23:11, 8.60s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [4662], local_loss=0.03840736672282219, train_loss=0.025230035185813904, time_cost=1.2306129932403564
+
Steps: 0%| | 4662/1000000 [11:54:41<2376:23:11, 8.60s/it, lr=1e-5, step_loss=0.0384]
Steps: 0%| | 4663/1000000 [11:54:45<2025:29:09, 7.33s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [4663], local_loss=0.017441656440496445, train_loss=17.31610679626465, time_cost=1.626840353012085
+
Steps: 0%| | 4663/1000000 [11:54:45<2025:29:09, 7.33s/it, lr=1e-5, step_loss=0.0174]
Steps: 0%| | 4664/1000000 [11:54:52<2029:37:02, 7.34s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [4664], local_loss=0.013695829547941685, train_loss=0.162559375166893, time_cost=3.754486083984375
+
Steps: 0%| | 4664/1000000 [11:54:52<2029:37:02, 7.34s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 4665/1000000 [11:55:06<2528:43:19, 9.15s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [4665], local_loss=0.030899155884981155, train_loss=0.03475748375058174, time_cost=4.615221261978149
+
Steps: 0%| | 4665/1000000 [11:55:06<2528:43:19, 9.15s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 4666/1000000 [11:55:15<2545:30:34, 9.21s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [4666], local_loss=0.020247558131814003, train_loss=0.14978662133216858, time_cost=3.853536605834961
+
Steps: 0%| | 4666/1000000 [11:55:15<2545:30:34, 9.21s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 4667/1000000 [11:55:20<2154:38:54, 7.79s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [4667], local_loss=0.04143084958195686, train_loss=0.08485253900289536, time_cost=1.9041781425476074
+
Steps: 0%| | 4667/1000000 [11:55:20<2154:38:54, 7.79s/it, lr=1e-5, step_loss=0.0414]
Steps: 0%| | 4668/1000000 [11:55:28<2190:56:46, 7.92s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [4668], local_loss=0.01342479232698679, train_loss=0.045289888978004456, time_cost=1.8448660373687744
+
Steps: 0%| | 4668/1000000 [11:55:28<2190:56:46, 7.92s/it, lr=1e-5, step_loss=0.0134]
Steps: 0%| | 4669/1000000 [11:55:32<1898:13:59, 6.87s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [4669], local_loss=0.10344509035348892, train_loss=0.06528006494045258, time_cost=1.434241771697998
+
Steps: 0%| | 4669/1000000 [11:55:32<1898:13:59, 6.87s/it, lr=1e-5, step_loss=0.103]
Steps: 0%| | 4670/1000000 [11:55:42<2146:59:57, 7.77s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [4670], local_loss=0.030589649453759193, train_loss=0.11409609764814377, time_cost=4.692716360092163
+
Steps: 0%| | 4670/1000000 [11:55:42<2146:59:57, 7.77s/it, lr=1e-5, step_loss=0.0306]
Steps: 0%| | 4671/1000000 [11:55:55<2568:31:17, 9.29s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [4671], local_loss=0.03126250207424164, train_loss=0.09351184219121933, time_cost=3.6485435962677
+
Steps: 0%| | 4671/1000000 [11:55:55<2568:31:17, 9.29s/it, lr=1e-5, step_loss=0.0313]
Steps: 0%| | 4672/1000000 [11:55:59<2146:48:41, 7.76s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [4672], local_loss=0.03740151226520538, train_loss=0.032464634627103806, time_cost=1.2210354804992676
+
Steps: 0%| | 4672/1000000 [11:55:59<2146:48:41, 7.76s/it, lr=1e-5, step_loss=0.0374]
Steps: 0%| | 4673/1000000 [11:56:06<2067:34:54, 7.48s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [4673], local_loss=0.021569501608610153, train_loss=0.03621349856257439, time_cost=2.9740426540374756
+
Steps: 0%| | 4673/1000000 [11:56:06<2067:34:54, 7.48s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 4674/1000000 [11:56:12<1913:31:07, 6.92s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [4674], local_loss=0.01527524646371603, train_loss=0.020433221012353897, time_cost=1.2225840091705322
+
Steps: 0%| | 4674/1000000 [11:56:12<1913:31:07, 6.92s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 4675/1000000 [11:56:19<1943:55:23, 7.03s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [4675], local_loss=0.0599759966135025, train_loss=0.03519871458411217, time_cost=1.5389361381530762
+
Steps: 0%| | 4675/1000000 [11:56:19<1943:55:23, 7.03s/it, lr=1e-5, step_loss=0.06]
Steps: 0%| | 4676/1000000 [11:56:25<1847:27:55, 6.68s/it, lr=1e-5, step_loss=0.06][RANK-0]: Step: [4676], local_loss=0.014483665116131306, train_loss=0.026731230318546295, time_cost=2.8449056148529053
+
Steps: 0%| | 4676/1000000 [11:56:25<1847:27:55, 6.68s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 4677/1000000 [11:56:29<1653:50:53, 5.98s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [4677], local_loss=0.26786303520202637, train_loss=0.06548381596803665, time_cost=1.7531845569610596
+
Steps: 0%| | 4677/1000000 [11:56:29<1653:50:53, 5.98s/it, lr=1e-5, step_loss=0.268]
Steps: 0%| | 4678/1000000 [11:56:36<1688:53:59, 6.11s/it, lr=1e-5, step_loss=0.268][RANK-0]: Step: [4678], local_loss=0.028648726642131805, train_loss=0.06065419316291809, time_cost=4.875505208969116
+
Steps: 0%| | 4678/1000000 [11:56:36<1688:53:59, 6.11s/it, lr=1e-5, step_loss=0.0286]
Steps: 0%| | 4679/1000000 [11:56:42<1742:58:41, 6.30s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [4679], local_loss=0.025901295244693756, train_loss=0.043373845517635345, time_cost=3.5704352855682373
+
Steps: 0%| | 4679/1000000 [11:56:42<1742:58:41, 6.30s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 4680/1000000 [11:56:47<1578:33:20, 5.71s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [4680], local_loss=0.04937612637877464, train_loss=0.05835416913032532, time_cost=1.2788867950439453
+
Steps: 0%| | 4680/1000000 [11:56:47<1578:33:20, 5.71s/it, lr=1e-5, step_loss=0.0494]
Steps: 0%| | 4681/1000000 [11:56:57<1960:59:16, 7.09s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [4681], local_loss=0.013882854022085667, train_loss=0.04343026131391525, time_cost=1.4837000370025635
+
Steps: 0%| | 4681/1000000 [11:56:57<1960:59:16, 7.09s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4682/1000000 [11:57:08<2326:50:04, 8.42s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4682], local_loss=0.050552237778902054, train_loss=0.08431711792945862, time_cost=2.3465893268585205
+
Steps: 0%| | 4682/1000000 [11:57:08<2326:50:04, 8.42s/it, lr=1e-5, step_loss=0.0506]
Steps: 0%| | 4683/1000000 [11:57:20<2558:35:09, 9.25s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [4683], local_loss=0.030144810676574707, train_loss=0.03597768396139145, time_cost=1.566734790802002
+
Steps: 0%| | 4683/1000000 [11:57:20<2558:35:09, 9.25s/it, lr=1e-5, step_loss=0.0301]
Steps: 0%| | 4684/1000000 [11:57:30<2664:51:19, 9.64s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [4684], local_loss=0.015072423964738846, train_loss=0.03351256251335144, time_cost=1.5743701457977295
+
Steps: 0%| | 4684/1000000 [11:57:30<2664:51:19, 9.64s/it, lr=1e-5, step_loss=0.0151]
Steps: 0%| | 4685/1000000 [11:57:47<3228:03:45, 11.68s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [4685], local_loss=0.03988446667790413, train_loss=8.421660423278809, time_cost=7.698824644088745
+
Steps: 0%| | 4685/1000000 [11:57:47<3228:03:45, 11.68s/it, lr=1e-5, step_loss=0.0399]
Steps: 0%| | 4686/1000000 [11:57:56<3064:15:03, 11.08s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [4686], local_loss=0.02587064355611801, train_loss=0.06670874357223511, time_cost=2.599810838699341
+
Steps: 0%| | 4686/1000000 [11:57:56<3064:15:03, 11.08s/it, lr=1e-5, step_loss=0.0259]
Steps: 0%| | 4687/1000000 [11:58:01<2526:57:04, 9.14s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [4687], local_loss=0.013971644453704357, train_loss=0.05872568115592003, time_cost=1.2052266597747803
+
Steps: 0%| | 4687/1000000 [11:58:01<2526:57:04, 9.14s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 4688/1000000 [11:58:06<2224:28:56, 8.05s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [4688], local_loss=0.021626481786370277, train_loss=0.05487148463726044, time_cost=1.217674732208252
+
Steps: 0%| | 4688/1000000 [11:58:06<2224:28:56, 8.05s/it, lr=1e-5, step_loss=0.0216]
Steps: 0%| | 4689/1000000 [11:58:21<2740:43:56, 9.91s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [4689], local_loss=0.028096366673707962, train_loss=0.15091826021671295, time_cost=1.2426366806030273
+
Steps: 0%| | 4689/1000000 [11:58:21<2740:43:56, 9.91s/it, lr=1e-5, step_loss=0.0281]
Steps: 0%| | 4690/1000000 [11:58:34<3034:37:41, 10.98s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [4690], local_loss=0.018665501847863197, train_loss=0.020759955048561096, time_cost=5.8139050006866455
+
Steps: 0%| | 4690/1000000 [11:58:34<3034:37:41, 10.98s/it, lr=1e-5, step_loss=0.0187]
Steps: 0%| | 4691/1000000 [11:58:41<2726:26:04, 9.86s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [4691], local_loss=0.010654572397470474, train_loss=3.69538950920105, time_cost=2.4317877292633057
+
Steps: 0%| | 4691/1000000 [11:58:41<2726:26:04, 9.86s/it, lr=1e-5, step_loss=0.0107]
Steps: 0%| | 4692/1000000 [11:58:54<2924:16:11, 10.58s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [4692], local_loss=0.009085843339562416, train_loss=0.03341846168041229, time_cost=4.503533363342285
+
Steps: 0%| | 4692/1000000 [11:58:54<2924:16:11, 10.58s/it, lr=1e-5, step_loss=0.00909]
Steps: 0%| | 4693/1000000 [11:59:00<2551:26:01, 9.23s/it, lr=1e-5, step_loss=0.00909][RANK-0]: Step: [4693], local_loss=0.014257030561566353, train_loss=18.48917579650879, time_cost=4.350454092025757
+
Steps: 0%| | 4693/1000000 [11:59:00<2551:26:01, 9.23s/it, lr=1e-5, step_loss=0.0143]
Steps: 0%| | 4694/1000000 [11:59:07<2405:02:36, 8.70s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [4694], local_loss=0.10985654592514038, train_loss=0.052329130470752716, time_cost=3.5107650756835938
+
Steps: 0%| | 4694/1000000 [11:59:07<2405:02:36, 8.70s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 4695/1000000 [11:59:17<2481:33:10, 8.98s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [4695], local_loss=0.015015043318271637, train_loss=0.03621498495340347, time_cost=1.4042561054229736
+
Steps: 0%| | 4695/1000000 [11:59:17<2481:33:10, 8.98s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4696/1000000 [11:59:27<2596:50:06, 9.39s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4696], local_loss=0.022272393107414246, train_loss=0.04144719988107681, time_cost=5.101873159408569
+
Steps: 0%| | 4696/1000000 [11:59:27<2596:50:06, 9.39s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 4697/1000000 [11:59:37<2646:24:43, 9.57s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [4697], local_loss=0.08602778613567352, train_loss=0.04309352487325668, time_cost=2.336261510848999
+
Steps: 0%| | 4697/1000000 [11:59:37<2646:24:43, 9.57s/it, lr=1e-5, step_loss=0.086]
Steps: 0%| | 4698/1000000 [11:59:53<3143:17:24, 11.37s/it, lr=1e-5, step_loss=0.086][RANK-0]: Step: [4698], local_loss=0.012462162412703037, train_loss=0.023817136883735657, time_cost=7.117839574813843
+
Steps: 0%| | 4698/1000000 [11:59:53<3143:17:24, 11.37s/it, lr=1e-5, step_loss=0.0125]
Steps: 0%| | 4699/1000000 [12:00:02<2960:56:43, 10.71s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [4699], local_loss=0.014195041730999947, train_loss=0.041387300938367844, time_cost=2.504223585128784
+
Steps: 0%| | 4699/1000000 [12:00:02<2960:56:43, 10.71s/it, lr=1e-5, step_loss=0.0142]
Steps: 0%| | 4700/1000000 [12:00:09<2686:33:12, 9.72s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [4700], local_loss=0.012822126038372517, train_loss=0.0366814061999321, time_cost=1.2661263942718506
+
Steps: 0%| | 4700/1000000 [12:00:09<2686:33:12, 9.72s/it, lr=1e-5, step_loss=0.0128]
Steps: 0%| | 4701/1000000 [12:00:18<2590:49:29, 9.37s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [4701], local_loss=0.026061320677399635, train_loss=0.057844266295433044, time_cost=2.128866672515869
+
Steps: 0%| | 4701/1000000 [12:00:18<2590:49:29, 9.37s/it, lr=1e-5, step_loss=0.0261]
Steps: 0%| | 4702/1000000 [12:00:26<2463:41:42, 8.91s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [4702], local_loss=0.01392519660294056, train_loss=0.02819814532995224, time_cost=1.5484578609466553
+
Steps: 0%| | 4702/1000000 [12:00:26<2463:41:42, 8.91s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4703/1000000 [12:00:38<2764:30:42, 10.00s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4703], local_loss=0.05909893289208412, train_loss=0.10425832122564316, time_cost=4.64061713218689
+
Steps: 0%| | 4703/1000000 [12:00:38<2764:30:42, 10.00s/it, lr=1e-5, step_loss=0.0591]
Steps: 0%| | 4704/1000000 [12:00:44<2378:07:56, 8.60s/it, lr=1e-5, step_loss=0.0591][RANK-0]: Step: [4704], local_loss=0.011435896158218384, train_loss=0.028646375983953476, time_cost=2.1878104209899902
+
Steps: 0%| | 4704/1000000 [12:00:44<2378:07:56, 8.60s/it, lr=1e-5, step_loss=0.0114]
Steps: 0%| | 4705/1000000 [12:00:53<2470:10:27, 8.93s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [4705], local_loss=0.02085934765636921, train_loss=0.04605182260274887, time_cost=7.133800506591797
+
Steps: 0%| | 4705/1000000 [12:00:53<2470:10:27, 8.93s/it, lr=1e-5, step_loss=0.0209]
Steps: 0%| | 4706/1000000 [12:00:59<2208:18:33, 7.99s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [4706], local_loss=0.04806433990597725, train_loss=0.06130208820104599, time_cost=3.050588846206665
+
Steps: 0%| | 4706/1000000 [12:00:59<2208:18:33, 7.99s/it, lr=1e-5, step_loss=0.0481]
Steps: 0%| | 4707/1000000 [12:01:11<2533:25:57, 9.16s/it, lr=1e-5, step_loss=0.0481][RANK-0]: Step: [4707], local_loss=0.03149537369608879, train_loss=0.0382763035595417, time_cost=3.9660086631774902
+
Steps: 0%| | 4707/1000000 [12:01:11<2533:25:57, 9.16s/it, lr=1e-5, step_loss=0.0315]
Steps: 0%| | 4708/1000000 [12:01:21<2571:45:28, 9.30s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [4708], local_loss=0.022705726325511932, train_loss=0.07829426229000092, time_cost=3.4265711307525635
+
Steps: 0%| | 4708/1000000 [12:01:21<2571:45:28, 9.30s/it, lr=1e-5, step_loss=0.0227]
Steps: 0%| | 4709/1000000 [12:01:35<2961:15:21, 10.71s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [4709], local_loss=0.053561627864837646, train_loss=0.10418142378330231, time_cost=11.102979898452759
+
Steps: 0%| | 4709/1000000 [12:01:35<2961:15:21, 10.71s/it, lr=1e-5, step_loss=0.0536]
Steps: 0%| | 4710/1000000 [12:01:43<2744:33:09, 9.93s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [4710], local_loss=0.11787713319063187, train_loss=0.03689708188176155, time_cost=5.414465427398682
+
Steps: 0%| | 4710/1000000 [12:01:43<2744:33:09, 9.93s/it, lr=1e-5, step_loss=0.118]
Steps: 0%| | 4711/1000000 [12:01:52<2686:35:39, 9.72s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [4711], local_loss=0.06734731793403625, train_loss=0.07055072486400604, time_cost=1.2243719100952148
+
Steps: 0%| | 4711/1000000 [12:01:52<2686:35:39, 9.72s/it, lr=1e-5, step_loss=0.0673]
Steps: 0%| | 4712/1000000 [12:02:06<3054:20:47, 11.05s/it, lr=1e-5, step_loss=0.0673][RANK-0]: Step: [4712], local_loss=0.022331509739160538, train_loss=0.049410708248615265, time_cost=3.9209272861480713
+
Steps: 0%| | 4712/1000000 [12:02:06<3054:20:47, 11.05s/it, lr=1e-5, step_loss=0.0223]
Steps: 0%| | 4713/1000000 [12:02:12<2593:19:50, 9.38s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [4713], local_loss=0.025179604068398476, train_loss=0.020284246653318405, time_cost=1.2199628353118896
+
Steps: 0%| | 4713/1000000 [12:02:12<2593:19:50, 9.38s/it, lr=1e-5, step_loss=0.0252]
Steps: 0%| | 4714/1000000 [12:02:19<2411:02:49, 8.72s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [4714], local_loss=0.03942558914422989, train_loss=0.03273635730147362, time_cost=2.387880802154541
+
Steps: 0%| | 4714/1000000 [12:02:19<2411:02:49, 8.72s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 4715/1000000 [12:02:27<2332:57:57, 8.44s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [4715], local_loss=0.07742953300476074, train_loss=0.08915801346302032, time_cost=5.416829824447632
+
Steps: 0%| | 4715/1000000 [12:02:27<2332:57:57, 8.44s/it, lr=1e-5, step_loss=0.0774]
Steps: 0%| | 4716/1000000 [12:02:31<2036:36:47, 7.37s/it, lr=1e-5, step_loss=0.0774][RANK-0]: Step: [4716], local_loss=0.037177830934524536, train_loss=0.06868612766265869, time_cost=2.046684741973877
+
Steps: 0%| | 4716/1000000 [12:02:31<2036:36:47, 7.37s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 4717/1000000 [12:02:42<2293:24:14, 8.30s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [4717], local_loss=0.011990658938884735, train_loss=0.0393555648624897, time_cost=1.6923155784606934
+
Steps: 0%| | 4717/1000000 [12:02:42<2293:24:14, 8.30s/it, lr=1e-5, step_loss=0.012]
Steps: 0%| | 4718/1000000 [12:02:54<2621:53:35, 9.48s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [4718], local_loss=0.009132388979196548, train_loss=0.058192215859889984, time_cost=3.446450710296631
+
Steps: 0%| | 4718/1000000 [12:02:54<2621:53:35, 9.48s/it, lr=1e-5, step_loss=0.00913]
Steps: 0%| | 4719/1000000 [12:03:02<2498:48:56, 9.04s/it, lr=1e-5, step_loss=0.00913][RANK-0]: Step: [4719], local_loss=0.07092788815498352, train_loss=0.040383461862802505, time_cost=3.068873643875122
+
Steps: 0%| | 4719/1000000 [12:03:02<2498:48:56, 9.04s/it, lr=1e-5, step_loss=0.0709]
Steps: 0%| | 4720/1000000 [12:03:09<2317:20:55, 8.38s/it, lr=1e-5, step_loss=0.0709][RANK-0]: Step: [4720], local_loss=0.036142993718385696, train_loss=0.05355343595147133, time_cost=5.313060283660889
+
Steps: 0%| | 4720/1000000 [12:03:09<2317:20:55, 8.38s/it, lr=1e-5, step_loss=0.0361]
Steps: 0%| | 4721/1000000 [12:03:26<3011:27:55, 10.89s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [4721], local_loss=0.02016327902674675, train_loss=0.018515028059482574, time_cost=5.482418537139893
+
Steps: 0%| | 4721/1000000 [12:03:26<3011:27:55, 10.89s/it, lr=1e-5, step_loss=0.0202]
Steps: 0%| | 4722/1000000 [12:03:42<3416:19:41, 12.36s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [4722], local_loss=0.015599694103002548, train_loss=0.06868164986371994, time_cost=2.7830777168273926
+
Steps: 0%| | 4722/1000000 [12:03:42<3416:19:41, 12.36s/it, lr=1e-5, step_loss=0.0156]
Steps: 0%| | 4723/1000000 [12:03:46<2782:04:18, 10.06s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [4723], local_loss=0.009383736178278923, train_loss=0.07828882336616516, time_cost=1.7950592041015625
+
Steps: 0%| | 4723/1000000 [12:03:46<2782:04:18, 10.06s/it, lr=1e-5, step_loss=0.00938]
Steps: 0%| | 4724/1000000 [12:03:56<2791:51:16, 10.10s/it, lr=1e-5, step_loss=0.00938][RANK-0]: Step: [4724], local_loss=0.01530819945037365, train_loss=0.04969821870326996, time_cost=1.2099730968475342
+
Steps: 0%| | 4724/1000000 [12:03:56<2791:51:16, 10.10s/it, lr=1e-5, step_loss=0.0153]
Steps: 0%| | 4725/1000000 [12:04:01<2352:54:12, 8.51s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [4725], local_loss=0.01603531278669834, train_loss=0.041781917214393616, time_cost=1.687981367111206
+
Steps: 0%| | 4725/1000000 [12:04:01<2352:54:12, 8.51s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4726/1000000 [12:04:07<2121:54:40, 7.68s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4726], local_loss=0.08852583169937134, train_loss=0.10166280716657639, time_cost=1.5233142375946045
+
Steps: 0%| | 4726/1000000 [12:04:07<2121:54:40, 7.68s/it, lr=1e-5, step_loss=0.0885]
Steps: 0%| | 4727/1000000 [12:04:19<2443:43:18, 8.84s/it, lr=1e-5, step_loss=0.0885][RANK-0]: Step: [4727], local_loss=0.014986220747232437, train_loss=0.06879197806119919, time_cost=4.8366193771362305
+
Steps: 0%| | 4727/1000000 [12:04:19<2443:43:18, 8.84s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4728/1000000 [12:04:31<2753:25:01, 9.96s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4728], local_loss=0.12158986926078796, train_loss=0.04891034960746765, time_cost=4.129812955856323
+
Steps: 0%| | 4728/1000000 [12:04:31<2753:25:01, 9.96s/it, lr=1e-5, step_loss=0.122]
Steps: 0%| | 4729/1000000 [12:04:45<3042:52:58, 11.01s/it, lr=1e-5, step_loss=0.122][RANK-0]: Step: [4729], local_loss=0.045980341732501984, train_loss=0.08950990438461304, time_cost=3.5466296672821045
+
Steps: 0%| | 4729/1000000 [12:04:45<3042:52:58, 11.01s/it, lr=1e-5, step_loss=0.046]
Steps: 0%| | 4730/1000000 [12:04:52<2771:38:59, 10.03s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [4730], local_loss=0.10697857290506363, train_loss=0.03524331748485565, time_cost=2.8803303241729736
+
Steps: 0%| | 4730/1000000 [12:04:52<2771:38:59, 10.03s/it, lr=1e-5, step_loss=0.107]
Steps: 0%| | 4731/1000000 [12:05:02<2750:18:14, 9.95s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [4731], local_loss=0.04932462424039841, train_loss=0.027388906106352806, time_cost=3.3730337619781494
+
Steps: 0%| | 4731/1000000 [12:05:02<2750:18:14, 9.95s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 4732/1000000 [12:05:17<3132:52:49, 11.33s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [4732], local_loss=0.04766098037362099, train_loss=0.04026012867689133, time_cost=5.821911334991455
+
Steps: 0%| | 4732/1000000 [12:05:17<3132:52:49, 11.33s/it, lr=1e-5, step_loss=0.0477]
Steps: 0%| | 4733/1000000 [12:05:33<3549:27:26, 12.84s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [4733], local_loss=0.05524669587612152, train_loss=0.03238411247730255, time_cost=6.01885199546814
+
Steps: 0%| | 4733/1000000 [12:05:33<3549:27:26, 12.84s/it, lr=1e-5, step_loss=0.0552]
Steps: 0%| | 4734/1000000 [12:05:42<3240:34:15, 11.72s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [4734], local_loss=0.011156676337122917, train_loss=0.02875717356801033, time_cost=6.531268358230591
+
Steps: 0%| | 4734/1000000 [12:05:42<3240:34:15, 11.72s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 4735/1000000 [12:05:50<2929:40:26, 10.60s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [4735], local_loss=0.03781871870160103, train_loss=0.01849975995719433, time_cost=3.094501495361328
+
Steps: 0%| | 4735/1000000 [12:05:50<2929:40:26, 10.60s/it, lr=1e-5, step_loss=0.0378]
Steps: 0%| | 4736/1000000 [12:05:55<2487:21:39, 9.00s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [4736], local_loss=0.01607166789472103, train_loss=0.08723515272140503, time_cost=2.3506970405578613
+
Steps: 0%| | 4736/1000000 [12:05:55<2487:21:39, 9.00s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 4737/1000000 [12:06:07<2691:59:38, 9.74s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [4737], local_loss=0.014454330317676067, train_loss=0.05847813934087753, time_cost=1.2278380393981934
+
Steps: 0%| | 4737/1000000 [12:06:07<2691:59:38, 9.74s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 4738/1000000 [12:06:20<2986:05:11, 10.80s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [4738], local_loss=0.012812170200049877, train_loss=0.022226078435778618, time_cost=5.966454267501831
+
Steps: 0%| | 4738/1000000 [12:06:20<2986:05:11, 10.80s/it, lr=1e-5, step_loss=0.0128]
Steps: 0%| | 4739/1000000 [12:06:29<2869:07:17, 10.38s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [4739], local_loss=0.11083763837814331, train_loss=0.040866248309612274, time_cost=3.1375131607055664
+
Steps: 0%| | 4739/1000000 [12:06:29<2869:07:17, 10.38s/it, lr=1e-5, step_loss=0.111]
Steps: 0%| | 4740/1000000 [12:06:37<2596:02:58, 9.39s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [4740], local_loss=0.04531469941139221, train_loss=0.024705100804567337, time_cost=2.09584379196167
+
Steps: 0%| | 4740/1000000 [12:06:37<2596:02:58, 9.39s/it, lr=1e-5, step_loss=0.0453]
Steps: 0%| | 4741/1000000 [12:06:44<2450:50:56, 8.87s/it, lr=1e-5, step_loss=0.0453][RANK-0]: Step: [4741], local_loss=40.57892990112305, train_loss=5.100511074066162, time_cost=2.8125128746032715
+
Steps: 0%| | 4741/1000000 [12:06:44<2450:50:56, 8.87s/it, lr=1e-5, step_loss=40.6]
Steps: 0%| | 4742/1000000 [12:06:56<2675:53:43, 9.68s/it, lr=1e-5, step_loss=40.6][RANK-0]: Step: [4742], local_loss=0.010480184108018875, train_loss=0.07498230040073395, time_cost=3.471405267715454
+
Steps: 0%| | 4742/1000000 [12:06:56<2675:53:43, 9.68s/it, lr=1e-5, step_loss=0.0105]
Steps: 0%| | 4743/1000000 [12:07:02<2375:28:44, 8.59s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [4743], local_loss=0.088670514523983, train_loss=0.052443571388721466, time_cost=1.5355408191680908
+
Steps: 0%| | 4743/1000000 [12:07:02<2375:28:44, 8.59s/it, lr=1e-5, step_loss=0.0887]
Steps: 0%| | 4744/1000000 [12:07:08<2167:43:10, 7.84s/it, lr=1e-5, step_loss=0.0887][RANK-0]: Step: [4744], local_loss=0.062347155064344406, train_loss=0.037908684462308884, time_cost=1.4610660076141357
+
Steps: 0%| | 4744/1000000 [12:07:08<2167:43:10, 7.84s/it, lr=1e-5, step_loss=0.0623]
Steps: 0%| | 4745/1000000 [12:07:12<1885:22:07, 6.82s/it, lr=1e-5, step_loss=0.0623][RANK-0]: Step: [4745], local_loss=0.012236164882779121, train_loss=0.03248891979455948, time_cost=1.285144567489624
+
Steps: 0%| | 4745/1000000 [12:07:12<1885:22:07, 6.82s/it, lr=1e-5, step_loss=0.0122]
Steps: 0%| | 4746/1000000 [12:07:24<2254:55:40, 8.16s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [4746], local_loss=0.01810411550104618, train_loss=0.06775019317865372, time_cost=1.9704298973083496
+
Steps: 0%| | 4746/1000000 [12:07:24<2254:55:40, 8.16s/it, lr=1e-5, step_loss=0.0181]
Steps: 0%| | 4747/1000000 [12:07:34<2473:57:15, 8.95s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [4747], local_loss=0.030873002484440804, train_loss=0.05721551179885864, time_cost=1.4002673625946045
+
Steps: 0%| | 4747/1000000 [12:07:34<2473:57:15, 8.95s/it, lr=1e-5, step_loss=0.0309]
Steps: 0%| | 4748/1000000 [12:07:49<2957:27:07, 10.70s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [4748], local_loss=0.004120408557355404, train_loss=0.027879927307367325, time_cost=7.016999959945679
+
Steps: 0%| | 4748/1000000 [12:07:49<2957:27:07, 10.70s/it, lr=1e-5, step_loss=0.00412]
Steps: 0%| | 4749/1000000 [12:07:59<2898:50:38, 10.49s/it, lr=1e-5, step_loss=0.00412][RANK-0]: Step: [4749], local_loss=0.011539317667484283, train_loss=0.027458220720291138, time_cost=1.2050611972808838
+
Steps: 0%| | 4749/1000000 [12:07:59<2898:50:38, 10.49s/it, lr=1e-5, step_loss=0.0115]
Steps: 0%| | 4750/1000000 [12:08:16<3457:46:20, 12.51s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [4750], local_loss=0.030257418751716614, train_loss=0.046657536178827286, time_cost=7.927842617034912
+
Steps: 0%| | 4750/1000000 [12:08:16<3457:46:20, 12.51s/it, lr=1e-5, step_loss=0.0303]
Steps: 0%| | 4751/1000000 [12:08:24<3077:04:34, 11.13s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [4751], local_loss=0.09112124890089035, train_loss=0.0740576833486557, time_cost=1.2259588241577148
+
Steps: 0%| | 4751/1000000 [12:08:24<3077:04:34, 11.13s/it, lr=1e-5, step_loss=0.0911]
Steps: 0%| | 4752/1000000 [12:08:39<3406:06:33, 12.32s/it, lr=1e-5, step_loss=0.0911][RANK-0]: Step: [4752], local_loss=0.03640439733862877, train_loss=0.02505653351545334, time_cost=1.3505964279174805
+
Steps: 0%| | 4752/1000000 [12:08:39<3406:06:33, 12.32s/it, lr=1e-5, step_loss=0.0364]
Steps: 0%| | 4753/1000000 [12:08:44<2784:55:25, 10.07s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [4753], local_loss=0.015165816061198711, train_loss=0.02889195829629898, time_cost=1.907519817352295
+
Steps: 0%| | 4753/1000000 [12:08:44<2784:55:25, 10.07s/it, lr=1e-5, step_loss=0.0152]
Steps: 0%| | 4754/1000000 [12:08:53<2711:41:23, 9.81s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [4754], local_loss=0.055693309754133224, train_loss=0.07587656378746033, time_cost=2.1647660732269287
+
Steps: 0%| | 4754/1000000 [12:08:53<2711:41:23, 9.81s/it, lr=1e-5, step_loss=0.0557]
Steps: 0%| | 4755/1000000 [12:09:02<2609:22:27, 9.44s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [4755], local_loss=0.08725538849830627, train_loss=0.03971056640148163, time_cost=3.484280586242676
+
Steps: 0%| | 4755/1000000 [12:09:02<2609:22:27, 9.44s/it, lr=1e-5, step_loss=0.0873]
Steps: 0%| | 4756/1000000 [12:09:08<2284:27:35, 8.26s/it, lr=1e-5, step_loss=0.0873][RANK-0]: Step: [4756], local_loss=0.016991904005408287, train_loss=0.02897174470126629, time_cost=2.398212194442749
+
Steps: 0%| | 4756/1000000 [12:09:08<2284:27:35, 8.26s/it, lr=1e-5, step_loss=0.017]
Steps: 0%| | 4757/1000000 [12:09:15<2218:03:32, 8.02s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [4757], local_loss=0.013957600109279156, train_loss=0.039640337228775024, time_cost=3.183696746826172
+
Steps: 0%| | 4757/1000000 [12:09:15<2218:03:32, 8.02s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 4758/1000000 [12:09:20<1988:05:58, 7.19s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [4758], local_loss=0.020425084978342056, train_loss=0.03742627054452896, time_cost=2.1424121856689453
+
Steps: 0%| | 4758/1000000 [12:09:20<1988:05:58, 7.19s/it, lr=1e-5, step_loss=0.0204]
Steps: 0%| | 4759/1000000 [12:09:36<2663:42:22, 9.64s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [4759], local_loss=0.07318408787250519, train_loss=0.07935792207717896, time_cost=2.7371840476989746
+
Steps: 0%| | 4759/1000000 [12:09:36<2663:42:22, 9.64s/it, lr=1e-5, step_loss=0.0732]
Steps: 0%| | 4760/1000000 [12:09:43<2441:09:11, 8.83s/it, lr=1e-5, step_loss=0.0732][RANK-0]: Step: [4760], local_loss=0.012776781804859638, train_loss=0.019400732591748238, time_cost=2.915673017501831
+
Steps: 0%| | 4760/1000000 [12:09:43<2441:09:11, 8.83s/it, lr=1e-5, step_loss=0.0128]
Steps: 0%| | 4761/1000000 [12:09:49<2227:34:12, 8.06s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [4761], local_loss=0.02545216493308544, train_loss=0.09150640666484833, time_cost=2.152808904647827
+
Steps: 0%| | 4761/1000000 [12:09:49<2227:34:12, 8.06s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 4762/1000000 [12:09:55<2049:38:28, 7.41s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [4762], local_loss=0.05340453237295151, train_loss=0.16735868155956268, time_cost=1.477400302886963
+
Steps: 0%| | 4762/1000000 [12:09:55<2049:38:28, 7.41s/it, lr=1e-5, step_loss=0.0534]
Steps: 0%| | 4763/1000000 [12:10:04<2209:49:35, 7.99s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [4763], local_loss=0.04086541384458542, train_loss=0.051542289555072784, time_cost=3.2460618019104004
+
Steps: 0%| | 4763/1000000 [12:10:04<2209:49:35, 7.99s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 4764/1000000 [12:10:09<1950:44:31, 7.06s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [4764], local_loss=0.01194543857127428, train_loss=0.028576139360666275, time_cost=2.1363463401794434
+
Steps: 0%| | 4764/1000000 [12:10:09<1950:44:31, 7.06s/it, lr=1e-5, step_loss=0.0119]
Steps: 0%| | 4765/1000000 [12:10:19<2231:04:41, 8.07s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [4765], local_loss=0.04236490651965141, train_loss=0.054083842784166336, time_cost=4.991660833358765
+
Steps: 0%| | 4765/1000000 [12:10:19<2231:04:41, 8.07s/it, lr=1e-5, step_loss=0.0424]
Steps: 0%| | 4766/1000000 [12:10:24<1925:14:18, 6.96s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [4766], local_loss=0.011205252259969711, train_loss=0.03178733214735985, time_cost=3.105199098587036
+
Steps: 0%| | 4766/1000000 [12:10:24<1925:14:18, 6.96s/it, lr=1e-5, step_loss=0.0112]
Steps: 0%| | 4767/1000000 [12:10:29<1773:24:59, 6.41s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [4767], local_loss=0.03728639334440231, train_loss=0.0883622020483017, time_cost=1.2635626792907715
+
Steps: 0%| | 4767/1000000 [12:10:29<1773:24:59, 6.41s/it, lr=1e-5, step_loss=0.0373]
Steps: 0%| | 4768/1000000 [12:10:33<1608:37:41, 5.82s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [4768], local_loss=389.53594970703125, train_loss=48.72991943359375, time_cost=1.5905656814575195
+
Steps: 0%| | 4768/1000000 [12:10:33<1608:37:41, 5.82s/it, lr=1e-5, step_loss=390]
Steps: 0%| | 4769/1000000 [12:10:39<1594:11:56, 5.77s/it, lr=1e-5, step_loss=390][RANK-0]: Step: [4769], local_loss=0.021766560152173042, train_loss=0.03745865076780319, time_cost=2.847083568572998
+
Steps: 0%| | 4769/1000000 [12:10:39<1594:11:56, 5.77s/it, lr=1e-5, step_loss=0.0218]
Steps: 0%| | 4770/1000000 [12:10:51<2153:51:09, 7.79s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [4770], local_loss=0.015981309115886688, train_loss=0.06589113175868988, time_cost=6.455471038818359
+
Steps: 0%| | 4770/1000000 [12:10:51<2153:51:09, 7.79s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4771/1000000 [12:11:06<2703:14:47, 9.78s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4771], local_loss=0.008026747032999992, train_loss=0.07944267988204956, time_cost=6.475780963897705
+
Steps: 0%| | 4771/1000000 [12:11:06<2703:14:47, 9.78s/it, lr=1e-5, step_loss=0.00803]
Steps: 0%| | 4772/1000000 [12:11:18<2926:05:35, 10.58s/it, lr=1e-5, step_loss=0.00803][RANK-0]: Step: [4772], local_loss=0.48763418197631836, train_loss=0.07659615576267242, time_cost=5.711799144744873
+
Steps: 0%| | 4772/1000000 [12:11:18<2926:05:35, 10.58s/it, lr=1e-5, step_loss=0.488]
Steps: 0%| | 4773/1000000 [12:11:24<2547:55:35, 9.22s/it, lr=1e-5, step_loss=0.488][RANK-0]: Step: [4773], local_loss=0.026743538677692413, train_loss=0.04238518327474594, time_cost=1.243499517440796
+
Steps: 0%| | 4773/1000000 [12:11:24<2547:55:35, 9.22s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 4774/1000000 [12:11:30<2265:54:58, 8.20s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [4774], local_loss=0.04864968731999397, train_loss=0.16387243568897247, time_cost=1.229724645614624
+
Steps: 0%| | 4774/1000000 [12:11:30<2265:54:58, 8.20s/it, lr=1e-5, step_loss=0.0486]
Steps: 0%| | 4775/1000000 [12:11:39<2349:52:54, 8.50s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [4775], local_loss=0.055438458919525146, train_loss=0.17662149667739868, time_cost=2.031872034072876
+
Steps: 0%| | 4775/1000000 [12:11:39<2349:52:54, 8.50s/it, lr=1e-5, step_loss=0.0554]
Steps: 0%| | 4776/1000000 [12:11:47<2251:41:52, 8.15s/it, lr=1e-5, step_loss=0.0554][RANK-0]: Step: [4776], local_loss=0.12111931294202805, train_loss=0.036665692925453186, time_cost=1.2331340312957764
+
Steps: 0%| | 4776/1000000 [12:11:47<2251:41:52, 8.15s/it, lr=1e-5, step_loss=0.121]
Steps: 0%| | 4777/1000000 [12:11:51<1930:07:25, 6.98s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [4777], local_loss=0.01484691258519888, train_loss=0.029836945235729218, time_cost=1.2845027446746826
+
Steps: 0%| | 4777/1000000 [12:11:51<1930:07:25, 6.98s/it, lr=1e-5, step_loss=0.0148]
Steps: 0%| | 4778/1000000 [12:11:56<1786:29:02, 6.46s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [4778], local_loss=0.019016562029719353, train_loss=0.020023178309202194, time_cost=1.2895121574401855
+
Steps: 0%| | 4778/1000000 [12:11:56<1786:29:02, 6.46s/it, lr=1e-5, step_loss=0.019]
Steps: 0%| | 4779/1000000 [12:12:02<1756:52:20, 6.36s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [4779], local_loss=0.01707012765109539, train_loss=0.09985638409852982, time_cost=1.389125108718872
+
Steps: 0%| | 4779/1000000 [12:12:02<1756:52:20, 6.36s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 4780/1000000 [12:12:13<2093:10:39, 7.57s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [4780], local_loss=0.04749812185764313, train_loss=0.020906910300254822, time_cost=4.650610685348511
+
Steps: 0%| | 4780/1000000 [12:12:13<2093:10:39, 7.57s/it, lr=1e-5, step_loss=0.0475]
Steps: 0%| | 4781/1000000 [12:12:25<2467:42:56, 8.93s/it, lr=1e-5, step_loss=0.0475][RANK-0]: Step: [4781], local_loss=0.08435861766338348, train_loss=0.04698847979307175, time_cost=1.693518877029419
+
Steps: 0%| | 4781/1000000 [12:12:25<2467:42:56, 8.93s/it, lr=1e-5, step_loss=0.0844]
Steps: 0%| | 4782/1000000 [12:12:31<2275:29:03, 8.23s/it, lr=1e-5, step_loss=0.0844][RANK-0]: Step: [4782], local_loss=0.017451591789722443, train_loss=0.028738196939229965, time_cost=3.092688798904419
+
Steps: 0%| | 4782/1000000 [12:12:31<2275:29:03, 8.23s/it, lr=1e-5, step_loss=0.0175]
Steps: 0%| | 4783/1000000 [12:12:39<2222:07:58, 8.04s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [4783], local_loss=0.014995679259300232, train_loss=0.028945963829755783, time_cost=2.5685436725616455
+
Steps: 0%| | 4783/1000000 [12:12:39<2222:07:58, 8.04s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4784/1000000 [12:12:46<2130:09:11, 7.71s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4784], local_loss=0.01459183543920517, train_loss=0.10921365022659302, time_cost=1.2107176780700684
+
Steps: 0%| | 4784/1000000 [12:12:46<2130:09:11, 7.71s/it, lr=1e-5, step_loss=0.0146]
Steps: 0%| | 4785/1000000 [12:12:53<2078:32:58, 7.52s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [4785], local_loss=0.029740165919065475, train_loss=0.07314477860927582, time_cost=2.9292891025543213
+
Steps: 0%| | 4785/1000000 [12:12:53<2078:32:58, 7.52s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 4786/1000000 [12:13:02<2188:33:29, 7.92s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [4786], local_loss=0.010133943520486355, train_loss=0.0544629730284214, time_cost=3.7965621948242188
+
Steps: 0%| | 4786/1000000 [12:13:02<2188:33:29, 7.92s/it, lr=1e-5, step_loss=0.0101]
Steps: 0%| | 4787/1000000 [12:13:07<1963:12:27, 7.10s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [4787], local_loss=0.016040492802858353, train_loss=0.03207846358418465, time_cost=1.2232601642608643
+
Steps: 0%| | 4787/1000000 [12:13:07<1963:12:27, 7.10s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4788/1000000 [12:13:13<1878:45:47, 6.80s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4788], local_loss=0.05310763418674469, train_loss=0.044605426490306854, time_cost=4.1819353103637695
+
Steps: 0%| | 4788/1000000 [12:13:13<1878:45:47, 6.80s/it, lr=1e-5, step_loss=0.0531]
Steps: 0%| | 4789/1000000 [12:13:28<2562:34:00, 9.27s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [4789], local_loss=0.011844119057059288, train_loss=0.05691550672054291, time_cost=6.047133207321167
+
Steps: 0%| | 4789/1000000 [12:13:28<2562:34:00, 9.27s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 4790/1000000 [12:13:38<2590:47:02, 9.37s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [4790], local_loss=0.026360969990491867, train_loss=0.035634882748126984, time_cost=3.4503977298736572
+
Steps: 0%| | 4790/1000000 [12:13:38<2590:47:02, 9.37s/it, lr=1e-5, step_loss=0.0264]
Steps: 0%| | 4791/1000000 [12:13:48<2692:20:56, 9.74s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [4791], local_loss=0.010932786390185356, train_loss=0.02354494482278824, time_cost=1.4562649726867676
+
Steps: 0%| | 4791/1000000 [12:13:48<2692:20:56, 9.74s/it, lr=1e-5, step_loss=0.0109]
Steps: 0%| | 4792/1000000 [12:13:54<2339:18:47, 8.46s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [4792], local_loss=0.009256435558199883, train_loss=0.016427213326096535, time_cost=2.6720263957977295
+
Steps: 0%| | 4792/1000000 [12:13:54<2339:18:47, 8.46s/it, lr=1e-5, step_loss=0.00926]
Steps: 0%| | 4793/1000000 [12:14:07<2709:06:10, 9.80s/it, lr=1e-5, step_loss=0.00926][RANK-0]: Step: [4793], local_loss=0.028477996587753296, train_loss=0.028293436393141747, time_cost=9.656047105789185
+
Steps: 0%| | 4793/1000000 [12:14:07<2709:06:10, 9.80s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 4794/1000000 [12:14:13<2424:12:45, 8.77s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [4794], local_loss=0.025429097935557365, train_loss=0.03439069539308548, time_cost=2.0910072326660156
+
Steps: 0%| | 4794/1000000 [12:14:13<2424:12:45, 8.77s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 4795/1000000 [12:14:21<2306:01:50, 8.34s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [4795], local_loss=0.04228553920984268, train_loss=0.09653474390506744, time_cost=3.137958526611328
+
Steps: 0%| | 4795/1000000 [12:14:21<2306:01:50, 8.34s/it, lr=1e-5, step_loss=0.0423]
Steps: 0%| | 4796/1000000 [12:14:28<2227:23:02, 8.06s/it, lr=1e-5, step_loss=0.0423][RANK-0]: Step: [4796], local_loss=0.017847461625933647, train_loss=0.11334553360939026, time_cost=1.626492977142334
+
Steps: 0%| | 4796/1000000 [12:14:28<2227:23:02, 8.06s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 4797/1000000 [12:14:35<2115:22:12, 7.65s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [4797], local_loss=0.010245982557535172, train_loss=0.028588324785232544, time_cost=2.14959454536438
+
Steps: 0%| | 4797/1000000 [12:14:35<2115:22:12, 7.65s/it, lr=1e-5, step_loss=0.0102]
Steps: 0%| | 4798/1000000 [12:14:50<2758:57:45, 9.98s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [4798], local_loss=0.012710900977253914, train_loss=0.05139986425638199, time_cost=12.853596925735474
+
Steps: 0%| | 4798/1000000 [12:14:50<2758:57:45, 9.98s/it, lr=1e-5, step_loss=0.0127]
Steps: 0%| | 4799/1000000 [12:14:55<2371:57:41, 8.58s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [4799], local_loss=0.015995169058442116, train_loss=0.029754050076007843, time_cost=1.1940267086029053
+
Steps: 0%| | 4799/1000000 [12:14:55<2371:57:41, 8.58s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4800/1000000 [12:15:00<2066:58:28, 7.48s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4800], local_loss=0.023306969553232193, train_loss=0.03413035348057747, time_cost=1.2330632209777832
+
Steps: 0%| | 4800/1000000 [12:15:00<2066:58:28, 7.48s/it, lr=1e-5, step_loss=0.0233]
Steps: 0%| | 4801/1000000 [12:15:11<2365:05:49, 8.56s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [4801], local_loss=0.061347488313913345, train_loss=0.09320641309022903, time_cost=1.2020468711853027
+
Steps: 0%| | 4801/1000000 [12:15:11<2365:05:49, 8.56s/it, lr=1e-5, step_loss=0.0613]
Steps: 0%| | 4802/1000000 [12:15:18<2241:30:34, 8.11s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [4802], local_loss=0.011810007505118847, train_loss=0.025101492181420326, time_cost=2.472118616104126
+
Steps: 0%| | 4802/1000000 [12:15:18<2241:30:34, 8.11s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 4803/1000000 [12:15:24<2057:56:15, 7.44s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [4803], local_loss=0.02850768342614174, train_loss=16.874034881591797, time_cost=1.3917031288146973
+
Steps: 0%| | 4803/1000000 [12:15:24<2057:56:15, 7.44s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 4804/1000000 [12:15:29<1861:53:24, 6.74s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [4804], local_loss=0.010437383316457272, train_loss=0.03781881183385849, time_cost=1.9182615280151367
+
Steps: 0%| | 4804/1000000 [12:15:29<1861:53:24, 6.74s/it, lr=1e-5, step_loss=0.0104]
Steps: 0%| | 4805/1000000 [12:15:34<1727:23:07, 6.25s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [4805], local_loss=150.36314392089844, train_loss=18.8475399017334, time_cost=2.0294508934020996
+
Steps: 0%| | 4805/1000000 [12:15:34<1727:23:07, 6.25s/it, lr=1e-5, step_loss=150]
Steps: 0%| | 4806/1000000 [12:15:44<2007:41:56, 7.26s/it, lr=1e-5, step_loss=150][RANK-0]: Step: [4806], local_loss=0.013575995340943336, train_loss=0.05072436481714249, time_cost=3.159287691116333
+
Steps: 0%| | 4806/1000000 [12:15:44<2007:41:56, 7.26s/it, lr=1e-5, step_loss=0.0136]
Steps: 0%| | 4807/1000000 [12:15:59<2631:57:25, 9.52s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [4807], local_loss=0.4622400999069214, train_loss=0.08609236776828766, time_cost=6.612609386444092
+
Steps: 0%| | 4807/1000000 [12:15:59<2631:57:25, 9.52s/it, lr=1e-5, step_loss=0.462]
Steps: 0%| | 4808/1000000 [12:16:08<2614:33:31, 9.46s/it, lr=1e-5, step_loss=0.462][RANK-0]: Step: [4808], local_loss=0.10384535789489746, train_loss=0.06247761845588684, time_cost=3.601794958114624
+
Steps: 0%| | 4808/1000000 [12:16:08<2614:33:31, 9.46s/it, lr=1e-5, step_loss=0.104]
Steps: 0%| | 4809/1000000 [12:16:19<2726:29:05, 9.86s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [4809], local_loss=0.013483819551765919, train_loss=4.76033353805542, time_cost=3.336355447769165
+
Steps: 0%| | 4809/1000000 [12:16:19<2726:29:05, 9.86s/it, lr=1e-5, step_loss=0.0135]
Steps: 0%| | 4810/1000000 [12:16:28<2624:39:46, 9.49s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [4810], local_loss=0.03915833309292793, train_loss=0.2306872010231018, time_cost=2.6502697467803955
+
Steps: 0%| | 4810/1000000 [12:16:28<2624:39:46, 9.49s/it, lr=1e-5, step_loss=0.0392]
Steps: 0%| | 4811/1000000 [12:16:41<2980:16:42, 10.78s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [4811], local_loss=0.04951193556189537, train_loss=0.06348715722560883, time_cost=3.7937862873077393
+
Steps: 0%| | 4811/1000000 [12:16:41<2980:16:42, 10.78s/it, lr=1e-5, step_loss=0.0495]
Steps: 0%| | 4812/1000000 [12:16:55<3223:12:08, 11.66s/it, lr=1e-5, step_loss=0.0495][RANK-0]: Step: [4812], local_loss=0.05277831479907036, train_loss=0.033868346363306046, time_cost=7.229005575180054
+
Steps: 0%| | 4812/1000000 [12:16:55<3223:12:08, 11.66s/it, lr=1e-5, step_loss=0.0528]
Steps: 0%| | 4813/1000000 [12:17:00<2694:21:49, 9.75s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [4813], local_loss=0.03260205313563347, train_loss=0.03269321471452713, time_cost=3.938289165496826
+
Steps: 0%| | 4813/1000000 [12:17:00<2694:21:49, 9.75s/it, lr=1e-5, step_loss=0.0326]
Steps: 0%| | 4814/1000000 [12:17:05<2286:35:54, 8.27s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [4814], local_loss=0.18844661116600037, train_loss=0.1085645854473114, time_cost=2.3249917030334473
+
Steps: 0%| | 4814/1000000 [12:17:05<2286:35:54, 8.27s/it, lr=1e-5, step_loss=0.188]
Steps: 0%| | 4815/1000000 [12:17:20<2842:29:17, 10.28s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [4815], local_loss=0.010927483439445496, train_loss=0.06765130162239075, time_cost=6.339143514633179
+
Steps: 0%| | 4815/1000000 [12:17:20<2842:29:17, 10.28s/it, lr=1e-5, step_loss=0.0109]
Steps: 0%| | 4816/1000000 [12:17:28<2628:37:34, 9.51s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [4816], local_loss=0.010298402979969978, train_loss=0.13879650831222534, time_cost=1.6640591621398926
+
Steps: 0%| | 4816/1000000 [12:17:28<2628:37:34, 9.51s/it, lr=1e-5, step_loss=0.0103]
Steps: 0%| | 4817/1000000 [12:17:39<2734:32:18, 9.89s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [4817], local_loss=0.018619578331708908, train_loss=0.045825839042663574, time_cost=1.8327744007110596
+
Steps: 0%| | 4817/1000000 [12:17:39<2734:32:18, 9.89s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 4818/1000000 [12:17:45<2420:48:19, 8.76s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [4818], local_loss=0.053118474781513214, train_loss=0.08952853083610535, time_cost=1.4178392887115479
+
Steps: 0%| | 4818/1000000 [12:17:45<2420:48:19, 8.76s/it, lr=1e-5, step_loss=0.0531]
Steps: 0%| | 4819/1000000 [12:17:51<2167:32:27, 7.84s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [4819], local_loss=0.021729081869125366, train_loss=0.062464725226163864, time_cost=4.7149763107299805
+
Steps: 0%| | 4819/1000000 [12:17:51<2167:32:27, 7.84s/it, lr=1e-5, step_loss=0.0217]
Steps: 0%| | 4820/1000000 [12:17:59<2259:43:13, 8.17s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [4820], local_loss=0.09068913757801056, train_loss=0.06851133704185486, time_cost=3.0336337089538574
+
Steps: 0%| | 4820/1000000 [12:17:59<2259:43:13, 8.17s/it, lr=1e-5, step_loss=0.0907]
Steps: 0%| | 4821/1000000 [12:18:07<2197:04:10, 7.95s/it, lr=1e-5, step_loss=0.0907][RANK-0]: Step: [4821], local_loss=0.011043008416891098, train_loss=0.07815195620059967, time_cost=2.3008859157562256
+
Steps: 0%| | 4821/1000000 [12:18:07<2197:04:10, 7.95s/it, lr=1e-5, step_loss=0.011]
Steps: 0%| | 4822/1000000 [12:18:22<2765:20:15, 10.00s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [4822], local_loss=0.9868224859237671, train_loss=0.15270228683948517, time_cost=1.2325048446655273
+
Steps: 0%| | 4822/1000000 [12:18:22<2765:20:15, 10.00s/it, lr=1e-5, step_loss=0.987]
Steps: 0%| | 4823/1000000 [12:18:34<2932:54:54, 10.61s/it, lr=1e-5, step_loss=0.987][RANK-0]: Step: [4823], local_loss=0.012096862308681011, train_loss=0.06503380089998245, time_cost=3.1922035217285156
+
Steps: 0%| | 4823/1000000 [12:18:34<2932:54:54, 10.61s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 4824/1000000 [12:18:42<2759:39:50, 9.98s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [4824], local_loss=1.005442500114441, train_loss=0.16146309673786163, time_cost=1.4444949626922607
+
Steps: 0%| | 4824/1000000 [12:18:42<2759:39:50, 9.98s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 4825/1000000 [12:18:52<2751:31:57, 9.95s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [4825], local_loss=0.04561874270439148, train_loss=0.03264236822724342, time_cost=1.389916181564331
+
Steps: 0%| | 4825/1000000 [12:18:52<2751:31:57, 9.95s/it, lr=1e-5, step_loss=0.0456]
Steps: 0%| | 4826/1000000 [12:19:06<3082:57:57, 11.15s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [4826], local_loss=0.10046692937612534, train_loss=0.055468007922172546, time_cost=4.43768835067749
+
Steps: 0%| | 4826/1000000 [12:19:06<3082:57:57, 11.15s/it, lr=1e-5, step_loss=0.1]
Steps: 0%| | 4827/1000000 [12:19:12<2654:37:58, 9.60s/it, lr=1e-5, step_loss=0.1][RANK-0]: Step: [4827], local_loss=30.41227149963379, train_loss=3.8302645683288574, time_cost=1.243800401687622
+
Steps: 0%| | 4827/1000000 [12:19:12<2654:37:58, 9.60s/it, lr=1e-5, step_loss=30.4]
Steps: 0%| | 4828/1000000 [12:19:23<2791:24:25, 10.10s/it, lr=1e-5, step_loss=30.4][RANK-0]: Step: [4828], local_loss=0.019310681149363518, train_loss=0.03219503164291382, time_cost=2.9574263095855713
+
Steps: 0%| | 4828/1000000 [12:19:23<2791:24:25, 10.10s/it, lr=1e-5, step_loss=0.0193]
Steps: 0%| | 4829/1000000 [12:19:35<2928:15:17, 10.59s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [4829], local_loss=0.008815586566925049, train_loss=0.02694585546851158, time_cost=4.1717259883880615
+
Steps: 0%| | 4829/1000000 [12:19:35<2928:15:17, 10.59s/it, lr=1e-5, step_loss=0.00882]
Steps: 0%| | 4830/1000000 [12:19:40<2471:52:09, 8.94s/it, lr=1e-5, step_loss=0.00882][RANK-0]: Step: [4830], local_loss=0.016652464866638184, train_loss=0.02825789898633957, time_cost=3.8248651027679443
+
Steps: 0%| | 4830/1000000 [12:19:40<2471:52:09, 8.94s/it, lr=1e-5, step_loss=0.0167]
Steps: 0%| | 4831/1000000 [12:19:53<2826:34:29, 10.23s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [4831], local_loss=0.0952107384800911, train_loss=0.033527713268995285, time_cost=1.250840187072754
+
Steps: 0%| | 4831/1000000 [12:19:53<2826:34:29, 10.23s/it, lr=1e-5, step_loss=0.0952]
Steps: 0%| | 4832/1000000 [12:20:09<3285:54:15, 11.89s/it, lr=1e-5, step_loss=0.0952][RANK-0]: Step: [4832], local_loss=0.008803052827715874, train_loss=0.0385802686214447, time_cost=6.255918025970459
+
Steps: 0%| | 4832/1000000 [12:20:09<3285:54:15, 11.89s/it, lr=1e-5, step_loss=0.0088]
Steps: 0%| | 4833/1000000 [12:20:17<2935:41:43, 10.62s/it, lr=1e-5, step_loss=0.0088][RANK-0]: Step: [4833], local_loss=0.04109934717416763, train_loss=0.030145181342959404, time_cost=1.2199089527130127
+
Steps: 0%| | 4833/1000000 [12:20:17<2935:41:43, 10.62s/it, lr=1e-5, step_loss=0.0411]
Steps: 0%| | 4834/1000000 [12:20:31<3194:53:57, 11.56s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [4834], local_loss=0.09620320796966553, train_loss=0.033770039677619934, time_cost=4.498296499252319
+
Steps: 0%| | 4834/1000000 [12:20:31<3194:53:57, 11.56s/it, lr=1e-5, step_loss=0.0962]
Steps: 0%| | 4835/1000000 [12:20:39<2969:18:28, 10.74s/it, lr=1e-5, step_loss=0.0962][RANK-0]: Step: [4835], local_loss=0.17252296209335327, train_loss=0.05678286775946617, time_cost=2.9197092056274414
+
Steps: 0%| | 4835/1000000 [12:20:39<2969:18:28, 10.74s/it, lr=1e-5, step_loss=0.173]
Steps: 0%| | 4836/1000000 [12:20:49<2855:56:46, 10.33s/it, lr=1e-5, step_loss=0.173][RANK-0]: Step: [4836], local_loss=0.023209353908896446, train_loss=0.024916525930166245, time_cost=1.4744000434875488
+
Steps: 0%| | 4836/1000000 [12:20:49<2855:56:46, 10.33s/it, lr=1e-5, step_loss=0.0232]
Steps: 0%| | 4837/1000000 [12:20:59<2879:02:36, 10.41s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [4837], local_loss=0.020063366740942, train_loss=0.041902609169483185, time_cost=7.627725839614868
+
Steps: 0%| | 4837/1000000 [12:20:59<2879:02:36, 10.41s/it, lr=1e-5, step_loss=0.0201]
Steps: 0%| | 4838/1000000 [12:21:12<3067:53:45, 11.10s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [4838], local_loss=0.02572184056043625, train_loss=0.02653537504374981, time_cost=3.9086251258850098
+
Steps: 0%| | 4838/1000000 [12:21:12<3067:53:45, 11.10s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 4839/1000000 [12:21:28<3429:19:50, 12.41s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [4839], local_loss=0.25695669651031494, train_loss=0.19256877899169922, time_cost=4.496182918548584
+
Steps: 0%| | 4839/1000000 [12:21:28<3429:19:50, 12.41s/it, lr=1e-5, step_loss=0.257]
Steps: 0%| | 4840/1000000 [12:21:35<3001:43:44, 10.86s/it, lr=1e-5, step_loss=0.257][RANK-0]: Step: [4840], local_loss=0.39676082134246826, train_loss=0.07627652585506439, time_cost=3.170638084411621
+
Steps: 0%| | 4840/1000000 [12:21:35<3001:43:44, 10.86s/it, lr=1e-5, step_loss=0.397]
Steps: 0%| | 4841/1000000 [12:21:41<2624:54:31, 9.50s/it, lr=1e-5, step_loss=0.397][RANK-0]: Step: [4841], local_loss=0.015520777553319931, train_loss=0.062315434217453, time_cost=1.5646445751190186
+
Steps: 0%| | 4841/1000000 [12:21:41<2624:54:31, 9.50s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4842/1000000 [12:21:50<2586:28:24, 9.36s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4842], local_loss=0.0490083172917366, train_loss=0.03634519502520561, time_cost=6.465343713760376
+
Steps: 0%| | 4842/1000000 [12:21:50<2586:28:24, 9.36s/it, lr=1e-5, step_loss=0.049]
Steps: 0%| | 4843/1000000 [12:21:56<2279:23:40, 8.25s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [4843], local_loss=0.05904029682278633, train_loss=0.06286676228046417, time_cost=2.4957711696624756
+
Steps: 0%| | 4843/1000000 [12:21:56<2279:23:40, 8.25s/it, lr=1e-5, step_loss=0.059]
Steps: 0%| | 4844/1000000 [12:22:07<2501:06:37, 9.05s/it, lr=1e-5, step_loss=0.059][RANK-0]: Step: [4844], local_loss=0.0121092414483428, train_loss=0.04878683760762215, time_cost=3.255892038345337
+
Steps: 0%| | 4844/1000000 [12:22:07<2501:06:37, 9.05s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 4845/1000000 [12:22:14<2358:14:35, 8.53s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [4845], local_loss=0.030408095568418503, train_loss=0.02224789187312126, time_cost=2.3550665378570557
+
Steps: 0%| | 4845/1000000 [12:22:14<2358:14:35, 8.53s/it, lr=1e-5, step_loss=0.0304]
Steps: 0%| | 4846/1000000 [12:22:19<2078:54:32, 7.52s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [4846], local_loss=0.013923604972660542, train_loss=0.04778382554650307, time_cost=2.0136911869049072
+
Steps: 0%| | 4846/1000000 [12:22:19<2078:54:32, 7.52s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4847/1000000 [12:22:32<2537:10:42, 9.18s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4847], local_loss=0.024179786443710327, train_loss=0.050350621342659, time_cost=2.483273506164551
+
Steps: 0%| | 4847/1000000 [12:22:32<2537:10:42, 9.18s/it, lr=1e-5, step_loss=0.0242]
Steps: 0%| | 4848/1000000 [12:22:38<2260:52:04, 8.18s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [4848], local_loss=0.01305198110640049, train_loss=0.09553005546331406, time_cost=1.805100679397583
+
Steps: 0%| | 4848/1000000 [12:22:38<2260:52:04, 8.18s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 4849/1000000 [12:22:48<2402:32:29, 8.69s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [4849], local_loss=0.017766769975423813, train_loss=0.031809329986572266, time_cost=4.340347528457642
+
Steps: 0%| | 4849/1000000 [12:22:48<2402:32:29, 8.69s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 4850/1000000 [12:22:56<2334:29:10, 8.45s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [4850], local_loss=0.014535540714859962, train_loss=0.14734353125095367, time_cost=1.9792773723602295
+
Steps: 0%| | 4850/1000000 [12:22:56<2334:29:10, 8.45s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 4851/1000000 [12:23:07<2590:56:22, 9.37s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [4851], local_loss=0.027675459161400795, train_loss=0.02565954253077507, time_cost=4.608935356140137
+
Steps: 0%| | 4851/1000000 [12:23:07<2590:56:22, 9.37s/it, lr=1e-5, step_loss=0.0277]
Steps: 0%| | 4852/1000000 [12:23:13<2285:50:38, 8.27s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [4852], local_loss=0.010190146043896675, train_loss=0.015356570482254028, time_cost=1.3129734992980957
+
Steps: 0%| | 4852/1000000 [12:23:13<2285:50:38, 8.27s/it, lr=1e-5, step_loss=0.0102]
Steps: 0%| | 4853/1000000 [12:23:24<2532:07:28, 9.16s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [4853], local_loss=0.010800482705235481, train_loss=0.019236117601394653, time_cost=1.631542682647705
+
Steps: 0%| | 4853/1000000 [12:23:24<2532:07:28, 9.16s/it, lr=1e-5, step_loss=0.0108]
Steps: 0%| | 4854/1000000 [12:23:33<2512:55:44, 9.09s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [4854], local_loss=0.014103753492236137, train_loss=0.017589272931218147, time_cost=7.40802788734436
+
Steps: 0%| | 4854/1000000 [12:23:33<2512:55:44, 9.09s/it, lr=1e-5, step_loss=0.0141]
Steps: 0%| | 4855/1000000 [12:23:39<2270:30:02, 8.21s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [4855], local_loss=0.023710006847977638, train_loss=0.05467306077480316, time_cost=2.190777540206909
+
Steps: 0%| | 4855/1000000 [12:23:39<2270:30:02, 8.21s/it, lr=1e-5, step_loss=0.0237]
Steps: 0%| | 4856/1000000 [12:23:47<2206:38:07, 7.98s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [4856], local_loss=0.040859419852495193, train_loss=0.12390486896038055, time_cost=1.294490098953247
+
Steps: 0%| | 4856/1000000 [12:23:47<2206:38:07, 7.98s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 4857/1000000 [12:23:53<2077:59:50, 7.52s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [4857], local_loss=0.03719337657094002, train_loss=0.03641831502318382, time_cost=3.2287380695343018
+
Steps: 0%| | 4857/1000000 [12:23:53<2077:59:50, 7.52s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 4858/1000000 [12:24:00<2053:26:16, 7.43s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [4858], local_loss=0.009021488949656487, train_loss=0.0418441966176033, time_cost=2.8699984550476074
+
Steps: 0%| | 4858/1000000 [12:24:00<2053:26:16, 7.43s/it, lr=1e-5, step_loss=0.00902]
Steps: 0%| | 4859/1000000 [12:24:10<2217:24:34, 8.02s/it, lr=1e-5, step_loss=0.00902][RANK-0]: Step: [4859], local_loss=0.05601053684949875, train_loss=0.04287724941968918, time_cost=3.166332244873047
+
Steps: 0%| | 4859/1000000 [12:24:10<2217:24:34, 8.02s/it, lr=1e-5, step_loss=0.056]
Steps: 0%| | 4860/1000000 [12:24:20<2352:51:33, 8.51s/it, lr=1e-5, step_loss=0.056][RANK-0]: Step: [4860], local_loss=0.04926556348800659, train_loss=0.09430299699306488, time_cost=1.6886756420135498
+
Steps: 0%| | 4860/1000000 [12:24:20<2352:51:33, 8.51s/it, lr=1e-5, step_loss=0.0493]
Steps: 0%| | 4861/1000000 [12:24:34<2870:28:01, 10.38s/it, lr=1e-5, step_loss=0.0493][RANK-0]: Step: [4861], local_loss=0.017196113243699074, train_loss=0.04463374614715576, time_cost=11.240428447723389
+
Steps: 0%| | 4861/1000000 [12:24:34<2870:28:01, 10.38s/it, lr=1e-5, step_loss=0.0172]
Steps: 0%| | 4862/1000000 [12:24:40<2471:34:38, 8.94s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [4862], local_loss=0.07278648018836975, train_loss=0.07962056994438171, time_cost=2.745061159133911
+
Steps: 0%| | 4862/1000000 [12:24:40<2471:34:38, 8.94s/it, lr=1e-5, step_loss=0.0728]
Steps: 0%| | 4863/1000000 [12:24:51<2652:58:08, 9.60s/it, lr=1e-5, step_loss=0.0728][RANK-0]: Step: [4863], local_loss=0.01677435263991356, train_loss=0.038108065724372864, time_cost=3.7836732864379883
+
Steps: 0%| | 4863/1000000 [12:24:51<2652:58:08, 9.60s/it, lr=1e-5, step_loss=0.0168]
Steps: 0%| | 4864/1000000 [12:25:00<2637:07:40, 9.54s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [4864], local_loss=0.018608899787068367, train_loss=0.024524148553609848, time_cost=1.3856632709503174
+
Steps: 0%| | 4864/1000000 [12:25:00<2637:07:40, 9.54s/it, lr=1e-5, step_loss=0.0186]
Steps: 0%| | 4865/1000000 [12:25:07<2379:18:20, 8.61s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [4865], local_loss=0.012306462973356247, train_loss=0.024644413962960243, time_cost=1.3080146312713623
+
Steps: 0%| | 4865/1000000 [12:25:07<2379:18:20, 8.61s/it, lr=1e-5, step_loss=0.0123]
Steps: 0%| | 4866/1000000 [12:25:14<2262:39:54, 8.19s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [4866], local_loss=0.0187645573168993, train_loss=0.04987423121929169, time_cost=2.6647567749023438
+
Steps: 0%| | 4866/1000000 [12:25:14<2262:39:54, 8.19s/it, lr=1e-5, step_loss=0.0188]
Steps: 0%| | 4867/1000000 [12:25:25<2499:12:59, 9.04s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [4867], local_loss=0.019154051318764687, train_loss=0.057338207960128784, time_cost=2.219904661178589
+
Steps: 0%| | 4867/1000000 [12:25:25<2499:12:59, 9.04s/it, lr=1e-5, step_loss=0.0192]
Steps: 0%| | 4868/1000000 [12:25:30<2137:09:06, 7.73s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [4868], local_loss=0.018285367637872696, train_loss=0.023165926337242126, time_cost=1.9901442527770996
+
Steps: 0%| | 4868/1000000 [12:25:30<2137:09:06, 7.73s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 4869/1000000 [12:25:36<1974:24:48, 7.14s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [4869], local_loss=0.01176105160266161, train_loss=0.05416887253522873, time_cost=3.2465732097625732
+
Steps: 0%| | 4869/1000000 [12:25:36<1974:24:48, 7.14s/it, lr=1e-5, step_loss=0.0118]
Steps: 0%| | 4870/1000000 [12:25:41<1809:57:20, 6.55s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [4870], local_loss=198.25057983398438, train_loss=24.875831604003906, time_cost=2.6022439002990723
+
Steps: 0%| | 4870/1000000 [12:25:41<1809:57:20, 6.55s/it, lr=1e-5, step_loss=198]
Steps: 0%| | 4871/1000000 [12:25:52<2207:52:36, 7.99s/it, lr=1e-5, step_loss=198][RANK-0]: Step: [4871], local_loss=0.025424527004361153, train_loss=0.07234083116054535, time_cost=1.2323551177978516
+
Steps: 0%| | 4871/1000000 [12:25:52<2207:52:36, 7.99s/it, lr=1e-5, step_loss=0.0254]
Steps: 0%| | 4872/1000000 [12:26:02<2385:57:07, 8.63s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [4872], local_loss=0.11687053740024567, train_loss=0.0745561420917511, time_cost=3.850787878036499
+
Steps: 0%| | 4872/1000000 [12:26:02<2385:57:07, 8.63s/it, lr=1e-5, step_loss=0.117]
Steps: 0%| | 4873/1000000 [12:26:10<2339:08:04, 8.46s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [4873], local_loss=0.010025978088378906, train_loss=18.89811134338379, time_cost=2.4617691040039062
+
Steps: 0%| | 4873/1000000 [12:26:10<2339:08:04, 8.46s/it, lr=1e-5, step_loss=0.01]
Steps: 0%| | 4874/1000000 [12:26:15<2063:29:59, 7.46s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [4874], local_loss=1.0016591548919678, train_loss=0.16132396459579468, time_cost=1.9581670761108398
+
Steps: 0%| | 4874/1000000 [12:26:15<2063:29:59, 7.46s/it, lr=1e-5, step_loss=1]
Steps: 0%| | 4875/1000000 [12:26:23<2070:07:10, 7.49s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [4875], local_loss=0.017751077190041542, train_loss=0.10352586209774017, time_cost=3.6780378818511963
+
Steps: 0%| | 4875/1000000 [12:26:23<2070:07:10, 7.49s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 4876/1000000 [12:26:30<2054:48:13, 7.43s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [4876], local_loss=0.041711803525686264, train_loss=0.15317118167877197, time_cost=1.2270503044128418
+
Steps: 0%| | 4876/1000000 [12:26:30<2054:48:13, 7.43s/it, lr=1e-5, step_loss=0.0417]
Steps: 0%| | 4877/1000000 [12:26:40<2263:44:19, 8.19s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [4877], local_loss=0.03935479745268822, train_loss=0.09240613877773285, time_cost=5.689254522323608
+
Steps: 0%| | 4877/1000000 [12:26:40<2263:44:19, 8.19s/it, lr=1e-5, step_loss=0.0394]
Steps: 0%| | 4878/1000000 [12:26:50<2414:41:30, 8.74s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [4878], local_loss=0.09890821576118469, train_loss=0.05269647389650345, time_cost=2.601144552230835
+
Steps: 0%| | 4878/1000000 [12:26:50<2414:41:30, 8.74s/it, lr=1e-5, step_loss=0.0989]
Steps: 0%| | 4879/1000000 [12:27:05<2879:47:07, 10.42s/it, lr=1e-5, step_loss=0.0989][RANK-0]: Step: [4879], local_loss=0.008464735001325607, train_loss=0.023500487208366394, time_cost=1.2509872913360596
+
Steps: 0%| | 4879/1000000 [12:27:05<2879:47:07, 10.42s/it, lr=1e-5, step_loss=0.00846]
Steps: 0%| | 4880/1000000 [12:27:09<2368:59:25, 8.57s/it, lr=1e-5, step_loss=0.00846][RANK-0]: Step: [4880], local_loss=0.01391851156949997, train_loss=0.018508756533265114, time_cost=1.2672135829925537
+
Steps: 0%| | 4880/1000000 [12:27:09<2368:59:25, 8.57s/it, lr=1e-5, step_loss=0.0139]
Steps: 0%| | 4881/1000000 [12:27:16<2269:30:43, 8.21s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [4881], local_loss=0.2768859565258026, train_loss=0.061401888728141785, time_cost=3.425471544265747
+
Steps: 0%| | 4881/1000000 [12:27:16<2269:30:43, 8.21s/it, lr=1e-5, step_loss=0.277]
Steps: 0%| | 4882/1000000 [12:27:22<2058:49:51, 7.45s/it, lr=1e-5, step_loss=0.277][RANK-0]: Step: [4882], local_loss=0.024641485884785652, train_loss=0.034465573728084564, time_cost=2.7681472301483154
+
Steps: 0%| | 4882/1000000 [12:27:22<2058:49:51, 7.45s/it, lr=1e-5, step_loss=0.0246]
Steps: 0%| | 4883/1000000 [12:27:27<1864:02:23, 6.74s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [4883], local_loss=0.010189127177000046, train_loss=0.04480282962322235, time_cost=3.870204210281372
+
Steps: 0%| | 4883/1000000 [12:27:27<1864:02:23, 6.74s/it, lr=1e-5, step_loss=0.0102]
Steps: 0%| | 4884/1000000 [12:27:40<2365:03:27, 8.56s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [4884], local_loss=0.016414202749729156, train_loss=0.028600068762898445, time_cost=3.266680955886841
+
Steps: 0%| | 4884/1000000 [12:27:40<2365:03:27, 8.56s/it, lr=1e-5, step_loss=0.0164]
Steps: 0%| | 4885/1000000 [12:27:51<2603:48:54, 9.42s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [4885], local_loss=0.014548356644809246, train_loss=0.13775108754634857, time_cost=2.333400249481201
+
Steps: 0%| | 4885/1000000 [12:27:51<2603:48:54, 9.42s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 4886/1000000 [12:28:03<2778:55:18, 10.05s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [4886], local_loss=0.09970558434724808, train_loss=0.17549127340316772, time_cost=2.742265224456787
+
Steps: 0%| | 4886/1000000 [12:28:03<2778:55:18, 10.05s/it, lr=1e-5, step_loss=0.0997]
Steps: 0%| | 4887/1000000 [12:28:13<2830:38:51, 10.24s/it, lr=1e-5, step_loss=0.0997][RANK-0]: Step: [4887], local_loss=0.05100724473595619, train_loss=0.04430140554904938, time_cost=4.525872230529785
+
Steps: 0%| | 4887/1000000 [12:28:13<2830:38:51, 10.24s/it, lr=1e-5, step_loss=0.051]
Steps: 0%| | 4888/1000000 [12:28:19<2427:23:16, 8.78s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [4888], local_loss=0.013093853369355202, train_loss=0.024217642843723297, time_cost=2.9903998374938965
+
Steps: 0%| | 4888/1000000 [12:28:19<2427:23:16, 8.78s/it, lr=1e-5, step_loss=0.0131]
Steps: 0%| | 4889/1000000 [12:28:28<2452:26:49, 8.87s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [4889], local_loss=0.014470499008893967, train_loss=0.03567829728126526, time_cost=1.9756438732147217
+
Steps: 0%| | 4889/1000000 [12:28:28<2452:26:49, 8.87s/it, lr=1e-5, step_loss=0.0145]
Steps: 0%| | 4890/1000000 [12:28:32<2098:03:44, 7.59s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [4890], local_loss=0.018289418891072273, train_loss=0.038870617747306824, time_cost=2.1936395168304443
+
Steps: 0%| | 4890/1000000 [12:28:32<2098:03:44, 7.59s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 4891/1000000 [12:28:44<2418:39:37, 8.75s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [4891], local_loss=0.44272902607917786, train_loss=0.1054958701133728, time_cost=1.613445520401001
+
Steps: 0%| | 4891/1000000 [12:28:44<2418:39:37, 8.75s/it, lr=1e-5, step_loss=0.443]
Steps: 0%| | 4892/1000000 [12:28:55<2595:46:06, 9.39s/it, lr=1e-5, step_loss=0.443][RANK-0]: Step: [4892], local_loss=0.29388663172721863, train_loss=0.08778702467679977, time_cost=5.759018182754517
+
Steps: 0%| | 4892/1000000 [12:28:55<2595:46:06, 9.39s/it, lr=1e-5, step_loss=0.294]
Steps: 0%| | 4893/1000000 [12:29:06<2781:53:33, 10.06s/it, lr=1e-5, step_loss=0.294][RANK-0]: Step: [4893], local_loss=0.011099182069301605, train_loss=0.09905439615249634, time_cost=4.357545375823975
+
Steps: 0%| | 4893/1000000 [12:29:06<2781:53:33, 10.06s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 4894/1000000 [12:29:12<2401:51:49, 8.69s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [4894], local_loss=0.02674948424100876, train_loss=0.021367263048887253, time_cost=2.782569646835327
+
Steps: 0%| | 4894/1000000 [12:29:12<2401:51:49, 8.69s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 4895/1000000 [12:29:17<2138:50:53, 7.74s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [4895], local_loss=0.17904366552829742, train_loss=0.06816747784614563, time_cost=3.0340287685394287
+
Steps: 0%| | 4895/1000000 [12:29:17<2138:50:53, 7.74s/it, lr=1e-5, step_loss=0.179]
Steps: 0%| | 4896/1000000 [12:29:32<2722:07:01, 9.85s/it, lr=1e-5, step_loss=0.179][RANK-0]: Step: [4896], local_loss=0.015661116689443588, train_loss=0.044816385954618454, time_cost=6.253006458282471
+
Steps: 0%| | 4896/1000000 [12:29:32<2722:07:01, 9.85s/it, lr=1e-5, step_loss=0.0157]
Steps: 0%| | 4897/1000000 [12:29:37<2348:13:34, 8.50s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [4897], local_loss=0.08424645662307739, train_loss=0.12483362853527069, time_cost=2.2887706756591797
+
Steps: 0%| | 4897/1000000 [12:29:38<2348:13:34, 8.50s/it, lr=1e-5, step_loss=0.0842]
Steps: 0%| | 4898/1000000 [12:29:49<2585:36:14, 9.35s/it, lr=1e-5, step_loss=0.0842][RANK-0]: Step: [4898], local_loss=0.032050784677267075, train_loss=5.546802520751953, time_cost=2.940244674682617
+
Steps: 0%| | 4898/1000000 [12:29:49<2585:36:14, 9.35s/it, lr=1e-5, step_loss=0.0321]
Steps: 0%| | 4899/1000000 [12:30:03<3019:54:38, 10.93s/it, lr=1e-5, step_loss=0.0321][RANK-0]: Step: [4899], local_loss=0.05183231085538864, train_loss=0.04372116178274155, time_cost=5.038039922714233
+
Steps: 0%| | 4899/1000000 [12:30:03<3019:54:38, 10.93s/it, lr=1e-5, step_loss=0.0518]
Steps: 0%| | 4900/1000000 [12:30:15<3094:59:11, 11.20s/it, lr=1e-5, step_loss=0.0518][RANK-0]: Step: [4900], local_loss=0.01371039915829897, train_loss=0.02309604361653328, time_cost=2.365974187850952
+
Steps: 0%| | 4900/1000000 [12:30:15<3094:59:11, 11.20s/it, lr=1e-5, step_loss=0.0137]
Steps: 0%| | 4901/1000000 [12:30:20<2578:16:21, 9.33s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [4901], local_loss=0.02718515135347843, train_loss=0.04274222254753113, time_cost=2.4370169639587402
+
Steps: 0%| | 4901/1000000 [12:30:20<2578:16:21, 9.33s/it, lr=1e-5, step_loss=0.0272]
Steps: 0%| | 4902/1000000 [12:30:29<2540:11:32, 9.19s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [4902], local_loss=0.1496235430240631, train_loss=0.04291187971830368, time_cost=3.5633418560028076
+
Steps: 0%| | 4902/1000000 [12:30:29<2540:11:32, 9.19s/it, lr=1e-5, step_loss=0.15]
Steps: 0%| | 4903/1000000 [12:30:38<2530:20:47, 9.15s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [4903], local_loss=0.020963603630661964, train_loss=0.04524406045675278, time_cost=1.3497288227081299
+
Steps: 0%| | 4903/1000000 [12:30:38<2530:20:47, 9.15s/it, lr=1e-5, step_loss=0.021]
Steps: 0%| | 4904/1000000 [12:30:44<2257:35:55, 8.17s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [4904], local_loss=0.010874111205339432, train_loss=0.03210499882698059, time_cost=2.3587472438812256
+
Steps: 0%| | 4904/1000000 [12:30:44<2257:35:55, 8.17s/it, lr=1e-5, step_loss=0.0109]
Steps: 0%| | 4905/1000000 [12:30:54<2400:38:27, 8.68s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [4905], local_loss=0.015482762828469276, train_loss=0.030137546360492706, time_cost=1.2775535583496094
+
Steps: 0%| | 4905/1000000 [12:30:54<2400:38:27, 8.68s/it, lr=1e-5, step_loss=0.0155]
Steps: 0%| | 4906/1000000 [12:30:58<2056:38:59, 7.44s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [4906], local_loss=0.018255511298775673, train_loss=0.04001309722661972, time_cost=1.5830771923065186
+
Steps: 0%| | 4906/1000000 [12:30:58<2056:38:59, 7.44s/it, lr=1e-5, step_loss=0.0183]
Steps: 0%| | 4907/1000000 [12:31:04<1890:10:40, 6.84s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [4907], local_loss=0.02236618474125862, train_loss=0.05260898172855377, time_cost=2.4195716381073
+
Steps: 0%| | 4907/1000000 [12:31:04<1890:10:40, 6.84s/it, lr=1e-5, step_loss=0.0224]
Steps: 0%| | 4908/1000000 [12:31:09<1770:12:13, 6.40s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [4908], local_loss=0.32048964500427246, train_loss=0.07894165813922882, time_cost=1.3027119636535645
+
Steps: 0%| | 4908/1000000 [12:31:09<1770:12:13, 6.40s/it, lr=1e-5, step_loss=0.32]
Steps: 0%| | 4909/1000000 [12:31:23<2371:53:50, 8.58s/it, lr=1e-5, step_loss=0.32][RANK-0]: Step: [4909], local_loss=0.19332990050315857, train_loss=0.055483561009168625, time_cost=1.2320587635040283
+
Steps: 0%| | 4909/1000000 [12:31:23<2371:53:50, 8.58s/it, lr=1e-5, step_loss=0.193]
Steps: 0%| | 4910/1000000 [12:31:35<2624:46:43, 9.50s/it, lr=1e-5, step_loss=0.193][RANK-0]: Step: [4910], local_loss=0.10085582733154297, train_loss=0.057845450937747955, time_cost=3.894406795501709
+
Steps: 0%| | 4910/1000000 [12:31:35<2624:46:43, 9.50s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 4911/1000000 [12:31:42<2477:32:37, 8.96s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [4911], local_loss=0.03438540920615196, train_loss=0.06635644286870956, time_cost=1.6737802028656006
+
Steps: 0%| | 4911/1000000 [12:31:42<2477:32:37, 8.96s/it, lr=1e-5, step_loss=0.0344]
Steps: 0%| | 4912/1000000 [12:31:52<2546:12:58, 9.21s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [4912], local_loss=0.048637114465236664, train_loss=0.17489281296730042, time_cost=4.135071277618408
+
Steps: 0%| | 4912/1000000 [12:31:52<2546:12:58, 9.21s/it, lr=1e-5, step_loss=0.0486]
Steps: 0%| | 4913/1000000 [12:32:03<2680:46:04, 9.70s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [4913], local_loss=0.038159605115652084, train_loss=0.028022129088640213, time_cost=1.7642433643341064
+
Steps: 0%| | 4913/1000000 [12:32:03<2680:46:04, 9.70s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 4914/1000000 [12:32:11<2538:54:26, 9.19s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [4914], local_loss=0.035871487110853195, train_loss=0.020226649940013885, time_cost=1.3536980152130127
+
Steps: 0%| | 4914/1000000 [12:32:11<2538:54:26, 9.19s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 4915/1000000 [12:32:16<2216:29:07, 8.02s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [4915], local_loss=0.025669002905488014, train_loss=0.03505585342645645, time_cost=1.2221062183380127
+
Steps: 0%| | 4915/1000000 [12:32:16<2216:29:07, 8.02s/it, lr=1e-5, step_loss=0.0257]
Steps: 0%| | 4916/1000000 [12:32:22<1989:49:44, 7.20s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [4916], local_loss=0.5458774566650391, train_loss=0.0872495174407959, time_cost=2.4423255920410156
+
Steps: 0%| | 4916/1000000 [12:32:22<1989:49:44, 7.20s/it, lr=1e-5, step_loss=0.546]
Steps: 0%| | 4917/1000000 [12:32:31<2175:50:13, 7.87s/it, lr=1e-5, step_loss=0.546][RANK-0]: Step: [4917], local_loss=0.08287018537521362, train_loss=0.044373586773872375, time_cost=2.896735906600952
+
Steps: 0%| | 4917/1000000 [12:32:31<2175:50:13, 7.87s/it, lr=1e-5, step_loss=0.0829]
Steps: 0%| | 4918/1000000 [12:32:41<2334:21:25, 8.45s/it, lr=1e-5, step_loss=0.0829][RANK-0]: Step: [4918], local_loss=0.019063351675868034, train_loss=0.1414036750793457, time_cost=1.2194783687591553
+
Steps: 0%| | 4918/1000000 [12:32:41<2334:21:25, 8.45s/it, lr=1e-5, step_loss=0.0191]
Steps: 0%| | 4919/1000000 [12:32:48<2251:51:15, 8.15s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [4919], local_loss=0.019181041046977043, train_loss=0.025536637753248215, time_cost=1.2314262390136719
+
Steps: 0%| | 4919/1000000 [12:32:48<2251:51:15, 8.15s/it, lr=1e-5, step_loss=0.0192]
Steps: 0%| | 4920/1000000 [12:32:57<2274:04:50, 8.23s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [4920], local_loss=0.024761037901043892, train_loss=0.02062387391924858, time_cost=3.2484617233276367
+
Steps: 0%| | 4920/1000000 [12:32:57<2274:04:50, 8.23s/it, lr=1e-5, step_loss=0.0248]
Steps: 0%| | 4921/1000000 [12:33:14<3006:30:14, 10.88s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [4921], local_loss=0.018168695271015167, train_loss=0.05813511461019516, time_cost=9.15546441078186
+
Steps: 0%| | 4921/1000000 [12:33:14<3006:30:14, 10.88s/it, lr=1e-5, step_loss=0.0182]
Steps: 0%| | 4922/1000000 [12:33:20<2607:41:37, 9.43s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [4922], local_loss=0.022123374044895172, train_loss=0.09132420271635056, time_cost=1.561202049255371
+
Steps: 0%| | 4922/1000000 [12:33:20<2607:41:37, 9.43s/it, lr=1e-5, step_loss=0.0221]
Steps: 0%| | 4923/1000000 [12:33:31<2746:56:44, 9.94s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [4923], local_loss=0.25791266560554504, train_loss=0.08287690579891205, time_cost=2.17555832862854
+
Steps: 0%| | 4923/1000000 [12:33:31<2746:56:44, 9.94s/it, lr=1e-5, step_loss=0.258]
Steps: 0%| | 4924/1000000 [12:33:39<2580:53:48, 9.34s/it, lr=1e-5, step_loss=0.258][RANK-0]: Step: [4924], local_loss=0.010558722540736198, train_loss=0.02642189711332321, time_cost=1.2727975845336914
+
Steps: 0%| | 4924/1000000 [12:33:39<2580:53:48, 9.34s/it, lr=1e-5, step_loss=0.0106]
Steps: 0%| | 4925/1000000 [12:33:46<2411:48:19, 8.73s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [4925], local_loss=0.05877286568284035, train_loss=0.05224398523569107, time_cost=2.642454147338867
+
Steps: 0%| | 4925/1000000 [12:33:46<2411:48:19, 8.73s/it, lr=1e-5, step_loss=0.0588]
Steps: 0%| | 4926/1000000 [12:33:52<2149:00:57, 7.77s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [4926], local_loss=0.038695670664310455, train_loss=0.08804309368133545, time_cost=2.673785924911499
+
Steps: 0%| | 4926/1000000 [12:33:52<2149:00:57, 7.77s/it, lr=1e-5, step_loss=0.0387]
Steps: 0%| | 4927/1000000 [12:34:09<2948:22:05, 10.67s/it, lr=1e-5, step_loss=0.0387][RANK-0]: Step: [4927], local_loss=0.10965269804000854, train_loss=0.07969063520431519, time_cost=7.520640134811401
+
Steps: 0%| | 4927/1000000 [12:34:09<2948:22:05, 10.67s/it, lr=1e-5, step_loss=0.11]
Steps: 0%| | 4928/1000000 [12:34:20<2946:32:06, 10.66s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [4928], local_loss=0.03824600949883461, train_loss=0.025818657130002975, time_cost=2.1047565937042236
+
Steps: 0%| | 4928/1000000 [12:34:20<2946:32:06, 10.66s/it, lr=1e-5, step_loss=0.0382]
Steps: 0%| | 4929/1000000 [12:34:25<2529:35:54, 9.15s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [4929], local_loss=0.027084343135356903, train_loss=0.18791364133358002, time_cost=2.9234185218811035
+
Steps: 0%| | 4929/1000000 [12:34:25<2529:35:54, 9.15s/it, lr=1e-5, step_loss=0.0271]
Steps: 0%| | 4930/1000000 [12:34:37<2757:53:25, 9.98s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [4930], local_loss=0.014027101919054985, train_loss=0.02863088995218277, time_cost=4.966927528381348
+
Steps: 0%| | 4930/1000000 [12:34:37<2757:53:25, 9.98s/it, lr=1e-5, step_loss=0.014]
Steps: 0%| | 4931/1000000 [12:34:43<2414:10:04, 8.73s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [4931], local_loss=0.02555077150464058, train_loss=0.04585221782326698, time_cost=1.4159603118896484
+
Steps: 0%| | 4931/1000000 [12:34:43<2414:10:04, 8.73s/it, lr=1e-5, step_loss=0.0256]
Steps: 0%| | 4932/1000000 [12:34:50<2242:40:16, 8.11s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [4932], local_loss=0.01597348414361477, train_loss=0.02838553488254547, time_cost=1.197634220123291
+
Steps: 0%| | 4932/1000000 [12:34:50<2242:40:16, 8.11s/it, lr=1e-5, step_loss=0.016]
Steps: 0%| | 4933/1000000 [12:35:00<2403:59:24, 8.70s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [4933], local_loss=0.04207676649093628, train_loss=0.07617215812206268, time_cost=7.142117500305176
+
Steps: 0%| | 4933/1000000 [12:35:00<2403:59:24, 8.70s/it, lr=1e-5, step_loss=0.0421]
Steps: 0%| | 4934/1000000 [12:35:07<2288:32:14, 8.28s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [4934], local_loss=0.039510536938905716, train_loss=0.023570798337459564, time_cost=1.2041378021240234
+
Steps: 0%| | 4934/1000000 [12:35:07<2288:32:14, 8.28s/it, lr=1e-5, step_loss=0.0395]
Steps: 0%| | 4935/1000000 [12:35:19<2554:10:32, 9.24s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [4935], local_loss=0.03390917554497719, train_loss=0.027682699263095856, time_cost=1.220698356628418
+
Steps: 0%| | 4935/1000000 [12:35:19<2554:10:32, 9.24s/it, lr=1e-5, step_loss=0.0339]
Steps: 0%| | 4936/1000000 [12:35:30<2723:29:58, 9.85s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [4936], local_loss=0.08730119466781616, train_loss=0.1721343845129013, time_cost=3.5880401134490967
+
Steps: 0%| | 4936/1000000 [12:35:30<2723:29:58, 9.85s/it, lr=1e-5, step_loss=0.0873]
Steps: 0%| | 4937/1000000 [12:35:42<2909:13:47, 10.53s/it, lr=1e-5, step_loss=0.0873][RANK-0]: Step: [4937], local_loss=0.04091208428144455, train_loss=0.07956158369779587, time_cost=9.034961938858032
+
Steps: 0%| | 4937/1000000 [12:35:42<2909:13:47, 10.53s/it, lr=1e-5, step_loss=0.0409]
Steps: 0%| | 4938/1000000 [12:35:49<2659:29:54, 9.62s/it, lr=1e-5, step_loss=0.0409][RANK-0]: Step: [4938], local_loss=0.06362961232662201, train_loss=0.045802295207977295, time_cost=1.2360708713531494
+
Steps: 0%| | 4938/1000000 [12:35:49<2659:29:54, 9.62s/it, lr=1e-5, step_loss=0.0636]
Steps: 0%| | 4939/1000000 [12:35:55<2284:46:03, 8.27s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [4939], local_loss=0.030479319393634796, train_loss=0.07209308445453644, time_cost=1.2953402996063232
+
Steps: 0%| | 4939/1000000 [12:35:55<2284:46:03, 8.27s/it, lr=1e-5, step_loss=0.0305]
Steps: 0%| | 4940/1000000 [12:36:06<2516:46:31, 9.11s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [4940], local_loss=0.021476151421666145, train_loss=0.031952597200870514, time_cost=2.039386510848999
+
Steps: 0%| | 4940/1000000 [12:36:06<2516:46:31, 9.11s/it, lr=1e-5, step_loss=0.0215]
Steps: 0%| | 4941/1000000 [12:36:19<2889:17:40, 10.45s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [4941], local_loss=0.023683765903115273, train_loss=0.033548906445503235, time_cost=5.1659018993377686
+
Steps: 0%| | 4941/1000000 [12:36:19<2889:17:40, 10.45s/it, lr=1e-5, step_loss=0.0237]
Steps: 0%| | 4942/1000000 [12:36:31<3010:38:06, 10.89s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [4942], local_loss=0.5011700987815857, train_loss=0.1261882185935974, time_cost=3.661288022994995
+
Steps: 0%| | 4942/1000000 [12:36:31<3010:38:06, 10.89s/it, lr=1e-5, step_loss=0.501]
Steps: 0%| | 4943/1000000 [12:36:38<2667:02:34, 9.65s/it, lr=1e-5, step_loss=0.501][RANK-0]: Step: [4943], local_loss=0.020656684413552284, train_loss=0.03801244497299194, time_cost=2.1504900455474854
+
Steps: 0%| | 4943/1000000 [12:36:38<2667:02:34, 9.65s/it, lr=1e-5, step_loss=0.0207]
Steps: 0%| | 4944/1000000 [12:36:52<3017:42:23, 10.92s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [4944], local_loss=0.03717406094074249, train_loss=0.10215671360492706, time_cost=1.7351775169372559
+
Steps: 0%| | 4944/1000000 [12:36:52<3017:42:23, 10.92s/it, lr=1e-5, step_loss=0.0372]
Steps: 0%| | 4945/1000000 [12:37:03<3041:08:54, 11.00s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [4945], local_loss=0.042030591517686844, train_loss=0.14996927976608276, time_cost=4.439391851425171
+
Steps: 0%| | 4945/1000000 [12:37:03<3041:08:54, 11.00s/it, lr=1e-5, step_loss=0.042]
Steps: 0%| | 4946/1000000 [12:37:15<3133:38:46, 11.34s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [4946], local_loss=0.010958293452858925, train_loss=0.024601232260465622, time_cost=4.906535625457764
+
Steps: 0%| | 4946/1000000 [12:37:15<3133:38:46, 11.34s/it, lr=1e-5, step_loss=0.011]
Steps: 0%| | 4947/1000000 [12:37:22<2782:11:08, 10.07s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [4947], local_loss=0.011125019751489162, train_loss=0.045953165739774704, time_cost=1.3871731758117676
+
Steps: 0%| | 4947/1000000 [12:37:22<2782:11:08, 10.07s/it, lr=1e-5, step_loss=0.0111]
Steps: 0%| | 4948/1000000 [12:37:30<2556:03:03, 9.25s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [4948], local_loss=0.01657348871231079, train_loss=0.02831082046031952, time_cost=2.825462579727173
+
Steps: 0%| | 4948/1000000 [12:37:30<2556:03:03, 9.25s/it, lr=1e-5, step_loss=0.0166]
Steps: 0%| | 4949/1000000 [12:37:39<2548:42:57, 9.22s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [4949], local_loss=0.3885097801685333, train_loss=0.08626490831375122, time_cost=1.436136245727539
+
Steps: 0%| | 4949/1000000 [12:37:39<2548:42:57, 9.22s/it, lr=1e-5, step_loss=0.389]
Steps: 0%| | 4950/1000000 [12:37:49<2651:45:33, 9.59s/it, lr=1e-5, step_loss=0.389][RANK-0]: Step: [4950], local_loss=0.029709678143262863, train_loss=0.025043314322829247, time_cost=2.722399950027466
+
Steps: 0%| | 4950/1000000 [12:37:49<2651:45:33, 9.59s/it, lr=1e-5, step_loss=0.0297]
Steps: 0%| | 4951/1000000 [12:37:55<2322:54:13, 8.40s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [4951], local_loss=0.012893455103039742, train_loss=0.054256752133369446, time_cost=1.3979651927947998
+
Steps: 0%| | 4951/1000000 [12:37:55<2322:54:13, 8.40s/it, lr=1e-5, step_loss=0.0129]
Steps: 0%| | 4952/1000000 [12:38:00<2085:40:00, 7.55s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [4952], local_loss=0.06749746203422546, train_loss=0.050404928624629974, time_cost=1.2564055919647217
+
Steps: 0%| | 4952/1000000 [12:38:00<2085:40:00, 7.55s/it, lr=1e-5, step_loss=0.0675]
Steps: 0%| | 4953/1000000 [12:38:04<1788:45:09, 6.47s/it, lr=1e-5, step_loss=0.0675][RANK-0]: Step: [4953], local_loss=0.018500862643122673, train_loss=0.06446188688278198, time_cost=1.2206690311431885
+
Steps: 0%| | 4953/1000000 [12:38:04<1788:45:09, 6.47s/it, lr=1e-5, step_loss=0.0185]
Steps: 0%| | 4954/1000000 [12:38:12<1903:22:19, 6.89s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [4954], local_loss=0.02849721349775791, train_loss=0.1562044471502304, time_cost=1.8200697898864746
+
Steps: 0%| | 4954/1000000 [12:38:12<1903:22:19, 6.89s/it, lr=1e-5, step_loss=0.0285]
Steps: 0%| | 4955/1000000 [12:38:16<1693:14:17, 6.13s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [4955], local_loss=0.055640775710344315, train_loss=0.15263116359710693, time_cost=1.877157211303711
+
Steps: 0%| | 4955/1000000 [12:38:16<1693:14:17, 6.13s/it, lr=1e-5, step_loss=0.0556]
Steps: 0%| | 4956/1000000 [12:38:29<2187:58:22, 7.92s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [4956], local_loss=0.019560661166906357, train_loss=0.02272927016019821, time_cost=2.7617034912109375
+
Steps: 0%| | 4956/1000000 [12:38:29<2187:58:22, 7.92s/it, lr=1e-5, step_loss=0.0196]
Steps: 0%| | 4957/1000000 [12:38:34<1996:50:33, 7.22s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [4957], local_loss=0.009537155739963055, train_loss=0.04321098327636719, time_cost=1.3024332523345947
+
Steps: 0%| | 4957/1000000 [12:38:34<1996:50:33, 7.22s/it, lr=1e-5, step_loss=0.00954]
Steps: 0%| | 4958/1000000 [12:38:44<2219:09:51, 8.03s/it, lr=1e-5, step_loss=0.00954][RANK-0]: Step: [4958], local_loss=0.05010141059756279, train_loss=0.06617783010005951, time_cost=1.2796454429626465
+
Steps: 0%| | 4958/1000000 [12:38:44<2219:09:51, 8.03s/it, lr=1e-5, step_loss=0.0501]
Steps: 0%| | 4959/1000000 [12:38:48<1884:55:10, 6.82s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [4959], local_loss=0.3948725759983063, train_loss=0.08465470373630524, time_cost=1.2876183986663818
+
Steps: 0%| | 4959/1000000 [12:38:48<1884:55:10, 6.82s/it, lr=1e-5, step_loss=0.395]
Steps: 0%| | 4960/1000000 [12:39:04<2649:55:43, 9.59s/it, lr=1e-5, step_loss=0.395][RANK-0]: Step: [4960], local_loss=0.013512343168258667, train_loss=0.1512073576450348, time_cost=8.304769039154053
+
Steps: 0%| | 4960/1000000 [12:39:04<2649:55:43, 9.59s/it, lr=1e-5, step_loss=0.0135]
Steps: 0%| | 4961/1000000 [12:39:12<2542:18:08, 9.20s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [4961], local_loss=0.014439882710576057, train_loss=0.12571804225444794, time_cost=2.407273530960083
+
Steps: 0%| | 4961/1000000 [12:39:12<2542:18:08, 9.20s/it, lr=1e-5, step_loss=0.0144]
Steps: 0%| | 4962/1000000 [12:39:18<2256:09:07, 8.16s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [4962], local_loss=0.025776155292987823, train_loss=0.019847385585308075, time_cost=4.945825576782227
+
Steps: 0%| | 4962/1000000 [12:39:18<2256:09:07, 8.16s/it, lr=1e-5, step_loss=0.0258]
Steps: 0%| | 4963/1000000 [12:39:29<2492:13:22, 9.02s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [4963], local_loss=0.025492772459983826, train_loss=0.030259007588028908, time_cost=3.558518886566162
+
Steps: 0%| | 4963/1000000 [12:39:29<2492:13:22, 9.02s/it, lr=1e-5, step_loss=0.0255]
Steps: 0%| | 4964/1000000 [12:39:34<2160:55:09, 7.82s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [4964], local_loss=1.0102342367172241, train_loss=0.15517506003379822, time_cost=1.8580985069274902
+
Steps: 0%| | 4964/1000000 [12:39:34<2160:55:09, 7.82s/it, lr=1e-5, step_loss=1.01]
Steps: 0%| | 4965/1000000 [12:39:40<1961:18:46, 7.10s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [4965], local_loss=0.04219353199005127, train_loss=0.0365736298263073, time_cost=2.355750322341919
+
Steps: 0%| | 4965/1000000 [12:39:40<1961:18:46, 7.10s/it, lr=1e-5, step_loss=0.0422]
Steps: 0%| | 4966/1000000 [12:39:51<2292:53:35, 8.30s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [4966], local_loss=0.32778698205947876, train_loss=0.9669303297996521, time_cost=3.332200288772583
+
Steps: 0%| | 4966/1000000 [12:39:51<2292:53:35, 8.30s/it, lr=1e-5, step_loss=0.328]
Steps: 0%| | 4967/1000000 [12:40:05<2787:28:02, 10.08s/it, lr=1e-5, step_loss=0.328][RANK-0]: Step: [4967], local_loss=0.07590515166521072, train_loss=0.05325794965028763, time_cost=4.091481685638428
+
Steps: 0%| | 4967/1000000 [12:40:05<2787:28:02, 10.08s/it, lr=1e-5, step_loss=0.0759]
Steps: 0%| | 4968/1000000 [12:40:20<3178:25:14, 11.50s/it, lr=1e-5, step_loss=0.0759][RANK-0]: Step: [4968], local_loss=0.019826263189315796, train_loss=0.0526968277990818, time_cost=1.6393980979919434
+
Steps: 0%| | 4968/1000000 [12:40:20<3178:25:14, 11.50s/it, lr=1e-5, step_loss=0.0198]
Steps: 0%| | 4969/1000000 [12:40:33<3312:47:41, 11.99s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [4969], local_loss=0.01794346049427986, train_loss=0.041281986981630325, time_cost=3.9096734523773193
+
Steps: 0%| | 4969/1000000 [12:40:33<3312:47:41, 11.99s/it, lr=1e-5, step_loss=0.0179]
Steps: 0%| | 4970/1000000 [12:40:44<3244:24:04, 11.74s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [4970], local_loss=0.01138542965054512, train_loss=0.04013730213046074, time_cost=2.3809542655944824
+
Steps: 0%| | 4970/1000000 [12:40:44<3244:24:04, 11.74s/it, lr=1e-5, step_loss=0.0114]
Steps: 0%| | 4971/1000000 [12:40:55<3192:57:55, 11.55s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [4971], local_loss=0.10094970464706421, train_loss=0.058665696531534195, time_cost=2.7537972927093506
+
Steps: 0%| | 4971/1000000 [12:40:55<3192:57:55, 11.55s/it, lr=1e-5, step_loss=0.101]
Steps: 0%| | 4972/1000000 [12:41:12<3630:07:53, 13.13s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [4972], local_loss=0.014878387562930584, train_loss=0.03126370161771774, time_cost=3.960712432861328
+
Steps: 0%| | 4972/1000000 [12:41:12<3630:07:53, 13.13s/it, lr=1e-5, step_loss=0.0149]
Steps: 0%| | 4973/1000000 [12:41:18<3002:42:45, 10.86s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [4973], local_loss=0.010646164417266846, train_loss=0.08127425611019135, time_cost=2.349421501159668
+
Steps: 0%| | 4973/1000000 [12:41:18<3002:42:45, 10.86s/it, lr=1e-5, step_loss=0.0106]
Steps: 0%| | 4974/1000000 [12:41:34<3438:04:50, 12.44s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [4974], local_loss=0.015810629352927208, train_loss=14.653273582458496, time_cost=3.6273510456085205
+
Steps: 0%| | 4974/1000000 [12:41:34<3438:04:50, 12.44s/it, lr=1e-5, step_loss=0.0158]
Steps: 0%| | 4975/1000000 [12:41:41<2994:28:42, 10.83s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [4975], local_loss=0.016133030876517296, train_loss=0.052316032350063324, time_cost=2.9474599361419678
+
Steps: 0%| | 4975/1000000 [12:41:41<2994:28:42, 10.83s/it, lr=1e-5, step_loss=0.0161]
Steps: 0%| | 4976/1000000 [12:41:52<3022:38:37, 10.94s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [4976], local_loss=0.04108542576432228, train_loss=0.033300913870334625, time_cost=1.7727816104888916
+
Steps: 0%| | 4976/1000000 [12:41:52<3022:38:37, 10.94s/it, lr=1e-5, step_loss=0.0411]
Steps: 0%| | 4977/1000000 [12:42:04<3080:05:41, 11.14s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [4977], local_loss=0.057579405605793, train_loss=0.02670982852578163, time_cost=1.2075762748718262
+
Steps: 0%| | 4977/1000000 [12:42:04<3080:05:41, 11.14s/it, lr=1e-5, step_loss=0.0576]
Steps: 0%| | 4978/1000000 [12:42:15<3063:29:33, 11.08s/it, lr=1e-5, step_loss=0.0576][RANK-0]: Step: [4978], local_loss=0.01498029287904501, train_loss=0.018652742728590965, time_cost=1.9768507480621338
+
Steps: 0%| | 4978/1000000 [12:42:15<3063:29:33, 11.08s/it, lr=1e-5, step_loss=0.015]
Steps: 0%| | 4979/1000000 [12:42:20<2633:31:39, 9.53s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [4979], local_loss=0.011397706344723701, train_loss=0.036583684384822845, time_cost=4.004814386367798
+
Steps: 0%| | 4979/1000000 [12:42:20<2633:31:39, 9.53s/it, lr=1e-5, step_loss=0.0114]
Steps: 0%| | 4980/1000000 [12:42:26<2297:28:38, 8.31s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [4980], local_loss=0.05387398600578308, train_loss=0.10741342604160309, time_cost=2.657327651977539
+
Steps: 0%| | 4980/1000000 [12:42:26<2297:28:38, 8.31s/it, lr=1e-5, step_loss=0.0539]
Steps: 0%| | 4981/1000000 [12:42:30<1974:26:26, 7.14s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [4981], local_loss=0.01776866801083088, train_loss=0.14490504562854767, time_cost=1.7019176483154297
+
Steps: 0%| | 4981/1000000 [12:42:30<1974:26:26, 7.14s/it, lr=1e-5, step_loss=0.0178]
Steps: 0%| | 4982/1000000 [12:42:39<2143:20:53, 7.75s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [4982], local_loss=0.01659364253282547, train_loss=0.031266950070858, time_cost=7.9259538650512695
+
Steps: 0%| | 4982/1000000 [12:42:39<2143:20:53, 7.75s/it, lr=1e-5, step_loss=0.0166]
Steps: 0%| | 4983/1000000 [12:42:54<2689:48:21, 9.73s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [4983], local_loss=0.05197014659643173, train_loss=0.07062163203954697, time_cost=4.918585538864136
+
Steps: 0%| | 4983/1000000 [12:42:54<2689:48:21, 9.73s/it, lr=1e-5, step_loss=0.052]
Steps: 0%| | 4984/1000000 [12:43:08<3049:47:17, 11.03s/it, lr=1e-5, step_loss=0.052][RANK-0]: Step: [4984], local_loss=0.01713426224887371, train_loss=0.0982305258512497, time_cost=5.145953178405762
+
Steps: 0%| | 4984/1000000 [12:43:08<3049:47:17, 11.03s/it, lr=1e-5, step_loss=0.0171]
Steps: 0%| | 4985/1000000 [12:43:15<2760:16:59, 9.99s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [4985], local_loss=0.019708432257175446, train_loss=0.02262781374156475, time_cost=1.432966709136963
+
Steps: 0%| | 4985/1000000 [12:43:15<2760:16:59, 9.99s/it, lr=1e-5, step_loss=0.0197]
Steps: 0%| | 4986/1000000 [12:43:21<2415:58:06, 8.74s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [4986], local_loss=0.024318093433976173, train_loss=0.024700844660401344, time_cost=1.7208433151245117
+
Steps: 0%| | 4986/1000000 [12:43:21<2415:58:06, 8.74s/it, lr=1e-5, step_loss=0.0243]
Steps: 0%| | 4987/1000000 [12:43:29<2306:55:52, 8.35s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [4987], local_loss=0.05282166600227356, train_loss=0.06941715627908707, time_cost=2.488895893096924
+
Steps: 0%| | 4987/1000000 [12:43:29<2306:55:52, 8.35s/it, lr=1e-5, step_loss=0.0528]
Steps: 0%| | 4988/1000000 [12:43:40<2520:40:18, 9.12s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [4988], local_loss=0.2505130171775818, train_loss=0.05996517837047577, time_cost=4.62212610244751
+
Steps: 0%| | 4988/1000000 [12:43:40<2520:40:18, 9.12s/it, lr=1e-5, step_loss=0.251]
Steps: 0%| | 4989/1000000 [12:43:57<3193:06:39, 11.55s/it, lr=1e-5, step_loss=0.251][RANK-0]: Step: [4989], local_loss=0.012528536841273308, train_loss=0.03706090897321701, time_cost=8.393752098083496
+
Steps: 0%| | 4989/1000000 [12:43:57<3193:06:39, 11.55s/it, lr=1e-5, step_loss=0.0125]
Steps: 0%| | 4990/1000000 [12:44:06<3029:47:50, 10.96s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [4990], local_loss=0.03592889755964279, train_loss=0.025185681879520416, time_cost=4.100345611572266
+
Steps: 0%| | 4990/1000000 [12:44:06<3029:47:50, 10.96s/it, lr=1e-5, step_loss=0.0359]
Steps: 0%| | 4991/1000000 [12:44:13<2689:59:19, 9.73s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [4991], local_loss=0.02412315458059311, train_loss=0.07641005516052246, time_cost=3.2015044689178467
+
Steps: 0%| | 4991/1000000 [12:44:13<2689:59:19, 9.73s/it, lr=1e-5, step_loss=0.0241]
Steps: 0%| | 4992/1000000 [12:44:27<3007:59:34, 10.88s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [4992], local_loss=0.1635449081659317, train_loss=0.05723653733730316, time_cost=4.147117614746094
+
Steps: 0%| | 4992/1000000 [12:44:27<3007:59:34, 10.88s/it, lr=1e-5, step_loss=0.164]
Steps: 0%| | 4993/1000000 [12:44:38<3008:22:20, 10.88s/it, lr=1e-5, step_loss=0.164][RANK-0]: Step: [4993], local_loss=0.012077460996806622, train_loss=0.055865269154310226, time_cost=7.8576905727386475
+
Steps: 0%| | 4993/1000000 [12:44:38<3008:22:20, 10.88s/it, lr=1e-5, step_loss=0.0121]
Steps: 0%| | 4994/1000000 [12:44:48<2983:29:13, 10.79s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [4994], local_loss=0.026330718770623207, train_loss=0.028002656996250153, time_cost=3.0882210731506348
+
Steps: 0%| | 4994/1000000 [12:44:48<2983:29:13, 10.79s/it, lr=1e-5, step_loss=0.0263]
Steps: 0%| | 4995/1000000 [12:45:01<3155:02:27, 11.42s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [4995], local_loss=0.012507233768701553, train_loss=0.03652408719062805, time_cost=1.2440667152404785
+
Steps: 0%| | 4995/1000000 [12:45:01<3155:02:27, 11.42s/it, lr=1e-5, step_loss=0.0125]
Steps: 0%| | 4996/1000000 [12:45:05<2559:55:29, 9.26s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [4996], local_loss=0.02667907252907753, train_loss=0.019186440855264664, time_cost=1.4009366035461426
+
Steps: 0%| | 4996/1000000 [12:45:05<2559:55:29, 9.26s/it, lr=1e-5, step_loss=0.0267]
Steps: 0%| | 4997/1000000 [12:45:10<2196:28:55, 7.95s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [4997], local_loss=0.06449904292821884, train_loss=0.16683360934257507, time_cost=2.355405807495117
+
Steps: 0%| | 4997/1000000 [12:45:10<2196:28:55, 7.95s/it, lr=1e-5, step_loss=0.0645]
Steps: 0%| | 4998/1000000 [12:45:15<1918:15:18, 6.94s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [4998], local_loss=0.037571147084236145, train_loss=0.03681063652038574, time_cost=3.7964680194854736
+
Steps: 0%| | 4998/1000000 [12:45:15<1918:15:18, 6.94s/it, lr=1e-5, step_loss=0.0376]
Steps: 0%| | 4999/1000000 [12:45:25<2182:20:46, 7.90s/it, lr=1e-5, step_loss=0.0376][RANK-0]: Step: [4999], local_loss=0.021908385679125786, train_loss=0.062358707189559937, time_cost=1.2214362621307373
+
Steps: 0%| | 4999/1000000 [12:45:25<2182:20:46, 7.90s/it, lr=1e-5, step_loss=0.0219]
Steps: 0%| | 5000/1000000 [12:45:34<2273:44:34, 8.23s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [5000], local_loss=0.019597060978412628, train_loss=0.28766095638275146, time_cost=1.9678022861480713
+09/19/2024 11:55:27 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000
+09/19/2024 11:55:27 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 11:55:27,145] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-19 11:55:27,176] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 11:55:27,176] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 11:55:49,000] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 11:55:49,011] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 11:56:18,240] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:18,240] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:18,240] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:20,181] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:20,182] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:20,182] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:24,838] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:24,839] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:24,839] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:25,345] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:25,345] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:25,345] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:25,350] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:25,358] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:25,358] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:25,358] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:25,408] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:25,409] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:25,526] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:25,527] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:25,527] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 11:56:25,624] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 11:56:25,624] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 11:56:25,625] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 11:56:25 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/model/diffusion_pytorch_model.safetensors
+09/19/2024 11:57:46 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/scheduler.bin
+09/19/2024 11:57:46 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/sampler.bin
+09/19/2024 11:57:46 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000/random_states_0.pkl
+09/19/2024 11:57:46 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-5000
+
Steps: 0%| | 5000/1000000 [12:47:53<2273:44:34, 8.23s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%| | 5001/1000000 [12:47:58<13501:02:10, 48.85s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [5001], local_loss=0.06833485513925552, train_loss=0.045669592916965485, time_cost=1.23121976852417
+
Steps: 1%| | 5001/1000000 [12:47:58<13501:02:10, 48.85s/it, lr=1e-5, step_loss=0.0683]
Steps: 1%| | 5002/1000000 [12:48:10<10451:46:23, 37.82s/it, lr=1e-5, step_loss=0.0683][RANK-0]: Step: [5002], local_loss=0.02406637743115425, train_loss=0.016882648691534996, time_cost=5.954786062240601
+
Steps: 1%| | 5002/1000000 [12:48:10<10451:46:23, 37.82s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 5003/1000000 [12:48:18<8040:12:35, 29.09s/it, lr=1e-5, step_loss=0.0241] [RANK-0]: Step: [5003], local_loss=0.08493727445602417, train_loss=0.07972507178783417, time_cost=1.1951403617858887
+
Steps: 1%| | 5003/1000000 [12:48:18<8040:12:35, 29.09s/it, lr=1e-5, step_loss=0.0849]
Steps: 1%| | 5004/1000000 [12:48:23<6041:35:07, 21.86s/it, lr=1e-5, step_loss=0.0849][RANK-0]: Step: [5004], local_loss=0.04377489164471626, train_loss=0.040279924869537354, time_cost=1.214613676071167
+
Steps: 1%| | 5004/1000000 [12:48:23<6041:35:07, 21.86s/it, lr=1e-5, step_loss=0.0438]
Steps: 1%| | 5005/1000000 [12:48:28<4596:47:03, 16.63s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [5005], local_loss=0.039352063089609146, train_loss=0.052218370139598846, time_cost=1.5727014541625977
+
Steps: 1%| | 5005/1000000 [12:48:28<4596:47:03, 16.63s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%| | 5006/1000000 [12:48:42<4352:34:45, 15.75s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [5006], local_loss=0.006136917509138584, train_loss=0.054340261965990067, time_cost=5.289700746536255
+
Steps: 1%| | 5006/1000000 [12:48:42<4352:34:45, 15.75s/it, lr=1e-5, step_loss=0.00614]
Steps: 1%| | 5007/1000000 [12:48:49<3640:56:46, 13.17s/it, lr=1e-5, step_loss=0.00614][RANK-0]: Step: [5007], local_loss=0.014295907691121101, train_loss=0.02459096536040306, time_cost=3.411381721496582
+
Steps: 1%| | 5007/1000000 [12:48:49<3640:56:46, 13.17s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5008/1000000 [12:48:54<2964:58:52, 10.73s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5008], local_loss=0.02846369706094265, train_loss=0.036669209599494934, time_cost=2.0729482173919678
+
Steps: 1%| | 5008/1000000 [12:48:54<2964:58:52, 10.73s/it, lr=1e-5, step_loss=0.0285]
Steps: 1%| | 5009/1000000 [12:49:08<3247:37:32, 11.75s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [5009], local_loss=0.017474200576543808, train_loss=0.10230844467878342, time_cost=5.689720869064331
+
Steps: 1%| | 5009/1000000 [12:49:08<3247:37:32, 11.75s/it, lr=1e-5, step_loss=0.0175]
Steps: 1%| | 5010/1000000 [12:49:23<3524:33:17, 12.75s/it, lr=1e-5, step_loss=0.0175][RANK-0]: Step: [5010], local_loss=0.015719641000032425, train_loss=0.03258293867111206, time_cost=6.51709508895874
+
Steps: 1%| | 5010/1000000 [12:49:23<3524:33:17, 12.75s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 5011/1000000 [12:49:38<3670:10:23, 13.28s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [5011], local_loss=0.013034803792834282, train_loss=0.03004380315542221, time_cost=6.589384317398071
+
Steps: 1%| | 5011/1000000 [12:49:38<3670:10:23, 13.28s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 5012/1000000 [12:49:53<3860:33:19, 13.97s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [5012], local_loss=0.027981536462903023, train_loss=0.017405454069375992, time_cost=6.7449047565460205
+
Steps: 1%| | 5012/1000000 [12:49:53<3860:33:19, 13.97s/it, lr=1e-5, step_loss=0.028]
Steps: 1%| | 5013/1000000 [12:50:00<3285:19:44, 11.89s/it, lr=1e-5, step_loss=0.028][RANK-0]: Step: [5013], local_loss=0.020349590107798576, train_loss=0.017767813056707382, time_cost=2.6067094802856445
+
Steps: 1%| | 5013/1000000 [12:50:00<3285:19:44, 11.89s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%| | 5014/1000000 [12:50:12<3249:52:49, 11.76s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [5014], local_loss=0.04053270444273949, train_loss=0.03690384328365326, time_cost=2.4166948795318604
+
Steps: 1%| | 5014/1000000 [12:50:12<3249:52:49, 11.76s/it, lr=1e-5, step_loss=0.0405]
Steps: 1%| | 5015/1000000 [12:50:19<2861:47:21, 10.35s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [5015], local_loss=0.057702403515577316, train_loss=0.03024083562195301, time_cost=3.2486038208007812
+
Steps: 1%| | 5015/1000000 [12:50:19<2861:47:21, 10.35s/it, lr=1e-5, step_loss=0.0577]
Steps: 1%| | 5016/1000000 [12:50:28<2797:59:26, 10.12s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [5016], local_loss=0.019224943593144417, train_loss=0.05901356413960457, time_cost=1.2332823276519775
+
Steps: 1%| | 5016/1000000 [12:50:28<2797:59:26, 10.12s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 5017/1000000 [12:50:35<2542:12:54, 9.20s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [5017], local_loss=0.045005012303590775, train_loss=0.031089579686522484, time_cost=1.2158277034759521
+
Steps: 1%| | 5017/1000000 [12:50:35<2542:12:54, 9.20s/it, lr=1e-5, step_loss=0.045]
Steps: 1%| | 5018/1000000 [12:50:43<2390:49:45, 8.65s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [5018], local_loss=0.010875128209590912, train_loss=0.05719800665974617, time_cost=5.499797582626343
+
Steps: 1%| | 5018/1000000 [12:50:43<2390:49:45, 8.65s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 5019/1000000 [12:50:53<2547:13:26, 9.22s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [5019], local_loss=0.0798388347029686, train_loss=0.07276812940835953, time_cost=3.7027976512908936
+
Steps: 1%| | 5019/1000000 [12:50:53<2547:13:26, 9.22s/it, lr=1e-5, step_loss=0.0798]
Steps: 1%| | 5020/1000000 [12:50:58<2152:40:46, 7.79s/it, lr=1e-5, step_loss=0.0798][RANK-0]: Step: [5020], local_loss=0.024015583097934723, train_loss=0.0260787270963192, time_cost=1.6109378337860107
+
Steps: 1%| | 5020/1000000 [12:50:58<2152:40:46, 7.79s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 5021/1000000 [12:51:09<2425:33:56, 8.78s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [5021], local_loss=0.013173090294003487, train_loss=0.08954941481351852, time_cost=4.641174554824829
+
Steps: 1%| | 5021/1000000 [12:51:09<2425:33:56, 8.78s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 5022/1000000 [12:51:15<2253:26:18, 8.15s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [5022], local_loss=0.06481864303350449, train_loss=0.06381801515817642, time_cost=1.2668640613555908
+
Steps: 1%| | 5022/1000000 [12:51:15<2253:26:18, 8.15s/it, lr=1e-5, step_loss=0.0648]
Steps: 1%| | 5023/1000000 [12:51:23<2176:35:55, 7.88s/it, lr=1e-5, step_loss=0.0648][RANK-0]: Step: [5023], local_loss=0.10714900493621826, train_loss=0.06201431155204773, time_cost=1.341054916381836
+
Steps: 1%| | 5023/1000000 [12:51:23<2176:35:55, 7.88s/it, lr=1e-5, step_loss=0.107]
Steps: 1%| | 5024/1000000 [12:51:29<2080:13:19, 7.53s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [5024], local_loss=0.09208221733570099, train_loss=0.16403242945671082, time_cost=1.2580676078796387
+
Steps: 1%| | 5024/1000000 [12:51:29<2080:13:19, 7.53s/it, lr=1e-5, step_loss=0.0921]
Steps: 1%| | 5025/1000000 [12:51:35<1917:02:21, 6.94s/it, lr=1e-5, step_loss=0.0921][RANK-0]: Step: [5025], local_loss=0.01119120605289936, train_loss=0.15360566973686218, time_cost=2.8624327182769775
+
Steps: 1%| | 5025/1000000 [12:51:35<1917:02:21, 6.94s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 5026/1000000 [12:51:40<1780:50:39, 6.44s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [5026], local_loss=0.016531024128198624, train_loss=0.04975437372922897, time_cost=2.4712769985198975
+
Steps: 1%| | 5026/1000000 [12:51:40<1780:50:39, 6.44s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 5027/1000000 [12:51:45<1614:53:26, 5.84s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [5027], local_loss=0.0389871820807457, train_loss=0.02429557964205742, time_cost=3.462449789047241
+
Steps: 1%| | 5027/1000000 [12:51:45<1614:53:26, 5.84s/it, lr=1e-5, step_loss=0.039]
Steps: 1%| | 5028/1000000 [12:51:51<1689:31:53, 6.11s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [5028], local_loss=0.06448071449995041, train_loss=0.030673466622829437, time_cost=4.881577014923096
+
Steps: 1%| | 5028/1000000 [12:51:51<1689:31:53, 6.11s/it, lr=1e-5, step_loss=0.0645]
Steps: 1%| | 5029/1000000 [12:51:56<1567:23:48, 5.67s/it, lr=1e-5, step_loss=0.0645][RANK-0]: Step: [5029], local_loss=0.16863913834095, train_loss=0.0421035960316658, time_cost=2.013715982437134
+
Steps: 1%| | 5029/1000000 [12:51:56<1567:23:48, 5.67s/it, lr=1e-5, step_loss=0.169]
Steps: 1%| | 5030/1000000 [12:52:06<1942:27:09, 7.03s/it, lr=1e-5, step_loss=0.169][RANK-0]: Step: [5030], local_loss=0.04895128682255745, train_loss=0.027526650577783585, time_cost=4.773139476776123
+
Steps: 1%| | 5030/1000000 [12:52:06<1942:27:09, 7.03s/it, lr=1e-5, step_loss=0.049]
Steps: 1%| | 5031/1000000 [12:52:12<1854:01:00, 6.71s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [5031], local_loss=0.03937281668186188, train_loss=0.03303925693035126, time_cost=4.80683445930481
+
Steps: 1%| | 5031/1000000 [12:52:12<1854:01:00, 6.71s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%| | 5032/1000000 [12:52:27<2510:27:32, 9.08s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [5032], local_loss=0.3367690145969391, train_loss=0.07520981132984161, time_cost=4.902301073074341
+
Steps: 1%| | 5032/1000000 [12:52:27<2510:27:32, 9.08s/it, lr=1e-5, step_loss=0.337]
Steps: 1%| | 5033/1000000 [12:52:31<2123:21:41, 7.68s/it, lr=1e-5, step_loss=0.337][RANK-0]: Step: [5033], local_loss=0.0200527124106884, train_loss=0.07206819206476212, time_cost=1.630812168121338
+
Steps: 1%| | 5033/1000000 [12:52:31<2123:21:41, 7.68s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%| | 5034/1000000 [12:52:43<2460:13:13, 8.90s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [5034], local_loss=0.034117285162210464, train_loss=0.0456213653087616, time_cost=1.2774958610534668
+
Steps: 1%| | 5034/1000000 [12:52:43<2460:13:13, 8.90s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%| | 5035/1000000 [12:52:56<2792:25:44, 10.10s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [5035], local_loss=0.9917407631874084, train_loss=0.14584490656852722, time_cost=3.4563040733337402
+
Steps: 1%| | 5035/1000000 [12:52:56<2792:25:44, 10.10s/it, lr=1e-5, step_loss=0.992]
Steps: 1%| | 5036/1000000 [12:53:05<2738:34:31, 9.91s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [5036], local_loss=0.011946150101721287, train_loss=0.06690175086259842, time_cost=1.9642367362976074
+
Steps: 1%| | 5036/1000000 [12:53:05<2738:34:31, 9.91s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 5037/1000000 [12:53:14<2660:04:22, 9.62s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [5037], local_loss=0.029465816915035248, train_loss=0.06506079435348511, time_cost=5.898694276809692
+
Steps: 1%| | 5037/1000000 [12:53:14<2660:04:22, 9.62s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%| | 5038/1000000 [12:53:19<2222:14:23, 8.04s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [5038], local_loss=0.08100276440382004, train_loss=0.04580269008874893, time_cost=1.3919124603271484
+
Steps: 1%| | 5038/1000000 [12:53:19<2222:14:23, 8.04s/it, lr=1e-5, step_loss=0.081]
Steps: 1%| | 5039/1000000 [12:53:31<2549:54:22, 9.23s/it, lr=1e-5, step_loss=0.081][RANK-0]: Step: [5039], local_loss=0.014158020727336407, train_loss=0.1070324257016182, time_cost=3.287639856338501
+
Steps: 1%| | 5039/1000000 [12:53:31<2549:54:22, 9.23s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 5040/1000000 [12:53:42<2695:16:52, 9.75s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [5040], local_loss=0.018159810453653336, train_loss=0.08310084044933319, time_cost=3.1067047119140625
+
Steps: 1%| | 5040/1000000 [12:53:42<2695:16:52, 9.75s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%| | 5041/1000000 [12:53:53<2829:46:44, 10.24s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [5041], local_loss=0.019689291715621948, train_loss=0.03796914219856262, time_cost=9.64962363243103
+
Steps: 1%| | 5041/1000000 [12:53:53<2829:46:44, 10.24s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%| | 5042/1000000 [12:53:59<2464:22:06, 8.92s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [5042], local_loss=0.9927306175231934, train_loss=0.1900247037410736, time_cost=4.2087812423706055
+
Steps: 1%| | 5042/1000000 [12:53:59<2464:22:06, 8.92s/it, lr=1e-5, step_loss=0.993]
Steps: 1%| | 5043/1000000 [12:54:04<2126:22:49, 7.69s/it, lr=1e-5, step_loss=0.993][RANK-0]: Step: [5043], local_loss=0.10950780659914017, train_loss=0.061193495988845825, time_cost=1.512319564819336
+
Steps: 1%| | 5043/1000000 [12:54:04<2126:22:49, 7.69s/it, lr=1e-5, step_loss=0.11]
Steps: 1%| | 5044/1000000 [12:54:10<1996:23:47, 7.22s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [5044], local_loss=0.05074876919388771, train_loss=0.03976660966873169, time_cost=1.9619414806365967
+
Steps: 1%| | 5044/1000000 [12:54:10<1996:23:47, 7.22s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%| | 5045/1000000 [12:54:14<1758:46:17, 6.36s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [5045], local_loss=0.09642134606838226, train_loss=0.17042037844657898, time_cost=1.3545048236846924
+
Steps: 1%| | 5045/1000000 [12:54:14<1758:46:17, 6.36s/it, lr=1e-5, step_loss=0.0964]
Steps: 1%| | 5046/1000000 [12:54:21<1820:12:31, 6.59s/it, lr=1e-5, step_loss=0.0964][RANK-0]: Step: [5046], local_loss=0.021972788497805595, train_loss=0.02456703595817089, time_cost=2.71881365776062
+
Steps: 1%| | 5046/1000000 [12:54:21<1820:12:31, 6.59s/it, lr=1e-5, step_loss=0.022]
Steps: 1%| | 5047/1000000 [12:54:27<1746:51:01, 6.32s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [5047], local_loss=0.014742870815098286, train_loss=0.04690747708082199, time_cost=1.3084537982940674
+
Steps: 1%| | 5047/1000000 [12:54:27<1746:51:01, 6.32s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 5048/1000000 [12:54:32<1651:33:17, 5.98s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [5048], local_loss=0.03067849576473236, train_loss=0.04723406583070755, time_cost=1.409529209136963
+
Steps: 1%| | 5048/1000000 [12:54:32<1651:33:17, 5.98s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%| | 5049/1000000 [12:54:38<1671:02:26, 6.05s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [5049], local_loss=0.01944630965590477, train_loss=0.0486995130777359, time_cost=1.8808603286743164
+
Steps: 1%| | 5049/1000000 [12:54:38<1671:02:26, 6.05s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 5050/1000000 [12:54:51<2217:29:12, 8.02s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [5050], local_loss=0.061633914709091187, train_loss=0.05754626542329788, time_cost=2.764054775238037
+
Steps: 1%| | 5050/1000000 [12:54:51<2217:29:12, 8.02s/it, lr=1e-5, step_loss=0.0616]
Steps: 1%| | 5051/1000000 [12:54:57<2036:16:10, 7.37s/it, lr=1e-5, step_loss=0.0616][RANK-0]: Step: [5051], local_loss=0.07681899517774582, train_loss=0.03742453455924988, time_cost=3.4195680618286133
+
Steps: 1%| | 5051/1000000 [12:54:57<2036:16:10, 7.37s/it, lr=1e-5, step_loss=0.0768]
Steps: 1%| | 5052/1000000 [12:55:02<1883:57:02, 6.82s/it, lr=1e-5, step_loss=0.0768][RANK-0]: Step: [5052], local_loss=0.01685449667274952, train_loss=0.05802423879504204, time_cost=2.927687406539917
+
Steps: 1%| | 5052/1000000 [12:55:02<1883:57:02, 6.82s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%| | 5053/1000000 [12:55:14<2307:35:14, 8.35s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [5053], local_loss=0.02536141313612461, train_loss=0.07320714741945267, time_cost=2.9740889072418213
+
Steps: 1%| | 5053/1000000 [12:55:14<2307:35:14, 8.35s/it, lr=1e-5, step_loss=0.0254]
Steps: 1%| | 5054/1000000 [12:55:29<2845:44:03, 10.30s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [5054], local_loss=0.01111633237451315, train_loss=0.08268677443265915, time_cost=6.559409856796265
+
Steps: 1%| | 5054/1000000 [12:55:29<2845:44:03, 10.30s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5055/1000000 [12:55:37<2685:20:23, 9.72s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5055], local_loss=0.01678323559463024, train_loss=0.017937812954187393, time_cost=1.799633502960205
+
Steps: 1%| | 5055/1000000 [12:55:37<2685:20:23, 9.72s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%| | 5056/1000000 [12:55:49<2824:22:12, 10.22s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [5056], local_loss=0.02159987948834896, train_loss=0.04172015190124512, time_cost=1.3423309326171875
+
Steps: 1%| | 5056/1000000 [12:55:49<2824:22:12, 10.22s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 5057/1000000 [12:55:55<2506:48:39, 9.07s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [5057], local_loss=0.03300017490983009, train_loss=0.09631931781768799, time_cost=2.481074810028076
+
Steps: 1%| | 5057/1000000 [12:55:55<2506:48:39, 9.07s/it, lr=1e-5, step_loss=0.033]
Steps: 1%| | 5058/1000000 [12:56:03<2357:37:18, 8.53s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [5058], local_loss=0.10171552747488022, train_loss=0.04808255657553673, time_cost=1.3410305976867676
+
Steps: 1%| | 5058/1000000 [12:56:03<2357:37:18, 8.53s/it, lr=1e-5, step_loss=0.102]
Steps: 1%| | 5059/1000000 [12:56:14<2603:49:44, 9.42s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [5059], local_loss=0.014837754890322685, train_loss=0.028722841292619705, time_cost=1.3472259044647217
+
Steps: 1%| | 5059/1000000 [12:56:14<2603:49:44, 9.42s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%| | 5060/1000000 [12:56:20<2284:16:25, 8.27s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [5060], local_loss=0.06548161804676056, train_loss=0.05242108181118965, time_cost=2.5304298400878906
+
Steps: 1%| | 5060/1000000 [12:56:20<2284:16:25, 8.27s/it, lr=1e-5, step_loss=0.0655]
Steps: 1%| | 5061/1000000 [12:56:29<2377:43:24, 8.60s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [5061], local_loss=0.08205848932266235, train_loss=0.08486007153987885, time_cost=3.8191769123077393
+
Steps: 1%| | 5061/1000000 [12:56:29<2377:43:24, 8.60s/it, lr=1e-5, step_loss=0.0821]
Steps: 1%| | 5062/1000000 [12:56:40<2591:26:15, 9.38s/it, lr=1e-5, step_loss=0.0821][RANK-0]: Step: [5062], local_loss=0.0228598490357399, train_loss=0.06732562184333801, time_cost=1.3294384479522705
+
Steps: 1%| | 5062/1000000 [12:56:40<2591:26:15, 9.38s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%| | 5063/1000000 [12:56:47<2403:57:43, 8.70s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [5063], local_loss=0.011859308928251266, train_loss=0.04930754005908966, time_cost=1.2829868793487549
+
Steps: 1%| | 5063/1000000 [12:56:47<2403:57:43, 8.70s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 5064/1000000 [12:56:58<2592:12:19, 9.38s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [5064], local_loss=0.521508514881134, train_loss=0.09658902883529663, time_cost=1.6705546379089355
+
Steps: 1%| | 5064/1000000 [12:56:58<2592:12:19, 9.38s/it, lr=1e-5, step_loss=0.522]
Steps: 1%| | 5065/1000000 [12:57:03<2232:15:30, 8.08s/it, lr=1e-5, step_loss=0.522][RANK-0]: Step: [5065], local_loss=0.008912673220038414, train_loss=0.026906562969088554, time_cost=2.0414679050445557
+
Steps: 1%| | 5065/1000000 [12:57:03<2232:15:30, 8.08s/it, lr=1e-5, step_loss=0.00891]
Steps: 1%| | 5066/1000000 [12:57:14<2483:10:54, 8.98s/it, lr=1e-5, step_loss=0.00891][RANK-0]: Step: [5066], local_loss=0.01421082392334938, train_loss=0.03481721132993698, time_cost=2.908082962036133
+
Steps: 1%| | 5066/1000000 [12:57:14<2483:10:54, 8.98s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 5067/1000000 [12:57:24<2508:00:35, 9.07s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [5067], local_loss=0.012818647548556328, train_loss=0.09762048721313477, time_cost=1.969623327255249
+
Steps: 1%| | 5067/1000000 [12:57:24<2508:00:35, 9.07s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5068/1000000 [12:57:32<2421:39:22, 8.76s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5068], local_loss=0.013031148351728916, train_loss=0.10557583719491959, time_cost=3.6625795364379883
+
Steps: 1%| | 5068/1000000 [12:57:32<2421:39:22, 8.76s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 5069/1000000 [12:57:42<2538:19:13, 9.18s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [5069], local_loss=0.03668690845370293, train_loss=0.02693694271147251, time_cost=2.122861385345459
+
Steps: 1%| | 5069/1000000 [12:57:42<2538:19:13, 9.18s/it, lr=1e-5, step_loss=0.0367]
Steps: 1%| | 5070/1000000 [12:57:50<2411:13:07, 8.72s/it, lr=1e-5, step_loss=0.0367][RANK-0]: Step: [5070], local_loss=0.07339587062597275, train_loss=0.04781638830900192, time_cost=1.668748378753662
+
Steps: 1%| | 5070/1000000 [12:57:50<2411:13:07, 8.72s/it, lr=1e-5, step_loss=0.0734]
Steps: 1%| | 5071/1000000 [12:57:56<2198:07:33, 7.95s/it, lr=1e-5, step_loss=0.0734][RANK-0]: Step: [5071], local_loss=0.020983414724469185, train_loss=0.02663019672036171, time_cost=4.40252161026001
+
Steps: 1%| | 5071/1000000 [12:57:56<2198:07:33, 7.95s/it, lr=1e-5, step_loss=0.021]
Steps: 1%| | 5072/1000000 [12:58:06<2398:36:28, 8.68s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [5072], local_loss=0.014459545724093914, train_loss=0.04668021947145462, time_cost=8.761589527130127
+
Steps: 1%| | 5072/1000000 [12:58:06<2398:36:28, 8.68s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 5073/1000000 [12:58:16<2465:20:58, 8.92s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [5073], local_loss=0.0196877159178257, train_loss=0.06453091651201248, time_cost=4.592460632324219
+
Steps: 1%| | 5073/1000000 [12:58:16<2465:20:58, 8.92s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%| | 5074/1000000 [12:58:23<2342:25:59, 8.48s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [5074], local_loss=0.013147582300007343, train_loss=0.026312001049518585, time_cost=1.728175401687622
+
Steps: 1%| | 5074/1000000 [12:58:23<2342:25:59, 8.48s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 5075/1000000 [12:58:27<1995:04:20, 7.22s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [5075], local_loss=0.025285476818680763, train_loss=0.02453133463859558, time_cost=1.266599416732788
+
Steps: 1%| | 5075/1000000 [12:58:27<1995:04:20, 7.22s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%| | 5076/1000000 [12:58:35<1997:04:51, 7.23s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [5076], local_loss=0.012789450585842133, train_loss=0.031915441155433655, time_cost=1.2507741451263428
+
Steps: 1%| | 5076/1000000 [12:58:35<1997:04:51, 7.23s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5077/1000000 [12:58:45<2236:43:51, 8.09s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5077], local_loss=0.01739668846130371, train_loss=0.14785858988761902, time_cost=1.8239123821258545
+
Steps: 1%| | 5077/1000000 [12:58:45<2236:43:51, 8.09s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 5078/1000000 [12:58:59<2768:42:32, 10.02s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [5078], local_loss=0.02070174366235733, train_loss=0.023466072976589203, time_cost=6.399979591369629
+
Steps: 1%| | 5078/1000000 [12:58:59<2768:42:32, 10.02s/it, lr=1e-5, step_loss=0.0207]
Steps: 1%| | 5079/1000000 [12:59:03<2299:48:02, 8.32s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [5079], local_loss=0.027206888422369957, train_loss=0.019413143396377563, time_cost=1.2444725036621094
+
Steps: 1%| | 5079/1000000 [12:59:04<2299:48:02, 8.32s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%| | 5080/1000000 [12:59:15<2549:34:59, 9.23s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [5080], local_loss=0.06858920305967331, train_loss=0.07888461649417877, time_cost=3.843757390975952
+
Steps: 1%| | 5080/1000000 [12:59:15<2549:34:59, 9.23s/it, lr=1e-5, step_loss=0.0686]
Steps: 1%| | 5081/1000000 [12:59:26<2710:02:14, 9.81s/it, lr=1e-5, step_loss=0.0686][RANK-0]: Step: [5081], local_loss=0.04282454028725624, train_loss=0.1890338659286499, time_cost=2.555431365966797
+
Steps: 1%| | 5081/1000000 [12:59:26<2710:02:14, 9.81s/it, lr=1e-5, step_loss=0.0428]
Steps: 1%| | 5082/1000000 [12:59:41<3103:33:44, 11.23s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [5082], local_loss=0.010147985070943832, train_loss=0.01813504658639431, time_cost=5.2694830894470215
+
Steps: 1%| | 5082/1000000 [12:59:41<3103:33:44, 11.23s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 5083/1000000 [12:59:55<3341:39:34, 12.09s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [5083], local_loss=0.23687969148159027, train_loss=0.04348105564713478, time_cost=10.724687814712524
+
Steps: 1%| | 5083/1000000 [12:59:55<3341:39:34, 12.09s/it, lr=1e-5, step_loss=0.237]
Steps: 1%| | 5084/1000000 [13:00:06<3275:20:49, 11.85s/it, lr=1e-5, step_loss=0.237][RANK-0]: Step: [5084], local_loss=0.036892395466566086, train_loss=0.08699989318847656, time_cost=8.03857707977295
+
Steps: 1%| | 5084/1000000 [13:00:06<3275:20:49, 11.85s/it, lr=1e-5, step_loss=0.0369]
Steps: 1%| | 5085/1000000 [13:00:17<3221:59:58, 11.66s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [5085], local_loss=0.02752331830561161, train_loss=17.244842529296875, time_cost=1.2941925525665283
+
Steps: 1%| | 5085/1000000 [13:00:17<3221:59:58, 11.66s/it, lr=1e-5, step_loss=0.0275]
Steps: 1%| | 5086/1000000 [13:00:22<2616:57:47, 9.47s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [5086], local_loss=0.022502951323986053, train_loss=0.03213001787662506, time_cost=1.4444615840911865
+
Steps: 1%| | 5086/1000000 [13:00:22<2616:57:47, 9.47s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%| | 5087/1000000 [13:00:37<3152:03:21, 11.41s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [5087], local_loss=0.01892853155732155, train_loss=0.02959580533206463, time_cost=3.630195379257202
+
Steps: 1%| | 5087/1000000 [13:00:37<3152:03:21, 11.41s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%| | 5088/1000000 [13:00:52<3450:27:40, 12.49s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [5088], local_loss=0.009424206800758839, train_loss=0.02234419248998165, time_cost=5.351763486862183
+
Steps: 1%| | 5088/1000000 [13:00:52<3450:27:40, 12.49s/it, lr=1e-5, step_loss=0.00942]
Steps: 1%| | 5089/1000000 [13:01:04<3343:53:16, 12.10s/it, lr=1e-5, step_loss=0.00942][RANK-0]: Step: [5089], local_loss=0.026947904378175735, train_loss=0.06699542701244354, time_cost=1.7779405117034912
+
Steps: 1%| | 5089/1000000 [13:01:04<3343:53:16, 12.10s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%| | 5090/1000000 [13:01:08<2732:21:40, 9.89s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [5090], local_loss=0.01696198061108589, train_loss=0.03174116089940071, time_cost=1.88643479347229
+
Steps: 1%| | 5090/1000000 [13:01:08<2732:21:40, 9.89s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 5091/1000000 [13:01:16<2585:55:45, 9.36s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [5091], local_loss=0.06617064028978348, train_loss=0.0344906821846962, time_cost=3.404297351837158
+
Steps: 1%| | 5091/1000000 [13:01:16<2585:55:45, 9.36s/it, lr=1e-5, step_loss=0.0662]
Steps: 1%| | 5092/1000000 [13:01:28<2778:21:15, 10.05s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [5092], local_loss=0.010034140199422836, train_loss=0.029171133413910866, time_cost=3.925849199295044
+
Steps: 1%| | 5092/1000000 [13:01:28<2778:21:15, 10.05s/it, lr=1e-5, step_loss=0.01]
Steps: 1%| | 5093/1000000 [13:01:39<2829:50:55, 10.24s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [5093], local_loss=0.022311151027679443, train_loss=0.018567200750112534, time_cost=9.30613923072815
+
Steps: 1%| | 5093/1000000 [13:01:39<2829:50:55, 10.24s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 5094/1000000 [13:01:46<2564:32:31, 9.28s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [5094], local_loss=0.05749335139989853, train_loss=0.026590559631586075, time_cost=2.4547574520111084
+
Steps: 1%| | 5094/1000000 [13:01:46<2564:32:31, 9.28s/it, lr=1e-5, step_loss=0.0575]
Steps: 1%| | 5095/1000000 [13:01:53<2398:33:23, 8.68s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [5095], local_loss=0.04159361496567726, train_loss=0.02228604070842266, time_cost=1.5195810794830322
+
Steps: 1%| | 5095/1000000 [13:01:53<2398:33:23, 8.68s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%| | 5096/1000000 [13:02:00<2222:20:02, 8.04s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [5096], local_loss=0.042698752135038376, train_loss=0.08055237680673599, time_cost=2.3306050300598145
+
Steps: 1%| | 5096/1000000 [13:02:00<2222:20:02, 8.04s/it, lr=1e-5, step_loss=0.0427]
Steps: 1%| | 5097/1000000 [13:02:10<2438:37:44, 8.82s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [5097], local_loss=0.0378740057349205, train_loss=0.038047052919864655, time_cost=1.313063144683838
+
Steps: 1%| | 5097/1000000 [13:02:10<2438:37:44, 8.82s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%| | 5098/1000000 [13:02:15<2127:29:06, 7.70s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [5098], local_loss=0.032903578132390976, train_loss=0.0604686364531517, time_cost=1.319465160369873
+
Steps: 1%| | 5098/1000000 [13:02:15<2127:29:06, 7.70s/it, lr=1e-5, step_loss=0.0329]
Steps: 1%| | 5099/1000000 [13:02:28<2493:00:47, 9.02s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [5099], local_loss=0.023305777460336685, train_loss=0.030582956969738007, time_cost=3.7546160221099854
+
Steps: 1%| | 5099/1000000 [13:02:28<2493:00:47, 9.02s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%| | 5100/1000000 [13:02:39<2713:31:52, 9.82s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [5100], local_loss=0.05828246846795082, train_loss=0.025410180911421776, time_cost=3.8775076866149902
+
Steps: 1%| | 5100/1000000 [13:02:39<2713:31:52, 9.82s/it, lr=1e-5, step_loss=0.0583]
Steps: 1%| | 5101/1000000 [13:02:44<2297:04:24, 8.31s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [5101], local_loss=0.013496991246938705, train_loss=0.03449101001024246, time_cost=1.883758783340454
+
Steps: 1%| | 5101/1000000 [13:02:44<2297:04:24, 8.31s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 5102/1000000 [13:02:57<2674:08:45, 9.68s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [5102], local_loss=0.3984071910381317, train_loss=0.2082078754901886, time_cost=4.440090894699097
+
Steps: 1%| | 5102/1000000 [13:02:57<2674:08:45, 9.68s/it, lr=1e-5, step_loss=0.398]
Steps: 1%| | 5103/1000000 [13:03:09<2901:28:34, 10.50s/it, lr=1e-5, step_loss=0.398][RANK-0]: Step: [5103], local_loss=0.043805528432130814, train_loss=0.024221397936344147, time_cost=1.2740232944488525
+
Steps: 1%| | 5103/1000000 [13:03:09<2901:28:34, 10.50s/it, lr=1e-5, step_loss=0.0438]
Steps: 1%| | 5104/1000000 [13:03:21<2967:40:25, 10.74s/it, lr=1e-5, step_loss=0.0438][RANK-0]: Step: [5104], local_loss=0.015049885958433151, train_loss=0.03505946695804596, time_cost=5.116999387741089
+
Steps: 1%| | 5104/1000000 [13:03:21<2967:40:25, 10.74s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5105/1000000 [13:03:27<2568:51:54, 9.30s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5105], local_loss=0.017015526071190834, train_loss=0.02635614387691021, time_cost=1.4256949424743652
+
Steps: 1%| | 5105/1000000 [13:03:27<2568:51:54, 9.30s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 5106/1000000 [13:03:35<2509:32:51, 9.08s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [5106], local_loss=0.03027740865945816, train_loss=0.033740777522325516, time_cost=1.271993637084961
+
Steps: 1%| | 5106/1000000 [13:03:35<2509:32:51, 9.08s/it, lr=1e-5, step_loss=0.0303]
Steps: 1%| | 5107/1000000 [13:03:41<2234:50:47, 8.09s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [5107], local_loss=0.030284596607089043, train_loss=0.04463353008031845, time_cost=1.2759385108947754
+
Steps: 1%| | 5107/1000000 [13:03:41<2234:50:47, 8.09s/it, lr=1e-5, step_loss=0.0303]
Steps: 1%| | 5108/1000000 [13:03:54<2623:26:10, 9.49s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [5108], local_loss=0.029813040047883987, train_loss=0.046368591487407684, time_cost=4.144527435302734
+
Steps: 1%| | 5108/1000000 [13:03:54<2623:26:10, 9.49s/it, lr=1e-5, step_loss=0.0298]
Steps: 1%| | 5109/1000000 [13:03:59<2250:19:07, 8.14s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [5109], local_loss=0.011679667048156261, train_loss=0.035578079521656036, time_cost=2.152533769607544
+
Steps: 1%| | 5109/1000000 [13:03:59<2250:19:07, 8.14s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 5110/1000000 [13:04:13<2752:09:49, 9.96s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [5110], local_loss=0.0213736854493618, train_loss=0.02799270674586296, time_cost=4.507042646408081
+
Steps: 1%| | 5110/1000000 [13:04:13<2752:09:49, 9.96s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 5111/1000000 [13:04:24<2885:29:48, 10.44s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [5111], local_loss=0.04680458828806877, train_loss=0.04398588091135025, time_cost=2.1615238189697266
+
Steps: 1%| | 5111/1000000 [13:04:24<2885:29:48, 10.44s/it, lr=1e-5, step_loss=0.0468]
Steps: 1%| | 5112/1000000 [13:04:31<2589:13:38, 9.37s/it, lr=1e-5, step_loss=0.0468][RANK-0]: Step: [5112], local_loss=0.08685421198606491, train_loss=0.03216269984841347, time_cost=1.2749249935150146
+
Steps: 1%| | 5112/1000000 [13:04:31<2589:13:38, 9.37s/it, lr=1e-5, step_loss=0.0869]
Steps: 1%| | 5113/1000000 [13:04:42<2691:26:17, 9.74s/it, lr=1e-5, step_loss=0.0869][RANK-0]: Step: [5113], local_loss=0.010311456397175789, train_loss=0.021823376417160034, time_cost=1.284646987915039
+
Steps: 1%| | 5113/1000000 [13:04:42<2691:26:17, 9.74s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%| | 5114/1000000 [13:04:55<2998:29:26, 10.85s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [5114], local_loss=0.01881621778011322, train_loss=0.016592880710959435, time_cost=1.3403079509735107
+
Steps: 1%| | 5114/1000000 [13:04:55<2998:29:26, 10.85s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 5115/1000000 [13:05:02<2618:49:12, 9.48s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [5115], local_loss=0.010881244204938412, train_loss=0.03879031911492348, time_cost=1.2240288257598877
+
Steps: 1%| | 5115/1000000 [13:05:02<2618:49:12, 9.48s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 5116/1000000 [13:05:13<2770:03:23, 10.02s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [5116], local_loss=0.013094362802803516, train_loss=0.037299588322639465, time_cost=2.593083143234253
+
Steps: 1%| | 5116/1000000 [13:05:13<2770:03:23, 10.02s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 5117/1000000 [13:05:20<2555:26:15, 9.25s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [5117], local_loss=0.019589334726333618, train_loss=0.016546692699193954, time_cost=1.5083909034729004
+
Steps: 1%| | 5117/1000000 [13:05:20<2555:26:15, 9.25s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%| | 5118/1000000 [13:05:32<2785:51:59, 10.08s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [5118], local_loss=0.01345907524228096, train_loss=17.350360870361328, time_cost=3.52030086517334
+
Steps: 1%| | 5118/1000000 [13:05:32<2785:51:59, 10.08s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 5119/1000000 [13:05:43<2852:26:06, 10.32s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [5119], local_loss=0.05816764384508133, train_loss=0.04877132922410965, time_cost=2.4109046459198
+
Steps: 1%| | 5119/1000000 [13:05:43<2852:26:06, 10.32s/it, lr=1e-5, step_loss=0.0582]
Steps: 1%| | 5120/1000000 [13:05:56<3034:23:50, 10.98s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [5120], local_loss=0.07210622727870941, train_loss=0.03146928548812866, time_cost=3.3007829189300537
+
Steps: 1%| | 5120/1000000 [13:05:56<3034:23:50, 10.98s/it, lr=1e-5, step_loss=0.0721]
Steps: 1%| | 5121/1000000 [13:06:00<2509:41:37, 9.08s/it, lr=1e-5, step_loss=0.0721][RANK-0]: Step: [5121], local_loss=0.020038766786456108, train_loss=0.01743602566421032, time_cost=2.3854973316192627
+
Steps: 1%| | 5121/1000000 [13:06:00<2509:41:37, 9.08s/it, lr=1e-5, step_loss=0.02]
Steps: 1%| | 5122/1000000 [13:06:16<3034:18:55, 10.98s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [5122], local_loss=0.039920587092638016, train_loss=0.05855026841163635, time_cost=1.2517845630645752
+
Steps: 1%| | 5122/1000000 [13:06:16<3034:18:55, 10.98s/it, lr=1e-5, step_loss=0.0399]
Steps: 1%| | 5123/1000000 [13:06:21<2564:52:14, 9.28s/it, lr=1e-5, step_loss=0.0399][RANK-0]: Step: [5123], local_loss=0.021140191704034805, train_loss=0.04253861680626869, time_cost=4.308401346206665
+
Steps: 1%| | 5123/1000000 [13:06:21<2564:52:14, 9.28s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%| | 5124/1000000 [13:06:32<2698:06:21, 9.76s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [5124], local_loss=0.07524798065423965, train_loss=0.0454193539917469, time_cost=1.7849860191345215
+
Steps: 1%| | 5124/1000000 [13:06:32<2698:06:21, 9.76s/it, lr=1e-5, step_loss=0.0752]
Steps: 1%| | 5125/1000000 [13:06:37<2323:19:32, 8.41s/it, lr=1e-5, step_loss=0.0752][RANK-0]: Step: [5125], local_loss=0.05816894769668579, train_loss=0.03132300078868866, time_cost=2.0383174419403076
+
Steps: 1%| | 5125/1000000 [13:06:37<2323:19:32, 8.41s/it, lr=1e-5, step_loss=0.0582]
Steps: 1%| | 5126/1000000 [13:06:48<2546:17:51, 9.21s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [5126], local_loss=0.06618455052375793, train_loss=0.09389948844909668, time_cost=2.29764986038208
+
Steps: 1%| | 5126/1000000 [13:06:48<2546:17:51, 9.21s/it, lr=1e-5, step_loss=0.0662]
Steps: 1%| | 5127/1000000 [13:06:53<2205:10:27, 7.98s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [5127], local_loss=0.007695076987147331, train_loss=0.025061912834644318, time_cost=2.5950369834899902
+
Steps: 1%| | 5127/1000000 [13:06:53<2205:10:27, 7.98s/it, lr=1e-5, step_loss=0.0077]
Steps: 1%| | 5128/1000000 [13:06:59<2045:45:21, 7.40s/it, lr=1e-5, step_loss=0.0077][RANK-0]: Step: [5128], local_loss=0.048308368772268295, train_loss=0.08205675333738327, time_cost=1.640209674835205
+
Steps: 1%| | 5128/1000000 [13:06:59<2045:45:21, 7.40s/it, lr=1e-5, step_loss=0.0483]
Steps: 1%| | 5129/1000000 [13:07:09<2181:12:55, 7.89s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [5129], local_loss=0.014934148639440536, train_loss=0.03205709159374237, time_cost=6.69084906578064
+
Steps: 1%| | 5129/1000000 [13:07:09<2181:12:55, 7.89s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 5130/1000000 [13:07:13<1882:34:27, 6.81s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [5130], local_loss=0.04429567605257034, train_loss=0.0509512796998024, time_cost=1.5760788917541504
+
Steps: 1%| | 5130/1000000 [13:07:13<1882:34:27, 6.81s/it, lr=1e-5, step_loss=0.0443]
Steps: 1%| | 5131/1000000 [13:07:22<2067:28:01, 7.48s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [5131], local_loss=0.007569212932139635, train_loss=0.10696735978126526, time_cost=1.2331154346466064
+
Steps: 1%| | 5131/1000000 [13:07:22<2067:28:01, 7.48s/it, lr=1e-5, step_loss=0.00757]
Steps: 1%| | 5132/1000000 [13:07:27<1878:00:10, 6.80s/it, lr=1e-5, step_loss=0.00757][RANK-0]: Step: [5132], local_loss=0.019040759652853012, train_loss=7.753046035766602, time_cost=1.5637023448944092
+
Steps: 1%| | 5132/1000000 [13:07:27<1878:00:10, 6.80s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 5133/1000000 [13:07:39<2323:42:37, 8.41s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [5133], local_loss=0.04796696454286575, train_loss=0.06496962904930115, time_cost=4.01115870475769
+
Steps: 1%| | 5133/1000000 [13:07:39<2323:42:37, 8.41s/it, lr=1e-5, step_loss=0.048]
Steps: 1%| | 5134/1000000 [13:07:51<2599:36:57, 9.41s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [5134], local_loss=0.017985163256525993, train_loss=0.04306550323963165, time_cost=3.6747140884399414
+
Steps: 1%| | 5134/1000000 [13:07:51<2599:36:57, 9.41s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 5135/1000000 [13:08:02<2718:13:51, 9.84s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [5135], local_loss=0.01421256735920906, train_loss=0.10720827430486679, time_cost=1.8927156925201416
+
Steps: 1%| | 5135/1000000 [13:08:02<2718:13:51, 9.84s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 5136/1000000 [13:08:20<3384:24:08, 12.25s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [5136], local_loss=0.015112083405256271, train_loss=0.06324192136526108, time_cost=3.576873779296875
+
Steps: 1%| | 5136/1000000 [13:08:20<3384:24:08, 12.25s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 5137/1000000 [13:08:30<3257:05:35, 11.79s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [5137], local_loss=0.026729989796876907, train_loss=0.03812483698129654, time_cost=3.1610238552093506
+
Steps: 1%| | 5137/1000000 [13:08:30<3257:05:35, 11.79s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%| | 5138/1000000 [13:08:38<2914:30:20, 10.55s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [5138], local_loss=0.01647748239338398, train_loss=0.03871191293001175, time_cost=2.1576733589172363
+
Steps: 1%| | 5138/1000000 [13:08:38<2914:30:20, 10.55s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 5139/1000000 [13:08:42<2405:46:39, 8.71s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [5139], local_loss=0.04056892544031143, train_loss=22.778409957885742, time_cost=1.653862476348877
+
Steps: 1%| | 5139/1000000 [13:08:42<2405:46:39, 8.71s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%| | 5140/1000000 [13:08:55<2702:00:53, 9.78s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [5140], local_loss=0.027259208261966705, train_loss=0.11382737755775452, time_cost=5.544090509414673
+
Steps: 1%| | 5140/1000000 [13:08:55<2702:00:53, 9.78s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%| | 5141/1000000 [13:09:03<2575:17:08, 9.32s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [5141], local_loss=0.016050776466727257, train_loss=0.024017658084630966, time_cost=1.9835543632507324
+
Steps: 1%| | 5141/1000000 [13:09:03<2575:17:08, 9.32s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 5142/1000000 [13:09:09<2269:08:42, 8.21s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [5142], local_loss=0.04697860777378082, train_loss=0.03597332537174225, time_cost=2.9898674488067627
+
Steps: 1%| | 5142/1000000 [13:09:09<2269:08:42, 8.21s/it, lr=1e-5, step_loss=0.047]
Steps: 1%| | 5143/1000000 [13:09:14<2035:12:48, 7.36s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [5143], local_loss=0.04644470289349556, train_loss=0.08284018933773041, time_cost=3.910741090774536
+
Steps: 1%| | 5143/1000000 [13:09:14<2035:12:48, 7.36s/it, lr=1e-5, step_loss=0.0464]
Steps: 1%| | 5144/1000000 [13:09:28<2604:16:38, 9.42s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [5144], local_loss=0.018255218863487244, train_loss=0.03207665681838989, time_cost=1.2563269138336182
+
Steps: 1%| | 5144/1000000 [13:09:28<2604:16:38, 9.42s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 5145/1000000 [13:09:44<3140:42:42, 11.37s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [5145], local_loss=0.051327068358659744, train_loss=0.020509302616119385, time_cost=8.31229543685913
+
Steps: 1%| | 5145/1000000 [13:09:44<3140:42:42, 11.37s/it, lr=1e-5, step_loss=0.0513]
Steps: 1%| | 5146/1000000 [13:09:53<2940:05:44, 10.64s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [5146], local_loss=0.009104549884796143, train_loss=0.06395965814590454, time_cost=1.977290153503418
+
Steps: 1%| | 5146/1000000 [13:09:53<2940:05:44, 10.64s/it, lr=1e-5, step_loss=0.0091]
Steps: 1%| | 5147/1000000 [13:10:01<2699:53:22, 9.77s/it, lr=1e-5, step_loss=0.0091][RANK-0]: Step: [5147], local_loss=0.017355315387248993, train_loss=0.016817539930343628, time_cost=3.4021592140197754
+
Steps: 1%| | 5147/1000000 [13:10:01<2699:53:22, 9.77s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 5148/1000000 [13:10:06<2305:33:30, 8.34s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [5148], local_loss=0.09098392724990845, train_loss=0.048090070486068726, time_cost=2.2711925506591797
+
Steps: 1%| | 5148/1000000 [13:10:06<2305:33:30, 8.34s/it, lr=1e-5, step_loss=0.091]
Steps: 1%| | 5149/1000000 [13:10:11<2072:53:18, 7.50s/it, lr=1e-5, step_loss=0.091][RANK-0]: Step: [5149], local_loss=0.015320025384426117, train_loss=0.050863441079854965, time_cost=1.4559412002563477
+
Steps: 1%| | 5149/1000000 [13:10:11<2072:53:18, 7.50s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%| | 5150/1000000 [13:10:16<1861:41:13, 6.74s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [5150], local_loss=0.22424623370170593, train_loss=0.08041325211524963, time_cost=1.237889051437378
+
Steps: 1%| | 5150/1000000 [13:10:16<1861:41:13, 6.74s/it, lr=1e-5, step_loss=0.224]
Steps: 1%| | 5151/1000000 [13:10:31<2500:47:12, 9.05s/it, lr=1e-5, step_loss=0.224][RANK-0]: Step: [5151], local_loss=0.3740788400173187, train_loss=0.07892650365829468, time_cost=1.8590784072875977
+
Steps: 1%| | 5151/1000000 [13:10:31<2500:47:12, 9.05s/it, lr=1e-5, step_loss=0.374]
Steps: 1%| | 5152/1000000 [13:10:35<2100:07:37, 7.60s/it, lr=1e-5, step_loss=0.374][RANK-0]: Step: [5152], local_loss=0.052464358508586884, train_loss=0.0378914400935173, time_cost=1.5528111457824707
+
Steps: 1%| | 5152/1000000 [13:10:35<2100:07:37, 7.60s/it, lr=1e-5, step_loss=0.0525]
Steps: 1%| | 5153/1000000 [13:10:42<2043:24:09, 7.39s/it, lr=1e-5, step_loss=0.0525][RANK-0]: Step: [5153], local_loss=0.016052594408392906, train_loss=0.06269241124391556, time_cost=2.2542600631713867
+
Steps: 1%| | 5153/1000000 [13:10:42<2043:24:09, 7.39s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 5154/1000000 [13:10:54<2457:13:24, 8.89s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [5154], local_loss=0.03391464799642563, train_loss=0.03480328246951103, time_cost=5.698412179946899
+
Steps: 1%| | 5154/1000000 [13:10:54<2457:13:24, 8.89s/it, lr=1e-5, step_loss=0.0339]
Steps: 1%| | 5155/1000000 [13:11:06<2677:11:19, 9.69s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [5155], local_loss=0.08179736137390137, train_loss=0.08540388941764832, time_cost=4.591497898101807
+
Steps: 1%| | 5155/1000000 [13:11:06<2677:11:19, 9.69s/it, lr=1e-5, step_loss=0.0818]
Steps: 1%| | 5156/1000000 [13:11:20<3078:12:27, 11.14s/it, lr=1e-5, step_loss=0.0818][RANK-0]: Step: [5156], local_loss=0.009093503467738628, train_loss=0.015126308426260948, time_cost=5.897627353668213
+
Steps: 1%| | 5156/1000000 [13:11:20<3078:12:27, 11.14s/it, lr=1e-5, step_loss=0.00909]
Steps: 1%| | 5157/1000000 [13:11:29<2906:53:16, 10.52s/it, lr=1e-5, step_loss=0.00909][RANK-0]: Step: [5157], local_loss=0.017001764848828316, train_loss=0.15780803561210632, time_cost=4.164728403091431
+
Steps: 1%| | 5157/1000000 [13:11:29<2906:53:16, 10.52s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 5158/1000000 [13:11:36<2573:17:30, 9.31s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [5158], local_loss=0.012970582582056522, train_loss=0.017527688294649124, time_cost=4.121459245681763
+
Steps: 1%| | 5158/1000000 [13:11:36<2573:17:30, 9.31s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 5159/1000000 [13:11:40<2127:13:14, 7.70s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [5159], local_loss=0.011070908978581429, train_loss=2.0263864994049072, time_cost=1.5981900691986084
+
Steps: 1%| | 5159/1000000 [13:11:40<2127:13:14, 7.70s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5160/1000000 [13:11:51<2406:10:28, 8.71s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5160], local_loss=0.017763745039701462, train_loss=0.03107665665447712, time_cost=2.5943398475646973
+
Steps: 1%| | 5160/1000000 [13:11:51<2406:10:28, 8.71s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%| | 5161/1000000 [13:12:06<2927:03:50, 10.59s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [5161], local_loss=0.026130609214305878, train_loss=0.030618803575634956, time_cost=2.292281150817871
+
Steps: 1%| | 5161/1000000 [13:12:06<2927:03:50, 10.59s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%| | 5162/1000000 [13:12:17<2979:36:50, 10.78s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [5162], local_loss=1.0011334419250488, train_loss=0.15319016575813293, time_cost=5.793514728546143
+
Steps: 1%| | 5162/1000000 [13:12:17<2979:36:50, 10.78s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 5163/1000000 [13:12:28<3009:37:16, 10.89s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [5163], local_loss=0.05940229445695877, train_loss=0.04029529541730881, time_cost=2.9810190200805664
+
Steps: 1%| | 5163/1000000 [13:12:28<3009:37:16, 10.89s/it, lr=1e-5, step_loss=0.0594]
Steps: 1%| | 5164/1000000 [13:12:35<2649:00:47, 9.59s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [5164], local_loss=0.062239766120910645, train_loss=0.03260519355535507, time_cost=1.2322378158569336
+
Steps: 1%| | 5164/1000000 [13:12:35<2649:00:47, 9.59s/it, lr=1e-5, step_loss=0.0622]
Steps: 1%| | 5165/1000000 [13:12:48<2956:14:14, 10.70s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [5165], local_loss=0.44247448444366455, train_loss=0.08522065728902817, time_cost=4.796046495437622
+
Steps: 1%| | 5165/1000000 [13:12:48<2956:14:14, 10.70s/it, lr=1e-5, step_loss=0.442]
Steps: 1%| | 5166/1000000 [13:12:57<2829:40:55, 10.24s/it, lr=1e-5, step_loss=0.442][RANK-0]: Step: [5166], local_loss=0.011201199144124985, train_loss=0.03888062387704849, time_cost=2.920851230621338
+
Steps: 1%| | 5166/1000000 [13:12:57<2829:40:55, 10.24s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 5167/1000000 [13:13:03<2462:20:09, 8.91s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [5167], local_loss=0.02253090590238571, train_loss=0.0308675579726696, time_cost=1.4624929428100586
+
Steps: 1%| | 5167/1000000 [13:13:03<2462:20:09, 8.91s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%| | 5168/1000000 [13:13:08<2143:08:20, 7.76s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [5168], local_loss=0.034887850284576416, train_loss=0.16043955087661743, time_cost=2.397897243499756
+
Steps: 1%| | 5168/1000000 [13:13:08<2143:08:20, 7.76s/it, lr=1e-5, step_loss=0.0349]
Steps: 1%| | 5169/1000000 [13:13:19<2438:15:58, 8.82s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [5169], local_loss=0.015426617115736008, train_loss=0.03992656618356705, time_cost=1.9266185760498047
+
Steps: 1%| | 5169/1000000 [13:13:19<2438:15:58, 8.82s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 5170/1000000 [13:13:31<2667:06:16, 9.65s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [5170], local_loss=0.01675940677523613, train_loss=0.05169720947742462, time_cost=4.055788516998291
+
Steps: 1%| | 5170/1000000 [13:13:31<2667:06:16, 9.65s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%| | 5171/1000000 [13:13:38<2430:20:31, 8.79s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [5171], local_loss=0.013809006661176682, train_loss=0.06470495462417603, time_cost=2.610261917114258
+
Steps: 1%| | 5171/1000000 [13:13:38<2430:20:31, 8.79s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 5172/1000000 [13:13:47<2470:30:55, 8.94s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [5172], local_loss=0.058424465358257294, train_loss=0.08838378638029099, time_cost=1.255232810974121
+
Steps: 1%| | 5172/1000000 [13:13:47<2470:30:55, 8.94s/it, lr=1e-5, step_loss=0.0584]
Steps: 1%| | 5173/1000000 [13:13:55<2382:54:18, 8.62s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [5173], local_loss=0.2737880349159241, train_loss=0.056702107191085815, time_cost=1.236060380935669
+
Steps: 1%| | 5173/1000000 [13:13:55<2382:54:18, 8.62s/it, lr=1e-5, step_loss=0.274]
Steps: 1%| | 5174/1000000 [13:14:04<2416:09:56, 8.74s/it, lr=1e-5, step_loss=0.274][RANK-0]: Step: [5174], local_loss=0.018741421401500702, train_loss=11.972136497497559, time_cost=1.2322797775268555
+
Steps: 1%| | 5174/1000000 [13:14:04<2416:09:56, 8.74s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 5175/1000000 [13:14:09<2115:19:46, 7.65s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [5175], local_loss=0.34330666065216064, train_loss=0.07579509913921356, time_cost=2.2517130374908447
+
Steps: 1%| | 5175/1000000 [13:14:09<2115:19:46, 7.65s/it, lr=1e-5, step_loss=0.343]
Steps: 1%| | 5176/1000000 [13:14:19<2297:21:55, 8.31s/it, lr=1e-5, step_loss=0.343][RANK-0]: Step: [5176], local_loss=0.020103566348552704, train_loss=0.07105003297328949, time_cost=2.7061171531677246
+
Steps: 1%| | 5176/1000000 [13:14:19<2297:21:55, 8.31s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%| | 5177/1000000 [13:14:28<2391:38:42, 8.65s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [5177], local_loss=0.06900840997695923, train_loss=0.033374588936567307, time_cost=4.381957769393921
+
Steps: 1%| | 5177/1000000 [13:14:28<2391:38:42, 8.65s/it, lr=1e-5, step_loss=0.069]
Steps: 1%| | 5178/1000000 [13:14:42<2773:03:10, 10.03s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [5178], local_loss=0.01101895235478878, train_loss=0.06305043399333954, time_cost=4.886292219161987
+
Steps: 1%| | 5178/1000000 [13:14:42<2773:03:10, 10.03s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5179/1000000 [13:14:55<3002:58:39, 10.87s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5179], local_loss=0.006241321098059416, train_loss=6.1574225425720215, time_cost=4.460930824279785
+
Steps: 1%| | 5179/1000000 [13:14:55<3002:58:39, 10.87s/it, lr=1e-5, step_loss=0.00624]
Steps: 1%| | 5180/1000000 [13:14:59<2470:19:50, 8.94s/it, lr=1e-5, step_loss=0.00624][RANK-0]: Step: [5180], local_loss=0.3851877450942993, train_loss=0.10230022668838501, time_cost=1.6528429985046387
+
Steps: 1%| | 5180/1000000 [13:14:59<2470:19:50, 8.94s/it, lr=1e-5, step_loss=0.385]
Steps: 1%| | 5181/1000000 [13:15:08<2499:35:52, 9.05s/it, lr=1e-5, step_loss=0.385][RANK-0]: Step: [5181], local_loss=0.12041492760181427, train_loss=0.18048341572284698, time_cost=1.956463098526001
+
Steps: 1%| | 5181/1000000 [13:15:08<2499:35:52, 9.05s/it, lr=1e-5, step_loss=0.12]
Steps: 1%| | 5182/1000000 [13:15:18<2576:39:00, 9.32s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [5182], local_loss=0.08336174488067627, train_loss=0.15673565864562988, time_cost=4.759790897369385
+
Steps: 1%| | 5182/1000000 [13:15:18<2576:39:00, 9.32s/it, lr=1e-5, step_loss=0.0834]
Steps: 1%| | 5183/1000000 [13:15:28<2582:27:52, 9.35s/it, lr=1e-5, step_loss=0.0834][RANK-0]: Step: [5183], local_loss=0.04031246900558472, train_loss=0.05465620383620262, time_cost=3.2760469913482666
+
Steps: 1%| | 5183/1000000 [13:15:28<2582:27:52, 9.35s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%| | 5184/1000000 [13:15:39<2729:34:10, 9.88s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [5184], local_loss=0.022147325798869133, train_loss=0.04563722014427185, time_cost=2.766080379486084
+
Steps: 1%| | 5184/1000000 [13:15:39<2729:34:10, 9.88s/it, lr=1e-5, step_loss=0.0221]
Steps: 1%| | 5185/1000000 [13:15:56<3381:35:12, 12.24s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [5185], local_loss=0.04371779039502144, train_loss=0.061199288815259933, time_cost=9.262117385864258
+
Steps: 1%| | 5185/1000000 [13:15:56<3381:35:12, 12.24s/it, lr=1e-5, step_loss=0.0437]
Steps: 1%| | 5186/1000000 [13:16:01<2717:37:08, 9.83s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [5186], local_loss=0.01731507107615471, train_loss=0.026906844228506088, time_cost=1.2007272243499756
+
Steps: 1%| | 5186/1000000 [13:16:01<2717:37:08, 9.83s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%| | 5187/1000000 [13:16:11<2715:59:33, 9.83s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [5187], local_loss=0.008207268081605434, train_loss=0.2885821461677551, time_cost=3.634329080581665
+
Steps: 1%| | 5187/1000000 [13:16:11<2715:59:33, 9.83s/it, lr=1e-5, step_loss=0.00821]
Steps: 1%| | 5188/1000000 [13:16:15<2307:22:37, 8.35s/it, lr=1e-5, step_loss=0.00821][RANK-0]: Step: [5188], local_loss=0.03127996623516083, train_loss=0.1545213907957077, time_cost=2.2931671142578125
+
Steps: 1%| | 5188/1000000 [13:16:15<2307:22:37, 8.35s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 5189/1000000 [13:16:30<2797:51:31, 10.12s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [5189], local_loss=0.02106679603457451, train_loss=0.07078464329242706, time_cost=6.629400730133057
+
Steps: 1%| | 5189/1000000 [13:16:30<2797:51:31, 10.12s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%| | 5190/1000000 [13:16:35<2397:34:59, 8.68s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [5190], local_loss=0.16693300008773804, train_loss=0.03880693390965462, time_cost=2.146090030670166
+
Steps: 1%| | 5190/1000000 [13:16:35<2397:34:59, 8.68s/it, lr=1e-5, step_loss=0.167]
Steps: 1%| | 5191/1000000 [13:16:51<3037:10:19, 10.99s/it, lr=1e-5, step_loss=0.167][RANK-0]: Step: [5191], local_loss=0.009861673228442669, train_loss=0.1479315459728241, time_cost=13.917078495025635
+
Steps: 1%| | 5191/1000000 [13:16:51<3037:10:19, 10.99s/it, lr=1e-5, step_loss=0.00986]
Steps: 1%| | 5192/1000000 [13:17:02<3017:14:11, 10.92s/it, lr=1e-5, step_loss=0.00986][RANK-0]: Step: [5192], local_loss=0.04584578424692154, train_loss=14.808236122131348, time_cost=6.579446792602539
+
Steps: 1%| | 5192/1000000 [13:17:02<3017:14:11, 10.92s/it, lr=1e-5, step_loss=0.0458]
Steps: 1%| | 5193/1000000 [13:17:07<2518:27:30, 9.11s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [5193], local_loss=0.09160250425338745, train_loss=0.06777145713567734, time_cost=1.8109796047210693
+
Steps: 1%| | 5193/1000000 [13:17:07<2518:27:30, 9.11s/it, lr=1e-5, step_loss=0.0916]
Steps: 1%| | 5194/1000000 [13:17:20<2832:14:09, 10.25s/it, lr=1e-5, step_loss=0.0916][RANK-0]: Step: [5194], local_loss=0.08798807114362717, train_loss=0.060692109167575836, time_cost=1.2484135627746582
+
Steps: 1%| | 5194/1000000 [13:17:20<2832:14:09, 10.25s/it, lr=1e-5, step_loss=0.088]
Steps: 1%| | 5195/1000000 [13:17:30<2823:16:45, 10.22s/it, lr=1e-5, step_loss=0.088][RANK-0]: Step: [5195], local_loss=0.0293835811316967, train_loss=38.96226501464844, time_cost=2.7658157348632812
+
Steps: 1%| | 5195/1000000 [13:17:30<2823:16:45, 10.22s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%| | 5196/1000000 [13:17:45<3205:51:17, 11.60s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [5196], local_loss=0.019148746505379677, train_loss=0.02406712807714939, time_cost=3.3868350982666016
+
Steps: 1%| | 5196/1000000 [13:17:45<3205:51:17, 11.60s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 5197/1000000 [13:17:54<2966:08:21, 10.73s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [5197], local_loss=0.04012664034962654, train_loss=0.07627935707569122, time_cost=2.0622782707214355
+
Steps: 1%| | 5197/1000000 [13:17:54<2966:08:21, 10.73s/it, lr=1e-5, step_loss=0.0401]
Steps: 1%| | 5198/1000000 [13:18:03<2834:19:15, 10.26s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [5198], local_loss=0.0883542075753212, train_loss=0.09826871752738953, time_cost=3.4010202884674072
+
Steps: 1%| | 5198/1000000 [13:18:03<2834:19:15, 10.26s/it, lr=1e-5, step_loss=0.0884]
Steps: 1%| | 5199/1000000 [13:18:08<2406:24:25, 8.71s/it, lr=1e-5, step_loss=0.0884][RANK-0]: Step: [5199], local_loss=0.03703230246901512, train_loss=0.04043981805443764, time_cost=1.4204938411712646
+
Steps: 1%| | 5199/1000000 [13:18:08<2406:24:25, 8.71s/it, lr=1e-5, step_loss=0.037]
Steps: 1%| | 5200/1000000 [13:18:18<2524:02:20, 9.13s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [5200], local_loss=0.043331678956747055, train_loss=0.048627905547618866, time_cost=1.3018627166748047
+
Steps: 1%| | 5200/1000000 [13:18:18<2524:02:20, 9.13s/it, lr=1e-5, step_loss=0.0433]
Steps: 1%| | 5201/1000000 [13:18:24<2252:56:29, 8.15s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [5201], local_loss=0.04580537602305412, train_loss=0.031245287507772446, time_cost=3.4978930950164795
+
Steps: 1%| | 5201/1000000 [13:18:24<2252:56:29, 8.15s/it, lr=1e-5, step_loss=0.0458]
Steps: 1%| | 5202/1000000 [13:18:39<2811:50:10, 10.18s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [5202], local_loss=0.07357735931873322, train_loss=0.17185260355472565, time_cost=7.502616882324219
+
Steps: 1%| | 5202/1000000 [13:18:39<2811:50:10, 10.18s/it, lr=1e-5, step_loss=0.0736]
Steps: 1%| | 5203/1000000 [13:18:47<2624:38:08, 9.50s/it, lr=1e-5, step_loss=0.0736][RANK-0]: Step: [5203], local_loss=0.018014026805758476, train_loss=0.038775429129600525, time_cost=6.564723968505859
+
Steps: 1%| | 5203/1000000 [13:18:47<2624:38:08, 9.50s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 5204/1000000 [13:18:51<2189:55:38, 7.92s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [5204], local_loss=0.23753578960895538, train_loss=0.09992086887359619, time_cost=1.722332239151001
+
Steps: 1%| | 5204/1000000 [13:18:51<2189:55:38, 7.92s/it, lr=1e-5, step_loss=0.238]
Steps: 1%| | 5205/1000000 [13:19:04<2612:07:48, 9.45s/it, lr=1e-5, step_loss=0.238][RANK-0]: Step: [5205], local_loss=0.03104613907635212, train_loss=0.03391160070896149, time_cost=1.2364773750305176
+
Steps: 1%| | 5205/1000000 [13:19:04<2612:07:48, 9.45s/it, lr=1e-5, step_loss=0.031]
Steps: 1%| | 5206/1000000 [13:19:10<2342:44:35, 8.48s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [5206], local_loss=0.026562388986349106, train_loss=0.03434374928474426, time_cost=1.5841152667999268
+
Steps: 1%| | 5206/1000000 [13:19:10<2342:44:35, 8.48s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%| | 5207/1000000 [13:19:15<2019:57:34, 7.31s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [5207], local_loss=0.85822594165802, train_loss=0.15643665194511414, time_cost=1.7285575866699219
+
Steps: 1%| | 5207/1000000 [13:19:15<2019:57:34, 7.31s/it, lr=1e-5, step_loss=0.858]
Steps: 1%| | 5208/1000000 [13:19:20<1830:17:36, 6.62s/it, lr=1e-5, step_loss=0.858][RANK-0]: Step: [5208], local_loss=0.03961361572146416, train_loss=0.031058311462402344, time_cost=1.3782639503479004
+
Steps: 1%| | 5208/1000000 [13:19:20<1830:17:36, 6.62s/it, lr=1e-5, step_loss=0.0396]
Steps: 1%| | 5209/1000000 [13:19:27<1856:13:42, 6.72s/it, lr=1e-5, step_loss=0.0396][RANK-0]: Step: [5209], local_loss=0.010564304888248444, train_loss=0.07893373817205429, time_cost=1.3261706829071045
+
Steps: 1%| | 5209/1000000 [13:19:27<1856:13:42, 6.72s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5210/1000000 [13:19:41<2457:08:29, 8.89s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5210], local_loss=0.02023060992360115, train_loss=0.07711463421583176, time_cost=4.524500131607056
+
Steps: 1%| | 5210/1000000 [13:19:41<2457:08:29, 8.89s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%| | 5211/1000000 [13:19:49<2406:35:16, 8.71s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [5211], local_loss=0.015631159767508507, train_loss=0.024405382573604584, time_cost=2.335752248764038
+
Steps: 1%| | 5211/1000000 [13:19:49<2406:35:16, 8.71s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%| | 5212/1000000 [13:19:56<2240:37:19, 8.11s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [5212], local_loss=0.015630168840289116, train_loss=0.034341052174568176, time_cost=3.0415844917297363
+
Steps: 1%| | 5212/1000000 [13:19:56<2240:37:19, 8.11s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%| | 5213/1000000 [13:20:09<2703:50:53, 9.78s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [5213], local_loss=0.015856673941016197, train_loss=0.040326036512851715, time_cost=4.511401891708374
+
Steps: 1%| | 5213/1000000 [13:20:09<2703:50:53, 9.78s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 5214/1000000 [13:20:14<2253:30:42, 8.16s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [5214], local_loss=0.024419711902737617, train_loss=0.02543971873819828, time_cost=2.0498433113098145
+
Steps: 1%| | 5214/1000000 [13:20:14<2253:30:42, 8.16s/it, lr=1e-5, step_loss=0.0244]
Steps: 1%| | 5215/1000000 [13:20:27<2646:40:24, 9.58s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [5215], local_loss=0.05356346815824509, train_loss=0.02593468874692917, time_cost=3.385496139526367
+
Steps: 1%| | 5215/1000000 [13:20:27<2646:40:24, 9.58s/it, lr=1e-5, step_loss=0.0536]
Steps: 1%| | 5216/1000000 [13:20:39<2884:08:40, 10.44s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [5216], local_loss=0.023473327979445457, train_loss=0.034689635038375854, time_cost=4.726921796798706
+
Steps: 1%| | 5216/1000000 [13:20:39<2884:08:40, 10.44s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%| | 5217/1000000 [13:20:49<2868:29:57, 10.38s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [5217], local_loss=0.013563108630478382, train_loss=0.05465823784470558, time_cost=3.820650100708008
+
Steps: 1%| | 5217/1000000 [13:20:49<2868:29:57, 10.38s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 5218/1000000 [13:20:54<2440:02:30, 8.83s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [5218], local_loss=0.01849808730185032, train_loss=0.02422059327363968, time_cost=2.3687007427215576
+
Steps: 1%| | 5218/1000000 [13:20:54<2440:02:30, 8.83s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%| | 5219/1000000 [13:21:00<2186:00:57, 7.91s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [5219], local_loss=0.03885437175631523, train_loss=0.10666435956954956, time_cost=1.4986703395843506
+
Steps: 1%| | 5219/1000000 [13:21:00<2186:00:57, 7.91s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%| | 5220/1000000 [13:21:06<2017:49:28, 7.30s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [5220], local_loss=0.014274189248681068, train_loss=0.07615698128938675, time_cost=1.6219842433929443
+
Steps: 1%| | 5220/1000000 [13:21:06<2017:49:28, 7.30s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5221/1000000 [13:21:17<2291:09:58, 8.29s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5221], local_loss=0.13814431428909302, train_loss=0.06414351612329483, time_cost=2.5952577590942383
+
Steps: 1%| | 5221/1000000 [13:21:17<2291:09:58, 8.29s/it, lr=1e-5, step_loss=0.138]
Steps: 1%| | 5222/1000000 [13:21:28<2521:23:32, 9.12s/it, lr=1e-5, step_loss=0.138][RANK-0]: Step: [5222], local_loss=0.02206512913107872, train_loss=0.021804815158247948, time_cost=1.518968105316162
+
Steps: 1%| | 5222/1000000 [13:21:28<2521:23:32, 9.12s/it, lr=1e-5, step_loss=0.0221]
Steps: 1%| | 5223/1000000 [13:21:33<2186:48:04, 7.91s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [5223], local_loss=0.014579671435058117, train_loss=0.0251736082136631, time_cost=4.069939374923706
+
Steps: 1%| | 5223/1000000 [13:21:33<2186:48:04, 7.91s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 5224/1000000 [13:21:37<1864:05:54, 6.75s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [5224], local_loss=0.05319235101342201, train_loss=0.09767366200685501, time_cost=1.32871675491333
+
Steps: 1%| | 5224/1000000 [13:21:37<1864:05:54, 6.75s/it, lr=1e-5, step_loss=0.0532]
Steps: 1%| | 5225/1000000 [13:21:48<2187:39:44, 7.92s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [5225], local_loss=0.24413911998271942, train_loss=0.06118210405111313, time_cost=3.1908388137817383
+
Steps: 1%| | 5225/1000000 [13:21:48<2187:39:44, 7.92s/it, lr=1e-5, step_loss=0.244]
Steps: 1%| | 5226/1000000 [13:22:02<2703:40:53, 9.78s/it, lr=1e-5, step_loss=0.244][RANK-0]: Step: [5226], local_loss=0.020444802939891815, train_loss=0.03872045874595642, time_cost=5.997323513031006
+
Steps: 1%| | 5226/1000000 [13:22:02<2703:40:53, 9.78s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%| | 5227/1000000 [13:22:09<2488:20:43, 9.01s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [5227], local_loss=0.014278766699135303, train_loss=0.016478512436151505, time_cost=2.5323293209075928
+
Steps: 1%| | 5227/1000000 [13:22:09<2488:20:43, 9.01s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5228/1000000 [13:22:20<2643:14:53, 9.57s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5228], local_loss=0.03898139297962189, train_loss=0.02329319715499878, time_cost=2.3635828495025635
+
Steps: 1%| | 5228/1000000 [13:22:20<2643:14:53, 9.57s/it, lr=1e-5, step_loss=0.039]
Steps: 1%| | 5229/1000000 [13:22:25<2296:07:01, 8.31s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [5229], local_loss=0.12264178693294525, train_loss=0.06958594918251038, time_cost=2.0508623123168945
+
Steps: 1%| | 5229/1000000 [13:22:25<2296:07:01, 8.31s/it, lr=1e-5, step_loss=0.123]
Steps: 1%| | 5230/1000000 [13:22:33<2253:03:16, 8.15s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [5230], local_loss=0.022926446050405502, train_loss=0.03336786478757858, time_cost=3.6747000217437744
+
Steps: 1%| | 5230/1000000 [13:22:33<2253:03:16, 8.15s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%| | 5231/1000000 [13:22:38<2002:17:42, 7.25s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [5231], local_loss=0.03794518858194351, train_loss=0.044066254049539566, time_cost=1.2102351188659668
+
Steps: 1%| | 5231/1000000 [13:22:38<2002:17:42, 7.25s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%| | 5232/1000000 [13:22:48<2201:08:50, 7.97s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [5232], local_loss=180.24708557128906, train_loss=22.5689697265625, time_cost=4.500297546386719
+
Steps: 1%| | 5232/1000000 [13:22:48<2201:08:50, 7.97s/it, lr=1e-5, step_loss=180]
Steps: 1%| | 5233/1000000 [13:23:03<2820:38:59, 10.21s/it, lr=1e-5, step_loss=180][RANK-0]: Step: [5233], local_loss=0.016963819041848183, train_loss=0.025548642501235008, time_cost=1.2356863021850586
+
Steps: 1%| | 5233/1000000 [13:23:03<2820:38:59, 10.21s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 5234/1000000 [13:23:18<3226:12:34, 11.68s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [5234], local_loss=0.014798874966800213, train_loss=0.016710124909877777, time_cost=5.769184350967407
+
Steps: 1%| | 5234/1000000 [13:23:18<3226:12:34, 11.68s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%| | 5235/1000000 [13:23:30<3240:45:14, 11.73s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [5235], local_loss=0.0435023196041584, train_loss=0.04802827909588814, time_cost=1.211120843887329
+
Steps: 1%| | 5235/1000000 [13:23:30<3240:45:14, 11.73s/it, lr=1e-5, step_loss=0.0435]
Steps: 1%| | 5236/1000000 [13:23:36<2726:20:30, 9.87s/it, lr=1e-5, step_loss=0.0435][RANK-0]: Step: [5236], local_loss=0.1479799747467041, train_loss=0.05012545734643936, time_cost=2.504824638366699
+
Steps: 1%| | 5236/1000000 [13:23:36<2726:20:30, 9.87s/it, lr=1e-5, step_loss=0.148]
Steps: 1%| | 5237/1000000 [13:23:45<2698:46:42, 9.77s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [5237], local_loss=0.04334711283445358, train_loss=0.038870930671691895, time_cost=1.5697884559631348
+
Steps: 1%| | 5237/1000000 [13:23:45<2698:46:42, 9.77s/it, lr=1e-5, step_loss=0.0433]
Steps: 1%| | 5238/1000000 [13:23:57<2901:38:00, 10.50s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [5238], local_loss=0.03306049108505249, train_loss=0.03768811374902725, time_cost=4.077752590179443
+
Steps: 1%| | 5238/1000000 [13:23:57<2901:38:00, 10.50s/it, lr=1e-5, step_loss=0.0331]
Steps: 1%| | 5239/1000000 [13:24:02<2376:07:50, 8.60s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [5239], local_loss=0.010688099078834057, train_loss=0.07119430601596832, time_cost=1.3379366397857666
+
Steps: 1%| | 5239/1000000 [13:24:02<2376:07:50, 8.60s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 5240/1000000 [13:24:15<2785:03:15, 10.08s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [5240], local_loss=0.04622849076986313, train_loss=0.08574175089597702, time_cost=5.546322584152222
+
Steps: 1%| | 5240/1000000 [13:24:15<2785:03:15, 10.08s/it, lr=1e-5, step_loss=0.0462]
Steps: 1%| | 5241/1000000 [13:24:20<2382:48:52, 8.62s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [5241], local_loss=0.014148731715977192, train_loss=0.057994913309812546, time_cost=2.1242527961730957
+
Steps: 1%| | 5241/1000000 [13:24:20<2382:48:52, 8.62s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 5242/1000000 [13:24:27<2246:15:11, 8.13s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [5242], local_loss=0.010948805138468742, train_loss=0.04799680784344673, time_cost=3.2500295639038086
+
Steps: 1%| | 5242/1000000 [13:24:27<2246:15:11, 8.13s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 5243/1000000 [13:24:39<2575:10:15, 9.32s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [5243], local_loss=0.04759097471833229, train_loss=0.07455071806907654, time_cost=2.036303997039795
+
Steps: 1%| | 5243/1000000 [13:24:39<2575:10:15, 9.32s/it, lr=1e-5, step_loss=0.0476]
Steps: 1%| | 5244/1000000 [13:24:47<2408:48:53, 8.72s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [5244], local_loss=0.017448008060455322, train_loss=0.03792083263397217, time_cost=2.0704057216644287
+
Steps: 1%| | 5244/1000000 [13:24:47<2408:48:53, 8.72s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 5245/1000000 [13:24:56<2441:08:07, 8.83s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [5245], local_loss=0.013462407514452934, train_loss=5.358803749084473, time_cost=2.3443093299865723
+
Steps: 1%| | 5245/1000000 [13:24:56<2441:08:07, 8.83s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 5246/1000000 [13:25:04<2357:53:17, 8.53s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [5246], local_loss=0.011703381314873695, train_loss=0.0637868344783783, time_cost=3.7483997344970703
+
Steps: 1%| | 5246/1000000 [13:25:04<2357:53:17, 8.53s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 5247/1000000 [13:25:13<2427:34:22, 8.79s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [5247], local_loss=0.28791260719299316, train_loss=0.11201705783605576, time_cost=7.0003721714019775
+
Steps: 1%| | 5247/1000000 [13:25:13<2427:34:22, 8.79s/it, lr=1e-5, step_loss=0.288]
Steps: 1%| | 5248/1000000 [13:25:27<2874:17:20, 10.40s/it, lr=1e-5, step_loss=0.288][RANK-0]: Step: [5248], local_loss=0.03406614437699318, train_loss=0.04311642050743103, time_cost=5.116801738739014
+
Steps: 1%| | 5248/1000000 [13:25:27<2874:17:20, 10.40s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%| | 5249/1000000 [13:25:33<2460:00:10, 8.90s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [5249], local_loss=0.011851418763399124, train_loss=0.03255537152290344, time_cost=2.6851553916931152
+
Steps: 1%| | 5249/1000000 [13:25:33<2460:00:10, 8.90s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 5250/1000000 [13:25:42<2481:17:02, 8.98s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [5250], local_loss=0.01507333293557167, train_loss=0.06577958911657333, time_cost=2.808633804321289
+
Steps: 1%| | 5250/1000000 [13:25:42<2481:17:02, 8.98s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 5251/1000000 [13:25:46<2132:08:46, 7.72s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [5251], local_loss=0.2549338936805725, train_loss=0.05509372800588608, time_cost=1.613041877746582
+
Steps: 1%| | 5251/1000000 [13:25:46<2132:08:46, 7.72s/it, lr=1e-5, step_loss=0.255]
Steps: 1%| | 5252/1000000 [13:25:57<2382:42:31, 8.62s/it, lr=1e-5, step_loss=0.255][RANK-0]: Step: [5252], local_loss=0.010059311985969543, train_loss=0.1754220575094223, time_cost=1.9456820487976074
+
Steps: 1%| | 5252/1000000 [13:25:57<2382:42:31, 8.62s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 5253/1000000 [13:26:07<2498:33:34, 9.04s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [5253], local_loss=0.016788624227046967, train_loss=0.22465364634990692, time_cost=1.2139225006103516
+
Steps: 1%| | 5253/1000000 [13:26:07<2498:33:34, 9.04s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%| | 5254/1000000 [13:26:18<2627:11:37, 9.51s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [5254], local_loss=317.6286315917969, train_loss=39.830501556396484, time_cost=4.396316289901733
+
Steps: 1%| | 5254/1000000 [13:26:18<2627:11:37, 9.51s/it, lr=1e-5, step_loss=318]
Steps: 1%| | 5255/1000000 [13:26:34<3207:21:33, 11.61s/it, lr=1e-5, step_loss=318][RANK-0]: Step: [5255], local_loss=0.016162250190973282, train_loss=0.03908887878060341, time_cost=8.107892751693726
+
Steps: 1%| | 5255/1000000 [13:26:34<3207:21:33, 11.61s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 5256/1000000 [13:26:40<2733:38:23, 9.89s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [5256], local_loss=0.050063855946063995, train_loss=0.11668828129768372, time_cost=2.9306349754333496
+
Steps: 1%| | 5256/1000000 [13:26:40<2733:38:23, 9.89s/it, lr=1e-5, step_loss=0.0501]
Steps: 1%| | 5257/1000000 [13:26:51<2792:37:20, 10.11s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [5257], local_loss=0.023199064657092094, train_loss=0.2023516744375229, time_cost=1.8193483352661133
+
Steps: 1%| | 5257/1000000 [13:26:51<2792:37:20, 10.11s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 5258/1000000 [13:27:03<2924:13:42, 10.58s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [5258], local_loss=0.01947667822241783, train_loss=0.15402737259864807, time_cost=9.664036512374878
+
Steps: 1%| | 5258/1000000 [13:27:03<2924:13:42, 10.58s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 5259/1000000 [13:27:08<2488:23:19, 9.01s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [5259], local_loss=0.024995863437652588, train_loss=0.03330465406179428, time_cost=1.2445123195648193
+
Steps: 1%| | 5259/1000000 [13:27:08<2488:23:19, 9.01s/it, lr=1e-5, step_loss=0.025]
Steps: 1%| | 5260/1000000 [13:27:21<2860:37:27, 10.35s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [5260], local_loss=0.014607241377234459, train_loss=0.08896634727716446, time_cost=1.1940407752990723
+
Steps: 1%| | 5260/1000000 [13:27:21<2860:37:27, 10.35s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 5261/1000000 [13:27:35<3167:24:58, 11.46s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [5261], local_loss=0.015042298473417759, train_loss=0.03451509028673172, time_cost=1.832782506942749
+
Steps: 1%| | 5261/1000000 [13:27:35<3167:24:58, 11.46s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5262/1000000 [13:27:44<2926:58:50, 10.59s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5262], local_loss=0.01399558037519455, train_loss=0.018183965235948563, time_cost=1.3071410655975342
+
Steps: 1%| | 5262/1000000 [13:27:44<2926:58:50, 10.59s/it, lr=1e-5, step_loss=0.014]
Steps: 1%| | 5263/1000000 [13:27:54<2893:16:26, 10.47s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [5263], local_loss=0.11145974695682526, train_loss=0.0364343523979187, time_cost=1.244722843170166
+
Steps: 1%| | 5263/1000000 [13:27:54<2893:16:26, 10.47s/it, lr=1e-5, step_loss=0.111]
Steps: 1%| | 5264/1000000 [13:28:03<2751:54:47, 9.96s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [5264], local_loss=0.02982642687857151, train_loss=0.07072710245847702, time_cost=2.4320874214172363
+
Steps: 1%| | 5264/1000000 [13:28:03<2751:54:47, 9.96s/it, lr=1e-5, step_loss=0.0298]
Steps: 1%| | 5265/1000000 [13:28:08<2382:05:59, 8.62s/it, lr=1e-5, step_loss=0.0298][RANK-0]: Step: [5265], local_loss=0.01980244740843773, train_loss=8.284523010253906, time_cost=3.079134941101074
+
Steps: 1%| | 5265/1000000 [13:28:08<2382:05:59, 8.62s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 5266/1000000 [13:28:25<3016:37:32, 10.92s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [5266], local_loss=0.03203468397259712, train_loss=0.03224945813417435, time_cost=1.1919879913330078
+
Steps: 1%| | 5266/1000000 [13:28:25<3016:37:32, 10.92s/it, lr=1e-5, step_loss=0.032]
Steps: 1%| | 5267/1000000 [13:28:30<2539:00:51, 9.19s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [5267], local_loss=0.04989701882004738, train_loss=0.04922834783792496, time_cost=1.2888455390930176
+
Steps: 1%| | 5267/1000000 [13:28:30<2539:00:51, 9.19s/it, lr=1e-5, step_loss=0.0499]
Steps: 1%| | 5268/1000000 [13:28:44<2980:10:41, 10.79s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [5268], local_loss=0.07463093101978302, train_loss=0.07688778638839722, time_cost=5.444939851760864
+
Steps: 1%| | 5268/1000000 [13:28:44<2980:10:41, 10.79s/it, lr=1e-5, step_loss=0.0746]
Steps: 1%| | 5269/1000000 [13:28:55<2980:16:38, 10.79s/it, lr=1e-5, step_loss=0.0746][RANK-0]: Step: [5269], local_loss=0.015984291210770607, train_loss=0.1738511621952057, time_cost=3.4222493171691895
+
Steps: 1%| | 5269/1000000 [13:28:55<2980:16:38, 10.79s/it, lr=1e-5, step_loss=0.016]
Steps: 1%| | 5270/1000000 [13:28:59<2441:36:06, 8.84s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [5270], local_loss=0.2829350531101227, train_loss=0.08753041923046112, time_cost=1.3513832092285156
+
Steps: 1%| | 5270/1000000 [13:28:59<2441:36:06, 8.84s/it, lr=1e-5, step_loss=0.283]
Steps: 1%| | 5271/1000000 [13:29:05<2149:31:51, 7.78s/it, lr=1e-5, step_loss=0.283][RANK-0]: Step: [5271], local_loss=0.009640921838581562, train_loss=0.1541084498167038, time_cost=2.1966354846954346
+
Steps: 1%| | 5271/1000000 [13:29:05<2149:31:51, 7.78s/it, lr=1e-5, step_loss=0.00964]
Steps: 1%| | 5272/1000000 [13:29:10<1963:18:33, 7.11s/it, lr=1e-5, step_loss=0.00964][RANK-0]: Step: [5272], local_loss=0.13500149548053741, train_loss=0.057400841265916824, time_cost=2.2546257972717285
+
Steps: 1%| | 5272/1000000 [13:29:10<1963:18:33, 7.11s/it, lr=1e-5, step_loss=0.135]
Steps: 1%| | 5273/1000000 [13:29:15<1789:01:09, 6.47s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [5273], local_loss=0.07759537547826767, train_loss=0.029780909419059753, time_cost=1.9883308410644531
+
Steps: 1%| | 5273/1000000 [13:29:15<1789:01:09, 6.47s/it, lr=1e-5, step_loss=0.0776]
Steps: 1%| | 5274/1000000 [13:29:22<1808:43:52, 6.55s/it, lr=1e-5, step_loss=0.0776][RANK-0]: Step: [5274], local_loss=0.008625807240605354, train_loss=0.048636309802532196, time_cost=1.236407995223999
+
Steps: 1%| | 5274/1000000 [13:29:22<1808:43:52, 6.55s/it, lr=1e-5, step_loss=0.00863]
Steps: 1%| | 5275/1000000 [13:29:31<2029:28:57, 7.34s/it, lr=1e-5, step_loss=0.00863][RANK-0]: Step: [5275], local_loss=0.012588074430823326, train_loss=0.0281358789652586, time_cost=1.5731582641601562
+
Steps: 1%| | 5275/1000000 [13:29:31<2029:28:57, 7.34s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 5276/1000000 [13:29:39<2026:09:13, 7.33s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [5276], local_loss=0.03137343004345894, train_loss=0.03717374801635742, time_cost=2.9134368896484375
+
Steps: 1%| | 5276/1000000 [13:29:39<2026:09:13, 7.33s/it, lr=1e-5, step_loss=0.0314]
Steps: 1%| | 5277/1000000 [13:29:44<1862:02:21, 6.74s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [5277], local_loss=0.010036991909146309, train_loss=0.15854118764400482, time_cost=2.7339327335357666
+
Steps: 1%| | 5277/1000000 [13:29:44<1862:02:21, 6.74s/it, lr=1e-5, step_loss=0.01]
Steps: 1%| | 5278/1000000 [13:29:56<2347:26:05, 8.50s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [5278], local_loss=0.16633565723896027, train_loss=0.17209088802337646, time_cost=3.727994203567505
+
Steps: 1%| | 5278/1000000 [13:29:56<2347:26:05, 8.50s/it, lr=1e-5, step_loss=0.166]
Steps: 1%| | 5279/1000000 [13:30:09<2669:19:12, 9.66s/it, lr=1e-5, step_loss=0.166][RANK-0]: Step: [5279], local_loss=0.05586441606283188, train_loss=0.050768714398145676, time_cost=10.42307186126709
+
Steps: 1%| | 5279/1000000 [13:30:09<2669:19:12, 9.66s/it, lr=1e-5, step_loss=0.0559]
Steps: 1%| | 5280/1000000 [13:30:15<2357:31:50, 8.53s/it, lr=1e-5, step_loss=0.0559][RANK-0]: Step: [5280], local_loss=0.06529386341571808, train_loss=0.02629329264163971, time_cost=1.2547075748443604
+
Steps: 1%| | 5280/1000000 [13:30:15<2357:31:50, 8.53s/it, lr=1e-5, step_loss=0.0653]
Steps: 1%| | 5281/1000000 [13:30:27<2659:02:16, 9.62s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [5281], local_loss=0.03321158140897751, train_loss=0.12197708338499069, time_cost=3.422137498855591
+
Steps: 1%| | 5281/1000000 [13:30:27<2659:02:16, 9.62s/it, lr=1e-5, step_loss=0.0332]
Steps: 1%| | 5282/1000000 [13:30:42<3100:45:34, 11.22s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [5282], local_loss=0.09939280152320862, train_loss=0.03389671817421913, time_cost=1.2107410430908203
+
Steps: 1%| | 5282/1000000 [13:30:42<3100:45:34, 11.22s/it, lr=1e-5, step_loss=0.0994]
Steps: 1%| | 5283/1000000 [13:30:53<3127:33:27, 11.32s/it, lr=1e-5, step_loss=0.0994][RANK-0]: Step: [5283], local_loss=0.008627213537693024, train_loss=0.012088866904377937, time_cost=2.9359607696533203
+
Steps: 1%| | 5283/1000000 [13:30:53<3127:33:27, 11.32s/it, lr=1e-5, step_loss=0.00863]
Steps: 1%| | 5284/1000000 [13:30:59<2618:07:24, 9.48s/it, lr=1e-5, step_loss=0.00863][RANK-0]: Step: [5284], local_loss=0.31898176670074463, train_loss=0.07876846194267273, time_cost=1.9885234832763672
+
Steps: 1%| | 5284/1000000 [13:30:59<2618:07:24, 9.48s/it, lr=1e-5, step_loss=0.319]
Steps: 1%| | 5285/1000000 [13:31:04<2243:29:35, 8.12s/it, lr=1e-5, step_loss=0.319][RANK-0]: Step: [5285], local_loss=0.028691353276371956, train_loss=0.021897954866290092, time_cost=2.2649292945861816
+
Steps: 1%| | 5285/1000000 [13:31:04<2243:29:35, 8.12s/it, lr=1e-5, step_loss=0.0287]
Steps: 1%| | 5286/1000000 [13:31:08<1960:44:20, 7.10s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [5286], local_loss=0.01282780896872282, train_loss=0.04841750115156174, time_cost=2.016847610473633
+
Steps: 1%| | 5286/1000000 [13:31:08<1960:44:20, 7.10s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5287/1000000 [13:31:15<1948:39:37, 7.05s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5287], local_loss=0.0371738038957119, train_loss=0.05401380732655525, time_cost=2.572028636932373
+
Steps: 1%| | 5287/1000000 [13:31:15<1948:39:37, 7.05s/it, lr=1e-5, step_loss=0.0372]
Steps: 1%| | 5288/1000000 [13:31:20<1786:44:27, 6.47s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [5288], local_loss=0.05466485768556595, train_loss=0.03236285597085953, time_cost=2.5886447429656982
+
Steps: 1%| | 5288/1000000 [13:31:20<1786:44:27, 6.47s/it, lr=1e-5, step_loss=0.0547]
Steps: 1%| | 5289/1000000 [13:31:26<1733:30:54, 6.27s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [5289], local_loss=0.011935862712562084, train_loss=0.16215233504772186, time_cost=1.2253217697143555
+
Steps: 1%| | 5289/1000000 [13:31:26<1733:30:54, 6.27s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 5290/1000000 [13:31:31<1637:54:14, 5.93s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [5290], local_loss=0.012402810156345367, train_loss=0.06800783425569534, time_cost=1.2022805213928223
+
Steps: 1%| | 5290/1000000 [13:31:31<1637:54:14, 5.93s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5291/1000000 [13:31:36<1508:02:08, 5.46s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5291], local_loss=0.030505910515785217, train_loss=0.023288484662771225, time_cost=1.7347185611724854
+
Steps: 1%| | 5291/1000000 [13:31:36<1508:02:08, 5.46s/it, lr=1e-5, step_loss=0.0305]
Steps: 1%| | 5292/1000000 [13:31:42<1600:01:12, 5.79s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [5292], local_loss=0.009593253955245018, train_loss=0.06358081847429276, time_cost=2.5345823764801025
+
Steps: 1%| | 5292/1000000 [13:31:42<1600:01:12, 5.79s/it, lr=1e-5, step_loss=0.00959]
Steps: 1%| | 5293/1000000 [13:31:53<2058:27:23, 7.45s/it, lr=1e-5, step_loss=0.00959][RANK-0]: Step: [5293], local_loss=0.015925193205475807, train_loss=0.07743662595748901, time_cost=3.759770154953003
+
Steps: 1%| | 5293/1000000 [13:31:53<2058:27:23, 7.45s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 5294/1000000 [13:31:59<1907:46:36, 6.90s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [5294], local_loss=0.1328708380460739, train_loss=0.04671473801136017, time_cost=2.2623281478881836
+
Steps: 1%| | 5294/1000000 [13:31:59<1907:46:36, 6.90s/it, lr=1e-5, step_loss=0.133]
Steps: 1%| | 5295/1000000 [13:32:10<2255:16:52, 8.16s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [5295], local_loss=0.023863477632403374, train_loss=0.03469792380928993, time_cost=1.643742322921753
+
Steps: 1%| | 5295/1000000 [13:32:10<2255:16:52, 8.16s/it, lr=1e-5, step_loss=0.0239]
Steps: 1%| | 5296/1000000 [13:32:21<2436:46:09, 8.82s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [5296], local_loss=0.012807277962565422, train_loss=0.01878610998392105, time_cost=3.310067892074585
+
Steps: 1%| | 5296/1000000 [13:32:21<2436:46:09, 8.82s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5297/1000000 [13:32:28<2292:11:14, 8.30s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5297], local_loss=0.015994008630514145, train_loss=0.08021757006645203, time_cost=1.765221118927002
+
Steps: 1%| | 5297/1000000 [13:32:28<2292:11:14, 8.30s/it, lr=1e-5, step_loss=0.016]
Steps: 1%| | 5298/1000000 [13:32:35<2220:19:54, 8.04s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [5298], local_loss=0.04752553626894951, train_loss=0.05562371760606766, time_cost=5.242211103439331
+
Steps: 1%| | 5298/1000000 [13:32:35<2220:19:54, 8.04s/it, lr=1e-5, step_loss=0.0475]
Steps: 1%| | 5299/1000000 [13:32:40<1985:21:19, 7.19s/it, lr=1e-5, step_loss=0.0475][RANK-0]: Step: [5299], local_loss=1.0058053731918335, train_loss=0.15467947721481323, time_cost=2.0404839515686035
+
Steps: 1%| | 5299/1000000 [13:32:40<1985:21:19, 7.19s/it, lr=1e-5, step_loss=1.01]
Steps: 1%| | 5300/1000000 [13:32:51<2277:50:49, 8.24s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [5300], local_loss=0.05124819278717041, train_loss=0.10061858594417572, time_cost=1.783623218536377
+
Steps: 1%| | 5300/1000000 [13:32:51<2277:50:49, 8.24s/it, lr=1e-5, step_loss=0.0512]
Steps: 1%| | 5301/1000000 [13:32:55<1962:17:36, 7.10s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [5301], local_loss=0.03740441054105759, train_loss=0.057439081370830536, time_cost=1.5218605995178223
+
Steps: 1%| | 5301/1000000 [13:32:55<1962:17:36, 7.10s/it, lr=1e-5, step_loss=0.0374]
Steps: 1%| | 5302/1000000 [13:33:12<2711:39:48, 9.81s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [5302], local_loss=0.03684232383966446, train_loss=0.043303243815898895, time_cost=1.2249250411987305
+
Steps: 1%| | 5302/1000000 [13:33:12<2711:39:48, 9.81s/it, lr=1e-5, step_loss=0.0368]
Steps: 1%| | 5303/1000000 [13:33:23<2825:53:08, 10.23s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [5303], local_loss=0.16356892883777618, train_loss=0.1543407440185547, time_cost=1.2307014465332031
+
Steps: 1%| | 5303/1000000 [13:33:23<2825:53:08, 10.23s/it, lr=1e-5, step_loss=0.164]
Steps: 1%| | 5304/1000000 [13:33:35<3000:30:19, 10.86s/it, lr=1e-5, step_loss=0.164][RANK-0]: Step: [5304], local_loss=0.016836950555443764, train_loss=0.03537312150001526, time_cost=5.4778430461883545
+
Steps: 1%| | 5304/1000000 [13:33:35<3000:30:19, 10.86s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%| | 5305/1000000 [13:33:44<2864:14:27, 10.37s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [5305], local_loss=0.012109617702662945, train_loss=0.049263615161180496, time_cost=1.1918213367462158
+
Steps: 1%| | 5305/1000000 [13:33:44<2864:14:27, 10.37s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 5306/1000000 [13:33:49<2426:43:13, 8.78s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [5306], local_loss=0.08345071971416473, train_loss=0.11179699748754501, time_cost=1.940497875213623
+
Steps: 1%| | 5306/1000000 [13:33:49<2426:43:13, 8.78s/it, lr=1e-5, step_loss=0.0835]
Steps: 1%| | 5307/1000000 [13:33:55<2125:45:40, 7.69s/it, lr=1e-5, step_loss=0.0835][RANK-0]: Step: [5307], local_loss=0.045160215348005295, train_loss=0.0435081347823143, time_cost=2.0386574268341064
+
Steps: 1%| | 5307/1000000 [13:33:55<2125:45:40, 7.69s/it, lr=1e-5, step_loss=0.0452]
Steps: 1%| | 5308/1000000 [13:34:06<2448:08:06, 8.86s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [5308], local_loss=0.03638044372200966, train_loss=0.22904932498931885, time_cost=4.041747093200684
+
Steps: 1%| | 5308/1000000 [13:34:06<2448:08:06, 8.86s/it, lr=1e-5, step_loss=0.0364]
Steps: 1%| | 5309/1000000 [13:34:11<2130:34:42, 7.71s/it, lr=1e-5, step_loss=0.0364][RANK-0]: Step: [5309], local_loss=0.03462343290448189, train_loss=0.031284548342227936, time_cost=2.150010347366333
+
Steps: 1%| | 5309/1000000 [13:34:11<2130:34:42, 7.71s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%| | 5310/1000000 [13:34:16<1873:17:22, 6.78s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [5310], local_loss=0.024015327915549278, train_loss=0.03777819499373436, time_cost=3.5121781826019287
+
Steps: 1%| | 5310/1000000 [13:34:16<1873:17:22, 6.78s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 5311/1000000 [13:34:22<1803:18:07, 6.53s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [5311], local_loss=0.12437891215085983, train_loss=0.04854901134967804, time_cost=1.3537371158599854
+
Steps: 1%| | 5311/1000000 [13:34:22<1803:18:07, 6.53s/it, lr=1e-5, step_loss=0.124]
Steps: 1%| | 5312/1000000 [13:34:35<2368:10:18, 8.57s/it, lr=1e-5, step_loss=0.124][RANK-0]: Step: [5312], local_loss=0.013909494504332542, train_loss=0.06434071063995361, time_cost=3.9804439544677734
+
Steps: 1%| | 5312/1000000 [13:34:35<2368:10:18, 8.57s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 5313/1000000 [13:34:41<2185:02:47, 7.91s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [5313], local_loss=0.06024082750082016, train_loss=0.08718608319759369, time_cost=2.0185885429382324
+
Steps: 1%| | 5313/1000000 [13:34:41<2185:02:47, 7.91s/it, lr=1e-5, step_loss=0.0602]
Steps: 1%| | 5314/1000000 [13:34:53<2528:16:23, 9.15s/it, lr=1e-5, step_loss=0.0602][RANK-0]: Step: [5314], local_loss=0.011064663529396057, train_loss=0.05282661318778992, time_cost=4.582218647003174
+
Steps: 1%| | 5314/1000000 [13:34:53<2528:16:23, 9.15s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5315/1000000 [13:34:59<2259:48:57, 8.18s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5315], local_loss=0.012289434671401978, train_loss=0.08332660794258118, time_cost=1.6282966136932373
+
Steps: 1%| | 5315/1000000 [13:34:59<2259:48:57, 8.18s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 5316/1000000 [13:35:05<2030:40:10, 7.35s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [5316], local_loss=0.11765186488628387, train_loss=0.04338063299655914, time_cost=1.4993884563446045
+
Steps: 1%| | 5316/1000000 [13:35:05<2030:40:10, 7.35s/it, lr=1e-5, step_loss=0.118]
Steps: 1%| | 5317/1000000 [13:35:18<2483:04:30, 8.99s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [5317], local_loss=0.028789183124899864, train_loss=0.03116503916680813, time_cost=1.5942330360412598
+
Steps: 1%| | 5317/1000000 [13:35:18<2483:04:30, 8.99s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%| | 5318/1000000 [13:35:31<2846:23:07, 10.30s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [5318], local_loss=0.029957175254821777, train_loss=0.07492853701114655, time_cost=1.2065856456756592
+
Steps: 1%| | 5318/1000000 [13:35:31<2846:23:07, 10.30s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 5319/1000000 [13:35:44<3109:58:45, 11.26s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [5319], local_loss=0.012169064953923225, train_loss=0.03712041676044464, time_cost=1.2340145111083984
+
Steps: 1%| | 5319/1000000 [13:35:44<3109:58:45, 11.26s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 5320/1000000 [13:35:50<2651:06:53, 9.60s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [5320], local_loss=0.03413970768451691, train_loss=0.17005538940429688, time_cost=1.8726162910461426
+
Steps: 1%| | 5320/1000000 [13:35:50<2651:06:53, 9.60s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%| | 5321/1000000 [13:36:02<2877:39:49, 10.42s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [5321], local_loss=0.010197681374847889, train_loss=0.04945109039545059, time_cost=5.735039949417114
+
Steps: 1%| | 5321/1000000 [13:36:02<2877:39:49, 10.42s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 5322/1000000 [13:36:10<2600:51:11, 9.41s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [5322], local_loss=0.0324355885386467, train_loss=32.02357482910156, time_cost=2.5333340167999268
+
Steps: 1%| | 5322/1000000 [13:36:10<2600:51:11, 9.41s/it, lr=1e-5, step_loss=0.0324]
Steps: 1%| | 5323/1000000 [13:36:21<2761:42:01, 10.00s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [5323], local_loss=0.01403661910444498, train_loss=0.036381691694259644, time_cost=1.2323660850524902
+
Steps: 1%| | 5323/1000000 [13:36:21<2761:42:01, 10.00s/it, lr=1e-5, step_loss=0.014]
Steps: 1%| | 5324/1000000 [13:36:25<2280:52:15, 8.26s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [5324], local_loss=0.02080545574426651, train_loss=0.02465178072452545, time_cost=3.2305779457092285
+
Steps: 1%| | 5324/1000000 [13:36:25<2280:52:15, 8.26s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%| | 5325/1000000 [13:36:38<2646:04:52, 9.58s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [5325], local_loss=0.02409154176712036, train_loss=0.029734153300523758, time_cost=3.4128899574279785
+
Steps: 1%| | 5325/1000000 [13:36:38<2646:04:52, 9.58s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 5326/1000000 [13:36:52<3033:17:42, 10.98s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [5326], local_loss=0.019115235656499863, train_loss=0.07361309975385666, time_cost=2.232553005218506
+
Steps: 1%| | 5326/1000000 [13:36:52<3033:17:42, 10.98s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 5327/1000000 [13:36:59<2716:15:45, 9.83s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [5327], local_loss=0.1563062220811844, train_loss=0.06130886822938919, time_cost=3.0036723613739014
+
Steps: 1%| | 5327/1000000 [13:36:59<2716:15:45, 9.83s/it, lr=1e-5, step_loss=0.156]
Steps: 1%| | 5328/1000000 [13:37:05<2356:15:08, 8.53s/it, lr=1e-5, step_loss=0.156][RANK-0]: Step: [5328], local_loss=0.02217422053217888, train_loss=0.026371456682682037, time_cost=3.0586130619049072
+
Steps: 1%| | 5328/1000000 [13:37:05<2356:15:08, 8.53s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 5329/1000000 [13:37:14<2400:45:53, 8.69s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [5329], local_loss=0.009977822192013264, train_loss=0.017874356359243393, time_cost=1.2260963916778564
+
Steps: 1%| | 5329/1000000 [13:37:14<2400:45:53, 8.69s/it, lr=1e-5, step_loss=0.00998]
Steps: 1%| | 5330/1000000 [13:37:18<2045:40:30, 7.40s/it, lr=1e-5, step_loss=0.00998][RANK-0]: Step: [5330], local_loss=0.053552985191345215, train_loss=0.03891295567154884, time_cost=1.401890754699707
+
Steps: 1%| | 5330/1000000 [13:37:18<2045:40:30, 7.40s/it, lr=1e-5, step_loss=0.0536]
Steps: 1%| | 5331/1000000 [13:37:26<2055:25:49, 7.44s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [5331], local_loss=0.04145673289895058, train_loss=0.07984183728694916, time_cost=2.936837673187256
+
Steps: 1%| | 5331/1000000 [13:37:26<2055:25:49, 7.44s/it, lr=1e-5, step_loss=0.0415]
Steps: 1%| | 5332/1000000 [13:37:34<2142:42:13, 7.76s/it, lr=1e-5, step_loss=0.0415][RANK-0]: Step: [5332], local_loss=0.20632904767990112, train_loss=0.08178536593914032, time_cost=2.031315326690674
+
Steps: 1%| | 5332/1000000 [13:37:34<2142:42:13, 7.76s/it, lr=1e-5, step_loss=0.206]
Steps: 1%| | 5333/1000000 [13:37:45<2423:19:54, 8.77s/it, lr=1e-5, step_loss=0.206][RANK-0]: Step: [5333], local_loss=0.013924829661846161, train_loss=0.046528980135917664, time_cost=1.988663911819458
+
Steps: 1%| | 5333/1000000 [13:37:45<2423:19:54, 8.77s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 5334/1000000 [13:37:57<2670:14:38, 9.66s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [5334], local_loss=0.009998172521591187, train_loss=0.0449654683470726, time_cost=6.32366681098938
+
Steps: 1%| | 5334/1000000 [13:37:57<2670:14:38, 9.66s/it, lr=1e-5, step_loss=0.01]
Steps: 1%| | 5335/1000000 [13:38:03<2374:58:45, 8.60s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [5335], local_loss=0.04205545410513878, train_loss=0.04486832767724991, time_cost=1.2291035652160645
+
Steps: 1%| | 5335/1000000 [13:38:03<2374:58:45, 8.60s/it, lr=1e-5, step_loss=0.0421]
Steps: 1%| | 5336/1000000 [13:38:07<2024:45:07, 7.33s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [5336], local_loss=0.06670991331338882, train_loss=0.03265785798430443, time_cost=1.386383295059204
+
Steps: 1%| | 5336/1000000 [13:38:08<2024:45:07, 7.33s/it, lr=1e-5, step_loss=0.0667]
Steps: 1%| | 5337/1000000 [13:38:16<2150:29:10, 7.78s/it, lr=1e-5, step_loss=0.0667][RANK-0]: Step: [5337], local_loss=0.054772283881902695, train_loss=0.05328153818845749, time_cost=1.5690951347351074
+
Steps: 1%| | 5337/1000000 [13:38:16<2150:29:10, 7.78s/it, lr=1e-5, step_loss=0.0548]
Steps: 1%| | 5338/1000000 [13:38:31<2701:39:44, 9.78s/it, lr=1e-5, step_loss=0.0548][RANK-0]: Step: [5338], local_loss=0.07485637068748474, train_loss=0.07767914235591888, time_cost=4.931896924972534
+
Steps: 1%| | 5338/1000000 [13:38:31<2701:39:44, 9.78s/it, lr=1e-5, step_loss=0.0749]
Steps: 1%| | 5339/1000000 [13:38:40<2651:23:36, 9.60s/it, lr=1e-5, step_loss=0.0749][RANK-0]: Step: [5339], local_loss=0.015226626768708229, train_loss=0.037721678614616394, time_cost=4.165378093719482
+
Steps: 1%| | 5339/1000000 [13:38:40<2651:23:36, 9.60s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 5340/1000000 [13:38:47<2478:33:55, 8.97s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [5340], local_loss=0.13337093591690063, train_loss=0.04600534588098526, time_cost=1.2433366775512695
+
Steps: 1%| | 5340/1000000 [13:38:47<2478:33:55, 8.97s/it, lr=1e-5, step_loss=0.133]
Steps: 1%| | 5341/1000000 [13:38:56<2406:48:25, 8.71s/it, lr=1e-5, step_loss=0.133][RANK-0]: Step: [5341], local_loss=0.06033216416835785, train_loss=0.031705599278211594, time_cost=3.1093664169311523
+
Steps: 1%| | 5341/1000000 [13:38:56<2406:48:25, 8.71s/it, lr=1e-5, step_loss=0.0603]
Steps: 1%| | 5342/1000000 [13:39:07<2634:49:14, 9.54s/it, lr=1e-5, step_loss=0.0603][RANK-0]: Step: [5342], local_loss=0.025630956515669823, train_loss=0.07499261945486069, time_cost=8.081340789794922
+
Steps: 1%| | 5342/1000000 [13:39:07<2634:49:14, 9.54s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%| | 5343/1000000 [13:39:12<2249:08:37, 8.14s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [5343], local_loss=0.10052123665809631, train_loss=0.028869371861219406, time_cost=1.8904759883880615
+
Steps: 1%| | 5343/1000000 [13:39:12<2249:08:37, 8.14s/it, lr=1e-5, step_loss=0.101]
Steps: 1%| | 5344/1000000 [13:39:17<1992:22:47, 7.21s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [5344], local_loss=1.001072645187378, train_loss=0.16484332084655762, time_cost=3.75622296333313
+
Steps: 1%| | 5344/1000000 [13:39:17<1992:22:47, 7.21s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 5345/1000000 [13:39:26<2135:30:51, 7.73s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [5345], local_loss=0.02098645083606243, train_loss=0.02314334362745285, time_cost=1.420163631439209
+
Steps: 1%| | 5345/1000000 [13:39:26<2135:30:51, 7.73s/it, lr=1e-5, step_loss=0.021]
Steps: 1%| | 5346/1000000 [13:39:40<2667:44:27, 9.66s/it, lr=1e-5, step_loss=0.021][RANK-0]: Step: [5346], local_loss=0.01493285596370697, train_loss=0.0737680047750473, time_cost=2.144963026046753
+
Steps: 1%| | 5346/1000000 [13:39:40<2667:44:27, 9.66s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 5347/1000000 [13:39:45<2280:08:31, 8.25s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [5347], local_loss=0.008732839487493038, train_loss=30.70254135131836, time_cost=1.2354459762573242
+
Steps: 1%| | 5347/1000000 [13:39:45<2280:08:31, 8.25s/it, lr=1e-5, step_loss=0.00873]
Steps: 1%| | 5348/1000000 [13:39:58<2708:10:19, 9.80s/it, lr=1e-5, step_loss=0.00873][RANK-0]: Step: [5348], local_loss=44.552433013916016, train_loss=5.627819061279297, time_cost=6.660241365432739
+
Steps: 1%| | 5348/1000000 [13:39:58<2708:10:19, 9.80s/it, lr=1e-5, step_loss=44.6]
Steps: 1%| | 5349/1000000 [13:40:03<2314:26:55, 8.38s/it, lr=1e-5, step_loss=44.6][RANK-0]: Step: [5349], local_loss=0.05621443688869476, train_loss=0.06285982578992844, time_cost=2.922415018081665
+
Steps: 1%| | 5349/1000000 [13:40:03<2314:26:55, 8.38s/it, lr=1e-5, step_loss=0.0562]
Steps: 1%| | 5350/1000000 [13:40:11<2270:18:40, 8.22s/it, lr=1e-5, step_loss=0.0562][RANK-0]: Step: [5350], local_loss=0.08807045966386795, train_loss=0.03457235172390938, time_cost=3.478123903274536
+
Steps: 1%| | 5350/1000000 [13:40:11<2270:18:40, 8.22s/it, lr=1e-5, step_loss=0.0881]
Steps: 1%| | 5351/1000000 [13:40:26<2782:13:10, 10.07s/it, lr=1e-5, step_loss=0.0881][RANK-0]: Step: [5351], local_loss=0.308494508266449, train_loss=0.093330979347229, time_cost=6.493021249771118
+
Steps: 1%| | 5351/1000000 [13:40:26<2782:13:10, 10.07s/it, lr=1e-5, step_loss=0.308]
Steps: 1%| | 5352/1000000 [13:40:39<3071:41:19, 11.12s/it, lr=1e-5, step_loss=0.308][RANK-0]: Step: [5352], local_loss=0.018403811380267143, train_loss=0.03393121063709259, time_cost=5.204258918762207
+
Steps: 1%| | 5352/1000000 [13:40:39<3071:41:19, 11.12s/it, lr=1e-5, step_loss=0.0184]
Steps: 1%| | 5353/1000000 [13:40:50<3035:30:51, 10.99s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [5353], local_loss=0.030279699712991714, train_loss=0.02885543927550316, time_cost=1.2434370517730713
+
Steps: 1%| | 5353/1000000 [13:40:50<3035:30:51, 10.99s/it, lr=1e-5, step_loss=0.0303]
Steps: 1%| | 5354/1000000 [13:41:02<3101:41:19, 11.23s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [5354], local_loss=0.020158961415290833, train_loss=0.027233103290200233, time_cost=1.2426424026489258
+
Steps: 1%| | 5354/1000000 [13:41:02<3101:41:19, 11.23s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%| | 5355/1000000 [13:41:15<3273:25:06, 11.85s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [5355], local_loss=0.01669621840119362, train_loss=0.03470170497894287, time_cost=3.354951858520508
+
Steps: 1%| | 5355/1000000 [13:41:15<3273:25:06, 11.85s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%| | 5356/1000000 [13:41:27<3272:11:18, 11.84s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [5356], local_loss=0.03547484800219536, train_loss=0.05244734138250351, time_cost=1.227759599685669
+
Steps: 1%| | 5356/1000000 [13:41:27<3272:11:18, 11.84s/it, lr=1e-5, step_loss=0.0355]
Steps: 1%| | 5357/1000000 [13:41:35<2931:09:44, 10.61s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [5357], local_loss=0.40358787775039673, train_loss=0.09822243452072144, time_cost=1.2042644023895264
+
Steps: 1%| | 5357/1000000 [13:41:35<2931:09:44, 10.61s/it, lr=1e-5, step_loss=0.404]
Steps: 1%| | 5358/1000000 [13:41:40<2461:02:39, 8.91s/it, lr=1e-5, step_loss=0.404][RANK-0]: Step: [5358], local_loss=0.045039042830467224, train_loss=0.02904096618294716, time_cost=2.3666365146636963
+
Steps: 1%| | 5358/1000000 [13:41:40<2461:02:39, 8.91s/it, lr=1e-5, step_loss=0.045]
Steps: 1%| | 5359/1000000 [13:41:45<2151:05:22, 7.79s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [5359], local_loss=0.023811567574739456, train_loss=0.0751781165599823, time_cost=1.3552343845367432
+
Steps: 1%| | 5359/1000000 [13:41:45<2151:05:22, 7.79s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%| | 5360/1000000 [13:41:55<2398:06:50, 8.68s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [5360], local_loss=0.07924237102270126, train_loss=0.03531085327267647, time_cost=4.0056750774383545
+
Steps: 1%| | 5360/1000000 [13:41:55<2398:06:50, 8.68s/it, lr=1e-5, step_loss=0.0792]
Steps: 1%| | 5361/1000000 [13:42:02<2196:26:10, 7.95s/it, lr=1e-5, step_loss=0.0792][RANK-0]: Step: [5361], local_loss=0.008738009259104729, train_loss=0.024129454046487808, time_cost=1.7454900741577148
+
Steps: 1%| | 5361/1000000 [13:42:02<2196:26:10, 7.95s/it, lr=1e-5, step_loss=0.00874]
Steps: 1%| | 5362/1000000 [13:42:10<2197:57:18, 7.96s/it, lr=1e-5, step_loss=0.00874][RANK-0]: Step: [5362], local_loss=0.03531057760119438, train_loss=0.04072308540344238, time_cost=1.2205712795257568
+
Steps: 1%| | 5362/1000000 [13:42:10<2197:57:18, 7.96s/it, lr=1e-5, step_loss=0.0353]
Steps: 1%| | 5363/1000000 [13:42:16<2059:29:15, 7.45s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [5363], local_loss=0.023874381557106972, train_loss=0.05512838065624237, time_cost=1.84263277053833
+
Steps: 1%| | 5363/1000000 [13:42:16<2059:29:15, 7.45s/it, lr=1e-5, step_loss=0.0239]
Steps: 1%| | 5364/1000000 [13:42:24<2104:36:25, 7.62s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [5364], local_loss=0.030499840155243874, train_loss=0.04177720844745636, time_cost=2.6792538166046143
+
Steps: 1%| | 5364/1000000 [13:42:24<2104:36:25, 7.62s/it, lr=1e-5, step_loss=0.0305]
Steps: 1%| | 5365/1000000 [13:42:34<2291:09:40, 8.29s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [5365], local_loss=0.17768682539463043, train_loss=0.04386645555496216, time_cost=1.2837653160095215
+
Steps: 1%| | 5365/1000000 [13:42:34<2291:09:40, 8.29s/it, lr=1e-5, step_loss=0.178]
Steps: 1%| | 5366/1000000 [13:42:47<2689:34:57, 9.73s/it, lr=1e-5, step_loss=0.178][RANK-0]: Step: [5366], local_loss=0.02160186693072319, train_loss=0.028890488669276237, time_cost=6.023944139480591
+
Steps: 1%| | 5366/1000000 [13:42:47<2689:34:57, 9.73s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 5367/1000000 [13:42:57<2733:53:24, 9.90s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [5367], local_loss=0.1808866262435913, train_loss=0.09443605691194534, time_cost=4.949525356292725
+
Steps: 1%| | 5367/1000000 [13:42:57<2733:53:24, 9.90s/it, lr=1e-5, step_loss=0.181]
Steps: 1%| | 5368/1000000 [13:43:03<2364:39:48, 8.56s/it, lr=1e-5, step_loss=0.181][RANK-0]: Step: [5368], local_loss=0.0225739274173975, train_loss=0.19713285565376282, time_cost=2.6852664947509766
+
Steps: 1%| | 5368/1000000 [13:43:03<2364:39:48, 8.56s/it, lr=1e-5, step_loss=0.0226]
Steps: 1%| | 5369/1000000 [13:43:18<2931:28:11, 10.61s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [5369], local_loss=0.21774913370609283, train_loss=0.15229704976081848, time_cost=7.956758499145508
+
Steps: 1%| | 5369/1000000 [13:43:18<2931:28:11, 10.61s/it, lr=1e-5, step_loss=0.218]
Steps: 1%| | 5370/1000000 [13:43:28<2853:42:34, 10.33s/it, lr=1e-5, step_loss=0.218][RANK-0]: Step: [5370], local_loss=0.012887340970337391, train_loss=0.03028177097439766, time_cost=1.8142521381378174
+
Steps: 1%| | 5370/1000000 [13:43:28<2853:42:34, 10.33s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 5371/1000000 [13:43:35<2588:13:47, 9.37s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [5371], local_loss=0.010076038539409637, train_loss=0.07772746682167053, time_cost=1.430081844329834
+
Steps: 1%| | 5371/1000000 [13:43:35<2588:13:47, 9.37s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 5372/1000000 [13:43:51<3170:10:57, 11.47s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [5372], local_loss=0.02546810545027256, train_loss=0.04886987805366516, time_cost=1.2045609951019287
+
Steps: 1%| | 5372/1000000 [13:43:51<3170:10:57, 11.47s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 5373/1000000 [13:43:59<2901:30:46, 10.50s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [5373], local_loss=0.012697793543338776, train_loss=0.05345083028078079, time_cost=4.243881464004517
+
Steps: 1%| | 5373/1000000 [13:43:59<2901:30:46, 10.50s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 5374/1000000 [13:44:04<2383:00:52, 8.63s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [5374], local_loss=0.014274060726165771, train_loss=0.026934724301099777, time_cost=1.3268928527832031
+
Steps: 1%| | 5374/1000000 [13:44:04<2383:00:52, 8.63s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5375/1000000 [13:44:08<2036:26:47, 7.37s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5375], local_loss=0.05141785740852356, train_loss=0.04695635661482811, time_cost=1.6689484119415283
+
Steps: 1%| | 5375/1000000 [13:44:08<2036:26:47, 7.37s/it, lr=1e-5, step_loss=0.0514]
Steps: 1%| | 5376/1000000 [13:44:19<2353:51:15, 8.52s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [5376], local_loss=0.25562235713005066, train_loss=0.0617532879114151, time_cost=3.2956995964050293
+
Steps: 1%| | 5376/1000000 [13:44:19<2353:51:15, 8.52s/it, lr=1e-5, step_loss=0.256]
Steps: 1%| | 5377/1000000 [13:44:30<2536:33:03, 9.18s/it, lr=1e-5, step_loss=0.256][RANK-0]: Step: [5377], local_loss=0.037707652896642685, train_loss=0.02692577987909317, time_cost=1.4156785011291504
+
Steps: 1%| | 5377/1000000 [13:44:30<2536:33:03, 9.18s/it, lr=1e-5, step_loss=0.0377]
Steps: 1%| | 5378/1000000 [13:44:38<2414:13:01, 8.74s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [5378], local_loss=0.015453418716788292, train_loss=0.05555073171854019, time_cost=2.6115520000457764
+
Steps: 1%| | 5378/1000000 [13:44:38<2414:13:01, 8.74s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 5379/1000000 [13:44:45<2278:48:55, 8.25s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [5379], local_loss=0.12464731186628342, train_loss=0.0664229542016983, time_cost=3.018951177597046
+
Steps: 1%| | 5379/1000000 [13:44:45<2278:48:55, 8.25s/it, lr=1e-5, step_loss=0.125]
Steps: 1%| | 5380/1000000 [13:44:49<1952:19:50, 7.07s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [5380], local_loss=0.01111097726970911, train_loss=0.024978484958410263, time_cost=1.2625889778137207
+
Steps: 1%| | 5380/1000000 [13:44:49<1952:19:50, 7.07s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5381/1000000 [13:44:56<1968:12:41, 7.12s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5381], local_loss=0.08095631003379822, train_loss=0.05969715863466263, time_cost=2.532900810241699
+
Steps: 1%| | 5381/1000000 [13:44:56<1968:12:41, 7.12s/it, lr=1e-5, step_loss=0.081]
Steps: 1%| | 5382/1000000 [13:45:03<1957:30:18, 7.09s/it, lr=1e-5, step_loss=0.081][RANK-0]: Step: [5382], local_loss=0.1503058820962906, train_loss=0.05295216664671898, time_cost=1.5545763969421387
+
Steps: 1%| | 5382/1000000 [13:45:03<1957:30:18, 7.09s/it, lr=1e-5, step_loss=0.15]
Steps: 1%| | 5383/1000000 [13:45:08<1727:35:04, 6.25s/it, lr=1e-5, step_loss=0.15][RANK-0]: Step: [5383], local_loss=0.012361356057226658, train_loss=0.23130984604358673, time_cost=1.506087303161621
+
Steps: 1%| | 5383/1000000 [13:45:08<1727:35:04, 6.25s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5384/1000000 [13:45:21<2298:14:33, 8.32s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5384], local_loss=0.05376201495528221, train_loss=0.05188377946615219, time_cost=2.621757984161377
+
Steps: 1%| | 5384/1000000 [13:45:21<2298:14:33, 8.32s/it, lr=1e-5, step_loss=0.0538]
Steps: 1%| | 5385/1000000 [13:45:28<2232:56:25, 8.08s/it, lr=1e-5, step_loss=0.0538][RANK-0]: Step: [5385], local_loss=0.21664008498191833, train_loss=0.07047949731349945, time_cost=2.1495516300201416
+
Steps: 1%| | 5385/1000000 [13:45:28<2232:56:25, 8.08s/it, lr=1e-5, step_loss=0.217]
Steps: 1%| | 5386/1000000 [13:45:44<2813:32:53, 10.18s/it, lr=1e-5, step_loss=0.217][RANK-0]: Step: [5386], local_loss=0.030669668689370155, train_loss=0.04185386002063751, time_cost=3.9962241649627686
+
Steps: 1%| | 5386/1000000 [13:45:44<2813:32:53, 10.18s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%| | 5387/1000000 [13:45:54<2875:41:10, 10.41s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [5387], local_loss=0.031118901446461678, train_loss=0.07292254269123077, time_cost=6.8533034324646
+
Steps: 1%| | 5387/1000000 [13:45:54<2875:41:10, 10.41s/it, lr=1e-5, step_loss=0.0311]
Steps: 1%| | 5388/1000000 [13:46:05<2903:10:41, 10.51s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [5388], local_loss=0.05792488902807236, train_loss=0.021744053810834885, time_cost=1.5871729850769043
+
Steps: 1%| | 5388/1000000 [13:46:05<2903:10:41, 10.51s/it, lr=1e-5, step_loss=0.0579]
Steps: 1%| | 5389/1000000 [13:46:12<2594:33:07, 9.39s/it, lr=1e-5, step_loss=0.0579][RANK-0]: Step: [5389], local_loss=0.029284551739692688, train_loss=0.029499683529138565, time_cost=2.337759017944336
+
Steps: 1%| | 5389/1000000 [13:46:12<2594:33:07, 9.39s/it, lr=1e-5, step_loss=0.0293]
Steps: 1%| | 5390/1000000 [13:46:23<2717:47:50, 9.84s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [5390], local_loss=0.014638113789260387, train_loss=0.021260133013129234, time_cost=4.530960321426392
+
Steps: 1%| | 5390/1000000 [13:46:23<2717:47:50, 9.84s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 5391/1000000 [13:46:27<2252:09:23, 8.15s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [5391], local_loss=0.058775149285793304, train_loss=0.03058728203177452, time_cost=1.3642947673797607
+
Steps: 1%| | 5391/1000000 [13:46:27<2252:09:23, 8.15s/it, lr=1e-5, step_loss=0.0588]
Steps: 1%| | 5392/1000000 [13:46:36<2328:07:49, 8.43s/it, lr=1e-5, step_loss=0.0588][RANK-0]: Step: [5392], local_loss=0.015005587600171566, train_loss=0.0829082578420639, time_cost=3.9254140853881836
+
Steps: 1%| | 5392/1000000 [13:46:36<2328:07:49, 8.43s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5393/1000000 [13:46:48<2625:36:58, 9.50s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5393], local_loss=0.03965384140610695, train_loss=2.7529678344726562, time_cost=3.6822052001953125
+
Steps: 1%| | 5393/1000000 [13:46:48<2625:36:58, 9.50s/it, lr=1e-5, step_loss=0.0397]
Steps: 1%| | 5394/1000000 [13:46:52<2188:09:52, 7.92s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [5394], local_loss=0.013619310222566128, train_loss=0.048445750027894974, time_cost=1.8270375728607178
+
Steps: 1%| | 5394/1000000 [13:46:52<2188:09:52, 7.92s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 5395/1000000 [13:47:06<2626:38:27, 9.51s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [5395], local_loss=0.04758016765117645, train_loss=0.05965479463338852, time_cost=4.018592596054077
+
Steps: 1%| | 5395/1000000 [13:47:06<2626:38:27, 9.51s/it, lr=1e-5, step_loss=0.0476]
Steps: 1%| | 5396/1000000 [13:47:12<2330:29:01, 8.44s/it, lr=1e-5, step_loss=0.0476][RANK-0]: Step: [5396], local_loss=0.025748830288648605, train_loss=0.06945285201072693, time_cost=4.810011386871338
+
Steps: 1%| | 5396/1000000 [13:47:12<2330:29:01, 8.44s/it, lr=1e-5, step_loss=0.0257]
Steps: 1%| | 5397/1000000 [13:47:17<2118:39:52, 7.67s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [5397], local_loss=0.028597986325621605, train_loss=0.025251785293221474, time_cost=1.287832260131836
+
Steps: 1%| | 5397/1000000 [13:47:17<2118:39:52, 7.67s/it, lr=1e-5, step_loss=0.0286]
Steps: 1%| | 5398/1000000 [13:47:24<1994:22:31, 7.22s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [5398], local_loss=0.026843281462788582, train_loss=0.025474626570940018, time_cost=1.2451469898223877
+
Steps: 1%| | 5398/1000000 [13:47:24<1994:22:31, 7.22s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%| | 5399/1000000 [13:47:39<2634:42:13, 9.54s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [5399], local_loss=0.014101029373705387, train_loss=0.08611524105072021, time_cost=6.493699073791504
+
Steps: 1%| | 5399/1000000 [13:47:39<2634:42:13, 9.54s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 5400/1000000 [13:47:50<2804:30:38, 10.15s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [5400], local_loss=0.00975075364112854, train_loss=0.01722734048962593, time_cost=2.7414443492889404
+
Steps: 1%| | 5400/1000000 [13:47:50<2804:30:38, 10.15s/it, lr=1e-5, step_loss=0.00975]
Steps: 1%| | 5401/1000000 [13:48:01<2875:01:27, 10.41s/it, lr=1e-5, step_loss=0.00975][RANK-0]: Step: [5401], local_loss=0.01741909421980381, train_loss=0.03335266560316086, time_cost=2.280123233795166
+
Steps: 1%| | 5401/1000000 [13:48:01<2875:01:27, 10.41s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 5402/1000000 [13:48:14<3101:10:01, 11.22s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [5402], local_loss=0.010981336236000061, train_loss=0.02817676216363907, time_cost=3.5144710540771484
+
Steps: 1%| | 5402/1000000 [13:48:14<3101:10:01, 11.22s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5403/1000000 [13:48:22<2794:54:37, 10.12s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5403], local_loss=0.022143907845020294, train_loss=0.02826816961169243, time_cost=2.30421781539917
+
Steps: 1%| | 5403/1000000 [13:48:22<2794:54:37, 10.12s/it, lr=1e-5, step_loss=0.0221]
Steps: 1%| | 5404/1000000 [13:48:33<2853:52:41, 10.33s/it, lr=1e-5, step_loss=0.0221][RANK-0]: Step: [5404], local_loss=0.028424551710486412, train_loss=0.04646149277687073, time_cost=1.5655913352966309
+
Steps: 1%| | 5404/1000000 [13:48:33<2853:52:41, 10.33s/it, lr=1e-5, step_loss=0.0284]
Steps: 1%| | 5405/1000000 [13:48:42<2768:07:06, 10.02s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [5405], local_loss=0.009950187988579273, train_loss=0.018485108390450478, time_cost=3.086778163909912
+
Steps: 1%| | 5405/1000000 [13:48:42<2768:07:06, 10.02s/it, lr=1e-5, step_loss=0.00995]
Steps: 1%| | 5406/1000000 [13:48:56<3137:45:53, 11.36s/it, lr=1e-5, step_loss=0.00995][RANK-0]: Step: [5406], local_loss=0.006370446644723415, train_loss=0.05017770081758499, time_cost=10.129151821136475
+
Steps: 1%| | 5406/1000000 [13:48:56<3137:45:53, 11.36s/it, lr=1e-5, step_loss=0.00637]
Steps: 1%| | 5407/1000000 [13:49:11<3418:41:29, 12.37s/it, lr=1e-5, step_loss=0.00637][RANK-0]: Step: [5407], local_loss=0.039528168737888336, train_loss=0.03378470987081528, time_cost=4.186876058578491
+
Steps: 1%| | 5407/1000000 [13:49:11<3418:41:29, 12.37s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%| | 5408/1000000 [13:49:23<3384:31:54, 12.25s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [5408], local_loss=0.02408190816640854, train_loss=0.030829371884465218, time_cost=4.358107328414917
+
Steps: 1%| | 5408/1000000 [13:49:23<3384:31:54, 12.25s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 5409/1000000 [13:49:30<2940:47:58, 10.64s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [5409], local_loss=0.06594404578208923, train_loss=0.047065965831279755, time_cost=1.2239017486572266
+
Steps: 1%| | 5409/1000000 [13:49:30<2940:47:58, 10.64s/it, lr=1e-5, step_loss=0.0659]
Steps: 1%| | 5410/1000000 [13:49:40<2853:23:10, 10.33s/it, lr=1e-5, step_loss=0.0659][RANK-0]: Step: [5410], local_loss=0.06989901512861252, train_loss=0.05958006903529167, time_cost=2.8904337882995605
+
Steps: 1%| | 5410/1000000 [13:49:40<2853:23:10, 10.33s/it, lr=1e-5, step_loss=0.0699]
Steps: 1%| | 5411/1000000 [13:49:46<2493:10:38, 9.02s/it, lr=1e-5, step_loss=0.0699][RANK-0]: Step: [5411], local_loss=0.047376446425914764, train_loss=0.03176111727952957, time_cost=2.2598352432250977
+
Steps: 1%| | 5411/1000000 [13:49:46<2493:10:38, 9.02s/it, lr=1e-5, step_loss=0.0474]
Steps: 1%| | 5412/1000000 [13:49:53<2359:46:33, 8.54s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [5412], local_loss=0.014499468728899956, train_loss=0.03771938756108284, time_cost=3.160186290740967
+
Steps: 1%| | 5412/1000000 [13:49:53<2359:46:33, 8.54s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 5413/1000000 [13:49:58<2055:07:40, 7.44s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [5413], local_loss=0.01714061014354229, train_loss=0.030029047280550003, time_cost=2.051182985305786
+
Steps: 1%| | 5413/1000000 [13:49:58<2055:07:40, 7.44s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 5414/1000000 [13:50:08<2301:09:52, 8.33s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [5414], local_loss=0.03495276719331741, train_loss=0.042081236839294434, time_cost=1.2717392444610596
+
Steps: 1%| | 5414/1000000 [13:50:08<2301:09:52, 8.33s/it, lr=1e-5, step_loss=0.035]
Steps: 1%| | 5415/1000000 [13:50:14<2095:47:48, 7.59s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [5415], local_loss=0.33308398723602295, train_loss=0.05876806005835533, time_cost=2.961780548095703
+
Steps: 1%| | 5415/1000000 [13:50:14<2095:47:48, 7.59s/it, lr=1e-5, step_loss=0.333]
Steps: 1%| | 5416/1000000 [13:50:19<1888:49:47, 6.84s/it, lr=1e-5, step_loss=0.333][RANK-0]: Step: [5416], local_loss=0.19065497815608978, train_loss=0.17682072520256042, time_cost=3.8748581409454346
+
Steps: 1%| | 5416/1000000 [13:50:19<1888:49:47, 6.84s/it, lr=1e-5, step_loss=0.191]
Steps: 1%| | 5417/1000000 [13:50:26<1897:07:04, 6.87s/it, lr=1e-5, step_loss=0.191][RANK-0]: Step: [5417], local_loss=0.02194642275571823, train_loss=0.040987297892570496, time_cost=3.4244189262390137
+
Steps: 1%| | 5417/1000000 [13:50:26<1897:07:04, 6.87s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%| | 5418/1000000 [13:50:35<2062:53:58, 7.47s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [5418], local_loss=0.012916676700115204, train_loss=0.037736646831035614, time_cost=2.7883501052856445
+
Steps: 1%| | 5418/1000000 [13:50:35<2062:53:58, 7.47s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 5419/1000000 [13:50:42<2064:08:04, 7.47s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [5419], local_loss=0.010078010149300098, train_loss=0.022020095959305763, time_cost=3.372753143310547
+
Steps: 1%| | 5419/1000000 [13:50:42<2064:08:04, 7.47s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 5420/1000000 [13:50:48<1892:19:31, 6.85s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [5420], local_loss=0.006273172330111265, train_loss=0.05708000063896179, time_cost=2.713534116744995
+
Steps: 1%| | 5420/1000000 [13:50:48<1892:19:31, 6.85s/it, lr=1e-5, step_loss=0.00627]
Steps: 1%| | 5421/1000000 [13:50:59<2226:01:37, 8.06s/it, lr=1e-5, step_loss=0.00627][RANK-0]: Step: [5421], local_loss=0.026831580325961113, train_loss=0.045599132776260376, time_cost=3.596930503845215
+
Steps: 1%| | 5421/1000000 [13:50:59<2226:01:37, 8.06s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%| | 5422/1000000 [13:51:04<1978:42:29, 7.16s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [5422], local_loss=0.01730869710445404, train_loss=0.17546389997005463, time_cost=2.1328046321868896
+
Steps: 1%| | 5422/1000000 [13:51:04<1978:42:29, 7.16s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%| | 5423/1000000 [13:51:11<1981:51:32, 7.17s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [5423], local_loss=0.0252776350826025, train_loss=0.08786505460739136, time_cost=1.2578837871551514
+
Steps: 1%| | 5423/1000000 [13:51:11<1981:51:32, 7.17s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%| | 5424/1000000 [13:51:22<2316:12:47, 8.38s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [5424], local_loss=0.07115437835454941, train_loss=0.027531038969755173, time_cost=1.2035095691680908
+
Steps: 1%| | 5424/1000000 [13:51:22<2316:12:47, 8.38s/it, lr=1e-5, step_loss=0.0712]
Steps: 1%| | 5425/1000000 [13:51:34<2595:15:20, 9.39s/it, lr=1e-5, step_loss=0.0712][RANK-0]: Step: [5425], local_loss=0.005968648474663496, train_loss=0.04672938212752342, time_cost=1.2330079078674316
+
Steps: 1%| | 5425/1000000 [13:51:34<2595:15:20, 9.39s/it, lr=1e-5, step_loss=0.00597]
Steps: 1%| | 5426/1000000 [13:51:42<2473:48:07, 8.95s/it, lr=1e-5, step_loss=0.00597][RANK-0]: Step: [5426], local_loss=0.25347191095352173, train_loss=0.06627722084522247, time_cost=3.147860050201416
+
Steps: 1%| | 5426/1000000 [13:51:42<2473:48:07, 8.95s/it, lr=1e-5, step_loss=0.253]
Steps: 1%| | 5427/1000000 [13:51:51<2456:37:45, 8.89s/it, lr=1e-5, step_loss=0.253][RANK-0]: Step: [5427], local_loss=0.009468299336731434, train_loss=0.030946027487516403, time_cost=2.593688726425171
+
Steps: 1%| | 5427/1000000 [13:51:51<2456:37:45, 8.89s/it, lr=1e-5, step_loss=0.00947]
Steps: 1%| | 5428/1000000 [13:51:56<2135:51:46, 7.73s/it, lr=1e-5, step_loss=0.00947][RANK-0]: Step: [5428], local_loss=0.008756421506404877, train_loss=0.03263886645436287, time_cost=2.3070108890533447
+
Steps: 1%| | 5428/1000000 [13:51:56<2135:51:46, 7.73s/it, lr=1e-5, step_loss=0.00876]
Steps: 1%| | 5429/1000000 [13:52:05<2226:09:27, 8.06s/it, lr=1e-5, step_loss=0.00876][RANK-0]: Step: [5429], local_loss=0.01883562095463276, train_loss=0.0339789018034935, time_cost=1.2189688682556152
+
Steps: 1%| | 5429/1000000 [13:52:05<2226:09:27, 8.06s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 5430/1000000 [13:52:14<2318:48:53, 8.39s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [5430], local_loss=0.007919741794466972, train_loss=0.04699131101369858, time_cost=1.2562801837921143
+
Steps: 1%| | 5430/1000000 [13:52:14<2318:48:53, 8.39s/it, lr=1e-5, step_loss=0.00792]
Steps: 1%| | 5431/1000000 [13:52:19<2055:43:00, 7.44s/it, lr=1e-5, step_loss=0.00792][RANK-0]: Step: [5431], local_loss=0.045910563319921494, train_loss=0.05089465528726578, time_cost=2.172760486602783
+
Steps: 1%| | 5431/1000000 [13:52:19<2055:43:00, 7.44s/it, lr=1e-5, step_loss=0.0459]
Steps: 1%| | 5432/1000000 [13:52:29<2270:05:57, 8.22s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [5432], local_loss=0.042914118617773056, train_loss=0.05888621136546135, time_cost=1.5353171825408936
+
Steps: 1%| | 5432/1000000 [13:52:29<2270:05:57, 8.22s/it, lr=1e-5, step_loss=0.0429]
Steps: 1%| | 5433/1000000 [13:52:35<2098:58:52, 7.60s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [5433], local_loss=0.08495713770389557, train_loss=0.07667665183544159, time_cost=2.141303777694702
+
Steps: 1%| | 5433/1000000 [13:52:35<2098:58:52, 7.60s/it, lr=1e-5, step_loss=0.085]
Steps: 1%| | 5434/1000000 [13:52:44<2170:05:23, 7.86s/it, lr=1e-5, step_loss=0.085][RANK-0]: Step: [5434], local_loss=0.012536093592643738, train_loss=0.04255865514278412, time_cost=4.3810155391693115
+
Steps: 1%| | 5434/1000000 [13:52:44<2170:05:23, 7.86s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 5435/1000000 [13:53:00<2918:58:57, 10.57s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [5435], local_loss=0.008522621355950832, train_loss=0.0746956616640091, time_cost=7.751379489898682
+
Steps: 1%| | 5435/1000000 [13:53:00<2918:58:57, 10.57s/it, lr=1e-5, step_loss=0.00852]
Steps: 1%| | 5436/1000000 [13:53:14<3173:58:55, 11.49s/it, lr=1e-5, step_loss=0.00852][RANK-0]: Step: [5436], local_loss=0.02911093272268772, train_loss=0.03466526046395302, time_cost=6.941716909408569
+
Steps: 1%| | 5436/1000000 [13:53:14<3173:58:55, 11.49s/it, lr=1e-5, step_loss=0.0291]
Steps: 1%| | 5437/1000000 [13:53:18<2554:50:37, 9.25s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [5437], local_loss=0.03454931825399399, train_loss=0.01903567463159561, time_cost=1.415417194366455
+
Steps: 1%| | 5437/1000000 [13:53:18<2554:50:37, 9.25s/it, lr=1e-5, step_loss=0.0345]
Steps: 1%| | 5438/1000000 [13:53:35<3176:14:32, 11.50s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [5438], local_loss=0.027360953390598297, train_loss=0.03636183217167854, time_cost=7.0423424243927
+
Steps: 1%| | 5438/1000000 [13:53:35<3176:14:32, 11.50s/it, lr=1e-5, step_loss=0.0274]
Steps: 1%| | 5439/1000000 [13:53:41<2724:52:30, 9.86s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [5439], local_loss=0.1337078958749771, train_loss=0.057219840586185455, time_cost=4.528265953063965
+
Steps: 1%| | 5439/1000000 [13:53:41<2724:52:30, 9.86s/it, lr=1e-5, step_loss=0.134]
Steps: 1%| | 5440/1000000 [13:53:57<3205:42:27, 11.60s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [5440], local_loss=0.03058629482984543, train_loss=0.11971348524093628, time_cost=5.414518117904663
+
Steps: 1%| | 5440/1000000 [13:53:57<3205:42:27, 11.60s/it, lr=1e-5, step_loss=0.0306]
Steps: 1%| | 5441/1000000 [13:54:02<2708:55:59, 9.81s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [5441], local_loss=0.014919402077794075, train_loss=0.03765622153878212, time_cost=1.3114345073699951
+
Steps: 1%| | 5441/1000000 [13:54:02<2708:55:59, 9.81s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 5442/1000000 [13:54:18<3186:38:11, 11.53s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [5442], local_loss=0.037679754197597504, train_loss=0.07422833144664764, time_cost=5.196158170700073
+
Steps: 1%| | 5442/1000000 [13:54:18<3186:38:11, 11.53s/it, lr=1e-5, step_loss=0.0377]
Steps: 1%| | 5443/1000000 [13:54:26<2895:34:38, 10.48s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [5443], local_loss=0.05134543403983116, train_loss=0.02637222409248352, time_cost=5.886296033859253
+
Steps: 1%| | 5443/1000000 [13:54:26<2895:34:38, 10.48s/it, lr=1e-5, step_loss=0.0513]
Steps: 1%| | 5444/1000000 [13:54:39<3156:24:35, 11.43s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [5444], local_loss=0.032773032784461975, train_loss=0.08815628290176392, time_cost=6.353331089019775
+
Steps: 1%| | 5444/1000000 [13:54:39<3156:24:35, 11.43s/it, lr=1e-5, step_loss=0.0328]
Steps: 1%| | 5445/1000000 [13:54:47<2808:49:52, 10.17s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [5445], local_loss=0.07029275596141815, train_loss=0.030529245734214783, time_cost=3.575817584991455
+
Steps: 1%| | 5445/1000000 [13:54:47<2808:49:52, 10.17s/it, lr=1e-5, step_loss=0.0703]
Steps: 1%| | 5446/1000000 [13:55:00<3100:48:52, 11.22s/it, lr=1e-5, step_loss=0.0703][RANK-0]: Step: [5446], local_loss=0.009777914732694626, train_loss=0.05351495370268822, time_cost=3.8646035194396973
+
Steps: 1%| | 5446/1000000 [13:55:00<3100:48:52, 11.22s/it, lr=1e-5, step_loss=0.00978]
Steps: 1%| | 5447/1000000 [13:55:08<2790:58:24, 10.10s/it, lr=1e-5, step_loss=0.00978][RANK-0]: Step: [5447], local_loss=0.010959450155496597, train_loss=0.17513319849967957, time_cost=1.2484300136566162
+
Steps: 1%| | 5447/1000000 [13:55:08<2790:58:24, 10.10s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5448/1000000 [13:55:12<2320:41:18, 8.40s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5448], local_loss=0.01574389450252056, train_loss=0.026232238858938217, time_cost=1.4419522285461426
+
Steps: 1%| | 5448/1000000 [13:55:12<2320:41:18, 8.40s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 5449/1000000 [13:55:23<2558:22:53, 9.26s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [5449], local_loss=0.02817848138511181, train_loss=0.04176120460033417, time_cost=1.953199863433838
+
Steps: 1%| | 5449/1000000 [13:55:23<2558:22:53, 9.26s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%| | 5450/1000000 [13:55:38<2964:37:44, 10.73s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [5450], local_loss=0.01053628884255886, train_loss=0.016814932227134705, time_cost=5.203388929367065
+
Steps: 1%| | 5450/1000000 [13:55:38<2964:37:44, 10.73s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 5451/1000000 [13:55:49<2984:36:34, 10.80s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [5451], local_loss=0.03349601477384567, train_loss=0.11743801832199097, time_cost=1.428797721862793
+
Steps: 1%| | 5451/1000000 [13:55:49<2984:36:34, 10.80s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%| | 5452/1000000 [13:56:03<3281:57:38, 11.88s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [5452], local_loss=0.014945302158594131, train_loss=0.022250518202781677, time_cost=6.239048480987549
+
Steps: 1%| | 5452/1000000 [13:56:03<3281:57:38, 11.88s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 5453/1000000 [13:56:16<3392:37:47, 12.28s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [5453], local_loss=0.030911384150385857, train_loss=0.04217241704463959, time_cost=3.801281213760376
+
Steps: 1%| | 5453/1000000 [13:56:16<3392:37:47, 12.28s/it, lr=1e-5, step_loss=0.0309]
Steps: 1%| | 5454/1000000 [13:56:25<3119:35:22, 11.29s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [5454], local_loss=0.0341825895011425, train_loss=0.03943134844303131, time_cost=4.133447170257568
+
Steps: 1%| | 5454/1000000 [13:56:25<3119:35:22, 11.29s/it, lr=1e-5, step_loss=0.0342]
Steps: 1%| | 5455/1000000 [13:56:29<2522:25:58, 9.13s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [5455], local_loss=0.012336763553321362, train_loss=0.08037961274385452, time_cost=3.1350207328796387
+
Steps: 1%| | 5455/1000000 [13:56:29<2522:25:58, 9.13s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 5456/1000000 [13:56:36<2342:46:42, 8.48s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [5456], local_loss=0.02440822683274746, train_loss=0.04097629711031914, time_cost=2.5478572845458984
+
Steps: 1%| | 5456/1000000 [13:56:36<2342:46:42, 8.48s/it, lr=1e-5, step_loss=0.0244]
Steps: 1%| | 5457/1000000 [13:56:48<2614:19:50, 9.46s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [5457], local_loss=0.03392011299729347, train_loss=0.22491112351417542, time_cost=4.096755027770996
+
Steps: 1%| | 5457/1000000 [13:56:48<2614:19:50, 9.46s/it, lr=1e-5, step_loss=0.0339]
Steps: 1%| | 5458/1000000 [13:56:54<2352:26:05, 8.52s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [5458], local_loss=229.37213134765625, train_loss=28.711387634277344, time_cost=1.8566620349884033
+
Steps: 1%| | 5458/1000000 [13:56:54<2352:26:05, 8.52s/it, lr=1e-5, step_loss=229]
Steps: 1%| | 5459/1000000 [13:57:03<2335:08:15, 8.45s/it, lr=1e-5, step_loss=229][RANK-0]: Step: [5459], local_loss=0.040634457021951675, train_loss=0.16495457291603088, time_cost=4.5519700050354
+
Steps: 1%| | 5459/1000000 [13:57:03<2335:08:15, 8.45s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%| | 5460/1000000 [13:57:13<2467:32:59, 8.93s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [5460], local_loss=0.012090452946722507, train_loss=0.03616560250520706, time_cost=1.4232828617095947
+
Steps: 1%| | 5460/1000000 [13:57:13<2467:32:59, 8.93s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 5461/1000000 [13:57:28<3004:37:42, 10.88s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [5461], local_loss=0.014732273295521736, train_loss=0.028635798022150993, time_cost=4.230241060256958
+
Steps: 1%| | 5461/1000000 [13:57:28<3004:37:42, 10.88s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 5462/1000000 [13:57:44<3387:00:50, 12.26s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [5462], local_loss=0.031217757612466812, train_loss=0.018486492335796356, time_cost=8.07259225845337
+
Steps: 1%| | 5462/1000000 [13:57:44<3387:00:50, 12.26s/it, lr=1e-5, step_loss=0.0312]
Steps: 1%| | 5463/1000000 [13:57:49<2815:34:00, 10.19s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [5463], local_loss=0.3437173664569855, train_loss=0.0804654061794281, time_cost=2.8153042793273926
+
Steps: 1%| | 5463/1000000 [13:57:49<2815:34:00, 10.19s/it, lr=1e-5, step_loss=0.344]
Steps: 1%| | 5464/1000000 [13:58:01<2945:24:13, 10.66s/it, lr=1e-5, step_loss=0.344][RANK-0]: Step: [5464], local_loss=0.010369690135121346, train_loss=0.04479313641786575, time_cost=3.93385910987854
+
Steps: 1%| | 5464/1000000 [13:58:01<2945:24:13, 10.66s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 5465/1000000 [13:58:06<2482:12:20, 8.99s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [5465], local_loss=0.015468699857592583, train_loss=0.0870617926120758, time_cost=1.199103593826294
+
Steps: 1%| | 5465/1000000 [13:58:06<2482:12:20, 8.99s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 5466/1000000 [13:58:20<2898:55:47, 10.49s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [5466], local_loss=0.016227805987000465, train_loss=0.0220447089523077, time_cost=4.774185419082642
+
Steps: 1%| | 5466/1000000 [13:58:20<2898:55:47, 10.49s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 5467/1000000 [13:58:33<3095:26:15, 11.20s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [5467], local_loss=0.013746945187449455, train_loss=0.022163942456245422, time_cost=4.034228563308716
+
Steps: 1%| | 5467/1000000 [13:58:33<3095:26:15, 11.20s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 5468/1000000 [13:58:42<2940:25:22, 10.64s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [5468], local_loss=0.01100264023989439, train_loss=0.021593894809484482, time_cost=3.2871477603912354
+
Steps: 1%| | 5468/1000000 [13:58:42<2940:25:22, 10.64s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5469/1000000 [13:58:51<2840:42:28, 10.28s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5469], local_loss=0.06278349459171295, train_loss=0.04339878633618355, time_cost=1.2269580364227295
+
Steps: 1%| | 5469/1000000 [13:58:51<2840:42:28, 10.28s/it, lr=1e-5, step_loss=0.0628]
Steps: 1%| | 5470/1000000 [13:59:03<2977:14:05, 10.78s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [5470], local_loss=0.05330692604184151, train_loss=0.045291416347026825, time_cost=2.863572359085083
+
Steps: 1%| | 5470/1000000 [13:59:03<2977:14:05, 10.78s/it, lr=1e-5, step_loss=0.0533]
Steps: 1%| | 5471/1000000 [13:59:17<3188:33:14, 11.54s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [5471], local_loss=0.024384010583162308, train_loss=0.16650360822677612, time_cost=4.750347852706909
+
Steps: 1%| | 5471/1000000 [13:59:17<3188:33:14, 11.54s/it, lr=1e-5, step_loss=0.0244]
Steps: 1%| | 5472/1000000 [13:59:29<3226:57:42, 11.68s/it, lr=1e-5, step_loss=0.0244][RANK-0]: Step: [5472], local_loss=0.009547917172312737, train_loss=0.03585587441921234, time_cost=3.3716442584991455
+
Steps: 1%| | 5472/1000000 [13:59:29<3226:57:42, 11.68s/it, lr=1e-5, step_loss=0.00955]
Steps: 1%| | 5473/1000000 [13:59:40<3219:56:55, 11.66s/it, lr=1e-5, step_loss=0.00955][RANK-0]: Step: [5473], local_loss=0.013489075936377048, train_loss=0.023977313190698624, time_cost=5.483173608779907
+
Steps: 1%| | 5473/1000000 [13:59:40<3219:56:55, 11.66s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 5474/1000000 [13:59:52<3242:49:59, 11.74s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [5474], local_loss=0.026086241006851196, train_loss=16.530597686767578, time_cost=6.651951789855957
+
Steps: 1%| | 5474/1000000 [13:59:52<3242:49:59, 11.74s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%| | 5475/1000000 [14:00:02<3059:52:44, 11.08s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [5475], local_loss=0.05575671046972275, train_loss=0.1984676718711853, time_cost=2.8509294986724854
+
Steps: 1%| | 5475/1000000 [14:00:02<3059:52:44, 11.08s/it, lr=1e-5, step_loss=0.0558]
Steps: 1%| | 5476/1000000 [14:00:11<2911:09:42, 10.54s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [5476], local_loss=0.03845138102769852, train_loss=0.02919451706111431, time_cost=2.505153179168701
+
Steps: 1%| | 5476/1000000 [14:00:11<2911:09:42, 10.54s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%| | 5477/1000000 [14:00:26<3286:41:13, 11.90s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [5477], local_loss=0.04825687035918236, train_loss=0.12637975811958313, time_cost=5.577022075653076
+
Steps: 1%| | 5477/1000000 [14:00:26<3286:41:13, 11.90s/it, lr=1e-5, step_loss=0.0483]
Steps: 1%| | 5478/1000000 [14:00:40<3417:45:34, 12.37s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [5478], local_loss=0.023638639599084854, train_loss=14.638741493225098, time_cost=3.843630790710449
+
Steps: 1%| | 5478/1000000 [14:00:40<3417:45:34, 12.37s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%| | 5479/1000000 [14:00:44<2781:55:09, 10.07s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [5479], local_loss=0.03140852600336075, train_loss=3.3686559200286865, time_cost=2.124281883239746
+
Steps: 1%| | 5479/1000000 [14:00:44<2781:55:09, 10.07s/it, lr=1e-5, step_loss=0.0314]
Steps: 1%| | 5480/1000000 [14:00:51<2543:41:50, 9.21s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [5480], local_loss=0.05492602288722992, train_loss=0.08659907430410385, time_cost=2.578582763671875
+
Steps: 1%| | 5480/1000000 [14:00:51<2543:41:50, 9.21s/it, lr=1e-5, step_loss=0.0549]
Steps: 1%| | 5481/1000000 [14:00:56<2144:40:36, 7.76s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [5481], local_loss=0.009359377436339855, train_loss=0.04756589233875275, time_cost=1.3774144649505615
+
Steps: 1%| | 5481/1000000 [14:00:56<2144:40:36, 7.76s/it, lr=1e-5, step_loss=0.00936]
Steps: 1%| | 5482/1000000 [14:01:02<2039:02:04, 7.38s/it, lr=1e-5, step_loss=0.00936][RANK-0]: Step: [5482], local_loss=0.026102643460035324, train_loss=0.05904202535748482, time_cost=1.9386711120605469
+
Steps: 1%| | 5482/1000000 [14:01:02<2039:02:04, 7.38s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%| | 5483/1000000 [14:01:17<2632:06:44, 9.53s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [5483], local_loss=0.3636881411075592, train_loss=0.07744541764259338, time_cost=6.677934169769287
+
Steps: 1%| | 5483/1000000 [14:01:17<2632:06:44, 9.53s/it, lr=1e-5, step_loss=0.364]
Steps: 1%| | 5484/1000000 [14:01:31<2994:45:15, 10.84s/it, lr=1e-5, step_loss=0.364][RANK-0]: Step: [5484], local_loss=0.011364871636033058, train_loss=0.08039768785238266, time_cost=3.909381866455078
+
Steps: 1%| | 5484/1000000 [14:01:31<2994:45:15, 10.84s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5485/1000000 [14:01:42<3011:35:33, 10.90s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5485], local_loss=0.007469508796930313, train_loss=0.025232359766960144, time_cost=3.6773619651794434
+
Steps: 1%| | 5485/1000000 [14:01:42<3011:35:33, 10.90s/it, lr=1e-5, step_loss=0.00747]
Steps: 1%| | 5486/1000000 [14:01:53<3021:38:19, 10.94s/it, lr=1e-5, step_loss=0.00747][RANK-0]: Step: [5486], local_loss=0.014987356960773468, train_loss=0.04060635343194008, time_cost=1.2855618000030518
+
Steps: 1%| | 5486/1000000 [14:01:53<3021:38:19, 10.94s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5487/1000000 [14:02:05<3107:42:26, 11.25s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5487], local_loss=0.036561235785484314, train_loss=0.04769255965948105, time_cost=5.4171929359436035
+
Steps: 1%| | 5487/1000000 [14:02:05<3107:42:26, 11.25s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%| | 5488/1000000 [14:02:15<3054:38:15, 11.06s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [5488], local_loss=0.031266432255506516, train_loss=0.03665629029273987, time_cost=1.4210872650146484
+
Steps: 1%| | 5488/1000000 [14:02:15<3054:38:15, 11.06s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 5489/1000000 [14:02:22<2718:23:14, 9.84s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [5489], local_loss=0.18452540040016174, train_loss=0.10144132375717163, time_cost=3.869065523147583
+
Steps: 1%| | 5489/1000000 [14:02:22<2718:23:14, 9.84s/it, lr=1e-5, step_loss=0.185]
Steps: 1%| | 5490/1000000 [14:02:30<2534:11:15, 9.17s/it, lr=1e-5, step_loss=0.185][RANK-0]: Step: [5490], local_loss=0.06639991700649261, train_loss=0.03509441763162613, time_cost=5.597706317901611
+
Steps: 1%| | 5490/1000000 [14:02:30<2534:11:15, 9.17s/it, lr=1e-5, step_loss=0.0664]
Steps: 1%| | 5491/1000000 [14:02:35<2156:15:26, 7.81s/it, lr=1e-5, step_loss=0.0664][RANK-0]: Step: [5491], local_loss=0.05448557436466217, train_loss=0.08850264549255371, time_cost=1.4737040996551514
+
Steps: 1%| | 5491/1000000 [14:02:35<2156:15:26, 7.81s/it, lr=1e-5, step_loss=0.0545]
Steps: 1%| | 5492/1000000 [14:02:45<2398:32:01, 8.68s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [5492], local_loss=0.05774393305182457, train_loss=0.042566247284412384, time_cost=1.254577398300171
+
Steps: 1%| | 5492/1000000 [14:02:45<2398:32:01, 8.68s/it, lr=1e-5, step_loss=0.0577]
Steps: 1%| | 5493/1000000 [14:02:51<2105:37:00, 7.62s/it, lr=1e-5, step_loss=0.0577][RANK-0]: Step: [5493], local_loss=0.11198398470878601, train_loss=0.045818209648132324, time_cost=1.9583203792572021
+
Steps: 1%| | 5493/1000000 [14:02:51<2105:37:00, 7.62s/it, lr=1e-5, step_loss=0.112]
Steps: 1%| | 5494/1000000 [14:02:58<2078:07:53, 7.52s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [5494], local_loss=0.013037918135523796, train_loss=0.09769639372825623, time_cost=2.7844464778900146
+
Steps: 1%| | 5494/1000000 [14:02:58<2078:07:53, 7.52s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 5495/1000000 [14:03:03<1880:00:02, 6.81s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [5495], local_loss=0.26440930366516113, train_loss=0.06010369211435318, time_cost=2.105602979660034
+
Steps: 1%| | 5495/1000000 [14:03:03<1880:00:02, 6.81s/it, lr=1e-5, step_loss=0.264]
Steps: 1%| | 5496/1000000 [14:03:09<1814:00:21, 6.57s/it, lr=1e-5, step_loss=0.264][RANK-0]: Step: [5496], local_loss=0.020879162475466728, train_loss=0.06802526861429214, time_cost=1.7936928272247314
+
Steps: 1%| | 5496/1000000 [14:03:09<1814:00:21, 6.57s/it, lr=1e-5, step_loss=0.0209]
Steps: 1%| | 5497/1000000 [14:03:24<2479:18:26, 8.97s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [5497], local_loss=0.014987492002546787, train_loss=0.027633395045995712, time_cost=3.619264602661133
+
Steps: 1%| | 5497/1000000 [14:03:24<2479:18:26, 8.97s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5498/1000000 [14:03:29<2147:03:42, 7.77s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5498], local_loss=0.018364032730460167, train_loss=0.16776254773139954, time_cost=2.3272361755371094
+
Steps: 1%| | 5498/1000000 [14:03:29<2147:03:42, 7.77s/it, lr=1e-5, step_loss=0.0184]
Steps: 1%| | 5499/1000000 [14:03:40<2431:44:18, 8.80s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [5499], local_loss=0.003983467351645231, train_loss=0.02571173384785652, time_cost=7.749619960784912
+
Steps: 1%| | 5499/1000000 [14:03:40<2431:44:18, 8.80s/it, lr=1e-5, step_loss=0.00398]
Steps: 1%| | 5500/1000000 [14:03:50<2548:47:20, 9.23s/it, lr=1e-5, step_loss=0.00398][RANK-0]: Step: [5500], local_loss=0.015050793066620827, train_loss=0.03769068419933319, time_cost=5.030943870544434
+
Steps: 1%| | 5500/1000000 [14:03:50<2548:47:20, 9.23s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 5501/1000000 [14:03:57<2363:21:23, 8.56s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [5501], local_loss=0.012460817582905293, train_loss=0.05040924996137619, time_cost=2.462176561355591
+
Steps: 1%| | 5501/1000000 [14:03:57<2363:21:23, 8.56s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 5502/1000000 [14:04:06<2418:51:06, 8.76s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [5502], local_loss=0.01741371862590313, train_loss=0.08406894654035568, time_cost=3.591926097869873
+
Steps: 1%| | 5502/1000000 [14:04:06<2418:51:06, 8.76s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 5503/1000000 [14:04:11<2118:20:52, 7.67s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [5503], local_loss=0.010863293893635273, train_loss=0.06058718264102936, time_cost=2.1839287281036377
+
Steps: 1%| | 5503/1000000 [14:04:11<2118:20:52, 7.67s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 5504/1000000 [14:04:16<1868:54:46, 6.77s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [5504], local_loss=0.06999602168798447, train_loss=0.05754329264163971, time_cost=1.9394910335540771
+
Steps: 1%| | 5504/1000000 [14:04:16<1868:54:46, 6.77s/it, lr=1e-5, step_loss=0.07]
Steps: 1%| | 5505/1000000 [14:04:24<1963:28:42, 7.11s/it, lr=1e-5, step_loss=0.07][RANK-0]: Step: [5505], local_loss=1.0025629997253418, train_loss=0.17887026071548462, time_cost=3.731578826904297
+
Steps: 1%| | 5505/1000000 [14:04:24<1963:28:42, 7.11s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 5506/1000000 [14:04:28<1740:23:30, 6.30s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [5506], local_loss=0.035225193947553635, train_loss=0.03334689140319824, time_cost=1.2680444717407227
+
Steps: 1%| | 5506/1000000 [14:04:28<1740:23:30, 6.30s/it, lr=1e-5, step_loss=0.0352]
Steps: 1%| | 5507/1000000 [14:04:42<2315:32:34, 8.38s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [5507], local_loss=0.03505587950348854, train_loss=0.03362390026450157, time_cost=3.77752947807312
+
Steps: 1%| | 5507/1000000 [14:04:42<2315:32:34, 8.38s/it, lr=1e-5, step_loss=0.0351]
Steps: 1%| | 5508/1000000 [14:04:51<2440:23:59, 8.83s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [5508], local_loss=0.017195550724864006, train_loss=0.02786952070891857, time_cost=4.1207239627838135
+
Steps: 1%| | 5508/1000000 [14:04:51<2440:23:59, 8.83s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 5509/1000000 [14:04:57<2145:41:29, 7.77s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [5509], local_loss=0.014355733059346676, train_loss=0.0442889928817749, time_cost=2.5703630447387695
+
Steps: 1%| | 5509/1000000 [14:04:57<2145:41:29, 7.77s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 5510/1000000 [14:05:01<1874:37:02, 6.79s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [5510], local_loss=0.014581969007849693, train_loss=0.13972826302051544, time_cost=3.5377676486968994
+
Steps: 1%| | 5510/1000000 [14:05:01<1874:37:02, 6.79s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 5511/1000000 [14:05:08<1860:03:19, 6.73s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [5511], local_loss=0.052801478654146194, train_loss=0.16221535205841064, time_cost=1.637861728668213
+
Steps: 1%| | 5511/1000000 [14:05:08<1860:03:19, 6.73s/it, lr=1e-5, step_loss=0.0528]
Steps: 1%| | 5512/1000000 [14:05:19<2190:44:04, 7.93s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [5512], local_loss=0.030015893280506134, train_loss=0.13083066046237946, time_cost=4.889425039291382
+
Steps: 1%| | 5512/1000000 [14:05:19<2190:44:04, 7.93s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 5513/1000000 [14:05:30<2483:08:18, 8.99s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [5513], local_loss=0.10416434705257416, train_loss=0.08190640807151794, time_cost=2.514955520629883
+
Steps: 1%| | 5513/1000000 [14:05:30<2483:08:18, 8.99s/it, lr=1e-5, step_loss=0.104]
Steps: 1%| | 5514/1000000 [14:05:37<2297:08:48, 8.32s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [5514], local_loss=0.012381330132484436, train_loss=0.01714758574962616, time_cost=2.721900463104248
+
Steps: 1%| | 5514/1000000 [14:05:37<2297:08:48, 8.32s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5515/1000000 [14:05:42<2038:41:59, 7.38s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5515], local_loss=0.015264385379850864, train_loss=0.04137909412384033, time_cost=1.5048997402191162
+
Steps: 1%| | 5515/1000000 [14:05:42<2038:41:59, 7.38s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%| | 5516/1000000 [14:05:48<1937:50:48, 7.01s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [5516], local_loss=0.011147414334118366, train_loss=0.045300647616386414, time_cost=1.3146498203277588
+
Steps: 1%| | 5516/1000000 [14:05:48<1937:50:48, 7.01s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5517/1000000 [14:05:58<2217:17:47, 8.03s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5517], local_loss=0.1194135770201683, train_loss=0.06538248062133789, time_cost=2.649911403656006
+
Steps: 1%| | 5517/1000000 [14:05:58<2217:17:47, 8.03s/it, lr=1e-5, step_loss=0.119]
Steps: 1%| | 5518/1000000 [14:06:04<2037:25:00, 7.38s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [5518], local_loss=0.015700453892350197, train_loss=0.061613500118255615, time_cost=3.967977285385132
+
Steps: 1%| | 5518/1000000 [14:06:04<2037:25:00, 7.38s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 5519/1000000 [14:06:16<2380:11:47, 8.62s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [5519], local_loss=0.08144205808639526, train_loss=0.04537658393383026, time_cost=4.8674280643463135
+
Steps: 1%| | 5519/1000000 [14:06:16<2380:11:47, 8.62s/it, lr=1e-5, step_loss=0.0814]
Steps: 1%| | 5520/1000000 [14:06:27<2565:52:11, 9.29s/it, lr=1e-5, step_loss=0.0814][RANK-0]: Step: [5520], local_loss=0.024234723299741745, train_loss=0.14676235616207123, time_cost=1.6753621101379395
+
Steps: 1%| | 5520/1000000 [14:06:27<2565:52:11, 9.29s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 5521/1000000 [14:06:37<2640:12:53, 9.56s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [5521], local_loss=0.011041398160159588, train_loss=0.02568226493895054, time_cost=1.9896934032440186
+
Steps: 1%| | 5521/1000000 [14:06:37<2640:12:53, 9.56s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5522/1000000 [14:06:41<2205:09:38, 7.98s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5522], local_loss=0.04476042091846466, train_loss=0.027601398527622223, time_cost=1.4531352519989014
+
Steps: 1%| | 5522/1000000 [14:06:41<2205:09:38, 7.98s/it, lr=1e-5, step_loss=0.0448]
Steps: 1%| | 5523/1000000 [14:06:55<2711:42:31, 9.82s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [5523], local_loss=0.012691512703895569, train_loss=0.03206757456064224, time_cost=5.710192918777466
+
Steps: 1%| | 5523/1000000 [14:06:55<2711:42:31, 9.82s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 5524/1000000 [14:07:07<2898:38:12, 10.49s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [5524], local_loss=0.011470490135252476, train_loss=0.064258873462677, time_cost=1.2318921089172363
+
Steps: 1%| | 5524/1000000 [14:07:07<2898:38:12, 10.49s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 5525/1000000 [14:07:15<2626:45:03, 9.51s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [5525], local_loss=0.013679279014468193, train_loss=0.14007489383220673, time_cost=2.3968780040740967
+
Steps: 1%| | 5525/1000000 [14:07:15<2626:45:03, 9.51s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 5526/1000000 [14:07:22<2493:17:21, 9.03s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [5526], local_loss=0.011454211547970772, train_loss=0.04454920068383217, time_cost=2.938699722290039
+
Steps: 1%| | 5526/1000000 [14:07:22<2493:17:21, 9.03s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 5527/1000000 [14:07:28<2164:01:35, 7.83s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [5527], local_loss=0.022547481581568718, train_loss=0.03658166527748108, time_cost=3.7343177795410156
+
Steps: 1%| | 5527/1000000 [14:07:28<2164:01:35, 7.83s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%| | 5528/1000000 [14:07:33<1933:58:41, 7.00s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [5528], local_loss=0.18643152713775635, train_loss=0.061866044998168945, time_cost=2.0042831897735596
+
Steps: 1%| | 5528/1000000 [14:07:33<1933:58:41, 7.00s/it, lr=1e-5, step_loss=0.186]
Steps: 1%| | 5529/1000000 [14:07:37<1730:45:14, 6.27s/it, lr=1e-5, step_loss=0.186][RANK-0]: Step: [5529], local_loss=0.011380110867321491, train_loss=0.030620750039815903, time_cost=1.6044795513153076
+
Steps: 1%| | 5529/1000000 [14:07:37<1730:45:14, 6.27s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5530/1000000 [14:07:46<1926:47:31, 6.98s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5530], local_loss=0.017698509618639946, train_loss=0.16848786175251007, time_cost=3.044562578201294
+
Steps: 1%| | 5530/1000000 [14:07:46<1926:47:31, 6.98s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 5531/1000000 [14:08:00<2530:05:28, 9.16s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [5531], local_loss=0.016733959317207336, train_loss=0.022540856152772903, time_cost=2.5193748474121094
+
Steps: 1%| | 5531/1000000 [14:08:00<2530:05:28, 9.16s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%| | 5532/1000000 [14:08:14<2916:05:01, 10.56s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [5532], local_loss=0.02196257747709751, train_loss=0.047704994678497314, time_cost=4.545907258987427
+
Steps: 1%| | 5532/1000000 [14:08:14<2916:05:01, 10.56s/it, lr=1e-5, step_loss=0.022]
Steps: 1%| | 5533/1000000 [14:08:22<2687:17:03, 9.73s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [5533], local_loss=0.04099171608686447, train_loss=0.03021564893424511, time_cost=1.3964321613311768
+
Steps: 1%| | 5533/1000000 [14:08:22<2687:17:03, 9.73s/it, lr=1e-5, step_loss=0.041]
Steps: 1%| | 5534/1000000 [14:08:26<2254:05:50, 8.16s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [5534], local_loss=0.009146429598331451, train_loss=0.03823237866163254, time_cost=1.2458925247192383
+
Steps: 1%| | 5534/1000000 [14:08:26<2254:05:50, 8.16s/it, lr=1e-5, step_loss=0.00915]
Steps: 1%| | 5535/1000000 [14:08:32<2072:44:06, 7.50s/it, lr=1e-5, step_loss=0.00915][RANK-0]: Step: [5535], local_loss=0.01077414769679308, train_loss=0.06013231724500656, time_cost=1.2138409614562988
+
Steps: 1%| | 5535/1000000 [14:08:32<2072:44:06, 7.50s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 5536/1000000 [14:08:45<2531:24:12, 9.16s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [5536], local_loss=0.04166145622730255, train_loss=0.025474824011325836, time_cost=5.258719205856323
+
Steps: 1%| | 5536/1000000 [14:08:45<2531:24:12, 9.16s/it, lr=1e-5, step_loss=0.0417]
Steps: 1%| | 5537/1000000 [14:09:00<3007:31:46, 10.89s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [5537], local_loss=0.012780798599123955, train_loss=0.09970829635858536, time_cost=5.088263034820557
+
Steps: 1%| | 5537/1000000 [14:09:00<3007:31:46, 10.89s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5538/1000000 [14:09:06<2586:14:59, 9.36s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5538], local_loss=0.024302326142787933, train_loss=0.021526334807276726, time_cost=1.2463924884796143
+
Steps: 1%| | 5538/1000000 [14:09:06<2586:14:59, 9.36s/it, lr=1e-5, step_loss=0.0243]
Steps: 1%| | 5539/1000000 [14:09:11<2269:16:07, 8.21s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [5539], local_loss=0.029534758999943733, train_loss=0.04236413538455963, time_cost=2.7787845134735107
+
Steps: 1%| | 5539/1000000 [14:09:11<2269:16:07, 8.21s/it, lr=1e-5, step_loss=0.0295]
Steps: 1%| | 5540/1000000 [14:09:16<1999:53:26, 7.24s/it, lr=1e-5, step_loss=0.0295][RANK-0]: Step: [5540], local_loss=0.03399089351296425, train_loss=0.06818127632141113, time_cost=1.811981201171875
+
Steps: 1%| | 5540/1000000 [14:09:16<1999:53:26, 7.24s/it, lr=1e-5, step_loss=0.034]
Steps: 1%| | 5541/1000000 [14:09:22<1877:09:02, 6.80s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [5541], local_loss=0.010656463913619518, train_loss=0.03125024959445, time_cost=1.666612148284912
+
Steps: 1%| | 5541/1000000 [14:09:22<1877:09:02, 6.80s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 5542/1000000 [14:09:38<2626:02:09, 9.51s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [5542], local_loss=0.9960108399391174, train_loss=0.14217974245548248, time_cost=1.2311599254608154
+
Steps: 1%| | 5542/1000000 [14:09:38<2626:02:09, 9.51s/it, lr=1e-5, step_loss=0.996]
Steps: 1%| | 5543/1000000 [14:09:52<2967:53:09, 10.74s/it, lr=1e-5, step_loss=0.996][RANK-0]: Step: [5543], local_loss=0.04990500956773758, train_loss=0.025300664827227592, time_cost=4.173882961273193
+
Steps: 1%| | 5543/1000000 [14:09:52<2967:53:09, 10.74s/it, lr=1e-5, step_loss=0.0499]
Steps: 1%| | 5544/1000000 [14:10:03<2990:21:26, 10.83s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [5544], local_loss=0.009550041519105434, train_loss=0.16543520987033844, time_cost=1.2080960273742676
+
Steps: 1%| | 5544/1000000 [14:10:03<2990:21:26, 10.83s/it, lr=1e-5, step_loss=0.00955]
Steps: 1%| | 5545/1000000 [14:10:14<3005:33:08, 10.88s/it, lr=1e-5, step_loss=0.00955][RANK-0]: Step: [5545], local_loss=0.007506479974836111, train_loss=0.01840166375041008, time_cost=6.886584043502808
+
Steps: 1%| | 5545/1000000 [14:10:14<3005:33:08, 10.88s/it, lr=1e-5, step_loss=0.00751]
Steps: 1%| | 5546/1000000 [14:10:18<2473:25:45, 8.95s/it, lr=1e-5, step_loss=0.00751][RANK-0]: Step: [5546], local_loss=0.03749888390302658, train_loss=0.018893130123615265, time_cost=1.6471765041351318
+
Steps: 1%| | 5546/1000000 [14:10:18<2473:25:45, 8.95s/it, lr=1e-5, step_loss=0.0375]
Steps: 1%| | 5547/1000000 [14:10:24<2218:12:36, 8.03s/it, lr=1e-5, step_loss=0.0375][RANK-0]: Step: [5547], local_loss=0.052833765745162964, train_loss=0.09207819402217865, time_cost=1.4019339084625244
+
Steps: 1%| | 5547/1000000 [14:10:24<2218:12:36, 8.03s/it, lr=1e-5, step_loss=0.0528]
Steps: 1%| | 5548/1000000 [14:10:31<2154:56:30, 7.80s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [5548], local_loss=0.03519314527511597, train_loss=0.045746639370918274, time_cost=2.782329559326172
+
Steps: 1%| | 5548/1000000 [14:10:31<2154:56:30, 7.80s/it, lr=1e-5, step_loss=0.0352]
Steps: 1%| | 5549/1000000 [14:10:45<2611:39:18, 9.45s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [5549], local_loss=0.011280548758804798, train_loss=0.03388335183262825, time_cost=9.22081184387207
+
Steps: 1%| | 5549/1000000 [14:10:45<2611:39:18, 9.45s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 5550/1000000 [14:10:51<2385:01:25, 8.63s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [5550], local_loss=0.05070459470152855, train_loss=0.053002241998910904, time_cost=1.8187730312347412
+
Steps: 1%| | 5550/1000000 [14:10:51<2385:01:25, 8.63s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%| | 5551/1000000 [14:11:00<2392:31:16, 8.66s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [5551], local_loss=0.008106738328933716, train_loss=0.057501066476106644, time_cost=2.9896459579467773
+
Steps: 1%| | 5551/1000000 [14:11:00<2392:31:16, 8.66s/it, lr=1e-5, step_loss=0.00811]
Steps: 1%| | 5552/1000000 [14:11:05<2120:22:39, 7.68s/it, lr=1e-5, step_loss=0.00811][RANK-0]: Step: [5552], local_loss=0.022211775183677673, train_loss=0.1160525381565094, time_cost=2.8131837844848633
+
Steps: 1%| | 5552/1000000 [14:11:05<2120:22:39, 7.68s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 5553/1000000 [14:11:21<2766:07:58, 10.01s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [5553], local_loss=0.07250665128231049, train_loss=0.07342582941055298, time_cost=6.820854187011719
+
Steps: 1%| | 5553/1000000 [14:11:21<2766:07:58, 10.01s/it, lr=1e-5, step_loss=0.0725]
Steps: 1%| | 5554/1000000 [14:11:34<3067:02:40, 11.10s/it, lr=1e-5, step_loss=0.0725][RANK-0]: Step: [5554], local_loss=0.01312725804746151, train_loss=0.16003355383872986, time_cost=1.542281150817871
+
Steps: 1%| | 5554/1000000 [14:11:34<3067:02:40, 11.10s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 5555/1000000 [14:11:40<2596:48:51, 9.40s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [5555], local_loss=0.02752590924501419, train_loss=0.0371394008398056, time_cost=1.2433724403381348
+
Steps: 1%| | 5555/1000000 [14:11:40<2596:48:51, 9.40s/it, lr=1e-5, step_loss=0.0275]
Steps: 1%| | 5556/1000000 [14:11:46<2334:45:19, 8.45s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [5556], local_loss=0.014938365668058395, train_loss=0.06128235533833504, time_cost=2.002859354019165
+
Steps: 1%| | 5556/1000000 [14:11:46<2334:45:19, 8.45s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 5557/1000000 [14:11:51<2021:14:34, 7.32s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [5557], local_loss=0.05754562467336655, train_loss=0.02142072096467018, time_cost=1.3216676712036133
+
Steps: 1%| | 5557/1000000 [14:11:51<2021:14:34, 7.32s/it, lr=1e-5, step_loss=0.0575]
Steps: 1%| | 5558/1000000 [14:12:06<2656:17:22, 9.62s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [5558], local_loss=0.16103336215019226, train_loss=13.39973258972168, time_cost=6.95354700088501
+
Steps: 1%| | 5558/1000000 [14:12:06<2656:17:22, 9.62s/it, lr=1e-5, step_loss=0.161]
Steps: 1%| | 5559/1000000 [14:12:17<2783:55:04, 10.08s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [5559], local_loss=0.022155752405524254, train_loss=0.026033218950033188, time_cost=2.5178990364074707
+
Steps: 1%| | 5559/1000000 [14:12:17<2783:55:04, 10.08s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 5560/1000000 [14:12:22<2354:37:06, 8.52s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [5560], local_loss=0.015239684842526913, train_loss=0.06689950823783875, time_cost=2.2192649841308594
+
Steps: 1%| | 5560/1000000 [14:12:22<2354:37:06, 8.52s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 5561/1000000 [14:12:29<2222:45:04, 8.05s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [5561], local_loss=0.020587747916579247, train_loss=0.025417614728212357, time_cost=3.1178598403930664
+
Steps: 1%| | 5561/1000000 [14:12:29<2222:45:04, 8.05s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 5562/1000000 [14:12:33<1897:35:52, 6.87s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [5562], local_loss=0.010531144216656685, train_loss=0.156819686293602, time_cost=1.3546159267425537
+
Steps: 1%| | 5562/1000000 [14:12:33<1897:35:52, 6.87s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 5563/1000000 [14:12:40<1935:33:59, 7.01s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [5563], local_loss=0.041208840906620026, train_loss=0.04744746536016464, time_cost=2.7508115768432617
+
Steps: 1%| | 5563/1000000 [14:12:40<1935:33:59, 7.01s/it, lr=1e-5, step_loss=0.0412]
Steps: 1%| | 5564/1000000 [14:12:55<2544:30:32, 9.21s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [5564], local_loss=0.17444153130054474, train_loss=0.06779379397630692, time_cost=1.226729393005371
+
Steps: 1%| | 5564/1000000 [14:12:55<2544:30:32, 9.21s/it, lr=1e-5, step_loss=0.174]
Steps: 1%| | 5565/1000000 [14:13:02<2426:32:29, 8.78s/it, lr=1e-5, step_loss=0.174][RANK-0]: Step: [5565], local_loss=0.0104498490691185, train_loss=0.01890687271952629, time_cost=3.7879269123077393
+
Steps: 1%| | 5565/1000000 [14:13:02<2426:32:29, 8.78s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 5566/1000000 [14:13:16<2838:54:06, 10.28s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [5566], local_loss=0.011388982646167278, train_loss=0.0236678346991539, time_cost=1.2177064418792725
+
Steps: 1%| | 5566/1000000 [14:13:16<2838:54:06, 10.28s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5567/1000000 [14:13:20<2330:27:15, 8.44s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5567], local_loss=0.01849241554737091, train_loss=0.017562516033649445, time_cost=1.4764699935913086
+
Steps: 1%| | 5567/1000000 [14:13:20<2330:27:15, 8.44s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%| | 5568/1000000 [14:13:24<1968:54:30, 7.13s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [5568], local_loss=0.04066777229309082, train_loss=0.03724003955721855, time_cost=1.2990303039550781
+
Steps: 1%| | 5568/1000000 [14:13:24<1968:54:30, 7.13s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%| | 5569/1000000 [14:13:30<1838:38:53, 6.66s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [5569], local_loss=0.03491271287202835, train_loss=0.041067302227020264, time_cost=2.8946564197540283
+
Steps: 1%| | 5569/1000000 [14:13:30<1838:38:53, 6.66s/it, lr=1e-5, step_loss=0.0349]
Steps: 1%| | 5570/1000000 [14:13:36<1757:01:07, 6.36s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [5570], local_loss=0.09711550921201706, train_loss=0.15511728823184967, time_cost=1.8809154033660889
+
Steps: 1%| | 5570/1000000 [14:13:36<1757:01:07, 6.36s/it, lr=1e-5, step_loss=0.0971]
Steps: 1%| | 5571/1000000 [14:13:41<1657:18:30, 6.00s/it, lr=1e-5, step_loss=0.0971][RANK-0]: Step: [5571], local_loss=0.012845356948673725, train_loss=0.01728166453540325, time_cost=2.854865789413452
+
Steps: 1%| | 5571/1000000 [14:13:41<1657:18:30, 6.00s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5572/1000000 [14:13:48<1765:25:34, 6.39s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5572], local_loss=0.03791236877441406, train_loss=0.20345452427864075, time_cost=1.8798298835754395
+
Steps: 1%| | 5572/1000000 [14:13:48<1765:25:34, 6.39s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%| | 5573/1000000 [14:13:52<1586:09:12, 5.74s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [5573], local_loss=0.024346526712179184, train_loss=0.04480288177728653, time_cost=1.2082154750823975
+
Steps: 1%| | 5573/1000000 [14:13:52<1586:09:12, 5.74s/it, lr=1e-5, step_loss=0.0243]
Steps: 1%| | 5574/1000000 [14:14:06<2286:32:54, 8.28s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [5574], local_loss=0.015754450112581253, train_loss=0.055066999047994614, time_cost=10.305464267730713
+
Steps: 1%| | 5574/1000000 [14:14:06<2286:32:54, 8.28s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 5575/1000000 [14:14:13<2167:17:22, 7.85s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [5575], local_loss=0.2045339196920395, train_loss=0.046481985598802567, time_cost=1.6376090049743652
+
Steps: 1%| | 5575/1000000 [14:14:13<2167:17:22, 7.85s/it, lr=1e-5, step_loss=0.205]
Steps: 1%| | 5576/1000000 [14:14:19<1963:38:36, 7.11s/it, lr=1e-5, step_loss=0.205][RANK-0]: Step: [5576], local_loss=0.020266089588403702, train_loss=0.03566994518041611, time_cost=2.7314321994781494
+
Steps: 1%| | 5576/1000000 [14:14:19<1963:38:36, 7.11s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%| | 5577/1000000 [14:14:32<2476:55:37, 8.97s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [5577], local_loss=0.02731984108686447, train_loss=0.06920519471168518, time_cost=1.2747077941894531
+
Steps: 1%| | 5577/1000000 [14:14:32<2476:55:37, 8.97s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%| | 5578/1000000 [14:14:42<2582:35:47, 9.35s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [5578], local_loss=0.019757244735956192, train_loss=0.024747401475906372, time_cost=5.141996383666992
+
Steps: 1%| | 5578/1000000 [14:14:42<2582:35:47, 9.35s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 5579/1000000 [14:14:57<3018:16:56, 10.93s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [5579], local_loss=0.01392198447138071, train_loss=0.033482570201158524, time_cost=6.780315160751343
+
Steps: 1%| | 5579/1000000 [14:14:57<3018:16:56, 10.93s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 5580/1000000 [14:15:06<2841:14:15, 10.29s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [5580], local_loss=0.02546272799372673, train_loss=0.0219880323857069, time_cost=3.038358688354492
+
Steps: 1%| | 5580/1000000 [14:15:06<2841:14:15, 10.29s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 5581/1000000 [14:15:17<2948:00:07, 10.67s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [5581], local_loss=0.016667939722537994, train_loss=0.07545825839042664, time_cost=2.084584951400757
+
Steps: 1%| | 5581/1000000 [14:15:17<2948:00:07, 10.67s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%| | 5582/1000000 [14:15:31<3175:33:10, 11.50s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [5582], local_loss=0.024167798459529877, train_loss=0.029594045132398605, time_cost=1.288541316986084
+
Steps: 1%| | 5582/1000000 [14:15:31<3175:33:10, 11.50s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 5583/1000000 [14:15:36<2680:21:27, 9.70s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [5583], local_loss=0.011636254377663136, train_loss=0.059660524129867554, time_cost=2.015153646469116
+
Steps: 1%| | 5583/1000000 [14:15:36<2680:21:27, 9.70s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 5584/1000000 [14:15:41<2288:30:13, 8.28s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [5584], local_loss=0.024231668561697006, train_loss=0.042765699326992035, time_cost=1.710564136505127
+
Steps: 1%| | 5584/1000000 [14:15:41<2288:30:13, 8.28s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 5585/1000000 [14:16:00<3179:47:13, 11.51s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [5585], local_loss=0.010708017274737358, train_loss=0.05177886039018631, time_cost=8.904130697250366
+
Steps: 1%| | 5585/1000000 [14:16:00<3179:47:13, 11.51s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 5586/1000000 [14:16:11<3113:08:11, 11.27s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [5586], local_loss=0.06622745096683502, train_loss=0.05917394161224365, time_cost=1.267564296722412
+
Steps: 1%| | 5586/1000000 [14:16:11<3113:08:11, 11.27s/it, lr=1e-5, step_loss=0.0662]
Steps: 1%| | 5587/1000000 [14:16:19<2834:34:32, 10.26s/it, lr=1e-5, step_loss=0.0662][RANK-0]: Step: [5587], local_loss=0.011798865161836147, train_loss=0.02977713569998741, time_cost=1.2097954750061035
+
Steps: 1%| | 5587/1000000 [14:16:19<2834:34:32, 10.26s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 5588/1000000 [14:16:37<3475:31:08, 12.58s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [5588], local_loss=0.012047551572322845, train_loss=0.034322045743465424, time_cost=1.210054874420166
+
Steps: 1%| | 5588/1000000 [14:16:37<3475:31:08, 12.58s/it, lr=1e-5, step_loss=0.012]
Steps: 1%| | 5589/1000000 [14:16:42<2850:56:32, 10.32s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [5589], local_loss=0.013059807009994984, train_loss=0.04028414934873581, time_cost=3.7222583293914795
+
Steps: 1%| | 5589/1000000 [14:16:42<2850:56:32, 10.32s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 5590/1000000 [14:16:49<2617:52:00, 9.48s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [5590], local_loss=0.035798102617263794, train_loss=0.07433725893497467, time_cost=1.2601118087768555
+
Steps: 1%| | 5590/1000000 [14:16:49<2617:52:00, 9.48s/it, lr=1e-5, step_loss=0.0358]
Steps: 1%| | 5591/1000000 [14:17:03<2929:47:16, 10.61s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [5591], local_loss=0.012390011921525002, train_loss=0.07513364404439926, time_cost=3.8758866786956787
+
Steps: 1%| | 5591/1000000 [14:17:03<2929:47:16, 10.61s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5592/1000000 [14:17:07<2399:39:13, 8.69s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5592], local_loss=0.010638856329023838, train_loss=24.960676193237305, time_cost=1.4968299865722656
+
Steps: 1%| | 5592/1000000 [14:17:07<2399:39:13, 8.69s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5593/1000000 [14:17:17<2497:15:38, 9.04s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5593], local_loss=0.016505533829331398, train_loss=0.021609462797641754, time_cost=4.496181488037109
+
Steps: 1%| | 5593/1000000 [14:17:17<2497:15:38, 9.04s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 5594/1000000 [14:17:24<2368:56:22, 8.58s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [5594], local_loss=0.042757418006658554, train_loss=0.0825188085436821, time_cost=1.2027885913848877
+
Steps: 1%| | 5594/1000000 [14:17:24<2368:56:22, 8.58s/it, lr=1e-5, step_loss=0.0428]
Steps: 1%| | 5595/1000000 [14:17:33<2422:50:44, 8.77s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [5595], local_loss=0.019945815205574036, train_loss=0.019487302750349045, time_cost=4.033529758453369
+
Steps: 1%| | 5595/1000000 [14:17:33<2422:50:44, 8.77s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%| | 5596/1000000 [14:17:39<2132:43:27, 7.72s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [5596], local_loss=0.05494024604558945, train_loss=0.03844264894723892, time_cost=1.2491002082824707
+
Steps: 1%| | 5596/1000000 [14:17:39<2132:43:27, 7.72s/it, lr=1e-5, step_loss=0.0549]
Steps: 1%| | 5597/1000000 [14:17:52<2590:08:20, 9.38s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [5597], local_loss=0.025031445547938347, train_loss=0.10682756453752518, time_cost=10.600641965866089
+
Steps: 1%| | 5597/1000000 [14:17:52<2590:08:20, 9.38s/it, lr=1e-5, step_loss=0.025]
Steps: 1%| | 5598/1000000 [14:17:58<2288:33:17, 8.29s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [5598], local_loss=0.05852812901139259, train_loss=11.200761795043945, time_cost=1.486588954925537
+
Steps: 1%| | 5598/1000000 [14:17:58<2288:33:17, 8.29s/it, lr=1e-5, step_loss=0.0585]
Steps: 1%| | 5599/1000000 [14:18:03<2013:04:36, 7.29s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [5599], local_loss=0.1613289713859558, train_loss=0.0568513497710228, time_cost=1.9425299167633057
+
Steps: 1%| | 5599/1000000 [14:18:03<2013:04:36, 7.29s/it, lr=1e-5, step_loss=0.161]
Steps: 1%| | 5600/1000000 [14:18:12<2195:42:00, 7.95s/it, lr=1e-5, step_loss=0.161][RANK-0]: Step: [5600], local_loss=0.014255180023610592, train_loss=0.03549542278051376, time_cost=6.736963510513306
+
Steps: 1%| | 5600/1000000 [14:18:12<2195:42:00, 7.95s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5601/1000000 [14:18:21<2257:47:01, 8.17s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5601], local_loss=0.01565312221646309, train_loss=0.08912672102451324, time_cost=1.3810663223266602
+
Steps: 1%| | 5601/1000000 [14:18:21<2257:47:01, 8.17s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 5602/1000000 [14:18:33<2571:09:07, 9.31s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [5602], local_loss=0.033800795674324036, train_loss=0.05297219380736351, time_cost=1.4351718425750732
+
Steps: 1%| | 5602/1000000 [14:18:33<2571:09:07, 9.31s/it, lr=1e-5, step_loss=0.0338]
Steps: 1%| | 5603/1000000 [14:18:42<2606:15:35, 9.44s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [5603], local_loss=0.023588910698890686, train_loss=0.050831958651542664, time_cost=3.759758710861206
+
Steps: 1%| | 5603/1000000 [14:18:42<2606:15:35, 9.44s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%| | 5604/1000000 [14:18:52<2612:11:52, 9.46s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [5604], local_loss=0.0248979814350605, train_loss=0.0321817584335804, time_cost=3.9337103366851807
+
Steps: 1%| | 5604/1000000 [14:18:52<2612:11:52, 9.46s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%| | 5605/1000000 [14:19:02<2698:51:25, 9.77s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [5605], local_loss=1.0296385288238525, train_loss=0.27306997776031494, time_cost=7.686248779296875
+
Steps: 1%| | 5605/1000000 [14:19:02<2698:51:25, 9.77s/it, lr=1e-5, step_loss=1.03]
Steps: 1%| | 5606/1000000 [14:19:07<2247:34:48, 8.14s/it, lr=1e-5, step_loss=1.03][RANK-0]: Step: [5606], local_loss=0.014054367318749428, train_loss=0.04621254652738571, time_cost=1.4604804515838623
+
Steps: 1%| | 5606/1000000 [14:19:07<2247:34:48, 8.14s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 5607/1000000 [14:19:14<2188:07:22, 7.92s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [5607], local_loss=1.00998055934906, train_loss=0.16062255203723907, time_cost=3.4245314598083496
+
Steps: 1%| | 5607/1000000 [14:19:14<2188:07:22, 7.92s/it, lr=1e-5, step_loss=1.01]
Steps: 1%| | 5608/1000000 [14:19:27<2610:43:09, 9.45s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [5608], local_loss=0.08069635182619095, train_loss=0.08284387737512589, time_cost=4.20005202293396
+
Steps: 1%| | 5608/1000000 [14:19:27<2610:43:09, 9.45s/it, lr=1e-5, step_loss=0.0807]
Steps: 1%| | 5609/1000000 [14:19:37<2625:07:24, 9.50s/it, lr=1e-5, step_loss=0.0807][RANK-0]: Step: [5609], local_loss=0.011898290365934372, train_loss=0.02357609197497368, time_cost=3.361084222793579
+
Steps: 1%| | 5609/1000000 [14:19:37<2625:07:24, 9.50s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 5610/1000000 [14:19:53<3173:09:17, 11.49s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [5610], local_loss=0.18901291489601135, train_loss=0.05849539116024971, time_cost=8.37899374961853
+
Steps: 1%| | 5610/1000000 [14:19:53<3173:09:17, 11.49s/it, lr=1e-5, step_loss=0.189]
Steps: 1%| | 5611/1000000 [14:19:58<2635:53:06, 9.54s/it, lr=1e-5, step_loss=0.189][RANK-0]: Step: [5611], local_loss=0.00996289774775505, train_loss=0.02594568580389023, time_cost=1.9623479843139648
+
Steps: 1%| | 5611/1000000 [14:19:58<2635:53:06, 9.54s/it, lr=1e-5, step_loss=0.00996]
Steps: 1%| | 5612/1000000 [14:20:03<2264:53:10, 8.20s/it, lr=1e-5, step_loss=0.00996][RANK-0]: Step: [5612], local_loss=0.016748353838920593, train_loss=0.01907891407608986, time_cost=2.0206222534179688
+
Steps: 1%| | 5612/1000000 [14:20:03<2264:53:10, 8.20s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%| | 5613/1000000 [14:20:12<2311:04:12, 8.37s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [5613], local_loss=0.010503794066607952, train_loss=0.02870245650410652, time_cost=1.3374581336975098
+
Steps: 1%| | 5613/1000000 [14:20:12<2311:04:12, 8.37s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 5614/1000000 [14:20:16<1998:23:06, 7.23s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [5614], local_loss=0.01126164011657238, train_loss=0.03479133918881416, time_cost=2.253826856613159
+
Steps: 1%| | 5614/1000000 [14:20:16<1998:23:06, 7.23s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 5615/1000000 [14:20:23<1953:21:55, 7.07s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [5615], local_loss=0.020382005721330643, train_loss=0.04584532231092453, time_cost=2.4026849269866943
+
Steps: 1%| | 5615/1000000 [14:20:23<1953:21:55, 7.07s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%| | 5616/1000000 [14:20:30<1962:02:10, 7.10s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [5616], local_loss=0.059271834790706635, train_loss=0.0541841983795166, time_cost=1.9323451519012451
+
Steps: 1%| | 5616/1000000 [14:20:30<1962:02:10, 7.10s/it, lr=1e-5, step_loss=0.0593]
Steps: 1%| | 5617/1000000 [14:20:39<2127:15:39, 7.70s/it, lr=1e-5, step_loss=0.0593][RANK-0]: Step: [5617], local_loss=0.04702037572860718, train_loss=0.16199813783168793, time_cost=6.6287314891815186
+
Steps: 1%| | 5617/1000000 [14:20:39<2127:15:39, 7.70s/it, lr=1e-5, step_loss=0.047]
Steps: 1%| | 5618/1000000 [14:20:54<2725:23:32, 9.87s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [5618], local_loss=0.15260222554206848, train_loss=0.05014421045780182, time_cost=11.717061519622803
+
Steps: 1%| | 5618/1000000 [14:20:54<2725:23:32, 9.87s/it, lr=1e-5, step_loss=0.153]
Steps: 1%| | 5619/1000000 [14:21:08<3062:04:43, 11.09s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [5619], local_loss=0.04620344564318657, train_loss=0.03564869984984398, time_cost=5.600097417831421
+
Steps: 1%| | 5619/1000000 [14:21:08<3062:04:43, 11.09s/it, lr=1e-5, step_loss=0.0462]
Steps: 1%| | 5620/1000000 [14:21:21<3189:12:02, 11.55s/it, lr=1e-5, step_loss=0.0462][RANK-0]: Step: [5620], local_loss=0.19720660150051117, train_loss=0.17015060782432556, time_cost=6.337161540985107
+
Steps: 1%| | 5620/1000000 [14:21:21<3189:12:02, 11.55s/it, lr=1e-5, step_loss=0.197]
Steps: 1%| | 5621/1000000 [14:21:32<3163:22:22, 11.45s/it, lr=1e-5, step_loss=0.197][RANK-0]: Step: [5621], local_loss=0.02655074931681156, train_loss=0.02408698946237564, time_cost=1.9543976783752441
+
Steps: 1%| | 5621/1000000 [14:21:32<3163:22:22, 11.45s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%| | 5622/1000000 [14:21:46<3401:19:49, 12.31s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [5622], local_loss=0.01924740895628929, train_loss=0.14912065863609314, time_cost=2.388796091079712
+
Steps: 1%| | 5622/1000000 [14:21:46<3401:19:49, 12.31s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 5623/1000000 [14:21:57<3245:48:54, 11.75s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [5623], local_loss=0.018331948667764664, train_loss=0.020783806219697, time_cost=2.027204990386963
+
Steps: 1%| | 5623/1000000 [14:21:57<3245:48:54, 11.75s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 5624/1000000 [14:22:02<2688:00:31, 9.73s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [5624], local_loss=0.010560663416981697, train_loss=0.08762302994728088, time_cost=2.097390651702881
+
Steps: 1%| | 5624/1000000 [14:22:02<2688:00:31, 9.73s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5625/1000000 [14:22:06<2255:27:14, 8.17s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5625], local_loss=0.05429675057530403, train_loss=0.05012602359056473, time_cost=2.1491990089416504
+
Steps: 1%| | 5625/1000000 [14:22:06<2255:27:14, 8.17s/it, lr=1e-5, step_loss=0.0543]
Steps: 1%| | 5626/1000000 [14:22:13<2156:07:50, 7.81s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [5626], local_loss=0.053386714309453964, train_loss=0.04616142809391022, time_cost=2.662858724594116
+
Steps: 1%| | 5626/1000000 [14:22:13<2156:07:50, 7.81s/it, lr=1e-5, step_loss=0.0534]
Steps: 1%| | 5627/1000000 [14:22:19<2011:52:35, 7.28s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [5627], local_loss=0.09012580662965775, train_loss=0.17069542407989502, time_cost=1.7094194889068604
+
Steps: 1%| | 5627/1000000 [14:22:19<2011:52:35, 7.28s/it, lr=1e-5, step_loss=0.0901]
Steps: 1%| | 5628/1000000 [14:22:32<2472:25:14, 8.95s/it, lr=1e-5, step_loss=0.0901][RANK-0]: Step: [5628], local_loss=0.053082674741744995, train_loss=0.0898740217089653, time_cost=3.375992774963379
+
Steps: 1%| | 5628/1000000 [14:22:32<2472:25:14, 8.95s/it, lr=1e-5, step_loss=0.0531]
Steps: 1%| | 5629/1000000 [14:22:39<2301:43:25, 8.33s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [5629], local_loss=0.016326334327459335, train_loss=0.02810639515519142, time_cost=3.091137647628784
+
Steps: 1%| | 5629/1000000 [14:22:39<2301:43:25, 8.33s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 5630/1000000 [14:22:51<2596:27:01, 9.40s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [5630], local_loss=0.012356864288449287, train_loss=0.14106535911560059, time_cost=1.8656444549560547
+
Steps: 1%| | 5630/1000000 [14:22:51<2596:27:01, 9.40s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5631/1000000 [14:23:02<2704:02:18, 9.79s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5631], local_loss=0.05044277012348175, train_loss=0.0864514708518982, time_cost=2.178032159805298
+
Steps: 1%| | 5631/1000000 [14:23:02<2704:02:18, 9.79s/it, lr=1e-5, step_loss=0.0504]
Steps: 1%| | 5632/1000000 [14:23:08<2384:25:50, 8.63s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [5632], local_loss=0.02831415832042694, train_loss=0.05553651973605156, time_cost=1.2761826515197754
+
Steps: 1%| | 5632/1000000 [14:23:08<2384:25:50, 8.63s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%| | 5633/1000000 [14:23:19<2629:41:15, 9.52s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [5633], local_loss=0.008292725309729576, train_loss=0.03704899922013283, time_cost=3.8040528297424316
+
Steps: 1%| | 5633/1000000 [14:23:19<2629:41:15, 9.52s/it, lr=1e-5, step_loss=0.00829]
Steps: 1%| | 5634/1000000 [14:23:24<2267:32:20, 8.21s/it, lr=1e-5, step_loss=0.00829][RANK-0]: Step: [5634], local_loss=0.03545607626438141, train_loss=0.03608951345086098, time_cost=3.8852474689483643
+
Steps: 1%| | 5634/1000000 [14:23:24<2267:32:20, 8.21s/it, lr=1e-5, step_loss=0.0355]
Steps: 1%| | 5635/1000000 [14:23:31<2159:19:20, 7.82s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [5635], local_loss=0.012317214161157608, train_loss=0.028572047129273415, time_cost=2.4689860343933105
+
Steps: 1%| | 5635/1000000 [14:23:31<2159:19:20, 7.82s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 5636/1000000 [14:23:38<2100:38:04, 7.61s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [5636], local_loss=0.036479830741882324, train_loss=0.02181895822286606, time_cost=2.058091878890991
+
Steps: 1%| | 5636/1000000 [14:23:38<2100:38:04, 7.61s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%| | 5637/1000000 [14:23:48<2308:04:01, 8.36s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [5637], local_loss=0.01275624893605709, train_loss=0.026007745414972305, time_cost=1.5557258129119873
+
Steps: 1%| | 5637/1000000 [14:23:48<2308:04:01, 8.36s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 5638/1000000 [14:23:56<2247:31:31, 8.14s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [5638], local_loss=0.01019381545484066, train_loss=0.040615856647491455, time_cost=3.1772727966308594
+
Steps: 1%| | 5638/1000000 [14:23:56<2247:31:31, 8.14s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 5639/1000000 [14:24:03<2160:36:50, 7.82s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [5639], local_loss=0.011115082539618015, train_loss=0.01645539328455925, time_cost=2.1927645206451416
+
Steps: 1%| | 5639/1000000 [14:24:03<2160:36:50, 7.82s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5640/1000000 [14:24:19<2819:35:31, 10.21s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5640], local_loss=74.4577407836914, train_loss=9.498629570007324, time_cost=13.129625082015991
+
Steps: 1%| | 5640/1000000 [14:24:19<2819:35:31, 10.21s/it, lr=1e-5, step_loss=74.5]
Steps: 1%| | 5641/1000000 [14:24:25<2455:48:11, 8.89s/it, lr=1e-5, step_loss=74.5][RANK-0]: Step: [5641], local_loss=0.018132148310542107, train_loss=0.028580456972122192, time_cost=1.5692918300628662
+
Steps: 1%| | 5641/1000000 [14:24:25<2455:48:11, 8.89s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%| | 5642/1000000 [14:24:39<2899:44:25, 10.50s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [5642], local_loss=0.29058751463890076, train_loss=0.09765408933162689, time_cost=4.6708033084869385
+
Steps: 1%| | 5642/1000000 [14:24:39<2899:44:25, 10.50s/it, lr=1e-5, step_loss=0.291]
Steps: 1%| | 5643/1000000 [14:24:48<2768:38:37, 10.02s/it, lr=1e-5, step_loss=0.291][RANK-0]: Step: [5643], local_loss=0.012420596554875374, train_loss=0.028525134548544884, time_cost=1.3753643035888672
+
Steps: 1%| | 5643/1000000 [14:24:48<2768:38:37, 10.02s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5644/1000000 [14:25:00<2942:05:09, 10.65s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5644], local_loss=0.030099527910351753, train_loss=0.03947773575782776, time_cost=4.562089204788208
+
Steps: 1%| | 5644/1000000 [14:25:00<2942:05:09, 10.65s/it, lr=1e-5, step_loss=0.0301]
Steps: 1%| | 5645/1000000 [14:25:05<2461:34:50, 8.91s/it, lr=1e-5, step_loss=0.0301][RANK-0]: Step: [5645], local_loss=0.008273031562566757, train_loss=0.019267546012997627, time_cost=1.9349102973937988
+
Steps: 1%| | 5645/1000000 [14:25:05<2461:34:50, 8.91s/it, lr=1e-5, step_loss=0.00827]
Steps: 1%| | 5646/1000000 [14:25:15<2519:43:12, 9.12s/it, lr=1e-5, step_loss=0.00827][RANK-0]: Step: [5646], local_loss=0.005409632343798876, train_loss=0.03359378129243851, time_cost=3.2753310203552246
+
Steps: 1%| | 5646/1000000 [14:25:15<2519:43:12, 9.12s/it, lr=1e-5, step_loss=0.00541]
Steps: 1%| | 5647/1000000 [14:25:28<2877:02:35, 10.42s/it, lr=1e-5, step_loss=0.00541][RANK-0]: Step: [5647], local_loss=0.00686331233009696, train_loss=0.02451489120721817, time_cost=4.529428005218506
+
Steps: 1%| | 5647/1000000 [14:25:28<2877:02:35, 10.42s/it, lr=1e-5, step_loss=0.00686]
Steps: 1%| | 5648/1000000 [14:25:40<3041:03:18, 11.01s/it, lr=1e-5, step_loss=0.00686][RANK-0]: Step: [5648], local_loss=0.006939757615327835, train_loss=0.03342970460653305, time_cost=1.2292201519012451
+
Steps: 1%| | 5648/1000000 [14:25:40<3041:03:18, 11.01s/it, lr=1e-5, step_loss=0.00694]
Steps: 1%| | 5649/1000000 [14:25:50<2910:41:39, 10.54s/it, lr=1e-5, step_loss=0.00694][RANK-0]: Step: [5649], local_loss=0.016696451231837273, train_loss=0.1117459088563919, time_cost=3.319345235824585
+
Steps: 1%| | 5649/1000000 [14:25:50<2910:41:39, 10.54s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%| | 5650/1000000 [14:25:55<2484:29:41, 9.00s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [5650], local_loss=0.011780932545661926, train_loss=0.02345201000571251, time_cost=2.8223531246185303
+
Steps: 1%| | 5650/1000000 [14:25:55<2484:29:41, 9.00s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 5651/1000000 [14:26:00<2104:33:46, 7.62s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [5651], local_loss=0.024506360292434692, train_loss=0.04349028319120407, time_cost=1.3271067142486572
+
Steps: 1%| | 5651/1000000 [14:26:00<2104:33:46, 7.62s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%| | 5652/1000000 [14:26:13<2581:12:36, 9.35s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [5652], local_loss=0.012364331632852554, train_loss=0.02780165895819664, time_cost=4.575138568878174
+
Steps: 1%| | 5652/1000000 [14:26:13<2581:12:36, 9.35s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5653/1000000 [14:26:18<2252:30:41, 8.16s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5653], local_loss=0.007827316410839558, train_loss=0.026904458180069923, time_cost=1.2312555313110352
+
Steps: 1%| | 5653/1000000 [14:26:18<2252:30:41, 8.16s/it, lr=1e-5, step_loss=0.00783]
Steps: 1%| | 5654/1000000 [14:26:23<1959:42:18, 7.10s/it, lr=1e-5, step_loss=0.00783][RANK-0]: Step: [5654], local_loss=0.027214158326387405, train_loss=0.03103729337453842, time_cost=1.899998664855957
+
Steps: 1%| | 5654/1000000 [14:26:23<1959:42:18, 7.10s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%| | 5655/1000000 [14:26:34<2276:41:02, 8.24s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [5655], local_loss=0.03032374009490013, train_loss=0.01873920112848282, time_cost=4.186182737350464
+
Steps: 1%| | 5655/1000000 [14:26:34<2276:41:02, 8.24s/it, lr=1e-5, step_loss=0.0303]
Steps: 1%| | 5656/1000000 [14:26:41<2223:47:53, 8.05s/it, lr=1e-5, step_loss=0.0303][RANK-0]: Step: [5656], local_loss=0.0023731086403131485, train_loss=0.06676453351974487, time_cost=2.7340171337127686
+
Steps: 1%| | 5656/1000000 [14:26:42<2223:47:53, 8.05s/it, lr=1e-5, step_loss=0.00237]
Steps: 1%| | 5657/1000000 [14:26:54<2593:09:24, 9.39s/it, lr=1e-5, step_loss=0.00237][RANK-0]: Step: [5657], local_loss=0.02417771704494953, train_loss=0.04022647440433502, time_cost=3.59297776222229
+
Steps: 1%| | 5657/1000000 [14:26:54<2593:09:24, 9.39s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 5658/1000000 [14:27:08<2948:01:56, 10.67s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [5658], local_loss=0.014871650375425816, train_loss=0.03874100744724274, time_cost=5.77412223815918
+
Steps: 1%| | 5658/1000000 [14:27:08<2948:01:56, 10.67s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 5659/1000000 [14:27:12<2424:24:02, 8.78s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [5659], local_loss=0.011350117623806, train_loss=0.07448836416006088, time_cost=1.2364904880523682
+
Steps: 1%| | 5659/1000000 [14:27:12<2424:24:02, 8.78s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5660/1000000 [14:27:17<2113:44:06, 7.65s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5660], local_loss=0.024054300040006638, train_loss=0.0361851304769516, time_cost=1.3786394596099854
+
Steps: 1%| | 5660/1000000 [14:27:17<2113:44:06, 7.65s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 5661/1000000 [14:27:23<1969:30:28, 7.13s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [5661], local_loss=0.014661329798400402, train_loss=0.03069481998682022, time_cost=4.802115201950073
+
Steps: 1%| | 5661/1000000 [14:27:23<1969:30:28, 7.13s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 5662/1000000 [14:27:28<1827:45:19, 6.62s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [5662], local_loss=0.10056789219379425, train_loss=0.03403833508491516, time_cost=1.3824386596679688
+
Steps: 1%| | 5662/1000000 [14:27:28<1827:45:19, 6.62s/it, lr=1e-5, step_loss=0.101]
Steps: 1%| | 5663/1000000 [14:27:44<2543:35:08, 9.21s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [5663], local_loss=0.025950057432055473, train_loss=0.025657780468463898, time_cost=7.121059417724609
+
Steps: 1%| | 5663/1000000 [14:27:44<2543:35:08, 9.21s/it, lr=1e-5, step_loss=0.026]
Steps: 1%| | 5664/1000000 [14:27:51<2349:40:23, 8.51s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [5664], local_loss=0.01580454409122467, train_loss=0.04122525453567505, time_cost=2.4440674781799316
+
Steps: 1%| | 5664/1000000 [14:27:51<2349:40:23, 8.51s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 5665/1000000 [14:28:02<2583:57:23, 9.36s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [5665], local_loss=0.061721935868263245, train_loss=0.035493671894073486, time_cost=1.2398500442504883
+
Steps: 1%| | 5665/1000000 [14:28:02<2583:57:23, 9.36s/it, lr=1e-5, step_loss=0.0617]
Steps: 1%| | 5666/1000000 [14:28:12<2680:30:34, 9.70s/it, lr=1e-5, step_loss=0.0617][RANK-0]: Step: [5666], local_loss=0.2108680158853531, train_loss=0.09542237222194672, time_cost=1.2224078178405762
+
Steps: 1%| | 5666/1000000 [14:28:12<2680:30:34, 9.70s/it, lr=1e-5, step_loss=0.211]
Steps: 1%| | 5667/1000000 [14:28:23<2771:36:37, 10.03s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [5667], local_loss=0.25820714235305786, train_loss=0.05821963772177696, time_cost=1.7655603885650635
+
Steps: 1%| | 5667/1000000 [14:28:23<2771:36:37, 10.03s/it, lr=1e-5, step_loss=0.258]
Steps: 1%| | 5668/1000000 [14:28:30<2495:27:33, 9.03s/it, lr=1e-5, step_loss=0.258][RANK-0]: Step: [5668], local_loss=0.02249450795352459, train_loss=0.023792780935764313, time_cost=2.7892963886260986
+
Steps: 1%| | 5668/1000000 [14:28:30<2495:27:33, 9.03s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%| | 5669/1000000 [14:28:35<2158:42:40, 7.82s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [5669], local_loss=0.012336584739387035, train_loss=0.05261414870619774, time_cost=2.133885145187378
+
Steps: 1%| | 5669/1000000 [14:28:35<2158:42:40, 7.82s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 5670/1000000 [14:28:42<2093:35:15, 7.58s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [5670], local_loss=0.02750682272017002, train_loss=0.07678089290857315, time_cost=1.3192760944366455
+
Steps: 1%| | 5670/1000000 [14:28:42<2093:35:15, 7.58s/it, lr=1e-5, step_loss=0.0275]
Steps: 1%| | 5671/1000000 [14:28:59<2887:23:00, 10.45s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [5671], local_loss=0.028756871819496155, train_loss=0.07907287031412125, time_cost=2.997645854949951
+
Steps: 1%| | 5671/1000000 [14:28:59<2887:23:00, 10.45s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%| | 5672/1000000 [14:29:05<2502:41:59, 9.06s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [5672], local_loss=0.13787314295768738, train_loss=0.04244784265756607, time_cost=1.334394931793213
+
Steps: 1%| | 5672/1000000 [14:29:05<2502:41:59, 9.06s/it, lr=1e-5, step_loss=0.138]
Steps: 1%| | 5673/1000000 [14:29:16<2676:09:34, 9.69s/it, lr=1e-5, step_loss=0.138][RANK-0]: Step: [5673], local_loss=0.01716376282274723, train_loss=0.026099354028701782, time_cost=1.2815051078796387
+
Steps: 1%| | 5673/1000000 [14:29:16<2676:09:34, 9.69s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 5674/1000000 [14:29:30<3068:06:22, 11.11s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [5674], local_loss=0.022616777569055557, train_loss=0.041275035589933395, time_cost=1.2464511394500732
+
Steps: 1%| | 5674/1000000 [14:29:30<3068:06:22, 11.11s/it, lr=1e-5, step_loss=0.0226]
Steps: 1%| | 5675/1000000 [14:29:42<3108:37:11, 11.25s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [5675], local_loss=0.014158190228044987, train_loss=0.04856152832508087, time_cost=3.369325637817383
+
Steps: 1%| | 5675/1000000 [14:29:42<3108:37:11, 11.25s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 5676/1000000 [14:29:47<2614:14:12, 9.46s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [5676], local_loss=0.6480427980422974, train_loss=0.15433740615844727, time_cost=2.2686352729797363
+
Steps: 1%| | 5676/1000000 [14:29:47<2614:14:12, 9.46s/it, lr=1e-5, step_loss=0.648]
Steps: 1%| | 5677/1000000 [14:30:02<3086:45:56, 11.18s/it, lr=1e-5, step_loss=0.648][RANK-0]: Step: [5677], local_loss=0.026846902444958687, train_loss=0.07096698135137558, time_cost=7.715883493423462
+
Steps: 1%| | 5677/1000000 [14:30:02<3086:45:56, 11.18s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%| | 5678/1000000 [14:30:08<2645:01:56, 9.58s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [5678], local_loss=0.02539663575589657, train_loss=0.024835854768753052, time_cost=1.5376372337341309
+
Steps: 1%| | 5678/1000000 [14:30:08<2645:01:56, 9.58s/it, lr=1e-5, step_loss=0.0254]
Steps: 1%| | 5679/1000000 [14:30:14<2318:17:23, 8.39s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [5679], local_loss=0.011046621017158031, train_loss=0.028057176619768143, time_cost=1.5813982486724854
+
Steps: 1%| | 5679/1000000 [14:30:14<2318:17:23, 8.39s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5680/1000000 [14:30:30<2965:20:33, 10.74s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5680], local_loss=0.01424060482531786, train_loss=0.3109460771083832, time_cost=7.284952640533447
+
Steps: 1%| | 5680/1000000 [14:30:30<2965:20:33, 10.74s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 5681/1000000 [14:30:44<3240:28:50, 11.73s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [5681], local_loss=0.007331943139433861, train_loss=0.038895443081855774, time_cost=4.728619337081909
+
Steps: 1%| | 5681/1000000 [14:30:44<3240:28:50, 11.73s/it, lr=1e-5, step_loss=0.00733]
Steps: 1%| | 5682/1000000 [14:30:52<2932:12:42, 10.62s/it, lr=1e-5, step_loss=0.00733][RANK-0]: Step: [5682], local_loss=0.157704159617424, train_loss=0.043721262365579605, time_cost=1.2265393733978271
+
Steps: 1%| | 5682/1000000 [14:30:52<2932:12:42, 10.62s/it, lr=1e-5, step_loss=0.158]
Steps: 1%| | 5683/1000000 [14:30:59<2619:51:14, 9.49s/it, lr=1e-5, step_loss=0.158][RANK-0]: Step: [5683], local_loss=0.01884401962161064, train_loss=0.057019054889678955, time_cost=1.5570182800292969
+
Steps: 1%| | 5683/1000000 [14:30:59<2619:51:14, 9.49s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 5684/1000000 [14:31:08<2547:04:58, 9.22s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [5684], local_loss=0.029426129534840584, train_loss=0.029331278055906296, time_cost=6.90669584274292
+
Steps: 1%| | 5684/1000000 [14:31:08<2547:04:58, 9.22s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%| | 5685/1000000 [14:31:13<2195:29:41, 7.95s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [5685], local_loss=0.2982214689254761, train_loss=0.11189676821231842, time_cost=2.2899484634399414
+
Steps: 1%| | 5685/1000000 [14:31:13<2195:29:41, 7.95s/it, lr=1e-5, step_loss=0.298]
Steps: 1%| | 5686/1000000 [14:31:18<1962:55:59, 7.11s/it, lr=1e-5, step_loss=0.298][RANK-0]: Step: [5686], local_loss=0.024840321391820908, train_loss=0.048540279269218445, time_cost=1.9644863605499268
+
Steps: 1%| | 5686/1000000 [14:31:18<1962:55:59, 7.11s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%| | 5687/1000000 [14:31:25<1975:47:45, 7.15s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [5687], local_loss=0.02200709655880928, train_loss=0.08590063452720642, time_cost=4.8883843421936035
+
Steps: 1%| | 5687/1000000 [14:31:25<1975:47:45, 7.15s/it, lr=1e-5, step_loss=0.022]
Steps: 1%| | 5688/1000000 [14:31:33<2067:58:48, 7.49s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [5688], local_loss=0.03793034330010414, train_loss=0.04240187630057335, time_cost=4.708752870559692
+
Steps: 1%| | 5688/1000000 [14:31:33<2067:58:48, 7.49s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%| | 5689/1000000 [14:31:39<1916:23:20, 6.94s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [5689], local_loss=0.33397176861763, train_loss=0.1001472920179367, time_cost=2.912647008895874
+
Steps: 1%| | 5689/1000000 [14:31:39<1916:23:20, 6.94s/it, lr=1e-5, step_loss=0.334]
Steps: 1%| | 5690/1000000 [14:31:49<2193:03:07, 7.94s/it, lr=1e-5, step_loss=0.334][RANK-0]: Step: [5690], local_loss=0.07353613525629044, train_loss=0.1044711321592331, time_cost=1.996992826461792
+
Steps: 1%| | 5690/1000000 [14:31:49<2193:03:07, 7.94s/it, lr=1e-5, step_loss=0.0735]
Steps: 1%| | 5691/1000000 [14:32:03<2641:06:57, 9.56s/it, lr=1e-5, step_loss=0.0735][RANK-0]: Step: [5691], local_loss=0.1506803333759308, train_loss=0.045108918100595474, time_cost=4.3328938484191895
+
Steps: 1%| | 5691/1000000 [14:32:03<2641:06:57, 9.56s/it, lr=1e-5, step_loss=0.151]
Steps: 1%| | 5692/1000000 [14:32:10<2436:06:18, 8.82s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [5692], local_loss=0.015385464765131474, train_loss=0.06587323546409607, time_cost=1.3438892364501953
+
Steps: 1%| | 5692/1000000 [14:32:10<2436:06:18, 8.82s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 5693/1000000 [14:32:21<2603:55:29, 9.43s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [5693], local_loss=0.01924433745443821, train_loss=0.06542481482028961, time_cost=1.2204456329345703
+
Steps: 1%| | 5693/1000000 [14:32:21<2603:55:29, 9.43s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 5694/1000000 [14:32:28<2456:34:36, 8.89s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [5694], local_loss=0.009986469522118568, train_loss=0.07654379308223724, time_cost=1.2611148357391357
+
Steps: 1%| | 5694/1000000 [14:32:28<2456:34:36, 8.89s/it, lr=1e-5, step_loss=0.00999]
Steps: 1%| | 5695/1000000 [14:32:42<2828:15:20, 10.24s/it, lr=1e-5, step_loss=0.00999][RANK-0]: Step: [5695], local_loss=0.01179499737918377, train_loss=11.886476516723633, time_cost=4.12235426902771
+
Steps: 1%| | 5695/1000000 [14:32:42<2828:15:20, 10.24s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 5696/1000000 [14:32:47<2399:14:28, 8.69s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [5696], local_loss=0.016995899379253387, train_loss=0.0508689247071743, time_cost=1.2468512058258057
+
Steps: 1%| | 5696/1000000 [14:32:47<2399:14:28, 8.69s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 5697/1000000 [14:32:59<2705:50:25, 9.80s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [5697], local_loss=0.013912650756537914, train_loss=0.0389338955283165, time_cost=3.778040885925293
+
Steps: 1%| | 5697/1000000 [14:32:59<2705:50:25, 9.80s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 5698/1000000 [14:33:11<2882:14:14, 10.44s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [5698], local_loss=0.01869351789355278, train_loss=0.020864617079496384, time_cost=5.7116310596466064
+
Steps: 1%| | 5698/1000000 [14:33:11<2882:14:14, 10.44s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 5699/1000000 [14:33:18<2580:51:14, 9.34s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [5699], local_loss=0.018296144902706146, train_loss=0.03079999051988125, time_cost=2.5801756381988525
+
Steps: 1%| | 5699/1000000 [14:33:18<2580:51:14, 9.34s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 5700/1000000 [14:33:24<2324:25:56, 8.42s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [5700], local_loss=0.020448926836252213, train_loss=0.049455784261226654, time_cost=5.261931896209717
+
Steps: 1%| | 5700/1000000 [14:33:24<2324:25:56, 8.42s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%| | 5701/1000000 [14:33:33<2336:57:40, 8.46s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [5701], local_loss=0.07374256104230881, train_loss=0.1237751841545105, time_cost=4.440914630889893
+
Steps: 1%| | 5701/1000000 [14:33:33<2336:57:40, 8.46s/it, lr=1e-5, step_loss=0.0737]
Steps: 1%| | 5702/1000000 [14:33:49<2990:59:37, 10.83s/it, lr=1e-5, step_loss=0.0737][RANK-0]: Step: [5702], local_loss=0.0882379561662674, train_loss=0.04853801429271698, time_cost=14.184671878814697
+
Steps: 1%| | 5702/1000000 [14:33:49<2990:59:37, 10.83s/it, lr=1e-5, step_loss=0.0882]
Steps: 1%| | 5703/1000000 [14:33:53<2470:22:27, 8.94s/it, lr=1e-5, step_loss=0.0882][RANK-0]: Step: [5703], local_loss=0.042015351355075836, train_loss=0.07205402851104736, time_cost=1.9970641136169434
+
Steps: 1%| | 5703/1000000 [14:33:53<2470:22:27, 8.94s/it, lr=1e-5, step_loss=0.042]
Steps: 1%| | 5704/1000000 [14:34:07<2838:07:44, 10.28s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [5704], local_loss=0.010522391647100449, train_loss=0.06035410612821579, time_cost=3.163778066635132
+
Steps: 1%| | 5704/1000000 [14:34:07<2838:07:44, 10.28s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 5705/1000000 [14:34:21<3140:26:32, 11.37s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [5705], local_loss=0.2708858549594879, train_loss=0.07948137819766998, time_cost=1.235503911972046
+
Steps: 1%| | 5705/1000000 [14:34:21<3140:26:32, 11.37s/it, lr=1e-5, step_loss=0.271]
Steps: 1%| | 5706/1000000 [14:34:25<2550:53:01, 9.24s/it, lr=1e-5, step_loss=0.271][RANK-0]: Step: [5706], local_loss=0.027139022946357727, train_loss=0.025200525298714638, time_cost=1.2210502624511719
+
Steps: 1%| | 5706/1000000 [14:34:25<2550:53:01, 9.24s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%| | 5707/1000000 [14:34:30<2210:55:47, 8.01s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [5707], local_loss=0.02786315232515335, train_loss=0.0351075753569603, time_cost=2.1733851432800293
+
Steps: 1%| | 5707/1000000 [14:34:30<2210:55:47, 8.01s/it, lr=1e-5, step_loss=0.0279]
Steps: 1%| | 5708/1000000 [14:34:40<2388:49:05, 8.65s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [5708], local_loss=0.3202906548976898, train_loss=0.08650235831737518, time_cost=1.4493458271026611
+
Steps: 1%| | 5708/1000000 [14:34:40<2388:49:05, 8.65s/it, lr=1e-5, step_loss=0.32]
Steps: 1%| | 5709/1000000 [14:34:52<2625:16:30, 9.51s/it, lr=1e-5, step_loss=0.32][RANK-0]: Step: [5709], local_loss=0.03999881446361542, train_loss=0.026554308831691742, time_cost=2.838850498199463
+
Steps: 1%| | 5709/1000000 [14:34:52<2625:16:30, 9.51s/it, lr=1e-5, step_loss=0.04]
Steps: 1%| | 5710/1000000 [14:35:07<3121:11:39, 11.30s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [5710], local_loss=0.009725254960358143, train_loss=0.27040019631385803, time_cost=6.3603856563568115
+
Steps: 1%| | 5710/1000000 [14:35:07<3121:11:39, 11.30s/it, lr=1e-5, step_loss=0.00973]
Steps: 1%| | 5711/1000000 [14:35:12<2556:44:39, 9.26s/it, lr=1e-5, step_loss=0.00973][RANK-0]: Step: [5711], local_loss=0.010121656581759453, train_loss=0.12710139155387878, time_cost=1.5167887210845947
+
Steps: 1%| | 5711/1000000 [14:35:12<2556:44:39, 9.26s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 5712/1000000 [14:35:26<2938:11:07, 10.64s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [5712], local_loss=0.02141951024532318, train_loss=0.062008123844861984, time_cost=2.414685010910034
+
Steps: 1%| | 5712/1000000 [14:35:26<2938:11:07, 10.64s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 5713/1000000 [14:35:37<2955:46:35, 10.70s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [5713], local_loss=0.015426445752382278, train_loss=0.01670745015144348, time_cost=2.4429306983947754
+
Steps: 1%| | 5713/1000000 [14:35:37<2955:46:35, 10.70s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 5714/1000000 [14:35:47<2948:11:11, 10.67s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [5714], local_loss=0.013640315271914005, train_loss=0.02063244767487049, time_cost=2.1614747047424316
+
Steps: 1%| | 5714/1000000 [14:35:47<2948:11:11, 10.67s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 5715/1000000 [14:35:52<2499:13:05, 9.05s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [5715], local_loss=0.10981857031583786, train_loss=0.18271631002426147, time_cost=1.3626930713653564
+
Steps: 1%| | 5715/1000000 [14:35:52<2499:13:05, 9.05s/it, lr=1e-5, step_loss=0.11]
Steps: 1%| | 5716/1000000 [14:36:06<2904:05:03, 10.51s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [5716], local_loss=0.01627988927066326, train_loss=0.10541428625583649, time_cost=4.411659240722656
+
Steps: 1%| | 5716/1000000 [14:36:06<2904:05:03, 10.51s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 5717/1000000 [14:36:14<2681:20:55, 9.71s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [5717], local_loss=0.030980605632066727, train_loss=0.018584439530968666, time_cost=1.2824430465698242
+
Steps: 1%| | 5717/1000000 [14:36:14<2681:20:55, 9.71s/it, lr=1e-5, step_loss=0.031]
Steps: 1%| | 5718/1000000 [14:36:18<2220:20:56, 8.04s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [5718], local_loss=0.05859313905239105, train_loss=15.539013862609863, time_cost=1.2496082782745361
+
Steps: 1%| | 5718/1000000 [14:36:18<2220:20:56, 8.04s/it, lr=1e-5, step_loss=0.0586]
Steps: 1%| | 5719/1000000 [14:36:23<1948:20:25, 7.05s/it, lr=1e-5, step_loss=0.0586][RANK-0]: Step: [5719], local_loss=0.050645697861909866, train_loss=0.056590281426906586, time_cost=2.100419759750366
+
Steps: 1%| | 5719/1000000 [14:36:23<1948:20:25, 7.05s/it, lr=1e-5, step_loss=0.0506]
Steps: 1%| | 5720/1000000 [14:36:34<2270:39:03, 8.22s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [5720], local_loss=0.01871352083981037, train_loss=0.044219233095645905, time_cost=2.473188638687134
+
Steps: 1%| | 5720/1000000 [14:36:34<2270:39:03, 8.22s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 5721/1000000 [14:36:49<2804:46:53, 10.16s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [5721], local_loss=0.017650343477725983, train_loss=0.06038182973861694, time_cost=5.186491250991821
+
Steps: 1%| | 5721/1000000 [14:36:49<2804:46:53, 10.16s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 5722/1000000 [14:37:06<3393:51:38, 12.29s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [5722], local_loss=0.08021645992994308, train_loss=0.12169194221496582, time_cost=1.2405381202697754
+
Steps: 1%| | 5722/1000000 [14:37:06<3393:51:38, 12.29s/it, lr=1e-5, step_loss=0.0802]
Steps: 1%| | 5723/1000000 [14:37:17<3257:50:23, 11.80s/it, lr=1e-5, step_loss=0.0802][RANK-0]: Step: [5723], local_loss=0.01925269141793251, train_loss=0.036425285041332245, time_cost=2.0513055324554443
+
Steps: 1%| | 5723/1000000 [14:37:17<3257:50:23, 11.80s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%| | 5724/1000000 [14:37:26<3055:08:27, 11.06s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [5724], local_loss=0.3986702561378479, train_loss=0.08614669740200043, time_cost=6.493141174316406
+
Steps: 1%| | 5724/1000000 [14:37:26<3055:08:27, 11.06s/it, lr=1e-5, step_loss=0.399]
Steps: 1%| | 5725/1000000 [14:37:32<2625:20:39, 9.51s/it, lr=1e-5, step_loss=0.399][RANK-0]: Step: [5725], local_loss=0.14919832348823547, train_loss=0.08742986619472504, time_cost=1.2317798137664795
+
Steps: 1%| | 5725/1000000 [14:37:32<2625:20:39, 9.51s/it, lr=1e-5, step_loss=0.149]
Steps: 1%| | 5726/1000000 [14:37:36<2193:12:43, 7.94s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [5726], local_loss=0.011724048294126987, train_loss=0.02934511937201023, time_cost=1.7266430854797363
+
Steps: 1%| | 5726/1000000 [14:37:36<2193:12:43, 7.94s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 5727/1000000 [14:37:50<2679:23:22, 9.70s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [5727], local_loss=0.025532089173793793, train_loss=0.07276087254285812, time_cost=4.150330543518066
+
Steps: 1%| | 5727/1000000 [14:37:50<2679:23:22, 9.70s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 5728/1000000 [14:38:01<2756:47:03, 9.98s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [5728], local_loss=0.04300646483898163, train_loss=0.07813847810029984, time_cost=5.126899242401123
+
Steps: 1%| | 5728/1000000 [14:38:01<2756:47:03, 9.98s/it, lr=1e-5, step_loss=0.043]
Steps: 1%| | 5729/1000000 [14:38:12<2843:32:35, 10.30s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [5729], local_loss=0.02461295761168003, train_loss=0.025917597115039825, time_cost=4.0885910987854
+
Steps: 1%| | 5729/1000000 [14:38:12<2843:32:35, 10.30s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%| | 5730/1000000 [14:38:25<3142:40:38, 11.38s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [5730], local_loss=0.04060116410255432, train_loss=0.03912629559636116, time_cost=4.087877988815308
+
Steps: 1%| | 5730/1000000 [14:38:25<3142:40:38, 11.38s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%| | 5731/1000000 [14:38:39<3355:45:21, 12.15s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [5731], local_loss=0.014124215580523014, train_loss=0.027570690959692, time_cost=6.86164665222168
+
Steps: 1%| | 5731/1000000 [14:38:39<3355:45:21, 12.15s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 5732/1000000 [14:38:44<2724:09:31, 9.86s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [5732], local_loss=0.07783901691436768, train_loss=0.04152337834239006, time_cost=1.7422358989715576
+
Steps: 1%| | 5732/1000000 [14:38:44<2724:09:31, 9.86s/it, lr=1e-5, step_loss=0.0778]
Steps: 1%| | 5733/1000000 [14:38:54<2742:06:55, 9.93s/it, lr=1e-5, step_loss=0.0778][RANK-0]: Step: [5733], local_loss=0.03507684916257858, train_loss=0.032063789665699005, time_cost=4.559799671173096
+
Steps: 1%| | 5733/1000000 [14:38:54<2742:06:55, 9.93s/it, lr=1e-5, step_loss=0.0351]
Steps: 1%| | 5734/1000000 [14:38:59<2339:20:38, 8.47s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [5734], local_loss=0.015508094802498817, train_loss=0.053463149815797806, time_cost=1.8836250305175781
+
Steps: 1%| | 5734/1000000 [14:38:59<2339:20:38, 8.47s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 5735/1000000 [14:39:13<2802:10:31, 10.15s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [5735], local_loss=0.039494890719652176, train_loss=0.04874260723590851, time_cost=11.267747640609741
+
Steps: 1%| | 5735/1000000 [14:39:13<2802:10:31, 10.15s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%| | 5736/1000000 [14:39:22<2702:14:38, 9.78s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [5736], local_loss=0.02112995833158493, train_loss=0.05425327271223068, time_cost=2.84837007522583
+
Steps: 1%| | 5736/1000000 [14:39:22<2702:14:38, 9.78s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%| | 5737/1000000 [14:39:29<2505:06:26, 9.07s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [5737], local_loss=0.0422336682677269, train_loss=0.04201003164052963, time_cost=2.870828628540039
+
Steps: 1%| | 5737/1000000 [14:39:29<2505:06:26, 9.07s/it, lr=1e-5, step_loss=0.0422]
Steps: 1%| | 5738/1000000 [14:39:43<2893:42:42, 10.48s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [5738], local_loss=0.01272769458591938, train_loss=0.043366044759750366, time_cost=5.889828681945801
+
Steps: 1%| | 5738/1000000 [14:39:43<2893:42:42, 10.48s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 5739/1000000 [14:39:56<3114:05:08, 11.28s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [5739], local_loss=0.02003171294927597, train_loss=0.02501361444592476, time_cost=8.89571237564087
+
Steps: 1%| | 5739/1000000 [14:39:56<3114:05:08, 11.28s/it, lr=1e-5, step_loss=0.02]
Steps: 1%| | 5740/1000000 [14:40:01<2599:08:59, 9.41s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [5740], local_loss=0.2595033645629883, train_loss=0.053082823753356934, time_cost=2.3095335960388184
+
Steps: 1%| | 5740/1000000 [14:40:01<2599:08:59, 9.41s/it, lr=1e-5, step_loss=0.26]
Steps: 1%| | 5741/1000000 [14:40:10<2556:11:15, 9.26s/it, lr=1e-5, step_loss=0.26][RANK-0]: Step: [5741], local_loss=0.011022520251572132, train_loss=0.020464226603507996, time_cost=1.44232177734375
+
Steps: 1%| | 5741/1000000 [14:40:10<2556:11:15, 9.26s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 5742/1000000 [14:40:20<2613:55:50, 9.46s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [5742], local_loss=0.02886887639760971, train_loss=0.05549360811710358, time_cost=2.3309381008148193
+
Steps: 1%| | 5742/1000000 [14:40:20<2613:55:50, 9.46s/it, lr=1e-5, step_loss=0.0289]
Steps: 1%| | 5743/1000000 [14:40:28<2443:12:02, 8.85s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [5743], local_loss=0.027605853974819183, train_loss=0.05373722314834595, time_cost=1.766174554824829
+
Steps: 1%| | 5743/1000000 [14:40:28<2443:12:02, 8.85s/it, lr=1e-5, step_loss=0.0276]
Steps: 1%| | 5744/1000000 [14:40:35<2351:22:36, 8.51s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [5744], local_loss=0.04397415742278099, train_loss=0.08082576096057892, time_cost=1.3127760887145996
+
Steps: 1%| | 5744/1000000 [14:40:35<2351:22:36, 8.51s/it, lr=1e-5, step_loss=0.044]
Steps: 1%| | 5745/1000000 [14:40:44<2367:36:43, 8.57s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [5745], local_loss=0.03618481010198593, train_loss=0.032270170748233795, time_cost=2.9997622966766357
+
Steps: 1%| | 5745/1000000 [14:40:44<2367:36:43, 8.57s/it, lr=1e-5, step_loss=0.0362]
Steps: 1%| | 5746/1000000 [14:40:52<2276:52:31, 8.24s/it, lr=1e-5, step_loss=0.0362][RANK-0]: Step: [5746], local_loss=0.014410565607249737, train_loss=0.02837519720196724, time_cost=1.49772047996521
+
Steps: 1%| | 5746/1000000 [14:40:52<2276:52:31, 8.24s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 5747/1000000 [14:40:57<2006:49:35, 7.27s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [5747], local_loss=0.1736268550157547, train_loss=0.17990265786647797, time_cost=2.0619213581085205
+
Steps: 1%| | 5747/1000000 [14:40:57<2006:49:35, 7.27s/it, lr=1e-5, step_loss=0.174]
Steps: 1%| | 5748/1000000 [14:41:02<1880:19:03, 6.81s/it, lr=1e-5, step_loss=0.174][RANK-0]: Step: [5748], local_loss=0.014320301823318005, train_loss=0.09652838855981827, time_cost=2.9815471172332764
+
Steps: 1%| | 5748/1000000 [14:41:02<1880:19:03, 6.81s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5749/1000000 [14:41:09<1847:07:28, 6.69s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5749], local_loss=0.04655326157808304, train_loss=0.053164042532444, time_cost=2.6008360385894775
+
Steps: 1%| | 5749/1000000 [14:41:09<1847:07:28, 6.69s/it, lr=1e-5, step_loss=0.0466]
Steps: 1%| | 5750/1000000 [14:41:14<1738:28:06, 6.29s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [5750], local_loss=0.14887988567352295, train_loss=0.08030138164758682, time_cost=2.8481481075286865
+
Steps: 1%| | 5750/1000000 [14:41:14<1738:28:06, 6.29s/it, lr=1e-5, step_loss=0.149]
Steps: 1%| | 5751/1000000 [14:41:21<1821:06:52, 6.59s/it, lr=1e-5, step_loss=0.149][RANK-0]: Step: [5751], local_loss=0.015697535127401352, train_loss=0.08615203201770782, time_cost=2.762817621231079
+
Steps: 1%| | 5751/1000000 [14:41:21<1821:06:52, 6.59s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 5752/1000000 [14:41:33<2263:56:06, 8.20s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [5752], local_loss=0.02451833337545395, train_loss=0.1440875381231308, time_cost=6.625093698501587
+
Steps: 1%| | 5752/1000000 [14:41:33<2263:56:06, 8.20s/it, lr=1e-5, step_loss=0.0245]
Steps: 1%| | 5753/1000000 [14:41:40<2160:47:44, 7.82s/it, lr=1e-5, step_loss=0.0245][RANK-0]: Step: [5753], local_loss=0.034102894365787506, train_loss=0.0292797964066267, time_cost=1.855118751525879
+
Steps: 1%| | 5753/1000000 [14:41:40<2160:47:44, 7.82s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%| | 5754/1000000 [14:41:48<2111:21:21, 7.64s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [5754], local_loss=0.01264417078346014, train_loss=0.03695769980549812, time_cost=2.9014618396759033
+
Steps: 1%| | 5754/1000000 [14:41:48<2111:21:21, 7.64s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 5755/1000000 [14:41:58<2373:30:53, 8.59s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [5755], local_loss=0.013068608939647675, train_loss=0.05564511567354202, time_cost=1.2313241958618164
+
Steps: 1%| | 5755/1000000 [14:41:58<2373:30:53, 8.59s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 5756/1000000 [14:42:07<2399:58:34, 8.69s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [5756], local_loss=0.05244649946689606, train_loss=0.03645934909582138, time_cost=2.796199083328247
+
Steps: 1%| | 5756/1000000 [14:42:07<2399:58:34, 8.69s/it, lr=1e-5, step_loss=0.0524]
Steps: 1%| | 5757/1000000 [14:42:12<2111:17:30, 7.64s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [5757], local_loss=0.025660982355475426, train_loss=0.04972062259912491, time_cost=2.080594301223755
+
Steps: 1%| | 5757/1000000 [14:42:12<2111:17:30, 7.64s/it, lr=1e-5, step_loss=0.0257]
Steps: 1%| | 5758/1000000 [14:42:25<2551:44:49, 9.24s/it, lr=1e-5, step_loss=0.0257][RANK-0]: Step: [5758], local_loss=0.03260602802038193, train_loss=0.08577859401702881, time_cost=3.3156578540802
+
Steps: 1%| | 5758/1000000 [14:42:25<2551:44:49, 9.24s/it, lr=1e-5, step_loss=0.0326]
Steps: 1%| | 5759/1000000 [14:42:31<2259:27:57, 8.18s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [5759], local_loss=0.02064860239624977, train_loss=0.020159874111413956, time_cost=2.9391815662384033
+
Steps: 1%| | 5759/1000000 [14:42:31<2259:27:57, 8.18s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 5760/1000000 [14:42:36<1949:05:05, 7.06s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [5760], local_loss=0.03801111876964569, train_loss=0.07003572583198547, time_cost=1.3980872631072998
+
Steps: 1%| | 5760/1000000 [14:42:36<1949:05:05, 7.06s/it, lr=1e-5, step_loss=0.038]
Steps: 1%| | 5761/1000000 [14:42:40<1764:35:27, 6.39s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [5761], local_loss=0.022467684000730515, train_loss=0.0392836257815361, time_cost=1.4769086837768555
+
Steps: 1%| | 5761/1000000 [14:42:40<1764:35:27, 6.39s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%| | 5762/1000000 [14:42:45<1645:30:35, 5.96s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [5762], local_loss=0.05080876871943474, train_loss=0.1255907416343689, time_cost=1.442380428314209
+
Steps: 1%| | 5762/1000000 [14:42:45<1645:30:35, 5.96s/it, lr=1e-5, step_loss=0.0508]
Steps: 1%| | 5763/1000000 [14:42:56<2029:49:14, 7.35s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [5763], local_loss=0.007597525138407946, train_loss=0.03946685791015625, time_cost=1.2660448551177979
+
Steps: 1%| | 5763/1000000 [14:42:56<2029:49:14, 7.35s/it, lr=1e-5, step_loss=0.0076]
Steps: 1%| | 5764/1000000 [14:43:03<1989:28:27, 7.20s/it, lr=1e-5, step_loss=0.0076][RANK-0]: Step: [5764], local_loss=0.017884477972984314, train_loss=0.0406402125954628, time_cost=3.208306312561035
+
Steps: 1%| | 5764/1000000 [14:43:03<1989:28:27, 7.20s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 5765/1000000 [14:43:18<2645:25:16, 9.58s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [5765], local_loss=0.012068327516317368, train_loss=0.03333800286054611, time_cost=4.421836853027344
+
Steps: 1%| | 5765/1000000 [14:43:18<2645:25:16, 9.58s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 5766/1000000 [14:43:32<2989:08:24, 10.82s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [5766], local_loss=0.022811036556959152, train_loss=0.019249945878982544, time_cost=4.208785772323608
+
Steps: 1%| | 5766/1000000 [14:43:32<2989:08:24, 10.82s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%| | 5767/1000000 [14:43:44<3087:08:12, 11.18s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [5767], local_loss=0.012151680886745453, train_loss=0.03335548937320709, time_cost=2.3131837844848633
+
Steps: 1%| | 5767/1000000 [14:43:44<3087:08:12, 11.18s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 5768/1000000 [14:43:54<3055:50:50, 11.06s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [5768], local_loss=0.009626921266317368, train_loss=0.027141019701957703, time_cost=7.328675985336304
+
Steps: 1%| | 5768/1000000 [14:43:54<3055:50:50, 11.06s/it, lr=1e-5, step_loss=0.00963]
Steps: 1%| | 5769/1000000 [14:44:05<3008:32:51, 10.89s/it, lr=1e-5, step_loss=0.00963][RANK-0]: Step: [5769], local_loss=0.07056795805692673, train_loss=0.1847858428955078, time_cost=4.953598737716675
+
Steps: 1%| | 5769/1000000 [14:44:05<3008:32:51, 10.89s/it, lr=1e-5, step_loss=0.0706]
Steps: 1%| | 5770/1000000 [14:44:12<2707:23:45, 9.80s/it, lr=1e-5, step_loss=0.0706][RANK-0]: Step: [5770], local_loss=0.0957011803984642, train_loss=0.04002636298537254, time_cost=1.5442519187927246
+
Steps: 1%| | 5770/1000000 [14:44:12<2707:23:45, 9.80s/it, lr=1e-5, step_loss=0.0957]
Steps: 1%| | 5771/1000000 [14:44:23<2829:32:23, 10.25s/it, lr=1e-5, step_loss=0.0957][RANK-0]: Step: [5771], local_loss=0.015330600552260876, train_loss=0.02797534503042698, time_cost=2.6150567531585693
+
Steps: 1%| | 5771/1000000 [14:44:23<2829:32:23, 10.25s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%| | 5772/1000000 [14:44:35<2973:28:19, 10.77s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [5772], local_loss=0.027415378019213676, train_loss=0.042027547955513, time_cost=1.274033784866333
+
Steps: 1%| | 5772/1000000 [14:44:35<2973:28:19, 10.77s/it, lr=1e-5, step_loss=0.0274]
Steps: 1%| | 5773/1000000 [14:44:45<2884:16:20, 10.44s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [5773], local_loss=0.041238173842430115, train_loss=0.02555873617529869, time_cost=3.6935744285583496
+
Steps: 1%| | 5773/1000000 [14:44:45<2884:16:20, 10.44s/it, lr=1e-5, step_loss=0.0412]
Steps: 1%| | 5774/1000000 [14:44:59<3193:18:56, 11.56s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [5774], local_loss=0.015954269096255302, train_loss=0.09697899222373962, time_cost=11.928563356399536
+
Steps: 1%| | 5774/1000000 [14:44:59<3193:18:56, 11.56s/it, lr=1e-5, step_loss=0.016]
Steps: 1%| | 5775/1000000 [14:45:07<2886:44:32, 10.45s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [5775], local_loss=0.025905147194862366, train_loss=0.029397975653409958, time_cost=2.3215172290802
+
Steps: 1%| | 5775/1000000 [14:45:07<2886:44:32, 10.45s/it, lr=1e-5, step_loss=0.0259]
Steps: 1%| | 5776/1000000 [14:45:16<2773:15:07, 10.04s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [5776], local_loss=0.015437731519341469, train_loss=0.03254598006606102, time_cost=3.378727674484253
+
Steps: 1%| | 5776/1000000 [14:45:16<2773:15:07, 10.04s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 5777/1000000 [14:45:21<2363:06:08, 8.56s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [5777], local_loss=0.010683076456189156, train_loss=0.16493387520313263, time_cost=1.9911561012268066
+
Steps: 1%| | 5777/1000000 [14:45:21<2363:06:08, 8.56s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 5778/1000000 [14:45:26<2077:20:54, 7.52s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [5778], local_loss=0.01851034350693226, train_loss=0.09390632808208466, time_cost=2.019789218902588
+
Steps: 1%| | 5778/1000000 [14:45:26<2077:20:54, 7.52s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%| | 5779/1000000 [14:45:34<2066:36:32, 7.48s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [5779], local_loss=0.009384239092469215, train_loss=0.04410133138298988, time_cost=1.48002028465271
+
Steps: 1%| | 5779/1000000 [14:45:34<2066:36:32, 7.48s/it, lr=1e-5, step_loss=0.00938]
Steps: 1%| | 5780/1000000 [14:45:45<2331:27:21, 8.44s/it, lr=1e-5, step_loss=0.00938][RANK-0]: Step: [5780], local_loss=0.16507850587368011, train_loss=13.407051086425781, time_cost=2.504143238067627
+
Steps: 1%| | 5780/1000000 [14:45:45<2331:27:21, 8.44s/it, lr=1e-5, step_loss=0.165]
Steps: 1%| | 5781/1000000 [14:45:50<2105:30:21, 7.62s/it, lr=1e-5, step_loss=0.165][RANK-0]: Step: [5781], local_loss=0.018028419464826584, train_loss=0.05304207652807236, time_cost=1.5464248657226562
+
Steps: 1%| | 5781/1000000 [14:45:50<2105:30:21, 7.62s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 5782/1000000 [14:45:57<2030:40:44, 7.35s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [5782], local_loss=0.03591548651456833, train_loss=0.05680273845791817, time_cost=2.9110052585601807
+
Steps: 1%| | 5782/1000000 [14:45:57<2030:40:44, 7.35s/it, lr=1e-5, step_loss=0.0359]
Steps: 1%| | 5783/1000000 [14:46:01<1776:49:20, 6.43s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [5783], local_loss=0.025495620444417, train_loss=0.03648435324430466, time_cost=1.3474102020263672
+
Steps: 1%| | 5783/1000000 [14:46:01<1776:49:20, 6.43s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 5784/1000000 [14:46:13<2198:35:13, 7.96s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [5784], local_loss=0.04595348238945007, train_loss=0.055457741022109985, time_cost=1.2660157680511475
+
Steps: 1%| | 5784/1000000 [14:46:13<2198:35:13, 7.96s/it, lr=1e-5, step_loss=0.046]
Steps: 1%| | 5785/1000000 [14:46:27<2731:01:04, 9.89s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [5785], local_loss=0.011632914654910564, train_loss=0.032645806670188904, time_cost=5.4285266399383545
+
Steps: 1%| | 5785/1000000 [14:46:27<2731:01:04, 9.89s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 5786/1000000 [14:46:41<3064:17:09, 11.10s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [5786], local_loss=1.0140455961227417, train_loss=0.14549686014652252, time_cost=6.1021857261657715
+
Steps: 1%| | 5786/1000000 [14:46:41<3064:17:09, 11.10s/it, lr=1e-5, step_loss=1.01]
Steps: 1%| | 5787/1000000 [14:46:47<2604:31:39, 9.43s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [5787], local_loss=0.03966980054974556, train_loss=0.02615618333220482, time_cost=1.2446048259735107
+
Steps: 1%| | 5787/1000000 [14:46:47<2604:31:39, 9.43s/it, lr=1e-5, step_loss=0.0397]
Steps: 1%| | 5788/1000000 [14:46:54<2412:15:54, 8.73s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [5788], local_loss=0.013973957858979702, train_loss=0.07335580140352249, time_cost=5.9560699462890625
+
Steps: 1%| | 5788/1000000 [14:46:54<2412:15:54, 8.73s/it, lr=1e-5, step_loss=0.014]
Steps: 1%| | 5789/1000000 [14:47:05<2647:43:02, 9.59s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [5789], local_loss=0.03228452429175377, train_loss=0.039217524230480194, time_cost=4.054330587387085
+
Steps: 1%| | 5789/1000000 [14:47:05<2647:43:02, 9.59s/it, lr=1e-5, step_loss=0.0323]
Steps: 1%| | 5790/1000000 [14:47:14<2578:23:58, 9.34s/it, lr=1e-5, step_loss=0.0323][RANK-0]: Step: [5790], local_loss=0.33461278676986694, train_loss=0.06611327826976776, time_cost=3.4590940475463867
+
Steps: 1%| | 5790/1000000 [14:47:14<2578:23:58, 9.34s/it, lr=1e-5, step_loss=0.335]
Steps: 1%| | 5791/1000000 [14:47:21<2353:52:14, 8.52s/it, lr=1e-5, step_loss=0.335][RANK-0]: Step: [5791], local_loss=0.02940172143280506, train_loss=0.06967282295227051, time_cost=1.9539976119995117
+
Steps: 1%| | 5791/1000000 [14:47:21<2353:52:14, 8.52s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%| | 5792/1000000 [14:47:29<2302:25:00, 8.34s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [5792], local_loss=0.26386716961860657, train_loss=8.073094367980957, time_cost=1.2403504848480225
+
Steps: 1%| | 5792/1000000 [14:47:29<2302:25:00, 8.34s/it, lr=1e-5, step_loss=0.264]
Steps: 1%| | 5793/1000000 [14:47:39<2434:16:53, 8.81s/it, lr=1e-5, step_loss=0.264][RANK-0]: Step: [5793], local_loss=0.00927245058119297, train_loss=0.029080303385853767, time_cost=2.3985040187835693
+
Steps: 1%| | 5793/1000000 [14:47:39<2434:16:53, 8.81s/it, lr=1e-5, step_loss=0.00927]
Steps: 1%| | 5794/1000000 [14:47:44<2118:25:14, 7.67s/it, lr=1e-5, step_loss=0.00927][RANK-0]: Step: [5794], local_loss=0.01105581410229206, train_loss=0.03388465940952301, time_cost=1.2115085124969482
+
Steps: 1%| | 5794/1000000 [14:47:44<2118:25:14, 7.67s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 5795/1000000 [14:47:57<2615:16:09, 9.47s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [5795], local_loss=0.02712303213775158, train_loss=0.16035723686218262, time_cost=5.0474913120269775
+
Steps: 1%| | 5795/1000000 [14:47:57<2615:16:09, 9.47s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%| | 5796/1000000 [14:48:06<2563:58:03, 9.28s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [5796], local_loss=0.02175953797996044, train_loss=0.04099101573228836, time_cost=2.0184545516967773
+
Steps: 1%| | 5796/1000000 [14:48:06<2563:58:03, 9.28s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%| | 5797/1000000 [14:48:11<2213:08:09, 8.01s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [5797], local_loss=0.016844483092427254, train_loss=0.03847118467092514, time_cost=1.8567042350769043
+
Steps: 1%| | 5797/1000000 [14:48:11<2213:08:09, 8.01s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%| | 5798/1000000 [14:48:21<2335:34:14, 8.46s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [5798], local_loss=0.03339116647839546, train_loss=0.05115080997347832, time_cost=7.060610294342041
+
Steps: 1%| | 5798/1000000 [14:48:21<2335:34:14, 8.46s/it, lr=1e-5, step_loss=0.0334]
Steps: 1%| | 5799/1000000 [14:48:33<2691:09:37, 9.74s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [5799], local_loss=0.16000856459140778, train_loss=0.048683393746614456, time_cost=10.534143924713135
+
Steps: 1%| | 5799/1000000 [14:48:33<2691:09:37, 9.74s/it, lr=1e-5, step_loss=0.16]
Steps: 1%| | 5800/1000000 [14:48:39<2330:09:22, 8.44s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [5800], local_loss=0.9955514073371887, train_loss=0.1435379981994629, time_cost=1.2078814506530762
+
Steps: 1%| | 5800/1000000 [14:48:39<2330:09:22, 8.44s/it, lr=1e-5, step_loss=0.996]
Steps: 1%| | 5801/1000000 [14:48:49<2455:28:07, 8.89s/it, lr=1e-5, step_loss=0.996][RANK-0]: Step: [5801], local_loss=0.2409629225730896, train_loss=0.08493782579898834, time_cost=1.6247029304504395
+
Steps: 1%| | 5801/1000000 [14:48:49<2455:28:07, 8.89s/it, lr=1e-5, step_loss=0.241]
Steps: 1%| | 5802/1000000 [14:49:05<3093:36:52, 11.20s/it, lr=1e-5, step_loss=0.241][RANK-0]: Step: [5802], local_loss=0.05189066380262375, train_loss=0.03843533992767334, time_cost=6.364986419677734
+
Steps: 1%| | 5802/1000000 [14:49:05<3093:36:52, 11.20s/it, lr=1e-5, step_loss=0.0519]
Steps: 1%| | 5803/1000000 [14:49:10<2589:01:51, 9.37s/it, lr=1e-5, step_loss=0.0519][RANK-0]: Step: [5803], local_loss=0.019157344475388527, train_loss=0.0731448084115982, time_cost=1.9799182415008545
+
Steps: 1%| | 5803/1000000 [14:49:10<2589:01:51, 9.37s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 5804/1000000 [14:49:16<2267:20:08, 8.21s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [5804], local_loss=0.02286003902554512, train_loss=0.07082376629114151, time_cost=3.082082509994507
+
Steps: 1%| | 5804/1000000 [14:49:16<2267:20:08, 8.21s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%| | 5805/1000000 [14:49:23<2143:18:24, 7.76s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [5805], local_loss=0.018013017252087593, train_loss=0.04242235794663429, time_cost=2.229180335998535
+
Steps: 1%| | 5805/1000000 [14:49:23<2143:18:24, 7.76s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 5806/1000000 [14:49:30<2094:56:35, 7.59s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [5806], local_loss=0.08869016170501709, train_loss=0.02813715860247612, time_cost=2.940049886703491
+
Steps: 1%| | 5806/1000000 [14:49:30<2094:56:35, 7.59s/it, lr=1e-5, step_loss=0.0887]
Steps: 1%| | 5807/1000000 [14:49:37<2045:12:40, 7.41s/it, lr=1e-5, step_loss=0.0887][RANK-0]: Step: [5807], local_loss=0.027583565562963486, train_loss=0.022636305540800095, time_cost=1.708430528640747
+
Steps: 1%| | 5807/1000000 [14:49:37<2045:12:40, 7.41s/it, lr=1e-5, step_loss=0.0276]
Steps: 1%| | 5808/1000000 [14:49:44<2010:53:13, 7.28s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [5808], local_loss=0.06409500539302826, train_loss=0.09512259066104889, time_cost=2.544834852218628
+
Steps: 1%| | 5808/1000000 [14:49:44<2010:53:13, 7.28s/it, lr=1e-5, step_loss=0.0641]
Steps: 1%| | 5809/1000000 [14:49:55<2319:58:49, 8.40s/it, lr=1e-5, step_loss=0.0641][RANK-0]: Step: [5809], local_loss=0.03273278474807739, train_loss=0.05213073268532753, time_cost=2.221275568008423
+
Steps: 1%| | 5809/1000000 [14:49:55<2319:58:49, 8.40s/it, lr=1e-5, step_loss=0.0327]
Steps: 1%| | 5810/1000000 [14:50:01<2121:34:22, 7.68s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [5810], local_loss=0.010608049109578133, train_loss=0.02545192465186119, time_cost=2.7031917572021484
+
Steps: 1%| | 5810/1000000 [14:50:01<2121:34:22, 7.68s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5811/1000000 [14:50:07<1967:49:08, 7.13s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5811], local_loss=0.012498080730438232, train_loss=0.07535205036401749, time_cost=1.22078275680542
+
Steps: 1%| | 5811/1000000 [14:50:07<1967:49:08, 7.13s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 5812/1000000 [14:50:12<1787:58:40, 6.47s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [5812], local_loss=0.03306335210800171, train_loss=0.027372261509299278, time_cost=1.8855855464935303
+
Steps: 1%| | 5812/1000000 [14:50:12<1787:58:40, 6.47s/it, lr=1e-5, step_loss=0.0331]
Steps: 1%| | 5813/1000000 [14:50:19<1841:23:48, 6.67s/it, lr=1e-5, step_loss=0.0331][RANK-0]: Step: [5813], local_loss=0.12746359407901764, train_loss=0.07087963819503784, time_cost=3.342012405395508
+
Steps: 1%| | 5813/1000000 [14:50:19<1841:23:48, 6.67s/it, lr=1e-5, step_loss=0.127]
Steps: 1%| | 5814/1000000 [14:50:31<2290:49:23, 8.30s/it, lr=1e-5, step_loss=0.127][RANK-0]: Step: [5814], local_loss=0.2658073604106903, train_loss=0.11283192038536072, time_cost=2.907461643218994
+
Steps: 1%| | 5814/1000000 [14:50:31<2290:49:23, 8.30s/it, lr=1e-5, step_loss=0.266]
Steps: 1%| | 5815/1000000 [14:50:40<2408:58:57, 8.72s/it, lr=1e-5, step_loss=0.266][RANK-0]: Step: [5815], local_loss=0.013768231496214867, train_loss=0.02894412726163864, time_cost=7.6639134883880615
+
Steps: 1%| | 5815/1000000 [14:50:40<2408:58:57, 8.72s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 5816/1000000 [14:50:56<2983:52:17, 10.80s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [5816], local_loss=0.07567165046930313, train_loss=0.05010692775249481, time_cost=2.159898281097412
+
Steps: 1%| | 5816/1000000 [14:50:56<2983:52:17, 10.80s/it, lr=1e-5, step_loss=0.0757]
Steps: 1%| | 5817/1000000 [14:51:01<2450:29:26, 8.87s/it, lr=1e-5, step_loss=0.0757][RANK-0]: Step: [5817], local_loss=0.020291276276111603, train_loss=0.09559043496847153, time_cost=1.3338432312011719
+
Steps: 1%| | 5817/1000000 [14:51:01<2450:29:26, 8.87s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%| | 5818/1000000 [14:51:12<2658:16:07, 9.63s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [5818], local_loss=0.012608812190592289, train_loss=0.09339769929647446, time_cost=3.146580696105957
+
Steps: 1%| | 5818/1000000 [14:51:12<2658:16:07, 9.63s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 5819/1000000 [14:51:20<2501:55:12, 9.06s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [5819], local_loss=0.00998352188616991, train_loss=0.0295756533741951, time_cost=3.257720947265625
+
Steps: 1%| | 5819/1000000 [14:51:20<2501:55:12, 9.06s/it, lr=1e-5, step_loss=0.00998]
Steps: 1%| | 5820/1000000 [14:51:33<2888:08:10, 10.46s/it, lr=1e-5, step_loss=0.00998][RANK-0]: Step: [5820], local_loss=0.021148651838302612, train_loss=0.0528583899140358, time_cost=1.4337763786315918
+
Steps: 1%| | 5820/1000000 [14:51:33<2888:08:10, 10.46s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%| | 5821/1000000 [14:51:39<2462:07:22, 8.92s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [5821], local_loss=0.05328776687383652, train_loss=0.06314137578010559, time_cost=2.538008451461792
+
Steps: 1%| | 5821/1000000 [14:51:39<2462:07:22, 8.92s/it, lr=1e-5, step_loss=0.0533]
Steps: 1%| | 5822/1000000 [14:51:48<2488:44:24, 9.01s/it, lr=1e-5, step_loss=0.0533][RANK-0]: Step: [5822], local_loss=0.03916532173752785, train_loss=0.20337194204330444, time_cost=3.405263662338257
+
Steps: 1%| | 5822/1000000 [14:51:48<2488:44:24, 9.01s/it, lr=1e-5, step_loss=0.0392]
Steps: 1%| | 5823/1000000 [14:51:57<2458:30:07, 8.90s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [5823], local_loss=0.06383701413869858, train_loss=0.05409345030784607, time_cost=3.2209136486053467
+
Steps: 1%| | 5823/1000000 [14:51:57<2458:30:07, 8.90s/it, lr=1e-5, step_loss=0.0638]
Steps: 1%| | 5824/1000000 [14:52:06<2502:33:58, 9.06s/it, lr=1e-5, step_loss=0.0638][RANK-0]: Step: [5824], local_loss=0.02412770316004753, train_loss=0.0366019606590271, time_cost=3.2889297008514404
+
Steps: 1%| | 5824/1000000 [14:52:06<2502:33:58, 9.06s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 5825/1000000 [14:52:11<2178:19:16, 7.89s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [5825], local_loss=0.011446758173406124, train_loss=0.024740399792790413, time_cost=2.1204309463500977
+
Steps: 1%| | 5825/1000000 [14:52:11<2178:19:16, 7.89s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5826/1000000 [14:52:17<2028:43:48, 7.35s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5826], local_loss=0.00865369662642479, train_loss=0.13092264533042908, time_cost=1.650606632232666
+
Steps: 1%| | 5826/1000000 [14:52:17<2028:43:48, 7.35s/it, lr=1e-5, step_loss=0.00865]
Steps: 1%| | 5827/1000000 [14:52:29<2430:45:35, 8.80s/it, lr=1e-5, step_loss=0.00865][RANK-0]: Step: [5827], local_loss=0.017091453075408936, train_loss=0.025143256410956383, time_cost=5.4213175773620605
+
Steps: 1%| | 5827/1000000 [14:52:29<2430:45:35, 8.80s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 5828/1000000 [14:52:42<2780:49:40, 10.07s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [5828], local_loss=0.037166908383369446, train_loss=0.08353963494300842, time_cost=9.535238265991211
+
Steps: 1%| | 5828/1000000 [14:52:42<2780:49:40, 10.07s/it, lr=1e-5, step_loss=0.0372]
Steps: 1%| | 5829/1000000 [14:52:57<3136:51:56, 11.36s/it, lr=1e-5, step_loss=0.0372][RANK-0]: Step: [5829], local_loss=0.04176641255617142, train_loss=0.03147484362125397, time_cost=7.1053993701934814
+
Steps: 1%| | 5829/1000000 [14:52:57<3136:51:56, 11.36s/it, lr=1e-5, step_loss=0.0418]
Steps: 1%| | 5830/1000000 [14:53:01<2546:34:37, 9.22s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [5830], local_loss=0.019798604771494865, train_loss=1.4134553670883179, time_cost=1.4661266803741455
+
Steps: 1%| | 5830/1000000 [14:53:01<2546:34:37, 9.22s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 5831/1000000 [14:53:16<2980:13:49, 10.79s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [5831], local_loss=0.015005898661911488, train_loss=0.05437391623854637, time_cost=5.794190168380737
+
Steps: 1%| | 5831/1000000 [14:53:16<2980:13:49, 10.79s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5832/1000000 [14:53:27<3027:20:36, 10.96s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5832], local_loss=0.008747200481593609, train_loss=0.04467780143022537, time_cost=1.4504406452178955
+
Steps: 1%| | 5832/1000000 [14:53:27<3027:20:36, 10.96s/it, lr=1e-5, step_loss=0.00875]
Steps: 1%| | 5833/1000000 [14:53:36<2864:29:43, 10.37s/it, lr=1e-5, step_loss=0.00875][RANK-0]: Step: [5833], local_loss=0.03612852841615677, train_loss=0.03404611349105835, time_cost=2.163856267929077
+
Steps: 1%| | 5833/1000000 [14:53:36<2864:29:43, 10.37s/it, lr=1e-5, step_loss=0.0361]
Steps: 1%| | 5834/1000000 [14:53:43<2613:17:13, 9.46s/it, lr=1e-5, step_loss=0.0361][RANK-0]: Step: [5834], local_loss=0.24133537709712982, train_loss=0.05830332264304161, time_cost=2.5256268978118896
+
Steps: 1%| | 5834/1000000 [14:53:43<2613:17:13, 9.46s/it, lr=1e-5, step_loss=0.241]
Steps: 1%| | 5835/1000000 [14:53:59<3111:55:40, 11.27s/it, lr=1e-5, step_loss=0.241][RANK-0]: Step: [5835], local_loss=0.06557457149028778, train_loss=0.03137456625699997, time_cost=7.099566221237183
+
Steps: 1%| | 5835/1000000 [14:53:59<3111:55:40, 11.27s/it, lr=1e-5, step_loss=0.0656]
Steps: 1%| | 5836/1000000 [14:54:12<3293:13:25, 11.93s/it, lr=1e-5, step_loss=0.0656][RANK-0]: Step: [5836], local_loss=0.0238984152674675, train_loss=0.03937086462974548, time_cost=3.879967451095581
+
Steps: 1%| | 5836/1000000 [14:54:12<3293:13:25, 11.93s/it, lr=1e-5, step_loss=0.0239]
Steps: 1%| | 5837/1000000 [14:54:22<3124:48:20, 11.32s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [5837], local_loss=0.008023018017411232, train_loss=0.029934437945485115, time_cost=1.220595359802246
+
Steps: 1%| | 5837/1000000 [14:54:22<3124:48:20, 11.32s/it, lr=1e-5, step_loss=0.00802]
Steps: 1%| | 5838/1000000 [14:54:29<2762:44:23, 10.00s/it, lr=1e-5, step_loss=0.00802][RANK-0]: Step: [5838], local_loss=0.015006858855485916, train_loss=0.030007075518369675, time_cost=5.844790935516357
+
Steps: 1%| | 5838/1000000 [14:54:29<2762:44:23, 10.00s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 5839/1000000 [14:54:33<2292:55:32, 8.30s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [5839], local_loss=0.0850410982966423, train_loss=0.050491295754909515, time_cost=1.2302882671356201
+
Steps: 1%| | 5839/1000000 [14:54:33<2292:55:32, 8.30s/it, lr=1e-5, step_loss=0.085]
Steps: 1%| | 5840/1000000 [14:54:38<2024:57:36, 7.33s/it, lr=1e-5, step_loss=0.085][RANK-0]: Step: [5840], local_loss=0.0649086982011795, train_loss=0.0821368396282196, time_cost=1.8017141819000244
+
Steps: 1%| | 5840/1000000 [14:54:38<2024:57:36, 7.33s/it, lr=1e-5, step_loss=0.0649]
Steps: 1%| | 5841/1000000 [14:54:49<2330:07:35, 8.44s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [5841], local_loss=0.10824795812368393, train_loss=0.03423834964632988, time_cost=3.6609673500061035
+
Steps: 1%| | 5841/1000000 [14:54:49<2330:07:35, 8.44s/it, lr=1e-5, step_loss=0.108]
Steps: 1%| | 5842/1000000 [14:55:04<2812:21:13, 10.18s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [5842], local_loss=0.0038542146794497967, train_loss=0.023242544382810593, time_cost=1.2153685092926025
+
Steps: 1%| | 5842/1000000 [14:55:04<2812:21:13, 10.18s/it, lr=1e-5, step_loss=0.00385]
Steps: 1%| | 5843/1000000 [14:55:15<2931:38:01, 10.62s/it, lr=1e-5, step_loss=0.00385][RANK-0]: Step: [5843], local_loss=0.03500506281852722, train_loss=0.019082456827163696, time_cost=5.196576118469238
+
Steps: 1%| | 5843/1000000 [14:55:15<2931:38:01, 10.62s/it, lr=1e-5, step_loss=0.035]
Steps: 1%| | 5844/1000000 [14:55:20<2456:15:47, 8.89s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [5844], local_loss=0.06810614466667175, train_loss=0.10648328810930252, time_cost=2.3711373805999756
+
Steps: 1%| | 5844/1000000 [14:55:20<2456:15:47, 8.89s/it, lr=1e-5, step_loss=0.0681]
Steps: 1%| | 5845/1000000 [14:55:26<2185:51:49, 7.92s/it, lr=1e-5, step_loss=0.0681][RANK-0]: Step: [5845], local_loss=0.025316059589385986, train_loss=0.07318991422653198, time_cost=1.216618299484253
+
Steps: 1%| | 5845/1000000 [14:55:26<2185:51:49, 7.92s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%| | 5846/1000000 [14:55:30<1871:17:32, 6.78s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [5846], local_loss=0.03936957195401192, train_loss=0.05901075899600983, time_cost=1.3633198738098145
+
Steps: 1%| | 5846/1000000 [14:55:30<1871:17:32, 6.78s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%| | 5847/1000000 [14:55:37<1858:59:13, 6.73s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [5847], local_loss=0.008986097760498524, train_loss=0.030086301267147064, time_cost=2.266810417175293
+
Steps: 1%| | 5847/1000000 [14:55:37<1858:59:13, 6.73s/it, lr=1e-5, step_loss=0.00899]
Steps: 1%| | 5848/1000000 [14:55:44<1886:42:10, 6.83s/it, lr=1e-5, step_loss=0.00899][RANK-0]: Step: [5848], local_loss=0.021172616630792618, train_loss=0.032477714121341705, time_cost=1.2426071166992188
+
Steps: 1%| | 5848/1000000 [14:55:44<1886:42:10, 6.83s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%| | 5849/1000000 [14:55:49<1740:30:49, 6.30s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [5849], local_loss=0.028265677392482758, train_loss=0.038495149463415146, time_cost=2.1283583641052246
+
Steps: 1%| | 5849/1000000 [14:55:49<1740:30:49, 6.30s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%| | 5850/1000000 [14:55:59<2102:02:48, 7.61s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [5850], local_loss=0.00825946033000946, train_loss=0.039927348494529724, time_cost=1.2311086654663086
+
Steps: 1%| | 5850/1000000 [14:55:59<2102:02:48, 7.61s/it, lr=1e-5, step_loss=0.00826]
Steps: 1%| | 5851/1000000 [14:56:14<2657:44:48, 9.62s/it, lr=1e-5, step_loss=0.00826][RANK-0]: Step: [5851], local_loss=0.04905828833580017, train_loss=0.03615930676460266, time_cost=2.2620937824249268
+
Steps: 1%| | 5851/1000000 [14:56:14<2657:44:48, 9.62s/it, lr=1e-5, step_loss=0.0491]
Steps: 1%| | 5852/1000000 [14:56:27<2987:49:10, 10.82s/it, lr=1e-5, step_loss=0.0491][RANK-0]: Step: [5852], local_loss=0.02865326777100563, train_loss=0.07516750693321228, time_cost=5.160831689834595
+
Steps: 1%| | 5852/1000000 [14:56:27<2987:49:10, 10.82s/it, lr=1e-5, step_loss=0.0287]
Steps: 1%| | 5853/1000000 [14:56:36<2846:20:19, 10.31s/it, lr=1e-5, step_loss=0.0287][RANK-0]: Step: [5853], local_loss=0.0082866121083498, train_loss=0.03996727988123894, time_cost=1.2684800624847412
+
Steps: 1%| | 5853/1000000 [14:56:36<2846:20:19, 10.31s/it, lr=1e-5, step_loss=0.00829]
Steps: 1%| | 5854/1000000 [14:56:42<2490:00:05, 9.02s/it, lr=1e-5, step_loss=0.00829][RANK-0]: Step: [5854], local_loss=0.027444135397672653, train_loss=0.08094841241836548, time_cost=1.4862048625946045
+
Steps: 1%| | 5854/1000000 [14:56:42<2490:00:05, 9.02s/it, lr=1e-5, step_loss=0.0274]
Steps: 1%| | 5855/1000000 [14:56:57<2912:17:09, 10.55s/it, lr=1e-5, step_loss=0.0274][RANK-0]: Step: [5855], local_loss=0.017693743109703064, train_loss=0.02592446282505989, time_cost=5.012348890304565
+
Steps: 1%| | 5855/1000000 [14:56:57<2912:17:09, 10.55s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 5856/1000000 [14:57:02<2486:32:06, 9.00s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [5856], local_loss=0.01740090921521187, train_loss=0.04751712083816528, time_cost=2.5218076705932617
+
Steps: 1%| | 5856/1000000 [14:57:02<2486:32:06, 9.00s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 5857/1000000 [14:57:12<2556:10:34, 9.26s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [5857], local_loss=0.02337033301591873, train_loss=0.022159095853567123, time_cost=3.696946620941162
+
Steps: 1%| | 5857/1000000 [14:57:12<2556:10:34, 9.26s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%| | 5858/1000000 [14:57:19<2403:18:19, 8.70s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [5858], local_loss=0.02996591478586197, train_loss=0.019670069217681885, time_cost=5.897353649139404
+
Steps: 1%| | 5858/1000000 [14:57:19<2403:18:19, 8.70s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 5859/1000000 [14:57:32<2763:08:51, 10.01s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [5859], local_loss=0.04786594584584236, train_loss=0.021226881071925163, time_cost=3.7603862285614014
+
Steps: 1%| | 5859/1000000 [14:57:32<2763:08:51, 10.01s/it, lr=1e-5, step_loss=0.0479]
Steps: 1%| | 5860/1000000 [14:57:37<2347:20:31, 8.50s/it, lr=1e-5, step_loss=0.0479][RANK-0]: Step: [5860], local_loss=0.010491752065718174, train_loss=0.04214603453874588, time_cost=1.9944040775299072
+
Steps: 1%| | 5860/1000000 [14:57:37<2347:20:31, 8.50s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 5861/1000000 [14:57:48<2573:45:01, 9.32s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [5861], local_loss=0.029986131936311722, train_loss=0.21017076075077057, time_cost=8.157348155975342
+
Steps: 1%| | 5861/1000000 [14:57:48<2573:45:01, 9.32s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 5862/1000000 [14:57:58<2625:26:03, 9.51s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [5862], local_loss=0.1349439173936844, train_loss=0.03776833415031433, time_cost=1.2358481884002686
+
Steps: 1%| | 5862/1000000 [14:57:58<2625:26:03, 9.51s/it, lr=1e-5, step_loss=0.135]
Steps: 1%| | 5863/1000000 [14:58:13<3048:53:29, 11.04s/it, lr=1e-5, step_loss=0.135][RANK-0]: Step: [5863], local_loss=0.013779032044112682, train_loss=0.0513962060213089, time_cost=1.7384388446807861
+
Steps: 1%| | 5863/1000000 [14:58:13<3048:53:29, 11.04s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 5864/1000000 [14:58:20<2740:07:29, 9.92s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [5864], local_loss=0.023147130385041237, train_loss=0.022958865389227867, time_cost=1.2660510540008545
+
Steps: 1%| | 5864/1000000 [14:58:20<2740:07:29, 9.92s/it, lr=1e-5, step_loss=0.0231]
Steps: 1%| | 5865/1000000 [14:58:30<2692:43:41, 9.75s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [5865], local_loss=0.06091904640197754, train_loss=0.037289783358573914, time_cost=1.3149263858795166
+
Steps: 1%| | 5865/1000000 [14:58:30<2692:43:41, 9.75s/it, lr=1e-5, step_loss=0.0609]
Steps: 1%| | 5866/1000000 [14:58:37<2467:48:24, 8.94s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [5866], local_loss=0.054936304688453674, train_loss=0.04106822982430458, time_cost=1.2277374267578125
+
Steps: 1%| | 5866/1000000 [14:58:37<2467:48:24, 8.94s/it, lr=1e-5, step_loss=0.0549]
Steps: 1%| | 5867/1000000 [14:58:56<3290:22:43, 11.92s/it, lr=1e-5, step_loss=0.0549][RANK-0]: Step: [5867], local_loss=0.06947940587997437, train_loss=0.029004545882344246, time_cost=11.26216173171997
+
Steps: 1%| | 5867/1000000 [14:58:56<3290:22:43, 11.92s/it, lr=1e-5, step_loss=0.0695]
Steps: 1%| | 5868/1000000 [14:59:08<3350:32:18, 12.13s/it, lr=1e-5, step_loss=0.0695][RANK-0]: Step: [5868], local_loss=0.019219782203435898, train_loss=0.046137817203998566, time_cost=2.2226366996765137
+
Steps: 1%| | 5868/1000000 [14:59:08<3350:32:18, 12.13s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 5869/1000000 [14:59:20<3293:58:45, 11.93s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [5869], local_loss=0.04157910868525505, train_loss=0.1500704139471054, time_cost=2.565509080886841
+
Steps: 1%| | 5869/1000000 [14:59:20<3293:58:45, 11.93s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%| | 5870/1000000 [14:59:35<3569:34:10, 12.93s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [5870], local_loss=0.04422176256775856, train_loss=0.020341353490948677, time_cost=5.933613538742065
+
Steps: 1%| | 5870/1000000 [14:59:35<3569:34:10, 12.93s/it, lr=1e-5, step_loss=0.0442]
Steps: 1%| | 5871/1000000 [14:59:43<3174:27:40, 11.50s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [5871], local_loss=0.02668357640504837, train_loss=0.0337228961288929, time_cost=4.198460340499878
+
Steps: 1%| | 5871/1000000 [14:59:43<3174:27:40, 11.50s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%| | 5872/1000000 [14:59:58<3486:20:53, 12.62s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [5872], local_loss=0.45465385913848877, train_loss=0.10321104526519775, time_cost=6.499813795089722
+
Steps: 1%| | 5872/1000000 [14:59:58<3486:20:53, 12.62s/it, lr=1e-5, step_loss=0.455]
Steps: 1%| | 5873/1000000 [15:00:16<3926:00:54, 14.22s/it, lr=1e-5, step_loss=0.455][RANK-0]: Step: [5873], local_loss=0.013671628199517727, train_loss=0.06413879245519638, time_cost=10.128344535827637
+
Steps: 1%| | 5873/1000000 [15:00:16<3926:00:54, 14.22s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 5874/1000000 [15:00:24<3407:11:58, 12.34s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [5874], local_loss=0.013915936462581158, train_loss=0.062404319643974304, time_cost=6.4970550537109375
+
Steps: 1%| | 5874/1000000 [15:00:24<3407:11:58, 12.34s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 5875/1000000 [15:00:31<2981:25:48, 10.80s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [5875], local_loss=0.12529364228248596, train_loss=0.05434410274028778, time_cost=1.1970582008361816
+
Steps: 1%| | 5875/1000000 [15:00:31<2981:25:48, 10.80s/it, lr=1e-5, step_loss=0.125]
Steps: 1%| | 5876/1000000 [15:00:43<3012:16:36, 10.91s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [5876], local_loss=0.024847282096743584, train_loss=0.03769935667514801, time_cost=3.5761518478393555
+
Steps: 1%| | 5876/1000000 [15:00:43<3012:16:36, 10.91s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%| | 5877/1000000 [15:00:57<3277:06:56, 11.87s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [5877], local_loss=0.03741823881864548, train_loss=0.05547221750020981, time_cost=4.37739109992981
+
Steps: 1%| | 5877/1000000 [15:00:57<3277:06:56, 11.87s/it, lr=1e-5, step_loss=0.0374]
Steps: 1%| | 5878/1000000 [15:01:04<2880:48:03, 10.43s/it, lr=1e-5, step_loss=0.0374][RANK-0]: Step: [5878], local_loss=0.0785052478313446, train_loss=0.03196588158607483, time_cost=1.2088463306427002
+
Steps: 1%| | 5878/1000000 [15:01:04<2880:48:03, 10.43s/it, lr=1e-5, step_loss=0.0785]
Steps: 1%| | 5879/1000000 [15:01:12<2732:17:03, 9.89s/it, lr=1e-5, step_loss=0.0785][RANK-0]: Step: [5879], local_loss=0.013038231059908867, train_loss=0.03161146491765976, time_cost=3.1055538654327393
+
Steps: 1%| | 5879/1000000 [15:01:12<2732:17:03, 9.89s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 5880/1000000 [15:01:18<2349:09:48, 8.51s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [5880], local_loss=0.01804884895682335, train_loss=0.08464620262384415, time_cost=2.5699338912963867
+
Steps: 1%| | 5880/1000000 [15:01:18<2349:09:48, 8.51s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 5881/1000000 [15:01:23<2091:38:33, 7.57s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [5881], local_loss=0.04717130586504936, train_loss=0.0890020951628685, time_cost=1.2317886352539062
+
Steps: 1%| | 5881/1000000 [15:01:23<2091:38:33, 7.57s/it, lr=1e-5, step_loss=0.0472]
Steps: 1%| | 5882/1000000 [15:01:28<1875:34:21, 6.79s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [5882], local_loss=0.008321142755448818, train_loss=0.13811913132667542, time_cost=3.607193946838379
+
Steps: 1%| | 5882/1000000 [15:01:28<1875:34:21, 6.79s/it, lr=1e-5, step_loss=0.00832]
Steps: 1%| | 5883/1000000 [15:01:34<1787:57:44, 6.47s/it, lr=1e-5, step_loss=0.00832][RANK-0]: Step: [5883], local_loss=0.05821196362376213, train_loss=0.05152317136526108, time_cost=1.3836448192596436
+
Steps: 1%| | 5883/1000000 [15:01:34<1787:57:44, 6.47s/it, lr=1e-5, step_loss=0.0582]
Steps: 1%| | 5884/1000000 [15:01:43<2000:26:21, 7.24s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [5884], local_loss=0.014743594452738762, train_loss=0.08600978553295135, time_cost=1.2041077613830566
+
Steps: 1%| | 5884/1000000 [15:01:43<2000:26:21, 7.24s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 5885/1000000 [15:01:50<2009:42:51, 7.28s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [5885], local_loss=0.06534438580274582, train_loss=0.05032763257622719, time_cost=1.516028881072998
+
Steps: 1%| | 5885/1000000 [15:01:50<2009:42:51, 7.28s/it, lr=1e-5, step_loss=0.0653]
Steps: 1%| | 5886/1000000 [15:01:55<1780:18:35, 6.45s/it, lr=1e-5, step_loss=0.0653][RANK-0]: Step: [5886], local_loss=0.014535048045217991, train_loss=0.08038307726383209, time_cost=1.4845771789550781
+
Steps: 1%| | 5886/1000000 [15:01:55<1780:18:35, 6.45s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 5887/1000000 [15:02:02<1848:46:40, 6.70s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [5887], local_loss=0.016184870153665543, train_loss=7.689777851104736, time_cost=1.4654185771942139
+
Steps: 1%| | 5887/1000000 [15:02:02<1848:46:40, 6.70s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 5888/1000000 [15:02:13<2226:15:16, 8.06s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [5888], local_loss=0.038183290511369705, train_loss=0.06029241532087326, time_cost=7.973984479904175
+
Steps: 1%| | 5888/1000000 [15:02:13<2226:15:16, 8.06s/it, lr=1e-5, step_loss=0.0382]
Steps: 1%| | 5889/1000000 [15:02:21<2194:04:07, 7.95s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [5889], local_loss=0.060974571853876114, train_loss=0.05849134549498558, time_cost=2.438408136367798
+
Steps: 1%| | 5889/1000000 [15:02:21<2194:04:07, 7.95s/it, lr=1e-5, step_loss=0.061]
Steps: 1%| | 5890/1000000 [15:02:28<2095:53:49, 7.59s/it, lr=1e-5, step_loss=0.061][RANK-0]: Step: [5890], local_loss=0.01972937397658825, train_loss=0.031349651515483856, time_cost=1.241117000579834
+
Steps: 1%| | 5890/1000000 [15:02:28<2095:53:49, 7.59s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%| | 5891/1000000 [15:02:40<2496:31:12, 9.04s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [5891], local_loss=0.0962676852941513, train_loss=0.053467437624931335, time_cost=1.1961257457733154
+
Steps: 1%| | 5891/1000000 [15:02:40<2496:31:12, 9.04s/it, lr=1e-5, step_loss=0.0963]
Steps: 1%| | 5892/1000000 [15:02:52<2775:09:53, 10.05s/it, lr=1e-5, step_loss=0.0963][RANK-0]: Step: [5892], local_loss=0.02548139914870262, train_loss=0.10772483050823212, time_cost=4.571313381195068
+
Steps: 1%| | 5892/1000000 [15:02:52<2775:09:53, 10.05s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 5893/1000000 [15:02:58<2382:33:03, 8.63s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [5893], local_loss=0.022218892350792885, train_loss=0.028022736310958862, time_cost=1.2323217391967773
+
Steps: 1%| | 5893/1000000 [15:02:58<2382:33:03, 8.63s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 5894/1000000 [15:03:03<2091:11:59, 7.57s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [5894], local_loss=0.010899953544139862, train_loss=0.09061470627784729, time_cost=1.9949989318847656
+
Steps: 1%| | 5894/1000000 [15:03:03<2091:11:59, 7.57s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 5895/1000000 [15:03:13<2325:19:42, 8.42s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [5895], local_loss=0.020698051899671555, train_loss=0.02614227868616581, time_cost=1.3813047409057617
+
Steps: 1%| | 5895/1000000 [15:03:13<2325:19:42, 8.42s/it, lr=1e-5, step_loss=0.0207]
Steps: 1%| | 5896/1000000 [15:03:24<2526:28:31, 9.15s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [5896], local_loss=0.14790299534797668, train_loss=0.037963785231113434, time_cost=2.226750612258911
+
Steps: 1%| | 5896/1000000 [15:03:24<2526:28:31, 9.15s/it, lr=1e-5, step_loss=0.148]
Steps: 1%| | 5897/1000000 [15:03:31<2316:31:40, 8.39s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [5897], local_loss=0.00428220396861434, train_loss=0.0363713800907135, time_cost=1.2707979679107666
+
Steps: 1%| | 5897/1000000 [15:03:31<2316:31:40, 8.39s/it, lr=1e-5, step_loss=0.00428]
Steps: 1%| | 5898/1000000 [15:03:38<2229:06:03, 8.07s/it, lr=1e-5, step_loss=0.00428][RANK-0]: Step: [5898], local_loss=0.02400936745107174, train_loss=0.03650672733783722, time_cost=1.2216572761535645
+
Steps: 1%| | 5898/1000000 [15:03:38<2229:06:03, 8.07s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 5899/1000000 [15:03:47<2337:46:28, 8.47s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [5899], local_loss=0.010613431222736835, train_loss=0.03414809703826904, time_cost=2.0402305126190186
+
Steps: 1%| | 5899/1000000 [15:03:47<2337:46:28, 8.47s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5900/1000000 [15:03:54<2217:57:45, 8.03s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5900], local_loss=0.052065659314394, train_loss=0.06447693705558777, time_cost=2.872603416442871
+
Steps: 1%| | 5900/1000000 [15:03:54<2217:57:45, 8.03s/it, lr=1e-5, step_loss=0.0521]
Steps: 1%| | 5901/1000000 [15:04:04<2299:22:40, 8.33s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [5901], local_loss=0.008838563226163387, train_loss=0.06826646625995636, time_cost=3.3343963623046875
+
Steps: 1%| | 5901/1000000 [15:04:04<2299:22:40, 8.33s/it, lr=1e-5, step_loss=0.00884]
Steps: 1%| | 5902/1000000 [15:04:15<2588:50:44, 9.38s/it, lr=1e-5, step_loss=0.00884][RANK-0]: Step: [5902], local_loss=0.05016583576798439, train_loss=0.18084122240543365, time_cost=2.976095199584961
+
Steps: 1%| | 5902/1000000 [15:04:15<2588:50:44, 9.38s/it, lr=1e-5, step_loss=0.0502]
Steps: 1%| | 5903/1000000 [15:04:20<2239:30:59, 8.11s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [5903], local_loss=0.20304787158966064, train_loss=0.047450605779886246, time_cost=2.1149909496307373
+
Steps: 1%| | 5903/1000000 [15:04:20<2239:30:59, 8.11s/it, lr=1e-5, step_loss=0.203]
Steps: 1%| | 5904/1000000 [15:04:31<2450:55:11, 8.88s/it, lr=1e-5, step_loss=0.203][RANK-0]: Step: [5904], local_loss=0.04593122377991676, train_loss=0.1483842134475708, time_cost=1.3457093238830566
+
Steps: 1%| | 5904/1000000 [15:04:31<2450:55:11, 8.88s/it, lr=1e-5, step_loss=0.0459]
Steps: 1%| | 5905/1000000 [15:04:39<2375:46:40, 8.60s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [5905], local_loss=0.045586422085762024, train_loss=0.06413105875253677, time_cost=4.135661602020264
+
Steps: 1%| | 5905/1000000 [15:04:39<2375:46:40, 8.60s/it, lr=1e-5, step_loss=0.0456]
Steps: 1%| | 5906/1000000 [15:04:53<2849:20:40, 10.32s/it, lr=1e-5, step_loss=0.0456][RANK-0]: Step: [5906], local_loss=0.03497020900249481, train_loss=0.029864169657230377, time_cost=5.8453075885772705
+
Steps: 1%| | 5906/1000000 [15:04:53<2849:20:40, 10.32s/it, lr=1e-5, step_loss=0.035]
Steps: 1%| | 5907/1000000 [15:04:59<2417:34:30, 8.75s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [5907], local_loss=0.010555122047662735, train_loss=0.03225545585155487, time_cost=2.3251609802246094
+
Steps: 1%| | 5907/1000000 [15:04:59<2417:34:30, 8.75s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5908/1000000 [15:05:06<2293:41:58, 8.31s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5908], local_loss=0.22958320379257202, train_loss=0.04904897138476372, time_cost=3.098540782928467
+
Steps: 1%| | 5908/1000000 [15:05:06<2293:41:58, 8.31s/it, lr=1e-5, step_loss=0.23]
Steps: 1%| | 5909/1000000 [15:05:13<2215:03:10, 8.02s/it, lr=1e-5, step_loss=0.23][RANK-0]: Step: [5909], local_loss=0.011721476912498474, train_loss=0.06141549348831177, time_cost=3.605576515197754
+
Steps: 1%| | 5909/1000000 [15:05:13<2215:03:10, 8.02s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 5910/1000000 [15:05:25<2494:34:27, 9.03s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [5910], local_loss=0.467449814081192, train_loss=0.1000281497836113, time_cost=1.2152128219604492
+
Steps: 1%| | 5910/1000000 [15:05:25<2494:34:27, 9.03s/it, lr=1e-5, step_loss=0.467]
Steps: 1%| | 5911/1000000 [15:05:35<2590:53:33, 9.38s/it, lr=1e-5, step_loss=0.467][RANK-0]: Step: [5911], local_loss=0.008380650542676449, train_loss=0.04994649440050125, time_cost=6.131585121154785
+
Steps: 1%| | 5911/1000000 [15:05:35<2590:53:33, 9.38s/it, lr=1e-5, step_loss=0.00838]
Steps: 1%| | 5912/1000000 [15:05:43<2514:24:56, 9.11s/it, lr=1e-5, step_loss=0.00838][RANK-0]: Step: [5912], local_loss=0.016935477033257484, train_loss=0.1418914943933487, time_cost=2.5461559295654297
+
Steps: 1%| | 5912/1000000 [15:05:43<2514:24:56, 9.11s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%| | 5913/1000000 [15:05:55<2720:16:07, 9.85s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [5913], local_loss=0.012354819104075432, train_loss=0.17399896681308746, time_cost=3.8888492584228516
+
Steps: 1%| | 5913/1000000 [15:05:55<2720:16:07, 9.85s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 5914/1000000 [15:06:02<2523:50:16, 9.14s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [5914], local_loss=0.010733164846897125, train_loss=0.09697362035512924, time_cost=1.9152112007141113
+
Steps: 1%| | 5914/1000000 [15:06:02<2523:50:16, 9.14s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 5915/1000000 [15:06:13<2639:17:20, 9.56s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [5915], local_loss=0.01252695545554161, train_loss=0.05836234241724014, time_cost=2.045109272003174
+
Steps: 1%| | 5915/1000000 [15:06:13<2639:17:20, 9.56s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 5916/1000000 [15:06:28<3097:40:39, 11.22s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [5916], local_loss=0.14087484776973724, train_loss=0.08352818340063095, time_cost=1.2291429042816162
+
Steps: 1%| | 5916/1000000 [15:06:28<3097:40:39, 11.22s/it, lr=1e-5, step_loss=0.141]
Steps: 1%| | 5917/1000000 [15:06:41<3257:38:26, 11.80s/it, lr=1e-5, step_loss=0.141][RANK-0]: Step: [5917], local_loss=0.01856118254363537, train_loss=0.049900513142347336, time_cost=4.200647592544556
+
Steps: 1%| | 5917/1000000 [15:06:41<3257:38:26, 11.80s/it, lr=1e-5, step_loss=0.0186]
Steps: 1%| | 5918/1000000 [15:06:48<2879:48:03, 10.43s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [5918], local_loss=0.05169667303562164, train_loss=0.02812325209379196, time_cost=2.556537389755249
+
Steps: 1%| | 5918/1000000 [15:06:48<2879:48:03, 10.43s/it, lr=1e-5, step_loss=0.0517]
Steps: 1%| | 5919/1000000 [15:07:03<3259:49:32, 11.81s/it, lr=1e-5, step_loss=0.0517][RANK-0]: Step: [5919], local_loss=0.0480460487306118, train_loss=0.048627547919750214, time_cost=5.317975997924805
+
Steps: 1%| | 5919/1000000 [15:07:03<3259:49:32, 11.81s/it, lr=1e-5, step_loss=0.048]
Steps: 1%| | 5920/1000000 [15:07:11<2923:59:19, 10.59s/it, lr=1e-5, step_loss=0.048][RANK-0]: Step: [5920], local_loss=0.026982301846146584, train_loss=0.02995649352669716, time_cost=2.551497220993042
+
Steps: 1%| | 5920/1000000 [15:07:11<2923:59:19, 10.59s/it, lr=1e-5, step_loss=0.027]
Steps: 1%| | 5921/1000000 [15:07:25<3232:34:42, 11.71s/it, lr=1e-5, step_loss=0.027][RANK-0]: Step: [5921], local_loss=0.011296727694571018, train_loss=0.05814938247203827, time_cost=6.141691207885742
+
Steps: 1%| | 5921/1000000 [15:07:25<3232:34:42, 11.71s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 5922/1000000 [15:07:35<3076:26:07, 11.14s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [5922], local_loss=0.0183696411550045, train_loss=0.04498275741934776, time_cost=4.382033348083496
+
Steps: 1%| | 5922/1000000 [15:07:35<3076:26:07, 11.14s/it, lr=1e-5, step_loss=0.0184]
Steps: 1%| | 5923/1000000 [15:07:45<2954:38:32, 10.70s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [5923], local_loss=0.011393065564334393, train_loss=0.1553024798631668, time_cost=1.2245087623596191
+
Steps: 1%| | 5923/1000000 [15:07:45<2954:38:32, 10.70s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5924/1000000 [15:08:03<3545:33:00, 12.84s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5924], local_loss=0.13247594237327576, train_loss=0.04001826047897339, time_cost=1.2862248420715332
+
Steps: 1%| | 5924/1000000 [15:08:03<3545:33:00, 12.84s/it, lr=1e-5, step_loss=0.132]
Steps: 1%| | 5925/1000000 [15:08:12<3210:42:09, 11.63s/it, lr=1e-5, step_loss=0.132][RANK-0]: Step: [5925], local_loss=0.021466046571731567, train_loss=0.02025952935218811, time_cost=1.242476463317871
+
Steps: 1%| | 5925/1000000 [15:08:12<3210:42:09, 11.63s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%| | 5926/1000000 [15:08:22<3107:56:03, 11.26s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [5926], local_loss=0.0342852808535099, train_loss=0.03395818918943405, time_cost=1.2465078830718994
+
Steps: 1%| | 5926/1000000 [15:08:22<3107:56:03, 11.26s/it, lr=1e-5, step_loss=0.0343]
Steps: 1%| | 5927/1000000 [15:08:29<2771:44:49, 10.04s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [5927], local_loss=0.17859865725040436, train_loss=0.05848712474107742, time_cost=3.3698842525482178
+
Steps: 1%| | 5927/1000000 [15:08:29<2771:44:49, 10.04s/it, lr=1e-5, step_loss=0.179]
Steps: 1%| | 5928/1000000 [15:08:35<2399:59:47, 8.69s/it, lr=1e-5, step_loss=0.179][RANK-0]: Step: [5928], local_loss=0.016210591420531273, train_loss=0.04057255759835243, time_cost=1.2325224876403809
+
Steps: 1%| | 5928/1000000 [15:08:35<2399:59:47, 8.69s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 5929/1000000 [15:08:39<2038:14:52, 7.38s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [5929], local_loss=0.054454755038022995, train_loss=0.11306182295084, time_cost=1.7001137733459473
+
Steps: 1%| | 5929/1000000 [15:08:39<2038:14:52, 7.38s/it, lr=1e-5, step_loss=0.0545]
Steps: 1%| | 5930/1000000 [15:08:50<2374:47:32, 8.60s/it, lr=1e-5, step_loss=0.0545][RANK-0]: Step: [5930], local_loss=0.03931038826704025, train_loss=0.053069502115249634, time_cost=3.6130876541137695
+
Steps: 1%| | 5930/1000000 [15:08:50<2374:47:32, 8.60s/it, lr=1e-5, step_loss=0.0393]
Steps: 1%| | 5931/1000000 [15:08:58<2253:14:24, 8.16s/it, lr=1e-5, step_loss=0.0393][RANK-0]: Step: [5931], local_loss=0.018958810716867447, train_loss=0.16643857955932617, time_cost=2.290740489959717
+
Steps: 1%| | 5931/1000000 [15:08:58<2253:14:24, 8.16s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 5932/1000000 [15:09:03<2030:24:58, 7.35s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [5932], local_loss=0.024270497262477875, train_loss=0.16040894389152527, time_cost=3.2029104232788086
+
Steps: 1%| | 5932/1000000 [15:09:03<2030:24:58, 7.35s/it, lr=1e-5, step_loss=0.0243]
Steps: 1%| | 5933/1000000 [15:09:15<2382:56:29, 8.63s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [5933], local_loss=0.016252247616648674, train_loss=0.02354389615356922, time_cost=1.214397668838501
+
Steps: 1%| | 5933/1000000 [15:09:15<2382:56:29, 8.63s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 5934/1000000 [15:09:24<2423:13:24, 8.78s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [5934], local_loss=0.01672498695552349, train_loss=0.05270770564675331, time_cost=2.8243207931518555
+
Steps: 1%| | 5934/1000000 [15:09:24<2423:13:24, 8.78s/it, lr=1e-5, step_loss=0.0167]
Steps: 1%| | 5935/1000000 [15:09:35<2605:28:22, 9.44s/it, lr=1e-5, step_loss=0.0167][RANK-0]: Step: [5935], local_loss=0.03500417247414589, train_loss=0.04074132442474365, time_cost=3.110725164413452
+
Steps: 1%| | 5935/1000000 [15:09:35<2605:28:22, 9.44s/it, lr=1e-5, step_loss=0.035]
Steps: 1%| | 5936/1000000 [15:09:40<2256:42:49, 8.17s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [5936], local_loss=0.024748167023062706, train_loss=0.07279801368713379, time_cost=2.0805373191833496
+
Steps: 1%| | 5936/1000000 [15:09:40<2256:42:49, 8.17s/it, lr=1e-5, step_loss=0.0247]
Steps: 1%| | 5937/1000000 [15:09:47<2202:12:15, 7.98s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [5937], local_loss=0.011464560404419899, train_loss=0.03818545863032341, time_cost=2.184138774871826
+
Steps: 1%| | 5937/1000000 [15:09:47<2202:12:15, 7.98s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 5938/1000000 [15:09:55<2185:26:19, 7.91s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [5938], local_loss=1.0110251903533936, train_loss=0.1752283275127411, time_cost=1.6821939945220947
+
Steps: 1%| | 5938/1000000 [15:09:55<2185:26:19, 7.91s/it, lr=1e-5, step_loss=1.01]
Steps: 1%| | 5939/1000000 [15:10:00<1952:32:37, 7.07s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [5939], local_loss=0.14841017127037048, train_loss=0.07820664346218109, time_cost=1.9251463413238525
+
Steps: 1%| | 5939/1000000 [15:10:00<1952:32:37, 7.07s/it, lr=1e-5, step_loss=0.148]
Steps: 1%| | 5940/1000000 [15:10:05<1775:31:07, 6.43s/it, lr=1e-5, step_loss=0.148][RANK-0]: Step: [5940], local_loss=0.016170252114534378, train_loss=0.02955370396375656, time_cost=2.212001323699951
+
Steps: 1%| | 5940/1000000 [15:10:05<1775:31:07, 6.43s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 5941/1000000 [15:10:20<2425:20:29, 8.78s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [5941], local_loss=0.31402355432510376, train_loss=0.06106000766158104, time_cost=6.093185663223267
+
Steps: 1%| | 5941/1000000 [15:10:20<2425:20:29, 8.78s/it, lr=1e-5, step_loss=0.314]
Steps: 1%| | 5942/1000000 [15:10:31<2653:17:43, 9.61s/it, lr=1e-5, step_loss=0.314][RANK-0]: Step: [5942], local_loss=0.008483821526169777, train_loss=0.03468630090355873, time_cost=2.009054660797119
+
Steps: 1%| | 5942/1000000 [15:10:31<2653:17:43, 9.61s/it, lr=1e-5, step_loss=0.00848]
Steps: 1%| | 5943/1000000 [15:10:38<2457:49:36, 8.90s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [5943], local_loss=0.010916206985712051, train_loss=0.038750458508729935, time_cost=1.2132019996643066
+
Steps: 1%| | 5943/1000000 [15:10:38<2457:49:36, 8.90s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 5944/1000000 [15:10:47<2446:10:07, 8.86s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [5944], local_loss=0.08983058482408524, train_loss=0.04063544049859047, time_cost=2.603861093521118
+
Steps: 1%| | 5944/1000000 [15:10:47<2446:10:07, 8.86s/it, lr=1e-5, step_loss=0.0898]
Steps: 1%| | 5945/1000000 [15:10:56<2481:34:37, 8.99s/it, lr=1e-5, step_loss=0.0898][RANK-0]: Step: [5945], local_loss=0.03951239585876465, train_loss=0.030958060175180435, time_cost=1.2335338592529297
+
Steps: 1%| | 5945/1000000 [15:10:56<2481:34:37, 8.99s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%| | 5946/1000000 [15:11:11<2980:48:35, 10.80s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [5946], local_loss=0.023989098146557808, train_loss=0.030614713206887245, time_cost=1.2211272716522217
+
Steps: 1%| | 5946/1000000 [15:11:11<2980:48:35, 10.80s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 5947/1000000 [15:11:22<2989:59:30, 10.83s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [5947], local_loss=0.019324542954564095, train_loss=0.020207157358527184, time_cost=1.2499606609344482
+
Steps: 1%| | 5947/1000000 [15:11:22<2989:59:30, 10.83s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%| | 5948/1000000 [15:11:38<3362:35:33, 12.18s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [5948], local_loss=0.01431164238601923, train_loss=0.04058770090341568, time_cost=5.901095628738403
+
Steps: 1%| | 5948/1000000 [15:11:38<3362:35:33, 12.18s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 5949/1000000 [15:11:45<2970:40:45, 10.76s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [5949], local_loss=0.03198540210723877, train_loss=0.022895455360412598, time_cost=2.1862573623657227
+
Steps: 1%| | 5949/1000000 [15:11:45<2970:40:45, 10.76s/it, lr=1e-5, step_loss=0.032]
Steps: 1%| | 5950/1000000 [15:11:55<2938:21:05, 10.64s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [5950], local_loss=0.042486850172281265, train_loss=0.1698545664548874, time_cost=6.33633828163147
+
Steps: 1%| | 5950/1000000 [15:11:55<2938:21:05, 10.64s/it, lr=1e-5, step_loss=0.0425]
Steps: 1%| | 5951/1000000 [15:12:10<3280:23:07, 11.88s/it, lr=1e-5, step_loss=0.0425][RANK-0]: Step: [5951], local_loss=1.0227932929992676, train_loss=0.14298462867736816, time_cost=5.913635015487671
+
Steps: 1%| | 5951/1000000 [15:12:10<3280:23:07, 11.88s/it, lr=1e-5, step_loss=1.02]
Steps: 1%| | 5952/1000000 [15:12:19<3022:56:45, 10.95s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [5952], local_loss=0.46621280908584595, train_loss=0.07737179845571518, time_cost=3.7629899978637695
+
Steps: 1%| | 5952/1000000 [15:12:19<3022:56:45, 10.95s/it, lr=1e-5, step_loss=0.466]
Steps: 1%| | 5953/1000000 [15:12:24<2533:30:31, 9.18s/it, lr=1e-5, step_loss=0.466][RANK-0]: Step: [5953], local_loss=0.02057322859764099, train_loss=0.05597199127078056, time_cost=2.236783027648926
+
Steps: 1%| | 5953/1000000 [15:12:24<2533:30:31, 9.18s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 5954/1000000 [15:12:37<2884:20:22, 10.45s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [5954], local_loss=0.050020210444927216, train_loss=0.15140488743782043, time_cost=4.030571937561035
+
Steps: 1%| | 5954/1000000 [15:12:37<2884:20:22, 10.45s/it, lr=1e-5, step_loss=0.05]
Steps: 1%| | 5955/1000000 [15:12:49<2958:26:50, 10.71s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [5955], local_loss=0.04480687528848648, train_loss=0.0182650163769722, time_cost=3.2346982955932617
+
Steps: 1%| | 5955/1000000 [15:12:49<2958:26:50, 10.71s/it, lr=1e-5, step_loss=0.0448]
Steps: 1%| | 5956/1000000 [15:12:58<2866:29:01, 10.38s/it, lr=1e-5, step_loss=0.0448][RANK-0]: Step: [5956], local_loss=0.010579016990959644, train_loss=0.020034175366163254, time_cost=1.3061718940734863
+
Steps: 1%| | 5956/1000000 [15:12:58<2866:29:01, 10.38s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 5957/1000000 [15:13:09<2917:26:36, 10.57s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [5957], local_loss=0.02058693766593933, train_loss=0.05997683107852936, time_cost=1.2298450469970703
+
Steps: 1%| | 5957/1000000 [15:13:09<2917:26:36, 10.57s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 5958/1000000 [15:13:17<2705:58:06, 9.80s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [5958], local_loss=0.11879637092351913, train_loss=0.03294221684336662, time_cost=3.0428073406219482
+
Steps: 1%| | 5958/1000000 [15:13:17<2705:58:06, 9.80s/it, lr=1e-5, step_loss=0.119]
Steps: 1%| | 5959/1000000 [15:13:27<2650:42:11, 9.60s/it, lr=1e-5, step_loss=0.119][RANK-0]: Step: [5959], local_loss=0.015428384765982628, train_loss=0.026729963719844818, time_cost=1.9258010387420654
+
Steps: 1%| | 5959/1000000 [15:13:27<2650:42:11, 9.60s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 5960/1000000 [15:13:31<2215:32:30, 8.02s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [5960], local_loss=0.0627889484167099, train_loss=0.022110987454652786, time_cost=1.454793930053711
+
Steps: 1%| | 5960/1000000 [15:13:31<2215:32:30, 8.02s/it, lr=1e-5, step_loss=0.0628]
Steps: 1%| | 5961/1000000 [15:13:36<1946:53:54, 7.05s/it, lr=1e-5, step_loss=0.0628][RANK-0]: Step: [5961], local_loss=0.02911900356411934, train_loss=0.02671678550541401, time_cost=1.9418456554412842
+
Steps: 1%| | 5961/1000000 [15:13:36<1946:53:54, 7.05s/it, lr=1e-5, step_loss=0.0291]
Steps: 1%| | 5962/1000000 [15:13:49<2504:09:06, 9.07s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [5962], local_loss=0.009290055371820927, train_loss=0.028467141091823578, time_cost=1.2263660430908203
+
Steps: 1%| | 5962/1000000 [15:13:49<2504:09:06, 9.07s/it, lr=1e-5, step_loss=0.00929]
Steps: 1%| | 5963/1000000 [15:13:55<2181:14:05, 7.90s/it, lr=1e-5, step_loss=0.00929][RANK-0]: Step: [5963], local_loss=0.015242706052958965, train_loss=0.03879493847489357, time_cost=2.1945271492004395
+
Steps: 1%| | 5963/1000000 [15:13:55<2181:14:05, 7.90s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 5964/1000000 [15:13:59<1886:21:20, 6.83s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [5964], local_loss=0.04279996454715729, train_loss=0.05030522495508194, time_cost=3.2281100749969482
+
Steps: 1%| | 5964/1000000 [15:13:59<1886:21:20, 6.83s/it, lr=1e-5, step_loss=0.0428]
Steps: 1%| | 5965/1000000 [15:14:12<2393:22:27, 8.67s/it, lr=1e-5, step_loss=0.0428][RANK-0]: Step: [5965], local_loss=0.008990423753857613, train_loss=0.10252472758293152, time_cost=8.15360951423645
+
Steps: 1%| | 5965/1000000 [15:14:12<2393:22:27, 8.67s/it, lr=1e-5, step_loss=0.00899]
Steps: 1%| | 5966/1000000 [15:14:21<2458:37:26, 8.90s/it, lr=1e-5, step_loss=0.00899][RANK-0]: Step: [5966], local_loss=0.012937283143401146, train_loss=0.07382100820541382, time_cost=3.047769546508789
+
Steps: 1%| | 5966/1000000 [15:14:21<2458:37:26, 8.90s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 5967/1000000 [15:14:37<3056:07:44, 11.07s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [5967], local_loss=0.23152130842208862, train_loss=0.0730338916182518, time_cost=7.1064512729644775
+
Steps: 1%| | 5967/1000000 [15:14:37<3056:07:44, 11.07s/it, lr=1e-5, step_loss=0.232]
Steps: 1%| | 5968/1000000 [15:14:42<2476:23:44, 8.97s/it, lr=1e-5, step_loss=0.232][RANK-0]: Step: [5968], local_loss=0.02828354761004448, train_loss=0.029160264879465103, time_cost=1.2574381828308105
+
Steps: 1%| | 5968/1000000 [15:14:42<2476:23:44, 8.97s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%| | 5969/1000000 [15:14:55<2883:43:50, 10.44s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [5969], local_loss=0.02142011746764183, train_loss=0.04500407725572586, time_cost=5.625676393508911
+
Steps: 1%| | 5969/1000000 [15:14:55<2883:43:50, 10.44s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 5970/1000000 [15:15:06<2907:02:33, 10.53s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [5970], local_loss=0.012892614118754864, train_loss=0.029916029423475266, time_cost=8.796083927154541
+
Steps: 1%| | 5970/1000000 [15:15:06<2907:02:33, 10.53s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 5971/1000000 [15:15:21<3259:47:34, 11.81s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [5971], local_loss=0.01707448996603489, train_loss=0.045694950968027115, time_cost=1.2774581909179688
+
Steps: 1%| | 5971/1000000 [15:15:21<3259:47:34, 11.81s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 5972/1000000 [15:15:27<2755:02:53, 9.98s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [5972], local_loss=0.010534386150538921, train_loss=0.033574581146240234, time_cost=4.438094139099121
+
Steps: 1%| | 5972/1000000 [15:15:27<2755:02:53, 9.98s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 5973/1000000 [15:15:36<2679:57:00, 9.71s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [5973], local_loss=0.027175843715667725, train_loss=0.028471089899539948, time_cost=1.2716877460479736
+
Steps: 1%| | 5973/1000000 [15:15:36<2679:57:00, 9.71s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%| | 5974/1000000 [15:15:47<2803:45:24, 10.15s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [5974], local_loss=0.01115319225937128, train_loss=0.0533721037209034, time_cost=4.838167428970337
+
Steps: 1%| | 5974/1000000 [15:15:47<2803:45:24, 10.15s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 5975/1000000 [15:15:51<2325:13:00, 8.42s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [5975], local_loss=0.012197625823318958, train_loss=0.02409667707979679, time_cost=1.5194728374481201
+
Steps: 1%| | 5975/1000000 [15:15:51<2325:13:00, 8.42s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 5976/1000000 [15:15:59<2263:41:41, 8.20s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [5976], local_loss=0.025351272895932198, train_loss=0.0540427602827549, time_cost=1.6707286834716797
+
Steps: 1%| | 5976/1000000 [15:15:59<2263:41:41, 8.20s/it, lr=1e-5, step_loss=0.0254]
Steps: 1%| | 5977/1000000 [15:16:03<1954:43:53, 7.08s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [5977], local_loss=0.02633020281791687, train_loss=0.029977381229400635, time_cost=2.103248119354248
+
Steps: 1%| | 5977/1000000 [15:16:03<1954:43:53, 7.08s/it, lr=1e-5, step_loss=0.0263]
Steps: 1%| | 5978/1000000 [15:16:08<1722:41:43, 6.24s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [5978], local_loss=0.01303626224398613, train_loss=0.037775490432977676, time_cost=3.303708791732788
+
Steps: 1%| | 5978/1000000 [15:16:08<1722:41:43, 6.24s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 5979/1000000 [15:16:17<1940:32:32, 7.03s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [5979], local_loss=0.056798890233039856, train_loss=0.032766714692115784, time_cost=3.833038568496704
+
Steps: 1%| | 5979/1000000 [15:16:17<1940:32:32, 7.03s/it, lr=1e-5, step_loss=0.0568]
Steps: 1%| | 5980/1000000 [15:16:23<1868:51:54, 6.77s/it, lr=1e-5, step_loss=0.0568][RANK-0]: Step: [5980], local_loss=0.010812127031385899, train_loss=0.11140758544206619, time_cost=1.2665412425994873
+
Steps: 1%| | 5980/1000000 [15:16:23<1868:51:54, 6.77s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 5981/1000000 [15:16:30<1908:36:28, 6.91s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [5981], local_loss=0.021513069048523903, train_loss=0.027395647019147873, time_cost=1.2345337867736816
+
Steps: 1%| | 5981/1000000 [15:16:30<1908:36:28, 6.91s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%| | 5982/1000000 [15:16:39<2080:51:44, 7.54s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [5982], local_loss=0.04031568393111229, train_loss=0.07246390730142593, time_cost=1.271632432937622
+
Steps: 1%| | 5982/1000000 [15:16:39<2080:51:44, 7.54s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%| | 5983/1000000 [15:16:51<2434:41:11, 8.82s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [5983], local_loss=0.05477640777826309, train_loss=0.03240491822361946, time_cost=1.246950626373291
+
Steps: 1%| | 5983/1000000 [15:16:51<2434:41:11, 8.82s/it, lr=1e-5, step_loss=0.0548]
Steps: 1%| | 5984/1000000 [15:16:58<2309:40:14, 8.36s/it, lr=1e-5, step_loss=0.0548][RANK-0]: Step: [5984], local_loss=0.012563178315758705, train_loss=0.028221113607287407, time_cost=3.207825183868408
+
Steps: 1%| | 5984/1000000 [15:16:58<2309:40:14, 8.36s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 5985/1000000 [15:17:05<2168:33:14, 7.85s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [5985], local_loss=0.009587222710251808, train_loss=0.034219130873680115, time_cost=2.5671846866607666
+
Steps: 1%| | 5985/1000000 [15:17:05<2168:33:14, 7.85s/it, lr=1e-5, step_loss=0.00959]
Steps: 1%| | 5986/1000000 [15:17:09<1891:47:04, 6.85s/it, lr=1e-5, step_loss=0.00959][RANK-0]: Step: [5986], local_loss=0.02131897211074829, train_loss=0.04400775954127312, time_cost=1.230952501296997
+
Steps: 1%| | 5986/1000000 [15:17:09<1891:47:04, 6.85s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 5987/1000000 [15:17:14<1720:32:58, 6.23s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [5987], local_loss=0.017952071502804756, train_loss=0.029801174998283386, time_cost=2.044020891189575
+
Steps: 1%| | 5987/1000000 [15:17:14<1720:32:58, 6.23s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 5988/1000000 [15:17:25<2089:28:12, 7.57s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [5988], local_loss=0.01906287483870983, train_loss=0.02732883393764496, time_cost=7.728113889694214
+
Steps: 1%| | 5988/1000000 [15:17:25<2089:28:12, 7.57s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 5989/1000000 [15:17:36<2408:57:27, 8.72s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [5989], local_loss=0.014421633444726467, train_loss=0.17259754240512848, time_cost=3.8112633228302
+
Steps: 1%| | 5989/1000000 [15:17:36<2408:57:27, 8.72s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 5990/1000000 [15:17:43<2288:58:19, 8.29s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [5990], local_loss=0.023997072130441666, train_loss=0.04030877351760864, time_cost=2.484556198120117
+
Steps: 1%| | 5990/1000000 [15:17:43<2288:58:19, 8.29s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 5991/1000000 [15:17:48<1960:57:13, 7.10s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [5991], local_loss=0.011367079801857471, train_loss=0.033533692359924316, time_cost=1.4656298160552979
+
Steps: 1%| | 5991/1000000 [15:17:48<1960:57:13, 7.10s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 5992/1000000 [15:17:57<2118:16:47, 7.67s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [5992], local_loss=0.01245174091309309, train_loss=0.04166964814066887, time_cost=2.94627046585083
+
Steps: 1%| | 5992/1000000 [15:17:57<2118:16:47, 7.67s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 5993/1000000 [15:18:11<2624:57:08, 9.51s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [5993], local_loss=0.06145993620157242, train_loss=0.037802934646606445, time_cost=11.537866115570068
+
Steps: 1%| | 5993/1000000 [15:18:11<2624:57:08, 9.51s/it, lr=1e-5, step_loss=0.0615]
Steps: 1%| | 5994/1000000 [15:18:22<2746:10:39, 9.95s/it, lr=1e-5, step_loss=0.0615][RANK-0]: Step: [5994], local_loss=0.016113778576254845, train_loss=0.02236677147448063, time_cost=6.77494215965271
+
Steps: 1%| | 5994/1000000 [15:18:22<2746:10:39, 9.95s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 5995/1000000 [15:18:33<2869:14:50, 10.39s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [5995], local_loss=0.053139884024858475, train_loss=0.06377596408128738, time_cost=1.3685479164123535
+
Steps: 1%| | 5995/1000000 [15:18:33<2869:14:50, 10.39s/it, lr=1e-5, step_loss=0.0531]
Steps: 1%| | 5996/1000000 [15:18:45<3024:27:47, 10.95s/it, lr=1e-5, step_loss=0.0531][RANK-0]: Step: [5996], local_loss=0.17994697391986847, train_loss=0.05486851930618286, time_cost=4.696606636047363
+
Steps: 1%| | 5996/1000000 [15:18:45<3024:27:47, 10.95s/it, lr=1e-5, step_loss=0.18]
Steps: 1%| | 5997/1000000 [15:18:51<2601:45:48, 9.42s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [5997], local_loss=0.09159558266401291, train_loss=0.03292553126811981, time_cost=1.2983124256134033
+
Steps: 1%| | 5997/1000000 [15:18:51<2601:45:48, 9.42s/it, lr=1e-5, step_loss=0.0916]
Steps: 1%| | 5998/1000000 [15:19:08<3199:24:08, 11.59s/it, lr=1e-5, step_loss=0.0916][RANK-0]: Step: [5998], local_loss=0.031751129776239395, train_loss=0.024335674941539764, time_cost=12.913726568222046
+
Steps: 1%| | 5998/1000000 [15:19:08<3199:24:08, 11.59s/it, lr=1e-5, step_loss=0.0318]
Steps: 1%| | 5999/1000000 [15:19:18<3098:34:37, 11.22s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [5999], local_loss=0.047172367572784424, train_loss=0.03601421043276787, time_cost=5.4369800090789795
+
Steps: 1%| | 5999/1000000 [15:19:18<3098:34:37, 11.22s/it, lr=1e-5, step_loss=0.0472]
Steps: 1%| | 6000/1000000 [15:19:25<2763:41:41, 10.01s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [6000], local_loss=0.014911560341715813, train_loss=0.017984768375754356, time_cost=2.948049306869507
+09/19/2024 14:29:18 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000
+09/19/2024 14:29:18 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 14:29:21,554] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-19 14:29:21,584] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 14:29:21,585] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 14:29:43,911] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 14:29:43,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 14:30:14,279] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:14,280] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:14,280] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:15,173] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:15,173] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:15,173] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:19,231] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:19,232] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:19,232] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:19,400] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:19,400] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:19,401] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:19,821] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:19,890] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:19,890] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:20,545] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:20,545] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:20,545] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:20,596] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:20,596] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:20,596] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 14:30:20,767] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 14:30:20,768] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 14:30:20,768] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 14:30:20 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/model/diffusion_pytorch_model.safetensors
+09/19/2024 14:31:46 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/scheduler.bin
+09/19/2024 14:31:46 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/sampler.bin
+09/19/2024 14:31:46 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000/random_states_0.pkl
+09/19/2024 14:31:46 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-6000
+
Steps: 1%| | 6000/1000000 [15:21:54<2763:41:41, 10.01s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 6001/1000000 [15:22:02<14946:34:07, 54.13s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [6001], local_loss=0.05073767900466919, train_loss=0.07797560095787048, time_cost=1.2258579730987549
+
Steps: 1%| | 6001/1000000 [15:22:02<14946:34:07, 54.13s/it, lr=1e-5, step_loss=0.0507]
Steps: 1%| | 6002/1000000 [15:22:12<11293:05:50, 40.90s/it, lr=1e-5, step_loss=0.0507][RANK-0]: Step: [6002], local_loss=0.040177121758461, train_loss=0.05487098544836044, time_cost=2.0614349842071533
+
Steps: 1%| | 6002/1000000 [15:22:12<11293:05:50, 40.90s/it, lr=1e-5, step_loss=0.0402]
Steps: 1%| | 6003/1000000 [15:22:19<8486:41:59, 30.74s/it, lr=1e-5, step_loss=0.0402] [RANK-0]: Step: [6003], local_loss=0.01267441175878048, train_loss=0.0620696060359478, time_cost=2.412921190261841
+
Steps: 1%| | 6003/1000000 [15:22:19<8486:41:59, 30.74s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 6004/1000000 [15:22:31<6866:13:18, 24.87s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [6004], local_loss=0.24973000586032867, train_loss=0.05635707080364227, time_cost=5.109865188598633
+
Steps: 1%| | 6004/1000000 [15:22:31<6866:13:18, 24.87s/it, lr=1e-5, step_loss=0.25]
Steps: 1%| | 6005/1000000 [15:22:35<5211:40:55, 18.88s/it, lr=1e-5, step_loss=0.25][RANK-0]: Step: [6005], local_loss=0.2523839771747589, train_loss=0.15235298871994019, time_cost=2.0122148990631104
+
Steps: 1%| | 6005/1000000 [15:22:35<5211:40:55, 18.88s/it, lr=1e-5, step_loss=0.252]
Steps: 1%| | 6006/1000000 [15:22:46<4483:35:22, 16.24s/it, lr=1e-5, step_loss=0.252][RANK-0]: Step: [6006], local_loss=0.01773320883512497, train_loss=0.032090168446302414, time_cost=1.6853885650634766
+
Steps: 1%| | 6006/1000000 [15:22:46<4483:35:22, 16.24s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 6007/1000000 [15:22:55<3901:04:16, 14.13s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [6007], local_loss=0.07222466170787811, train_loss=0.03256111592054367, time_cost=1.9114656448364258
+
Steps: 1%| | 6007/1000000 [15:22:55<3901:04:16, 14.13s/it, lr=1e-5, step_loss=0.0722]
Steps: 1%| | 6008/1000000 [15:23:02<3338:06:23, 12.09s/it, lr=1e-5, step_loss=0.0722][RANK-0]: Step: [6008], local_loss=0.01081103552132845, train_loss=0.03625284135341644, time_cost=1.6993865966796875
+
Steps: 1%| | 6008/1000000 [15:23:02<3338:06:23, 12.09s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 6009/1000000 [15:23:10<2973:04:52, 10.77s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [6009], local_loss=0.012099767103791237, train_loss=0.028800152242183685, time_cost=1.4003269672393799
+
Steps: 1%| | 6009/1000000 [15:23:10<2973:04:52, 10.77s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6010/1000000 [15:23:21<3050:02:56, 11.05s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6010], local_loss=0.016942916437983513, train_loss=0.08290901780128479, time_cost=2.0393576622009277
+
Steps: 1%| | 6010/1000000 [15:23:21<3050:02:56, 11.05s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%| | 6011/1000000 [15:23:29<2740:02:55, 9.92s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [6011], local_loss=0.011926555074751377, train_loss=33.750396728515625, time_cost=2.620114326477051
+
Steps: 1%| | 6011/1000000 [15:23:29<2740:02:55, 9.92s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 6012/1000000 [15:23:42<3052:01:24, 11.05s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [6012], local_loss=0.011051638051867485, train_loss=0.06456742435693741, time_cost=4.844733476638794
+
Steps: 1%| | 6012/1000000 [15:23:42<3052:01:24, 11.05s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 6013/1000000 [15:23:53<3051:07:25, 11.05s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [6013], local_loss=0.3904809355735779, train_loss=0.15244688093662262, time_cost=3.429840564727783
+
Steps: 1%| | 6013/1000000 [15:23:54<3051:07:25, 11.05s/it, lr=1e-5, step_loss=0.39]
Steps: 1%| | 6014/1000000 [15:24:03<2943:34:57, 10.66s/it, lr=1e-5, step_loss=0.39][RANK-0]: Step: [6014], local_loss=0.025611357763409615, train_loss=0.023888856172561646, time_cost=3.850266933441162
+
Steps: 1%| | 6014/1000000 [15:24:03<2943:34:57, 10.66s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%| | 6015/1000000 [15:24:17<3159:53:09, 11.44s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [6015], local_loss=0.02042759396135807, train_loss=0.03341962397098541, time_cost=5.669849157333374
+
Steps: 1%| | 6015/1000000 [15:24:17<3159:53:09, 11.44s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%| | 6016/1000000 [15:24:27<3045:25:02, 11.03s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [6016], local_loss=0.04488976672291756, train_loss=0.024962909519672394, time_cost=1.640573501586914
+
Steps: 1%| | 6016/1000000 [15:24:27<3045:25:02, 11.03s/it, lr=1e-5, step_loss=0.0449]
Steps: 1%| | 6017/1000000 [15:24:40<3282:52:25, 11.89s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [6017], local_loss=0.015360236167907715, train_loss=0.021151211112737656, time_cost=4.159914255142212
+
Steps: 1%| | 6017/1000000 [15:24:40<3282:52:25, 11.89s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 6018/1000000 [15:24:49<3014:46:08, 10.92s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [6018], local_loss=0.03526890650391579, train_loss=0.024382201954722404, time_cost=1.2491111755371094
+
Steps: 1%| | 6018/1000000 [15:24:49<3014:46:08, 10.92s/it, lr=1e-5, step_loss=0.0353]
Steps: 1%| | 6019/1000000 [15:24:54<2514:08:07, 9.11s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [6019], local_loss=0.027607664465904236, train_loss=0.07400202751159668, time_cost=1.830643653869629
+
Steps: 1%| | 6019/1000000 [15:24:54<2514:08:07, 9.11s/it, lr=1e-5, step_loss=0.0276]
Steps: 1%| | 6020/1000000 [15:25:01<2349:15:59, 8.51s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [6020], local_loss=0.01999134011566639, train_loss=0.029196232557296753, time_cost=2.546208143234253
+
Steps: 1%| | 6020/1000000 [15:25:01<2349:15:59, 8.51s/it, lr=1e-5, step_loss=0.02]
Steps: 1%| | 6021/1000000 [15:25:14<2680:30:30, 9.71s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [6021], local_loss=0.09086097031831741, train_loss=0.07767478376626968, time_cost=5.555222272872925
+
Steps: 1%| | 6021/1000000 [15:25:14<2680:30:30, 9.71s/it, lr=1e-5, step_loss=0.0909]
Steps: 1%| | 6022/1000000 [15:25:23<2676:25:12, 9.69s/it, lr=1e-5, step_loss=0.0909][RANK-0]: Step: [6022], local_loss=0.3433783948421478, train_loss=0.05845344811677933, time_cost=7.3440446853637695
+
Steps: 1%| | 6022/1000000 [15:25:23<2676:25:12, 9.69s/it, lr=1e-5, step_loss=0.343]
Steps: 1%| | 6023/1000000 [15:25:28<2290:47:37, 8.30s/it, lr=1e-5, step_loss=0.343][RANK-0]: Step: [6023], local_loss=0.02396019734442234, train_loss=14.159982681274414, time_cost=2.535370349884033
+
Steps: 1%| | 6023/1000000 [15:25:28<2290:47:37, 8.30s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 6024/1000000 [15:25:38<2419:38:16, 8.76s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [6024], local_loss=0.020779943093657494, train_loss=0.17228442430496216, time_cost=1.5368413925170898
+
Steps: 1%| | 6024/1000000 [15:25:38<2419:38:16, 8.76s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%| | 6025/1000000 [15:25:43<2093:52:56, 7.58s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [6025], local_loss=0.015055641531944275, train_loss=0.14870209991931915, time_cost=2.2293953895568848
+
Steps: 1%| | 6025/1000000 [15:25:43<2093:52:56, 7.58s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 6026/1000000 [15:25:49<1923:35:09, 6.97s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [6026], local_loss=0.17385193705558777, train_loss=0.07554729282855988, time_cost=1.4200246334075928
+
Steps: 1%| | 6026/1000000 [15:25:49<1923:35:09, 6.97s/it, lr=1e-5, step_loss=0.174]
Steps: 1%| | 6027/1000000 [15:26:02<2501:17:58, 9.06s/it, lr=1e-5, step_loss=0.174][RANK-0]: Step: [6027], local_loss=0.054617542773485184, train_loss=0.04114934056997299, time_cost=3.033531665802002
+
Steps: 1%| | 6027/1000000 [15:26:02<2501:17:58, 9.06s/it, lr=1e-5, step_loss=0.0546]
Steps: 1%| | 6028/1000000 [15:26:11<2470:34:01, 8.95s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [6028], local_loss=0.010669970884919167, train_loss=0.07797821611166, time_cost=6.298054933547974
+
Steps: 1%| | 6028/1000000 [15:26:11<2470:34:01, 8.95s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6029/1000000 [15:26:17<2201:52:02, 7.97s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6029], local_loss=0.04848332330584526, train_loss=0.07276605069637299, time_cost=1.4941866397857666
+
Steps: 1%| | 6029/1000000 [15:26:17<2201:52:02, 7.97s/it, lr=1e-5, step_loss=0.0485]
Steps: 1%| | 6030/1000000 [15:26:33<2907:30:47, 10.53s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [6030], local_loss=0.028294993564486504, train_loss=0.02713397704064846, time_cost=7.6041483879089355
+
Steps: 1%| | 6030/1000000 [15:26:33<2907:30:47, 10.53s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%| | 6031/1000000 [15:26:40<2553:40:19, 9.25s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [6031], local_loss=0.018346916884183884, train_loss=0.023596152663230896, time_cost=2.1145217418670654
+
Steps: 1%| | 6031/1000000 [15:26:40<2553:40:19, 9.25s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 6032/1000000 [15:26:52<2819:02:26, 10.21s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [6032], local_loss=0.02011026442050934, train_loss=0.030233338475227356, time_cost=5.459554433822632
+
Steps: 1%| | 6032/1000000 [15:26:52<2819:02:26, 10.21s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%| | 6033/1000000 [15:27:10<3485:52:26, 12.63s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [6033], local_loss=0.03183276206254959, train_loss=0.042955536395311356, time_cost=10.396169662475586
+
Steps: 1%| | 6033/1000000 [15:27:10<3485:52:26, 12.63s/it, lr=1e-5, step_loss=0.0318]
Steps: 1%| | 6034/1000000 [15:27:22<3427:58:17, 12.42s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [6034], local_loss=0.014338018372654915, train_loss=0.03822949901223183, time_cost=2.741708755493164
+
Steps: 1%| | 6034/1000000 [15:27:22<3427:58:17, 12.42s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 6035/1000000 [15:27:32<3178:22:15, 11.51s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [6035], local_loss=0.009541415609419346, train_loss=0.03298814594745636, time_cost=6.155540227890015
+
Steps: 1%| | 6035/1000000 [15:27:32<3178:22:15, 11.51s/it, lr=1e-5, step_loss=0.00954]
Steps: 1%| | 6036/1000000 [15:27:36<2602:54:49, 9.43s/it, lr=1e-5, step_loss=0.00954][RANK-0]: Step: [6036], local_loss=0.05865749716758728, train_loss=0.046114206314086914, time_cost=1.22816801071167
+
Steps: 1%| | 6036/1000000 [15:27:36<2602:54:49, 9.43s/it, lr=1e-5, step_loss=0.0587]
Steps: 1%| | 6037/1000000 [15:27:51<3068:03:30, 11.11s/it, lr=1e-5, step_loss=0.0587][RANK-0]: Step: [6037], local_loss=0.04183783382177353, train_loss=0.05331864953041077, time_cost=5.453329563140869
+
Steps: 1%| | 6037/1000000 [15:27:51<3068:03:30, 11.11s/it, lr=1e-5, step_loss=0.0418]
Steps: 1%| | 6038/1000000 [15:28:05<3267:07:38, 11.83s/it, lr=1e-5, step_loss=0.0418][RANK-0]: Step: [6038], local_loss=0.008994247764348984, train_loss=0.1817527860403061, time_cost=4.446871042251587
+
Steps: 1%| | 6038/1000000 [15:28:05<3267:07:38, 11.83s/it, lr=1e-5, step_loss=0.00899]
Steps: 1%| | 6039/1000000 [15:28:15<3121:43:13, 11.31s/it, lr=1e-5, step_loss=0.00899][RANK-0]: Step: [6039], local_loss=0.009456218220293522, train_loss=0.02004089206457138, time_cost=1.8308393955230713
+
Steps: 1%| | 6039/1000000 [15:28:15<3121:43:13, 11.31s/it, lr=1e-5, step_loss=0.00946]
Steps: 1%| | 6040/1000000 [15:28:27<3225:28:20, 11.68s/it, lr=1e-5, step_loss=0.00946][RANK-0]: Step: [6040], local_loss=0.04395061358809471, train_loss=14.298701286315918, time_cost=4.228275775909424
+
Steps: 1%| | 6040/1000000 [15:28:27<3225:28:20, 11.68s/it, lr=1e-5, step_loss=0.044]
Steps: 1%| | 6041/1000000 [15:28:37<3015:11:39, 10.92s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [6041], local_loss=0.019102422520518303, train_loss=0.03580649569630623, time_cost=2.729590654373169
+
Steps: 1%| | 6041/1000000 [15:28:37<3015:11:39, 10.92s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 6042/1000000 [15:28:51<3272:27:45, 11.85s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [6042], local_loss=0.19398918747901917, train_loss=0.047506183385849, time_cost=6.034795761108398
+
Steps: 1%| | 6042/1000000 [15:28:51<3272:27:45, 11.85s/it, lr=1e-5, step_loss=0.194]
Steps: 1%| | 6043/1000000 [15:29:00<3049:50:41, 11.05s/it, lr=1e-5, step_loss=0.194][RANK-0]: Step: [6043], local_loss=0.017940346151590347, train_loss=0.015044109895825386, time_cost=2.0409095287323
+
Steps: 1%| | 6043/1000000 [15:29:00<3049:50:41, 11.05s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 6044/1000000 [15:29:11<3066:53:47, 11.11s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [6044], local_loss=0.027542991563677788, train_loss=0.18890748918056488, time_cost=4.495758771896362
+
Steps: 1%| | 6044/1000000 [15:29:11<3066:53:47, 11.11s/it, lr=1e-5, step_loss=0.0275]
Steps: 1%| | 6045/1000000 [15:29:26<3402:47:57, 12.32s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [6045], local_loss=0.02018965221941471, train_loss=0.06177674978971481, time_cost=12.300073862075806
+
Steps: 1%| | 6045/1000000 [15:29:26<3402:47:57, 12.32s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%| | 6046/1000000 [15:29:33<2956:11:12, 10.71s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [6046], local_loss=0.1473895162343979, train_loss=0.052464962005615234, time_cost=3.4941070079803467
+
Steps: 1%| | 6046/1000000 [15:29:33<2956:11:12, 10.71s/it, lr=1e-5, step_loss=0.147]
Steps: 1%| | 6047/1000000 [15:29:40<2657:24:15, 9.62s/it, lr=1e-5, step_loss=0.147][RANK-0]: Step: [6047], local_loss=0.07073851674795151, train_loss=0.06635238230228424, time_cost=2.904797077178955
+
Steps: 1%| | 6047/1000000 [15:29:40<2657:24:15, 9.62s/it, lr=1e-5, step_loss=0.0707]
Steps: 1%| | 6048/1000000 [15:29:48<2527:11:42, 9.15s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [6048], local_loss=0.019570887088775635, train_loss=0.04790564998984337, time_cost=1.2186076641082764
+
Steps: 1%| | 6048/1000000 [15:29:48<2527:11:42, 9.15s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%| | 6049/1000000 [15:29:53<2167:22:00, 7.85s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [6049], local_loss=0.015257028862833977, train_loss=0.024935737252235413, time_cost=2.059516668319702
+
Steps: 1%| | 6049/1000000 [15:29:53<2167:22:00, 7.85s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%| | 6050/1000000 [15:29:58<1888:58:29, 6.84s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [6050], local_loss=0.03084821254014969, train_loss=0.03270746394991875, time_cost=1.5053520202636719
+
Steps: 1%| | 6050/1000000 [15:29:58<1888:58:29, 6.84s/it, lr=1e-5, step_loss=0.0308]
Steps: 1%| | 6051/1000000 [15:30:08<2212:23:28, 8.01s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [6051], local_loss=0.009554123505949974, train_loss=0.08920793235301971, time_cost=2.1273162364959717
+
Steps: 1%| | 6051/1000000 [15:30:08<2212:23:28, 8.01s/it, lr=1e-5, step_loss=0.00955]
Steps: 1%| | 6052/1000000 [15:30:18<2356:09:36, 8.53s/it, lr=1e-5, step_loss=0.00955][RANK-0]: Step: [6052], local_loss=0.04411740601062775, train_loss=0.043635860085487366, time_cost=1.2442657947540283
+
Steps: 1%| | 6052/1000000 [15:30:18<2356:09:36, 8.53s/it, lr=1e-5, step_loss=0.0441]
Steps: 1%| | 6053/1000000 [15:30:30<2654:21:43, 9.61s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [6053], local_loss=0.1966232806444168, train_loss=0.051152341067790985, time_cost=5.267859697341919
+
Steps: 1%| | 6053/1000000 [15:30:30<2654:21:43, 9.61s/it, lr=1e-5, step_loss=0.197]
Steps: 1%| | 6054/1000000 [15:30:42<2871:25:02, 10.40s/it, lr=1e-5, step_loss=0.197][RANK-0]: Step: [6054], local_loss=0.04141543060541153, train_loss=0.050858937203884125, time_cost=10.360649824142456
+
Steps: 1%| | 6054/1000000 [15:30:42<2871:25:02, 10.40s/it, lr=1e-5, step_loss=0.0414]
Steps: 1%| | 6055/1000000 [15:30:55<3060:11:24, 11.08s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [6055], local_loss=0.032791733741760254, train_loss=0.15097340941429138, time_cost=10.498792886734009
+
Steps: 1%| | 6055/1000000 [15:30:55<3060:11:24, 11.08s/it, lr=1e-5, step_loss=0.0328]
Steps: 1%| | 6056/1000000 [15:31:07<3113:31:25, 11.28s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [6056], local_loss=0.06565193831920624, train_loss=0.021864870563149452, time_cost=4.653656244277954
+
Steps: 1%| | 6056/1000000 [15:31:07<3113:31:25, 11.28s/it, lr=1e-5, step_loss=0.0657]
Steps: 1%| | 6057/1000000 [15:31:14<2766:33:59, 10.02s/it, lr=1e-5, step_loss=0.0657][RANK-0]: Step: [6057], local_loss=0.25435659289360046, train_loss=0.08972270786762238, time_cost=3.3636701107025146
+
Steps: 1%| | 6057/1000000 [15:31:14<2766:33:59, 10.02s/it, lr=1e-5, step_loss=0.254]
Steps: 1%| | 6058/1000000 [15:31:22<2592:45:19, 9.39s/it, lr=1e-5, step_loss=0.254][RANK-0]: Step: [6058], local_loss=0.06911084055900574, train_loss=0.052933547645807266, time_cost=2.6427111625671387
+
Steps: 1%| | 6058/1000000 [15:31:22<2592:45:19, 9.39s/it, lr=1e-5, step_loss=0.0691]
Steps: 1%| | 6059/1000000 [15:31:32<2640:33:47, 9.56s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [6059], local_loss=0.04976767301559448, train_loss=0.06775151193141937, time_cost=3.1591765880584717
+
Steps: 1%| | 6059/1000000 [15:31:32<2640:33:47, 9.56s/it, lr=1e-5, step_loss=0.0498]
Steps: 1%| | 6060/1000000 [15:31:37<2273:48:47, 8.24s/it, lr=1e-5, step_loss=0.0498][RANK-0]: Step: [6060], local_loss=0.027500484138727188, train_loss=0.06861591339111328, time_cost=2.255676746368408
+
Steps: 1%| | 6060/1000000 [15:31:37<2273:48:47, 8.24s/it, lr=1e-5, step_loss=0.0275]
Steps: 1%| | 6061/1000000 [15:31:44<2170:00:25, 7.86s/it, lr=1e-5, step_loss=0.0275][RANK-0]: Step: [6061], local_loss=0.010410672053694725, train_loss=0.040373992174863815, time_cost=1.2171828746795654
+
Steps: 1%| | 6061/1000000 [15:31:44<2170:00:25, 7.86s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 6062/1000000 [15:31:53<2275:52:41, 8.24s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [6062], local_loss=1.002795934677124, train_loss=0.1537276804447174, time_cost=1.247011661529541
+
Steps: 1%| | 6062/1000000 [15:31:53<2275:52:41, 8.24s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6063/1000000 [15:32:04<2481:35:20, 8.99s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6063], local_loss=0.0275579784065485, train_loss=0.032851122319698334, time_cost=6.012272834777832
+
Steps: 1%| | 6063/1000000 [15:32:04<2481:35:20, 8.99s/it, lr=1e-5, step_loss=0.0276]
Steps: 1%| | 6064/1000000 [15:32:14<2610:09:43, 9.45s/it, lr=1e-5, step_loss=0.0276][RANK-0]: Step: [6064], local_loss=0.06473745405673981, train_loss=0.03609868139028549, time_cost=3.6252899169921875
+
Steps: 1%| | 6064/1000000 [15:32:14<2610:09:43, 9.45s/it, lr=1e-5, step_loss=0.0647]
Steps: 1%| | 6065/1000000 [15:32:28<2998:03:03, 10.86s/it, lr=1e-5, step_loss=0.0647][RANK-0]: Step: [6065], local_loss=0.013495407998561859, train_loss=0.032286468893289566, time_cost=1.2406389713287354
+
Steps: 1%| | 6065/1000000 [15:32:28<2998:03:03, 10.86s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 6066/1000000 [15:32:39<2930:17:43, 10.61s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [6066], local_loss=0.015194823034107685, train_loss=0.02280568704009056, time_cost=1.2242984771728516
+
Steps: 1%| | 6066/1000000 [15:32:39<2930:17:43, 10.61s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 6067/1000000 [15:32:46<2702:26:28, 9.79s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [6067], local_loss=0.007776044774800539, train_loss=0.023276859894394875, time_cost=3.8330304622650146
+
Steps: 1%| | 6067/1000000 [15:32:46<2702:26:28, 9.79s/it, lr=1e-5, step_loss=0.00778]
Steps: 1%| | 6068/1000000 [15:32:58<2822:50:47, 10.22s/it, lr=1e-5, step_loss=0.00778][RANK-0]: Step: [6068], local_loss=0.05641213059425354, train_loss=0.02865585871040821, time_cost=8.840845346450806
+
Steps: 1%| | 6068/1000000 [15:32:58<2822:50:47, 10.22s/it, lr=1e-5, step_loss=0.0564]
Steps: 1%| | 6069/1000000 [15:33:02<2331:55:32, 8.45s/it, lr=1e-5, step_loss=0.0564][RANK-0]: Step: [6069], local_loss=0.013827941380441189, train_loss=0.026914183050394058, time_cost=1.291182041168213
+
Steps: 1%| | 6069/1000000 [15:33:02<2331:55:32, 8.45s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 6070/1000000 [15:33:06<1993:26:06, 7.22s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [6070], local_loss=0.08410220593214035, train_loss=0.18769404292106628, time_cost=1.6523284912109375
+
Steps: 1%| | 6070/1000000 [15:33:06<1993:26:06, 7.22s/it, lr=1e-5, step_loss=0.0841]
Steps: 1%| | 6071/1000000 [15:33:15<2112:03:12, 7.65s/it, lr=1e-5, step_loss=0.0841][RANK-0]: Step: [6071], local_loss=0.009936030954122543, train_loss=0.022181499749422073, time_cost=2.6788887977600098
+
Steps: 1%| | 6071/1000000 [15:33:15<2112:03:12, 7.65s/it, lr=1e-5, step_loss=0.00994]
Steps: 1%| | 6072/1000000 [15:33:24<2259:56:38, 8.19s/it, lr=1e-5, step_loss=0.00994][RANK-0]: Step: [6072], local_loss=0.03686773404479027, train_loss=0.03813478350639343, time_cost=3.339764356613159
+
Steps: 1%| | 6072/1000000 [15:33:24<2259:56:38, 8.19s/it, lr=1e-5, step_loss=0.0369]
Steps: 1%| | 6073/1000000 [15:33:35<2468:59:31, 8.94s/it, lr=1e-5, step_loss=0.0369][RANK-0]: Step: [6073], local_loss=0.46217891573905945, train_loss=5.691890716552734, time_cost=7.613270044326782
+
Steps: 1%| | 6073/1000000 [15:33:35<2468:59:31, 8.94s/it, lr=1e-5, step_loss=0.462]
Steps: 1%| | 6074/1000000 [15:33:41<2212:01:29, 8.01s/it, lr=1e-5, step_loss=0.462][RANK-0]: Step: [6074], local_loss=0.018818244338035583, train_loss=0.059225380420684814, time_cost=1.4547700881958008
+
Steps: 1%| | 6074/1000000 [15:33:41<2212:01:29, 8.01s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 6075/1000000 [15:33:50<2276:00:10, 8.24s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [6075], local_loss=0.03408249467611313, train_loss=0.05162699893116951, time_cost=4.075622797012329
+
Steps: 1%| | 6075/1000000 [15:33:50<2276:00:10, 8.24s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%| | 6076/1000000 [15:33:55<2000:23:17, 7.25s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [6076], local_loss=0.0420183502137661, train_loss=0.04386911168694496, time_cost=2.212737798690796
+
Steps: 1%| | 6076/1000000 [15:33:55<2000:23:17, 7.25s/it, lr=1e-5, step_loss=0.042]
Steps: 1%| | 6077/1000000 [15:34:02<1995:54:50, 7.23s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [6077], local_loss=0.008104236796498299, train_loss=0.04667747765779495, time_cost=2.900949001312256
+
Steps: 1%| | 6077/1000000 [15:34:02<1995:54:50, 7.23s/it, lr=1e-5, step_loss=0.0081]
Steps: 1%| | 6078/1000000 [15:34:15<2484:11:49, 9.00s/it, lr=1e-5, step_loss=0.0081][RANK-0]: Step: [6078], local_loss=0.011604483239352703, train_loss=12.91347885131836, time_cost=4.087886810302734
+
Steps: 1%| | 6078/1000000 [15:34:15<2484:11:49, 9.00s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 6079/1000000 [15:34:24<2477:58:13, 8.98s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [6079], local_loss=0.013805790804326534, train_loss=0.15492351353168488, time_cost=2.192721128463745
+
Steps: 1%| | 6079/1000000 [15:34:24<2477:58:13, 8.98s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 6080/1000000 [15:34:34<2591:41:29, 9.39s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [6080], local_loss=0.042018644511699677, train_loss=107.0914306640625, time_cost=1.3783068656921387
+
Steps: 1%| | 6080/1000000 [15:34:34<2591:41:29, 9.39s/it, lr=1e-5, step_loss=0.042]
Steps: 1%| | 6081/1000000 [15:34:40<2321:54:45, 8.41s/it, lr=1e-5, step_loss=0.042][RANK-0]: Step: [6081], local_loss=0.062242291867733, train_loss=0.05536477267742157, time_cost=1.97267746925354
+
Steps: 1%| | 6081/1000000 [15:34:40<2321:54:45, 8.41s/it, lr=1e-5, step_loss=0.0622]
Steps: 1%| | 6082/1000000 [15:34:54<2744:49:50, 9.94s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [6082], local_loss=0.027803458273410797, train_loss=0.03514465317130089, time_cost=5.9400794506073
+
Steps: 1%| | 6082/1000000 [15:34:54<2744:49:50, 9.94s/it, lr=1e-5, step_loss=0.0278]
Steps: 1%| | 6083/1000000 [15:34:58<2306:35:53, 8.35s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [6083], local_loss=0.012072564102709293, train_loss=0.02461843192577362, time_cost=2.004981279373169
+
Steps: 1%| | 6083/1000000 [15:34:58<2306:35:53, 8.35s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6084/1000000 [15:35:05<2192:26:59, 7.94s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6084], local_loss=0.01252156961709261, train_loss=0.033390358090400696, time_cost=2.7544593811035156
+
Steps: 1%| | 6084/1000000 [15:35:05<2192:26:59, 7.94s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 6085/1000000 [15:35:11<2018:18:49, 7.31s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [6085], local_loss=0.01886674575507641, train_loss=0.1483495831489563, time_cost=1.3857355117797852
+
Steps: 1%| | 6085/1000000 [15:35:11<2018:18:49, 7.31s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%| | 6086/1000000 [15:35:24<2453:41:03, 8.89s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [6086], local_loss=0.05336763337254524, train_loss=0.04947555810213089, time_cost=4.485278129577637
+
Steps: 1%| | 6086/1000000 [15:35:24<2453:41:03, 8.89s/it, lr=1e-5, step_loss=0.0534]
Steps: 1%| | 6087/1000000 [15:35:34<2541:53:34, 9.21s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [6087], local_loss=0.009672748856246471, train_loss=0.03217067942023277, time_cost=4.805814504623413
+
Steps: 1%| | 6087/1000000 [15:35:34<2541:53:34, 9.21s/it, lr=1e-5, step_loss=0.00967]
Steps: 1%| | 6088/1000000 [15:35:39<2212:55:20, 8.02s/it, lr=1e-5, step_loss=0.00967][RANK-0]: Step: [6088], local_loss=0.011757219210267067, train_loss=0.13356705009937286, time_cost=2.2301526069641113
+
Steps: 1%| | 6088/1000000 [15:35:39<2212:55:20, 8.02s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6089/1000000 [15:35:47<2185:10:50, 7.91s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6089], local_loss=0.029257675632834435, train_loss=0.06115810200572014, time_cost=4.598072290420532
+
Steps: 1%| | 6089/1000000 [15:35:47<2185:10:50, 7.91s/it, lr=1e-5, step_loss=0.0293]
Steps: 1%| | 6090/1000000 [15:35:58<2428:41:54, 8.80s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [6090], local_loss=0.12493038922548294, train_loss=2.3882181644439697, time_cost=3.2896194458007812
+
Steps: 1%| | 6090/1000000 [15:35:58<2428:41:54, 8.80s/it, lr=1e-5, step_loss=0.125]
Steps: 1%| | 6091/1000000 [15:36:06<2426:44:46, 8.79s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [6091], local_loss=131.52642822265625, train_loss=16.48273468017578, time_cost=3.2816054821014404
+
Steps: 1%| | 6091/1000000 [15:36:06<2426:44:46, 8.79s/it, lr=1e-5, step_loss=132]
Steps: 1%| | 6092/1000000 [15:36:14<2297:00:17, 8.32s/it, lr=1e-5, step_loss=132][RANK-0]: Step: [6092], local_loss=0.20367474853992462, train_loss=0.16644351184368134, time_cost=3.0494017601013184
+
Steps: 1%| | 6092/1000000 [15:36:14<2297:00:17, 8.32s/it, lr=1e-5, step_loss=0.204]
Steps: 1%| | 6093/1000000 [15:36:20<2176:38:38, 7.88s/it, lr=1e-5, step_loss=0.204][RANK-0]: Step: [6093], local_loss=0.015205499716103077, train_loss=0.05803082510828972, time_cost=1.332108497619629
+
Steps: 1%| | 6093/1000000 [15:36:20<2176:38:38, 7.88s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 6094/1000000 [15:36:25<1889:46:18, 6.84s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [6094], local_loss=0.035449057817459106, train_loss=0.24103708565235138, time_cost=1.3198611736297607
+
Steps: 1%| | 6094/1000000 [15:36:25<1889:46:18, 6.84s/it, lr=1e-5, step_loss=0.0354]
Steps: 1%| | 6095/1000000 [15:36:31<1794:35:32, 6.50s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [6095], local_loss=0.11779028177261353, train_loss=0.16359879076480865, time_cost=4.830090045928955
+
Steps: 1%| | 6095/1000000 [15:36:31<1794:35:32, 6.50s/it, lr=1e-5, step_loss=0.118]
Steps: 1%| | 6096/1000000 [15:36:35<1615:45:18, 5.85s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [6096], local_loss=0.07899744063615799, train_loss=0.13368827104568481, time_cost=1.1980936527252197
+
Steps: 1%| | 6096/1000000 [15:36:35<1615:45:18, 5.85s/it, lr=1e-5, step_loss=0.079]
Steps: 1%| | 6097/1000000 [15:36:47<2134:01:01, 7.73s/it, lr=1e-5, step_loss=0.079][RANK-0]: Step: [6097], local_loss=0.034979548305273056, train_loss=0.04552014544606209, time_cost=5.730809688568115
+
Steps: 1%| | 6097/1000000 [15:36:47<2134:01:01, 7.73s/it, lr=1e-5, step_loss=0.035]
Steps: 1%| | 6098/1000000 [15:37:00<2573:02:21, 9.32s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [6098], local_loss=0.05690869688987732, train_loss=0.07374338805675507, time_cost=4.180771589279175
+
Steps: 1%| | 6098/1000000 [15:37:00<2573:02:21, 9.32s/it, lr=1e-5, step_loss=0.0569]
Steps: 1%| | 6099/1000000 [15:37:11<2708:51:49, 9.81s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [6099], local_loss=0.02685033343732357, train_loss=0.14576520025730133, time_cost=2.8932766914367676
+
Steps: 1%| | 6099/1000000 [15:37:11<2708:51:49, 9.81s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%| | 6100/1000000 [15:37:25<3044:43:57, 11.03s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [6100], local_loss=0.07215402275323868, train_loss=0.030346151441335678, time_cost=4.8198065757751465
+
Steps: 1%| | 6100/1000000 [15:37:25<3044:43:57, 11.03s/it, lr=1e-5, step_loss=0.0722]
Steps: 1%| | 6101/1000000 [15:37:29<2475:46:45, 8.97s/it, lr=1e-5, step_loss=0.0722][RANK-0]: Step: [6101], local_loss=0.05082276090979576, train_loss=21.12428855895996, time_cost=1.2374513149261475
+
Steps: 1%| | 6101/1000000 [15:37:29<2475:46:45, 8.97s/it, lr=1e-5, step_loss=0.0508]
Steps: 1%| | 6102/1000000 [15:37:41<2691:06:27, 9.75s/it, lr=1e-5, step_loss=0.0508][RANK-0]: Step: [6102], local_loss=0.011382794007658958, train_loss=0.055729761719703674, time_cost=2.069030523300171
+
Steps: 1%| | 6102/1000000 [15:37:41<2691:06:27, 9.75s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6103/1000000 [15:37:48<2484:09:10, 9.00s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6103], local_loss=0.02771080657839775, train_loss=0.07492153346538544, time_cost=1.542560338973999
+
Steps: 1%| | 6103/1000000 [15:37:48<2484:09:10, 9.00s/it, lr=1e-5, step_loss=0.0277]
Steps: 1%| | 6104/1000000 [15:37:53<2155:07:50, 7.81s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [6104], local_loss=0.01185703370720148, train_loss=0.03019038401544094, time_cost=1.294811725616455
+
Steps: 1%| | 6104/1000000 [15:37:53<2155:07:50, 7.81s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 6105/1000000 [15:38:04<2439:44:51, 8.84s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [6105], local_loss=0.03252590820193291, train_loss=1.72049081325531, time_cost=2.151801586151123
+
Steps: 1%| | 6105/1000000 [15:38:04<2439:44:51, 8.84s/it, lr=1e-5, step_loss=0.0325]
Steps: 1%| | 6106/1000000 [15:38:15<2640:37:43, 9.56s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [6106], local_loss=0.012455690652132034, train_loss=0.02210448682308197, time_cost=1.8358731269836426
+
Steps: 1%| | 6106/1000000 [15:38:15<2640:37:43, 9.56s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 6107/1000000 [15:38:20<2241:31:19, 8.12s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [6107], local_loss=0.017636602744460106, train_loss=0.02813829481601715, time_cost=3.571145534515381
+
Steps: 1%| | 6107/1000000 [15:38:20<2241:31:19, 8.12s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%| | 6108/1000000 [15:38:26<2045:12:47, 7.41s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [6108], local_loss=0.023209502920508385, train_loss=0.032637886703014374, time_cost=1.3132169246673584
+
Steps: 1%| | 6108/1000000 [15:38:26<2045:12:47, 7.41s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 6109/1000000 [15:38:36<2233:28:00, 8.09s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [6109], local_loss=0.04972817376255989, train_loss=0.02960575744509697, time_cost=3.61862850189209
+
Steps: 1%| | 6109/1000000 [15:38:36<2233:28:00, 8.09s/it, lr=1e-5, step_loss=0.0497]
Steps: 1%| | 6110/1000000 [15:38:41<2027:18:09, 7.34s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [6110], local_loss=0.025457940995693207, train_loss=0.09472023695707321, time_cost=2.927370309829712
+
Steps: 1%| | 6110/1000000 [15:38:41<2027:18:09, 7.34s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 6111/1000000 [15:38:53<2379:20:52, 8.62s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [6111], local_loss=0.11037692427635193, train_loss=0.2008047103881836, time_cost=3.8252973556518555
+
Steps: 1%| | 6111/1000000 [15:38:53<2379:20:52, 8.62s/it, lr=1e-5, step_loss=0.11]
Steps: 1%| | 6112/1000000 [15:38:59<2141:13:16, 7.76s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [6112], local_loss=0.06521313637495041, train_loss=0.05386361479759216, time_cost=1.4400105476379395
+
Steps: 1%| | 6112/1000000 [15:38:59<2141:13:16, 7.76s/it, lr=1e-5, step_loss=0.0652]
Steps: 1%| | 6113/1000000 [15:39:10<2454:37:09, 8.89s/it, lr=1e-5, step_loss=0.0652][RANK-0]: Step: [6113], local_loss=0.0482051707804203, train_loss=0.06016208976507187, time_cost=4.830028057098389
+
Steps: 1%| | 6113/1000000 [15:39:10<2454:37:09, 8.89s/it, lr=1e-5, step_loss=0.0482]
Steps: 1%| | 6114/1000000 [15:39:21<2604:40:15, 9.43s/it, lr=1e-5, step_loss=0.0482][RANK-0]: Step: [6114], local_loss=0.025332007557153702, train_loss=0.02247917652130127, time_cost=1.2540929317474365
+
Steps: 1%| | 6114/1000000 [15:39:21<2604:40:15, 9.43s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%| | 6115/1000000 [15:39:37<3131:29:33, 11.34s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [6115], local_loss=0.019763145595788956, train_loss=0.03549175336956978, time_cost=8.147032260894775
+
Steps: 1%| | 6115/1000000 [15:39:37<3131:29:33, 11.34s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 6116/1000000 [15:39:49<3188:56:29, 11.55s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [6116], local_loss=0.08484333753585815, train_loss=0.02437232993543148, time_cost=3.6584222316741943
+
Steps: 1%| | 6116/1000000 [15:39:49<3188:56:29, 11.55s/it, lr=1e-5, step_loss=0.0848]
Steps: 1%| | 6117/1000000 [15:39:56<2806:50:33, 10.17s/it, lr=1e-5, step_loss=0.0848][RANK-0]: Step: [6117], local_loss=0.014433128759264946, train_loss=0.02587289735674858, time_cost=2.3954243659973145
+
Steps: 1%| | 6117/1000000 [15:39:56<2806:50:33, 10.17s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6118/1000000 [15:40:11<3213:02:08, 11.64s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6118], local_loss=0.029852859675884247, train_loss=0.16187739372253418, time_cost=7.3300111293792725
+
Steps: 1%| | 6118/1000000 [15:40:11<3213:02:08, 11.64s/it, lr=1e-5, step_loss=0.0299]
Steps: 1%| | 6119/1000000 [15:40:24<3366:09:53, 12.19s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [6119], local_loss=0.021203387528657913, train_loss=0.020686451345682144, time_cost=1.2706689834594727
+
Steps: 1%| | 6119/1000000 [15:40:24<3366:09:53, 12.19s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%| | 6120/1000000 [15:40:37<3406:12:36, 12.34s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [6120], local_loss=0.011463379487395287, train_loss=0.017891211435198784, time_cost=4.195315837860107
+
Steps: 1%| | 6120/1000000 [15:40:37<3406:12:36, 12.34s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 6121/1000000 [15:40:46<3148:57:00, 11.41s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [6121], local_loss=0.02458573319017887, train_loss=0.04020044207572937, time_cost=3.5409936904907227
+
Steps: 1%| | 6121/1000000 [15:40:46<3148:57:00, 11.41s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%| | 6122/1000000 [15:40:53<2809:25:02, 10.18s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [6122], local_loss=0.10654646903276443, train_loss=0.04595167934894562, time_cost=3.1594386100769043
+
Steps: 1%| | 6122/1000000 [15:40:53<2809:25:02, 10.18s/it, lr=1e-5, step_loss=0.107]
Steps: 1%| | 6123/1000000 [15:40:58<2327:43:06, 8.43s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [6123], local_loss=0.011932115070521832, train_loss=0.04627855494618416, time_cost=1.3519868850708008
+
Steps: 1%| | 6123/1000000 [15:40:58<2327:43:06, 8.43s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 6124/1000000 [15:41:11<2753:20:59, 9.97s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [6124], local_loss=0.0903279259800911, train_loss=0.06563635170459747, time_cost=3.727195978164673
+
Steps: 1%| | 6124/1000000 [15:41:11<2753:20:59, 9.97s/it, lr=1e-5, step_loss=0.0903]
Steps: 1%| | 6125/1000000 [15:41:20<2669:46:51, 9.67s/it, lr=1e-5, step_loss=0.0903][RANK-0]: Step: [6125], local_loss=0.015773266553878784, train_loss=0.04210320860147476, time_cost=2.9715847969055176
+
Steps: 1%| | 6125/1000000 [15:41:20<2669:46:51, 9.67s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 6126/1000000 [15:41:28<2489:41:16, 9.02s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [6126], local_loss=0.009099218063056469, train_loss=0.023356663063168526, time_cost=1.6524784564971924
+
Steps: 1%| | 6126/1000000 [15:41:28<2489:41:16, 9.02s/it, lr=1e-5, step_loss=0.0091]
Steps: 1%| | 6127/1000000 [15:41:39<2686:38:21, 9.73s/it, lr=1e-5, step_loss=0.0091][RANK-0]: Step: [6127], local_loss=0.025032665580511093, train_loss=0.07664375007152557, time_cost=1.5345964431762695
+
Steps: 1%| | 6127/1000000 [15:41:39<2686:38:21, 9.73s/it, lr=1e-5, step_loss=0.025]
Steps: 1%| | 6128/1000000 [15:41:52<2981:52:36, 10.80s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [6128], local_loss=0.021521063521504402, train_loss=0.045027848333120346, time_cost=5.064691543579102
+
Steps: 1%| | 6128/1000000 [15:41:52<2981:52:36, 10.80s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%| | 6129/1000000 [15:41:57<2508:08:40, 9.09s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [6129], local_loss=0.009511656127870083, train_loss=0.020860159769654274, time_cost=4.231943607330322
+
Steps: 1%| | 6129/1000000 [15:41:57<2508:08:40, 9.09s/it, lr=1e-5, step_loss=0.00951]
Steps: 1%| | 6130/1000000 [15:42:11<2861:53:23, 10.37s/it, lr=1e-5, step_loss=0.00951][RANK-0]: Step: [6130], local_loss=0.03797248750925064, train_loss=0.058351632207632065, time_cost=5.363449335098267
+
Steps: 1%| | 6130/1000000 [15:42:11<2861:53:23, 10.37s/it, lr=1e-5, step_loss=0.038]
Steps: 1%| | 6131/1000000 [15:42:17<2506:46:43, 9.08s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [6131], local_loss=0.023361247032880783, train_loss=0.01674213446676731, time_cost=1.7053821086883545
+
Steps: 1%| | 6131/1000000 [15:42:17<2506:46:43, 9.08s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%| | 6132/1000000 [15:42:28<2669:47:46, 9.67s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [6132], local_loss=0.07748536020517349, train_loss=0.04550785571336746, time_cost=1.2198765277862549
+
Steps: 1%| | 6132/1000000 [15:42:28<2669:47:46, 9.67s/it, lr=1e-5, step_loss=0.0775]
Steps: 1%| | 6133/1000000 [15:42:35<2441:58:05, 8.85s/it, lr=1e-5, step_loss=0.0775][RANK-0]: Step: [6133], local_loss=0.011255789548158646, train_loss=0.021320633590221405, time_cost=3.347801446914673
+
Steps: 1%| | 6133/1000000 [15:42:35<2441:58:05, 8.85s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 6134/1000000 [15:42:41<2188:02:20, 7.93s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [6134], local_loss=0.5624924302101135, train_loss=0.21108078956604004, time_cost=1.4700937271118164
+
Steps: 1%| | 6134/1000000 [15:42:41<2188:02:20, 7.93s/it, lr=1e-5, step_loss=0.562]
Steps: 1%| | 6135/1000000 [15:42:48<2118:52:52, 7.68s/it, lr=1e-5, step_loss=0.562][RANK-0]: Step: [6135], local_loss=0.011362847872078419, train_loss=0.026210449635982513, time_cost=2.576571226119995
+
Steps: 1%| | 6135/1000000 [15:42:48<2118:52:52, 7.68s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6136/1000000 [15:42:53<1928:47:34, 6.99s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6136], local_loss=0.02385392226278782, train_loss=0.025619173422455788, time_cost=1.2292983531951904
+
Steps: 1%| | 6136/1000000 [15:42:53<1928:47:34, 6.99s/it, lr=1e-5, step_loss=0.0239]
Steps: 1%| | 6137/1000000 [15:43:02<2066:57:53, 7.49s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [6137], local_loss=0.03907260298728943, train_loss=0.03607413172721863, time_cost=5.176236867904663
+
Steps: 1%| | 6137/1000000 [15:43:02<2066:57:53, 7.49s/it, lr=1e-5, step_loss=0.0391]
Steps: 1%| | 6138/1000000 [15:43:11<2188:27:40, 7.93s/it, lr=1e-5, step_loss=0.0391][RANK-0]: Step: [6138], local_loss=0.04596780985593796, train_loss=0.04431254416704178, time_cost=1.738128423690796
+
Steps: 1%| | 6138/1000000 [15:43:11<2188:27:40, 7.93s/it, lr=1e-5, step_loss=0.046]
Steps: 1%| | 6139/1000000 [15:43:18<2142:56:42, 7.76s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [6139], local_loss=0.044579893350601196, train_loss=0.04630658030509949, time_cost=3.743612289428711
+
Steps: 1%| | 6139/1000000 [15:43:18<2142:56:42, 7.76s/it, lr=1e-5, step_loss=0.0446]
Steps: 1%| | 6140/1000000 [15:43:26<2147:44:16, 7.78s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [6140], local_loss=0.12544165551662445, train_loss=0.04078377038240433, time_cost=4.4224607944488525
+
Steps: 1%| | 6140/1000000 [15:43:26<2147:44:16, 7.78s/it, lr=1e-5, step_loss=0.125]
Steps: 1%| | 6141/1000000 [15:43:31<1927:14:57, 6.98s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [6141], local_loss=0.024693535640835762, train_loss=0.026316668838262558, time_cost=1.8399622440338135
+
Steps: 1%| | 6141/1000000 [15:43:31<1927:14:57, 6.98s/it, lr=1e-5, step_loss=0.0247]
Steps: 1%| | 6142/1000000 [15:43:40<2109:05:43, 7.64s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [6142], local_loss=0.015646807849407196, train_loss=0.07641617953777313, time_cost=1.2324626445770264
+
Steps: 1%| | 6142/1000000 [15:43:40<2109:05:43, 7.64s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%| | 6143/1000000 [15:43:53<2554:30:07, 9.25s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [6143], local_loss=0.019415883347392082, train_loss=0.14668746292591095, time_cost=1.2278008460998535
+
Steps: 1%| | 6143/1000000 [15:43:53<2554:30:07, 9.25s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 6144/1000000 [15:44:03<2617:15:42, 9.48s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [6144], local_loss=0.021734915673732758, train_loss=0.03250053524971008, time_cost=1.2200968265533447
+
Steps: 1%| | 6144/1000000 [15:44:03<2617:15:42, 9.48s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%| | 6145/1000000 [15:44:13<2664:55:11, 9.65s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [6145], local_loss=0.14160415530204773, train_loss=0.04068002104759216, time_cost=5.147562503814697
+
Steps: 1%| | 6145/1000000 [15:44:13<2664:55:11, 9.65s/it, lr=1e-5, step_loss=0.142]
Steps: 1%| | 6146/1000000 [15:44:19<2305:21:20, 8.35s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [6146], local_loss=0.00857968907803297, train_loss=0.047443144023418427, time_cost=1.2254602909088135
+
Steps: 1%| | 6146/1000000 [15:44:19<2305:21:20, 8.35s/it, lr=1e-5, step_loss=0.00858]
Steps: 1%| | 6147/1000000 [15:44:34<2879:37:27, 10.43s/it, lr=1e-5, step_loss=0.00858][RANK-0]: Step: [6147], local_loss=0.038365550339221954, train_loss=0.22560861706733704, time_cost=12.607298374176025
+
Steps: 1%| | 6147/1000000 [15:44:34<2879:37:27, 10.43s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%| | 6148/1000000 [15:44:39<2424:10:55, 8.78s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [6148], local_loss=0.022454991936683655, train_loss=0.14818604290485382, time_cost=2.036691665649414
+
Steps: 1%| | 6148/1000000 [15:44:39<2424:10:55, 8.78s/it, lr=1e-5, step_loss=0.0225]
Steps: 1%| | 6149/1000000 [15:44:46<2266:10:14, 8.21s/it, lr=1e-5, step_loss=0.0225][RANK-0]: Step: [6149], local_loss=0.017274515703320503, train_loss=0.03702088072896004, time_cost=2.7638237476348877
+
Steps: 1%| | 6149/1000000 [15:44:46<2266:10:14, 8.21s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%| | 6150/1000000 [15:44:53<2166:48:47, 7.85s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [6150], local_loss=0.058449625968933105, train_loss=0.03602736443281174, time_cost=1.2315924167633057
+
Steps: 1%| | 6150/1000000 [15:44:53<2166:48:47, 7.85s/it, lr=1e-5, step_loss=0.0584]
Steps: 1%| | 6151/1000000 [15:44:58<1993:26:01, 7.22s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [6151], local_loss=0.015176905319094658, train_loss=0.03481154888868332, time_cost=1.2241578102111816
+
Steps: 1%| | 6151/1000000 [15:44:58<1993:26:01, 7.22s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 6152/1000000 [15:45:06<1986:17:04, 7.19s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [6152], local_loss=0.04856012761592865, train_loss=0.17374220490455627, time_cost=1.672072172164917
+
Steps: 1%| | 6152/1000000 [15:45:06<1986:17:04, 7.19s/it, lr=1e-5, step_loss=0.0486]
Steps: 1%| | 6153/1000000 [15:45:14<2123:07:26, 7.69s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [6153], local_loss=0.01388128288090229, train_loss=0.019581789150834084, time_cost=3.034839153289795
+
Steps: 1%| | 6153/1000000 [15:45:14<2123:07:26, 7.69s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6154/1000000 [15:45:28<2626:27:27, 9.51s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6154], local_loss=0.016598956659436226, train_loss=4.848939895629883, time_cost=1.231287956237793
+
Steps: 1%| | 6154/1000000 [15:45:28<2626:27:27, 9.51s/it, lr=1e-5, step_loss=0.0166]
Steps: 1%| | 6155/1000000 [15:45:33<2246:22:50, 8.14s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [6155], local_loss=0.012921761721372604, train_loss=0.021770263090729713, time_cost=2.001669406890869
+
Steps: 1%| | 6155/1000000 [15:45:33<2246:22:50, 8.14s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 6156/1000000 [15:45:46<2616:17:24, 9.48s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [6156], local_loss=0.01458597369492054, train_loss=0.04768610745668411, time_cost=2.680436372756958
+
Steps: 1%| | 6156/1000000 [15:45:46<2616:17:24, 9.48s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 6157/1000000 [15:45:57<2747:08:26, 9.95s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [6157], local_loss=0.012239329516887665, train_loss=0.020446617156267166, time_cost=7.326512098312378
+
Steps: 1%| | 6157/1000000 [15:45:57<2747:08:26, 9.95s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 6158/1000000 [15:46:04<2535:17:17, 9.18s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [6158], local_loss=0.06982512772083282, train_loss=0.04342515021562576, time_cost=3.148300886154175
+
Steps: 1%| | 6158/1000000 [15:46:04<2535:17:17, 9.18s/it, lr=1e-5, step_loss=0.0698]
Steps: 1%| | 6159/1000000 [15:46:17<2870:17:23, 10.40s/it, lr=1e-5, step_loss=0.0698][RANK-0]: Step: [6159], local_loss=0.010519972071051598, train_loss=0.022662080824375153, time_cost=2.691995143890381
+
Steps: 1%| | 6159/1000000 [15:46:17<2870:17:23, 10.40s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 6160/1000000 [15:46:28<2893:52:21, 10.48s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [6160], local_loss=0.33507415652275085, train_loss=0.08922895789146423, time_cost=3.071800947189331
+
Steps: 1%| | 6160/1000000 [15:46:28<2893:52:21, 10.48s/it, lr=1e-5, step_loss=0.335]
Steps: 1%| | 6161/1000000 [15:46:40<3017:27:53, 10.93s/it, lr=1e-5, step_loss=0.335][RANK-0]: Step: [6161], local_loss=0.01699107512831688, train_loss=0.1487632393836975, time_cost=5.096943378448486
+
Steps: 1%| | 6161/1000000 [15:46:40<3017:27:53, 10.93s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 6162/1000000 [15:46:52<3062:08:50, 11.09s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [6162], local_loss=0.013385158963501453, train_loss=0.0875808447599411, time_cost=1.22328782081604
+
Steps: 1%| | 6162/1000000 [15:46:52<3062:08:50, 11.09s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 6163/1000000 [15:46:58<2713:31:06, 9.83s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [6163], local_loss=0.26641732454299927, train_loss=0.09116871654987335, time_cost=2.0995748043060303
+
Steps: 1%| | 6163/1000000 [15:46:58<2713:31:06, 9.83s/it, lr=1e-5, step_loss=0.266]
Steps: 1%| | 6164/1000000 [15:47:15<3262:15:25, 11.82s/it, lr=1e-5, step_loss=0.266][RANK-0]: Step: [6164], local_loss=0.2181406319141388, train_loss=0.06217989698052406, time_cost=6.454950332641602
+
Steps: 1%| | 6164/1000000 [15:47:15<3262:15:25, 11.82s/it, lr=1e-5, step_loss=0.218]
Steps: 1%| | 6165/1000000 [15:47:24<3011:58:21, 10.91s/it, lr=1e-5, step_loss=0.218][RANK-0]: Step: [6165], local_loss=0.12608866393566132, train_loss=0.03570377826690674, time_cost=2.7214837074279785
+
Steps: 1%| | 6165/1000000 [15:47:24<3011:58:21, 10.91s/it, lr=1e-5, step_loss=0.126]
Steps: 1%| | 6166/1000000 [15:47:29<2530:32:04, 9.17s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [6166], local_loss=0.046579714864492416, train_loss=0.029649341478943825, time_cost=2.375042200088501
+
Steps: 1%| | 6166/1000000 [15:47:29<2530:32:04, 9.17s/it, lr=1e-5, step_loss=0.0466]
Steps: 1%| | 6167/1000000 [15:47:34<2193:33:16, 7.95s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [6167], local_loss=0.18022935092449188, train_loss=0.05583313852548599, time_cost=2.6278908252716064
+
Steps: 1%| | 6167/1000000 [15:47:34<2193:33:16, 7.95s/it, lr=1e-5, step_loss=0.18]
Steps: 1%| | 6168/1000000 [15:47:44<2335:53:13, 8.46s/it, lr=1e-5, step_loss=0.18][RANK-0]: Step: [6168], local_loss=0.03779250755906105, train_loss=0.023258540779352188, time_cost=1.2401790618896484
+
Steps: 1%| | 6168/1000000 [15:47:44<2335:53:13, 8.46s/it, lr=1e-5, step_loss=0.0378]
Steps: 1%| | 6169/1000000 [15:47:54<2498:57:21, 9.05s/it, lr=1e-5, step_loss=0.0378][RANK-0]: Step: [6169], local_loss=0.012625946663320065, train_loss=0.056998297572135925, time_cost=1.8746397495269775
+
Steps: 1%| | 6169/1000000 [15:47:54<2498:57:21, 9.05s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 6170/1000000 [15:48:03<2507:25:52, 9.08s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [6170], local_loss=0.04337817430496216, train_loss=0.05900529399514198, time_cost=3.077728271484375
+
Steps: 1%| | 6170/1000000 [15:48:03<2507:25:52, 9.08s/it, lr=1e-5, step_loss=0.0434]
Steps: 1%| | 6171/1000000 [15:48:12<2522:05:11, 9.14s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [6171], local_loss=0.0346345528960228, train_loss=0.03187655657529831, time_cost=2.16430926322937
+
Steps: 1%| | 6171/1000000 [15:48:12<2522:05:11, 9.14s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%| | 6172/1000000 [15:48:24<2689:31:00, 9.74s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [6172], local_loss=0.009512961842119694, train_loss=0.0625937283039093, time_cost=2.9106462001800537
+
Steps: 1%| | 6172/1000000 [15:48:24<2689:31:00, 9.74s/it, lr=1e-5, step_loss=0.00951]
Steps: 1%| | 6173/1000000 [15:48:32<2605:12:13, 9.44s/it, lr=1e-5, step_loss=0.00951][RANK-0]: Step: [6173], local_loss=0.011862044222652912, train_loss=0.024176478385925293, time_cost=3.049394130706787
+
Steps: 1%| | 6173/1000000 [15:48:32<2605:12:13, 9.44s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 6174/1000000 [15:48:40<2451:20:42, 8.88s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [6174], local_loss=0.023114824667572975, train_loss=0.0891295075416565, time_cost=1.3604645729064941
+
Steps: 1%| | 6174/1000000 [15:48:40<2451:20:42, 8.88s/it, lr=1e-5, step_loss=0.0231]
Steps: 1%| | 6175/1000000 [15:48:46<2230:33:09, 8.08s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [6175], local_loss=0.4955768883228302, train_loss=0.08251233398914337, time_cost=1.8023574352264404
+
Steps: 1%| | 6175/1000000 [15:48:46<2230:33:09, 8.08s/it, lr=1e-5, step_loss=0.496]
Steps: 1%| | 6176/1000000 [15:49:00<2677:59:34, 9.70s/it, lr=1e-5, step_loss=0.496][RANK-0]: Step: [6176], local_loss=0.053733941167593, train_loss=0.04318118840456009, time_cost=3.851078510284424
+
Steps: 1%| | 6176/1000000 [15:49:00<2677:59:34, 9.70s/it, lr=1e-5, step_loss=0.0537]
Steps: 1%| | 6177/1000000 [15:49:06<2425:25:35, 8.79s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [6177], local_loss=0.01596468687057495, train_loss=0.02837979793548584, time_cost=4.824133634567261
+
Steps: 1%| | 6177/1000000 [15:49:06<2425:25:35, 8.79s/it, lr=1e-5, step_loss=0.016]
Steps: 1%| | 6178/1000000 [15:49:11<2106:35:17, 7.63s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [6178], local_loss=0.039978884160518646, train_loss=0.032782480120658875, time_cost=1.6500751972198486
+
Steps: 1%| | 6178/1000000 [15:49:11<2106:35:17, 7.63s/it, lr=1e-5, step_loss=0.04]
Steps: 1%| | 6179/1000000 [15:49:23<2434:45:26, 8.82s/it, lr=1e-5, step_loss=0.04][RANK-0]: Step: [6179], local_loss=0.022780992090702057, train_loss=0.026396552100777626, time_cost=2.610846757888794
+
Steps: 1%| | 6179/1000000 [15:49:23<2434:45:26, 8.82s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%| | 6180/1000000 [15:49:37<2866:07:07, 10.38s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [6180], local_loss=0.012775368988513947, train_loss=0.021984003484249115, time_cost=9.441588878631592
+
Steps: 1%| | 6180/1000000 [15:49:37<2866:07:07, 10.38s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 6181/1000000 [15:49:51<3186:56:25, 11.54s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [6181], local_loss=0.04040798172354698, train_loss=0.03468549996614456, time_cost=1.2888567447662354
+
Steps: 1%| | 6181/1000000 [15:49:51<3186:56:25, 11.54s/it, lr=1e-5, step_loss=0.0404]
Steps: 1%| | 6182/1000000 [15:49:56<2672:38:29, 9.68s/it, lr=1e-5, step_loss=0.0404][RANK-0]: Step: [6182], local_loss=0.022449180483818054, train_loss=0.06625616550445557, time_cost=2.965038776397705
+
Steps: 1%| | 6182/1000000 [15:49:56<2672:38:29, 9.68s/it, lr=1e-5, step_loss=0.0224]
Steps: 1%| | 6183/1000000 [15:50:06<2676:58:18, 9.70s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [6183], local_loss=0.05869313329458237, train_loss=0.02187630906701088, time_cost=4.168205261230469
+
Steps: 1%| | 6183/1000000 [15:50:06<2676:58:18, 9.70s/it, lr=1e-5, step_loss=0.0587]
Steps: 1%| | 6184/1000000 [15:50:20<3047:48:57, 11.04s/it, lr=1e-5, step_loss=0.0587][RANK-0]: Step: [6184], local_loss=0.0565858818590641, train_loss=0.07121175527572632, time_cost=5.511945486068726
+
Steps: 1%| | 6184/1000000 [15:50:20<3047:48:57, 11.04s/it, lr=1e-5, step_loss=0.0566]
Steps: 1%| | 6185/1000000 [15:50:25<2498:30:48, 9.05s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [6185], local_loss=0.017576737329363823, train_loss=0.02196703478693962, time_cost=1.4616689682006836
+
Steps: 1%| | 6185/1000000 [15:50:25<2498:30:48, 9.05s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%| | 6186/1000000 [15:50:39<2974:34:54, 10.78s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [6186], local_loss=0.00934350211173296, train_loss=0.05179128795862198, time_cost=5.385273456573486
+
Steps: 1%| | 6186/1000000 [15:50:39<2974:34:54, 10.78s/it, lr=1e-5, step_loss=0.00934]
Steps: 1%| | 6187/1000000 [15:50:45<2519:12:10, 9.13s/it, lr=1e-5, step_loss=0.00934][RANK-0]: Step: [6187], local_loss=0.006747216917574406, train_loss=0.04777418076992035, time_cost=1.237123727798462
+
Steps: 1%| | 6187/1000000 [15:50:45<2519:12:10, 9.13s/it, lr=1e-5, step_loss=0.00675]
Steps: 1%| | 6188/1000000 [15:50:55<2646:13:02, 9.59s/it, lr=1e-5, step_loss=0.00675][RANK-0]: Step: [6188], local_loss=0.01369841955602169, train_loss=0.023771196603775024, time_cost=7.948832988739014
+
Steps: 1%| | 6188/1000000 [15:50:55<2646:13:02, 9.59s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 6189/1000000 [15:51:07<2834:15:51, 10.27s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [6189], local_loss=0.012545224279165268, train_loss=0.039702676236629486, time_cost=2.74727201461792
+
Steps: 1%| | 6189/1000000 [15:51:07<2834:15:51, 10.27s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 6190/1000000 [15:51:13<2484:41:30, 9.00s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [6190], local_loss=0.04016604274511337, train_loss=0.05448853224515915, time_cost=1.4624459743499756
+
Steps: 1%| | 6190/1000000 [15:51:13<2484:41:30, 9.00s/it, lr=1e-5, step_loss=0.0402]
Steps: 1%| | 6191/1000000 [15:51:20<2295:46:42, 8.32s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [6191], local_loss=0.01091768592596054, train_loss=0.0641138106584549, time_cost=1.9416465759277344
+
Steps: 1%| | 6191/1000000 [15:51:20<2295:46:42, 8.32s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 6192/1000000 [15:51:26<2087:29:25, 7.56s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [6192], local_loss=0.016958137974143028, train_loss=0.03768542781472206, time_cost=3.464940309524536
+
Steps: 1%| | 6192/1000000 [15:51:26<2087:29:25, 7.56s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 6193/1000000 [15:51:31<1899:23:20, 6.88s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [6193], local_loss=0.038134507834911346, train_loss=0.08295729011297226, time_cost=2.667393207550049
+
Steps: 1%| | 6193/1000000 [15:51:31<1899:23:20, 6.88s/it, lr=1e-5, step_loss=0.0381]
Steps: 1%| | 6194/1000000 [15:51:40<2093:34:31, 7.58s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [6194], local_loss=0.022815432399511337, train_loss=0.024560410529375076, time_cost=1.573153018951416
+
Steps: 1%| | 6194/1000000 [15:51:40<2093:34:31, 7.58s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%| | 6195/1000000 [15:51:49<2158:42:51, 7.82s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [6195], local_loss=0.011566468514502048, train_loss=0.029482346028089523, time_cost=1.9340605735778809
+
Steps: 1%| | 6195/1000000 [15:51:49<2158:42:51, 7.82s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 6196/1000000 [15:52:01<2537:08:24, 9.19s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [6196], local_loss=0.02127796970307827, train_loss=0.0296311117708683, time_cost=6.158169507980347
+
Steps: 1%| | 6196/1000000 [15:52:01<2537:08:24, 9.19s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 6197/1000000 [15:52:06<2186:55:40, 7.92s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [6197], local_loss=0.021323490887880325, train_loss=0.020923476666212082, time_cost=1.9876971244812012
+
Steps: 1%| | 6197/1000000 [15:52:06<2186:55:40, 7.92s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 6198/1000000 [15:52:11<1974:20:34, 7.15s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [6198], local_loss=0.02386476658284664, train_loss=0.019154954701662064, time_cost=2.467580556869507
+
Steps: 1%| | 6198/1000000 [15:52:11<1974:20:34, 7.15s/it, lr=1e-5, step_loss=0.0239]
Steps: 1%| | 6199/1000000 [15:52:22<2236:37:43, 8.10s/it, lr=1e-5, step_loss=0.0239][RANK-0]: Step: [6199], local_loss=0.01319881621748209, train_loss=0.1893337070941925, time_cost=1.2373254299163818
+
Steps: 1%| | 6199/1000000 [15:52:22<2236:37:43, 8.10s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 6200/1000000 [15:52:28<2098:22:54, 7.60s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [6200], local_loss=0.00971345417201519, train_loss=0.04448460042476654, time_cost=2.230799913406372
+
Steps: 1%| | 6200/1000000 [15:52:28<2098:22:54, 7.60s/it, lr=1e-5, step_loss=0.00971]
Steps: 1%| | 6201/1000000 [15:52:40<2421:48:52, 8.77s/it, lr=1e-5, step_loss=0.00971][RANK-0]: Step: [6201], local_loss=0.028423413634300232, train_loss=0.047323077917099, time_cost=1.9875211715698242
+
Steps: 1%| | 6201/1000000 [15:52:40<2421:48:52, 8.77s/it, lr=1e-5, step_loss=0.0284]
Steps: 1%| | 6202/1000000 [15:52:47<2334:53:12, 8.46s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [6202], local_loss=0.011011859402060509, train_loss=0.15956124663352966, time_cost=1.2173235416412354
+
Steps: 1%| | 6202/1000000 [15:52:47<2334:53:12, 8.46s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 6203/1000000 [15:52:57<2441:01:35, 8.84s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [6203], local_loss=0.03139955550432205, train_loss=0.08122961968183517, time_cost=1.7100319862365723
+
Steps: 1%| | 6203/1000000 [15:52:57<2441:01:35, 8.84s/it, lr=1e-5, step_loss=0.0314]
Steps: 1%| | 6204/1000000 [15:53:10<2754:49:39, 9.98s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [6204], local_loss=1.0031028985977173, train_loss=0.15371690690517426, time_cost=5.087621450424194
+
Steps: 1%| | 6204/1000000 [15:53:10<2754:49:39, 9.98s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6205/1000000 [15:53:16<2410:01:07, 8.73s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6205], local_loss=0.009962737560272217, train_loss=0.024319030344486237, time_cost=1.6928253173828125
+
Steps: 1%| | 6205/1000000 [15:53:16<2410:01:07, 8.73s/it, lr=1e-5, step_loss=0.00996]
Steps: 1%| | 6206/1000000 [15:53:23<2304:03:03, 8.35s/it, lr=1e-5, step_loss=0.00996][RANK-0]: Step: [6206], local_loss=0.01889915205538273, train_loss=0.17132064700126648, time_cost=5.032391548156738
+
Steps: 1%| | 6206/1000000 [15:53:23<2304:03:03, 8.35s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%| | 6207/1000000 [15:53:28<1998:15:53, 7.24s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [6207], local_loss=0.03397872671484947, train_loss=0.04341709613800049, time_cost=1.8691165447235107
+
Steps: 1%| | 6207/1000000 [15:53:28<1998:15:53, 7.24s/it, lr=1e-5, step_loss=0.034]
Steps: 1%| | 6208/1000000 [15:53:42<2556:15:54, 9.26s/it, lr=1e-5, step_loss=0.034][RANK-0]: Step: [6208], local_loss=0.0277768075466156, train_loss=0.0938553735613823, time_cost=3.504389762878418
+
Steps: 1%| | 6208/1000000 [15:53:42<2556:15:54, 9.26s/it, lr=1e-5, step_loss=0.0278]
Steps: 1%| | 6209/1000000 [15:53:52<2638:25:11, 9.56s/it, lr=1e-5, step_loss=0.0278][RANK-0]: Step: [6209], local_loss=0.011782122775912285, train_loss=0.017390664666891098, time_cost=4.07059645652771
+
Steps: 1%| | 6209/1000000 [15:53:52<2638:25:11, 9.56s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6210/1000000 [15:54:03<2756:32:36, 9.99s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6210], local_loss=0.0136983972042799, train_loss=0.041786760091781616, time_cost=6.994438886642456
+
Steps: 1%| | 6210/1000000 [15:54:03<2756:32:36, 9.99s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 6211/1000000 [15:54:08<2386:41:09, 8.65s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [6211], local_loss=0.062180209904909134, train_loss=0.02917421981692314, time_cost=2.727292537689209
+
Steps: 1%| | 6211/1000000 [15:54:08<2386:41:09, 8.65s/it, lr=1e-5, step_loss=0.0622]
Steps: 1%| | 6212/1000000 [15:54:18<2425:38:39, 8.79s/it, lr=1e-5, step_loss=0.0622][RANK-0]: Step: [6212], local_loss=0.07391157746315002, train_loss=0.051238249987363815, time_cost=1.9272961616516113
+
Steps: 1%| | 6212/1000000 [15:54:18<2425:38:39, 8.79s/it, lr=1e-5, step_loss=0.0739]
Steps: 1%| | 6213/1000000 [15:54:29<2640:48:46, 9.57s/it, lr=1e-5, step_loss=0.0739][RANK-0]: Step: [6213], local_loss=0.01331180240958929, train_loss=0.020455438643693924, time_cost=3.812493324279785
+
Steps: 1%| | 6213/1000000 [15:54:29<2640:48:46, 9.57s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6214/1000000 [15:54:36<2459:05:32, 8.91s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6214], local_loss=0.012646307237446308, train_loss=0.04281226173043251, time_cost=2.7924697399139404
+
Steps: 1%| | 6214/1000000 [15:54:36<2459:05:32, 8.91s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 6215/1000000 [15:54:44<2331:18:08, 8.45s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [6215], local_loss=0.046600669622421265, train_loss=0.19826430082321167, time_cost=1.4985315799713135
+
Steps: 1%| | 6215/1000000 [15:54:44<2331:18:08, 8.45s/it, lr=1e-5, step_loss=0.0466]
Steps: 1%| | 6216/1000000 [15:54:55<2570:32:59, 9.31s/it, lr=1e-5, step_loss=0.0466][RANK-0]: Step: [6216], local_loss=0.04029450565576553, train_loss=0.03155767545104027, time_cost=2.0299479961395264
+
Steps: 1%| | 6216/1000000 [15:54:55<2570:32:59, 9.31s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%| | 6217/1000000 [15:55:00<2226:22:39, 8.07s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [6217], local_loss=0.029992109164595604, train_loss=0.052976444363594055, time_cost=2.3439271450042725
+
Steps: 1%| | 6217/1000000 [15:55:00<2226:22:39, 8.07s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 6218/1000000 [15:55:06<2058:21:16, 7.46s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [6218], local_loss=0.09705892205238342, train_loss=0.036034539341926575, time_cost=1.3751401901245117
+
Steps: 1%| | 6218/1000000 [15:55:06<2058:21:16, 7.46s/it, lr=1e-5, step_loss=0.0971]
Steps: 1%| | 6219/1000000 [15:55:19<2538:54:54, 9.20s/it, lr=1e-5, step_loss=0.0971][RANK-0]: Step: [6219], local_loss=0.04590596258640289, train_loss=0.19398066401481628, time_cost=1.326059103012085
+
Steps: 1%| | 6219/1000000 [15:55:19<2538:54:54, 9.20s/it, lr=1e-5, step_loss=0.0459]
Steps: 1%| | 6220/1000000 [15:55:24<2140:02:15, 7.75s/it, lr=1e-5, step_loss=0.0459][RANK-0]: Step: [6220], local_loss=0.023970454931259155, train_loss=0.08401672542095184, time_cost=1.373429775238037
+
Steps: 1%| | 6220/1000000 [15:55:24<2140:02:15, 7.75s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 6221/1000000 [15:55:28<1827:41:37, 6.62s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [6221], local_loss=0.034900564700365067, train_loss=0.03705771267414093, time_cost=1.3562421798706055
+
Steps: 1%| | 6221/1000000 [15:55:28<1827:41:37, 6.62s/it, lr=1e-5, step_loss=0.0349]
Steps: 1%| | 6222/1000000 [15:55:38<2097:34:25, 7.60s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [6222], local_loss=0.10496199131011963, train_loss=0.04265334829688072, time_cost=3.9623188972473145
+
Steps: 1%| | 6222/1000000 [15:55:38<2097:34:25, 7.60s/it, lr=1e-5, step_loss=0.105]
Steps: 1%| | 6223/1000000 [15:55:53<2702:03:39, 9.79s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [6223], local_loss=0.04863257706165314, train_loss=0.04841259494423866, time_cost=1.3340411186218262
+
Steps: 1%| | 6223/1000000 [15:55:53<2702:03:39, 9.79s/it, lr=1e-5, step_loss=0.0486]
Steps: 1%| | 6224/1000000 [15:55:58<2314:12:02, 8.38s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [6224], local_loss=0.018741386011242867, train_loss=0.027157988399267197, time_cost=3.825453996658325
+
Steps: 1%| | 6224/1000000 [15:55:58<2314:12:02, 8.38s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 6225/1000000 [15:56:06<2308:06:44, 8.36s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [6225], local_loss=0.05391906574368477, train_loss=0.06317730993032455, time_cost=1.386760950088501
+
Steps: 1%| | 6225/1000000 [15:56:06<2308:06:44, 8.36s/it, lr=1e-5, step_loss=0.0539]
Steps: 1%| | 6226/1000000 [15:56:11<2043:51:46, 7.40s/it, lr=1e-5, step_loss=0.0539][RANK-0]: Step: [6226], local_loss=0.1870332807302475, train_loss=0.0801096260547638, time_cost=1.2738254070281982
+
Steps: 1%| | 6226/1000000 [15:56:11<2043:51:46, 7.40s/it, lr=1e-5, step_loss=0.187]
Steps: 1%| | 6227/1000000 [15:56:17<1906:25:35, 6.91s/it, lr=1e-5, step_loss=0.187][RANK-0]: Step: [6227], local_loss=0.01076207309961319, train_loss=0.035196010023355484, time_cost=1.3396031856536865
+
Steps: 1%| | 6227/1000000 [15:56:17<1906:25:35, 6.91s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 6228/1000000 [15:56:28<2225:35:21, 8.06s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [6228], local_loss=0.05090948939323425, train_loss=0.03253880888223648, time_cost=1.214775562286377
+
Steps: 1%| | 6228/1000000 [15:56:28<2225:35:21, 8.06s/it, lr=1e-5, step_loss=0.0509]
Steps: 1%| | 6229/1000000 [15:56:34<2106:00:43, 7.63s/it, lr=1e-5, step_loss=0.0509][RANK-0]: Step: [6229], local_loss=0.024059828370809555, train_loss=0.08889684081077576, time_cost=1.7390244007110596
+
Steps: 1%| | 6229/1000000 [15:56:34<2106:00:43, 7.63s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 6230/1000000 [15:56:50<2782:38:35, 10.08s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [6230], local_loss=0.24712730944156647, train_loss=0.07570235431194305, time_cost=7.597960710525513
+
Steps: 1%| | 6230/1000000 [15:56:50<2782:38:35, 10.08s/it, lr=1e-5, step_loss=0.247]
Steps: 1%| | 6231/1000000 [15:57:01<2820:47:31, 10.22s/it, lr=1e-5, step_loss=0.247][RANK-0]: Step: [6231], local_loss=0.02288230136036873, train_loss=0.18447411060333252, time_cost=7.76559591293335
+
Steps: 1%| | 6231/1000000 [15:57:01<2820:47:31, 10.22s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%| | 6232/1000000 [15:57:10<2749:08:25, 9.96s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [6232], local_loss=0.013483189046382904, train_loss=0.02375413104891777, time_cost=1.598604440689087
+
Steps: 1%| | 6232/1000000 [15:57:10<2749:08:25, 9.96s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 6233/1000000 [15:57:21<2862:29:37, 10.37s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [6233], local_loss=0.012927422299981117, train_loss=0.026535578072071075, time_cost=2.166616201400757
+
Steps: 1%| | 6233/1000000 [15:57:21<2862:29:37, 10.37s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 6234/1000000 [15:57:32<2849:41:43, 10.32s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [6234], local_loss=0.03939299285411835, train_loss=0.032396771013736725, time_cost=2.2630584239959717
+
Steps: 1%| | 6234/1000000 [15:57:32<2849:41:43, 10.32s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%| | 6235/1000000 [15:57:42<2879:56:58, 10.43s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [6235], local_loss=0.03287767618894577, train_loss=0.071529820561409, time_cost=1.2222871780395508
+
Steps: 1%| | 6235/1000000 [15:57:42<2879:56:58, 10.43s/it, lr=1e-5, step_loss=0.0329]
Steps: 1%| | 6236/1000000 [15:57:49<2547:57:09, 9.23s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [6236], local_loss=0.031270045787096024, train_loss=0.07133100926876068, time_cost=1.2053911685943604
+
Steps: 1%| | 6236/1000000 [15:57:49<2547:57:09, 9.23s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 6237/1000000 [15:57:56<2358:13:41, 8.54s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [6237], local_loss=0.4234057366847992, train_loss=0.08152356743812561, time_cost=2.5378692150115967
+
Steps: 1%| | 6237/1000000 [15:57:56<2358:13:41, 8.54s/it, lr=1e-5, step_loss=0.423]
Steps: 1%| | 6238/1000000 [15:58:01<2067:48:30, 7.49s/it, lr=1e-5, step_loss=0.423][RANK-0]: Step: [6238], local_loss=0.0069152722135186195, train_loss=0.02906803786754608, time_cost=1.5778822898864746
+
Steps: 1%| | 6238/1000000 [15:58:01<2067:48:30, 7.49s/it, lr=1e-5, step_loss=0.00692]
Steps: 1%| | 6239/1000000 [15:58:05<1787:23:57, 6.48s/it, lr=1e-5, step_loss=0.00692][RANK-0]: Step: [6239], local_loss=0.010604925453662872, train_loss=0.1570291668176651, time_cost=1.235393762588501
+
Steps: 1%| | 6239/1000000 [15:58:05<1787:23:57, 6.48s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 6240/1000000 [15:58:10<1668:04:44, 6.04s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [6240], local_loss=0.015110666863620281, train_loss=0.03015449084341526, time_cost=1.2291481494903564
+
Steps: 1%| | 6240/1000000 [15:58:10<1668:04:44, 6.04s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 6241/1000000 [15:58:18<1884:45:06, 6.83s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [6241], local_loss=0.9875564575195312, train_loss=0.23957929015159607, time_cost=2.9055593013763428
+
Steps: 1%| | 6241/1000000 [15:58:18<1884:45:06, 6.83s/it, lr=1e-5, step_loss=0.988]
Steps: 1%| | 6242/1000000 [15:58:23<1728:43:18, 6.26s/it, lr=1e-5, step_loss=0.988][RANK-0]: Step: [6242], local_loss=0.010141566395759583, train_loss=0.019185349345207214, time_cost=1.2283880710601807
+
Steps: 1%| | 6242/1000000 [15:58:23<1728:43:18, 6.26s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 6243/1000000 [15:58:33<1970:35:40, 7.14s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [6243], local_loss=0.015392590314149857, train_loss=0.030655845999717712, time_cost=1.2397375106811523
+
Steps: 1%| | 6243/1000000 [15:58:33<1970:35:40, 7.14s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 6244/1000000 [15:58:38<1847:58:10, 6.69s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [6244], local_loss=0.01716836355626583, train_loss=0.043523065745830536, time_cost=1.7942912578582764
+
Steps: 1%| | 6244/1000000 [15:58:38<1847:58:10, 6.69s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 6245/1000000 [15:58:47<2039:01:25, 7.39s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [6245], local_loss=0.04862258955836296, train_loss=0.051498010754585266, time_cost=1.2302987575531006
+
Steps: 1%| | 6245/1000000 [15:58:47<2039:01:25, 7.39s/it, lr=1e-5, step_loss=0.0486]
Steps: 1%| | 6246/1000000 [15:58:59<2375:55:36, 8.61s/it, lr=1e-5, step_loss=0.0486][RANK-0]: Step: [6246], local_loss=0.008627263829112053, train_loss=0.14417746663093567, time_cost=2.0679900646209717
+
Steps: 1%| | 6246/1000000 [15:58:59<2375:55:36, 8.61s/it, lr=1e-5, step_loss=0.00863]
Steps: 1%| | 6247/1000000 [15:59:13<2889:51:55, 10.47s/it, lr=1e-5, step_loss=0.00863][RANK-0]: Step: [6247], local_loss=0.014404816552996635, train_loss=0.022714998573064804, time_cost=1.274975299835205
+
Steps: 1%| | 6247/1000000 [15:59:13<2889:51:55, 10.47s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6248/1000000 [15:59:19<2494:24:29, 9.04s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6248], local_loss=0.011844482272863388, train_loss=0.025137636810541153, time_cost=3.0154342651367188
+
Steps: 1%| | 6248/1000000 [15:59:19<2494:24:29, 9.04s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6249/1000000 [15:59:27<2356:05:32, 8.54s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6249], local_loss=0.008091656491160393, train_loss=0.1168292909860611, time_cost=1.8584537506103516
+
Steps: 1%| | 6249/1000000 [15:59:27<2356:05:32, 8.54s/it, lr=1e-5, step_loss=0.00809]
Steps: 1%| | 6250/1000000 [15:59:32<2073:38:07, 7.51s/it, lr=1e-5, step_loss=0.00809][RANK-0]: Step: [6250], local_loss=0.040957819670438766, train_loss=0.03540945053100586, time_cost=2.2092461585998535
+
Steps: 1%| | 6250/1000000 [15:59:32<2073:38:07, 7.51s/it, lr=1e-5, step_loss=0.041]
Steps: 1%| | 6251/1000000 [15:59:46<2630:49:24, 9.53s/it, lr=1e-5, step_loss=0.041][RANK-0]: Step: [6251], local_loss=0.03953050822019577, train_loss=0.09232568740844727, time_cost=1.2172317504882812
+
Steps: 1%| | 6251/1000000 [15:59:46<2630:49:24, 9.53s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%| | 6252/1000000 [15:59:52<2322:39:28, 8.41s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [6252], local_loss=0.011970610357820988, train_loss=0.08220848441123962, time_cost=1.5662336349487305
+
Steps: 1%| | 6252/1000000 [15:59:52<2322:39:28, 8.41s/it, lr=1e-5, step_loss=0.012]
Steps: 1%| | 6253/1000000 [16:00:04<2629:49:34, 9.53s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [6253], local_loss=0.019475635141134262, train_loss=0.04391254857182503, time_cost=5.480093240737915
+
Steps: 1%| | 6253/1000000 [16:00:04<2629:49:34, 9.53s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 6254/1000000 [16:00:08<2209:28:18, 8.00s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [6254], local_loss=0.032681748270988464, train_loss=0.03470558673143387, time_cost=1.891097068786621
+
Steps: 1%| | 6254/1000000 [16:00:08<2209:28:18, 8.00s/it, lr=1e-5, step_loss=0.0327]
Steps: 1%| | 6255/1000000 [16:00:19<2463:36:39, 8.92s/it, lr=1e-5, step_loss=0.0327][RANK-0]: Step: [6255], local_loss=0.05496514216065407, train_loss=0.07266945391893387, time_cost=5.187739849090576
+
Steps: 1%| | 6255/1000000 [16:00:19<2463:36:39, 8.92s/it, lr=1e-5, step_loss=0.055]
Steps: 1%| | 6256/1000000 [16:00:33<2852:01:17, 10.33s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [6256], local_loss=0.01127749215811491, train_loss=0.03160294145345688, time_cost=1.2283380031585693
+
Steps: 1%| | 6256/1000000 [16:00:33<2852:01:17, 10.33s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 6257/1000000 [16:00:47<3171:08:58, 11.49s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [6257], local_loss=0.010545819997787476, train_loss=0.11470747739076614, time_cost=4.892838716506958
+
Steps: 1%| | 6257/1000000 [16:00:47<3171:08:58, 11.49s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 6258/1000000 [16:00:56<2980:12:58, 10.80s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [6258], local_loss=0.010541440919041634, train_loss=0.02629200741648674, time_cost=3.61462140083313
+
Steps: 1%| | 6258/1000000 [16:00:56<2980:12:58, 10.80s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 6259/1000000 [16:01:09<3155:04:19, 11.43s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [6259], local_loss=0.011088852770626545, train_loss=0.11321796476840973, time_cost=4.989123106002808
+
Steps: 1%| | 6259/1000000 [16:01:09<3155:04:19, 11.43s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 6260/1000000 [16:01:23<3360:28:14, 12.17s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [6260], local_loss=0.9675232768058777, train_loss=0.14789286255836487, time_cost=5.686574697494507
+
Steps: 1%| | 6260/1000000 [16:01:23<3360:28:14, 12.17s/it, lr=1e-5, step_loss=0.968]
Steps: 1%| | 6261/1000000 [16:01:32<3118:40:50, 11.30s/it, lr=1e-5, step_loss=0.968][RANK-0]: Step: [6261], local_loss=0.01344153843820095, train_loss=0.025070682168006897, time_cost=1.7032396793365479
+
Steps: 1%| | 6261/1000000 [16:01:32<3118:40:50, 11.30s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 6262/1000000 [16:01:46<3333:29:43, 12.08s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [6262], local_loss=0.011609627865254879, train_loss=0.05946475267410278, time_cost=11.755791902542114
+
Steps: 1%| | 6262/1000000 [16:01:46<3333:29:43, 12.08s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 6263/1000000 [16:01:52<2844:11:12, 10.30s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [6263], local_loss=0.05012303218245506, train_loss=0.03177124261856079, time_cost=1.5619401931762695
+
Steps: 1%| | 6263/1000000 [16:01:52<2844:11:12, 10.30s/it, lr=1e-5, step_loss=0.0501]
Steps: 1%| | 6264/1000000 [16:02:02<2761:04:48, 10.00s/it, lr=1e-5, step_loss=0.0501][RANK-0]: Step: [6264], local_loss=0.1514025777578354, train_loss=14.264719009399414, time_cost=3.1101157665252686
+
Steps: 1%| | 6264/1000000 [16:02:02<2761:04:48, 10.00s/it, lr=1e-5, step_loss=0.151]
Steps: 1%| | 6265/1000000 [16:02:06<2294:57:46, 8.31s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [6265], local_loss=0.01438121683895588, train_loss=0.05556729808449745, time_cost=1.7948439121246338
+
Steps: 1%| | 6265/1000000 [16:02:06<2294:57:46, 8.31s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6266/1000000 [16:02:12<2099:28:33, 7.61s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6266], local_loss=0.011459769681096077, train_loss=0.029967116191983223, time_cost=1.2332580089569092
+
Steps: 1%| | 6266/1000000 [16:02:12<2099:28:33, 7.61s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 6267/1000000 [16:02:19<2083:42:41, 7.55s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [6267], local_loss=0.00886379461735487, train_loss=0.0587291494011879, time_cost=1.4957695007324219
+
Steps: 1%| | 6267/1000000 [16:02:20<2083:42:41, 7.55s/it, lr=1e-5, step_loss=0.00886]
Steps: 1%| | 6268/1000000 [16:02:27<2107:57:26, 7.64s/it, lr=1e-5, step_loss=0.00886][RANK-0]: Step: [6268], local_loss=0.018696729093790054, train_loss=0.026882030069828033, time_cost=2.9941067695617676
+
Steps: 1%| | 6268/1000000 [16:02:27<2107:57:26, 7.64s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 6269/1000000 [16:02:36<2214:45:42, 8.02s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [6269], local_loss=0.015428374521434307, train_loss=0.10249398648738861, time_cost=2.0898334980010986
+
Steps: 1%| | 6269/1000000 [16:02:36<2214:45:42, 8.02s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 6270/1000000 [16:02:41<1983:36:54, 7.19s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [6270], local_loss=0.037854429334402084, train_loss=0.031978707760572433, time_cost=2.1798906326293945
+
Steps: 1%| | 6270/1000000 [16:02:42<1983:36:54, 7.19s/it, lr=1e-5, step_loss=0.0379]
Steps: 1%| | 6271/1000000 [16:02:50<2082:34:10, 7.54s/it, lr=1e-5, step_loss=0.0379][RANK-0]: Step: [6271], local_loss=0.015034561045467854, train_loss=0.02568926103413105, time_cost=4.337848901748657
+
Steps: 1%| | 6271/1000000 [16:02:50<2082:34:10, 7.54s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 6272/1000000 [16:02:57<2012:56:29, 7.29s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [6272], local_loss=0.011539376340806484, train_loss=0.08280233293771744, time_cost=1.5698585510253906
+
Steps: 1%| | 6272/1000000 [16:02:57<2012:56:29, 7.29s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 6273/1000000 [16:03:12<2682:15:17, 9.72s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [6273], local_loss=0.10901642590761185, train_loss=0.050795480608940125, time_cost=5.854558229446411
+
Steps: 1%| | 6273/1000000 [16:03:12<2682:15:17, 9.72s/it, lr=1e-5, step_loss=0.109]
Steps: 1%| | 6274/1000000 [16:03:16<2240:20:42, 8.12s/it, lr=1e-5, step_loss=0.109][RANK-0]: Step: [6274], local_loss=0.008609285578131676, train_loss=0.030994119122624397, time_cost=1.5261735916137695
+
Steps: 1%| | 6274/1000000 [16:03:16<2240:20:42, 8.12s/it, lr=1e-5, step_loss=0.00861]
Steps: 1%| | 6275/1000000 [16:03:30<2700:31:24, 9.78s/it, lr=1e-5, step_loss=0.00861][RANK-0]: Step: [6275], local_loss=0.014031636528670788, train_loss=0.05956115946173668, time_cost=4.4528796672821045
+
Steps: 1%| | 6275/1000000 [16:03:30<2700:31:24, 9.78s/it, lr=1e-5, step_loss=0.014]
Steps: 1%| | 6276/1000000 [16:03:44<3036:11:38, 11.00s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [6276], local_loss=0.030622752383351326, train_loss=0.0308093149214983, time_cost=5.716490268707275
+
Steps: 1%| | 6276/1000000 [16:03:44<3036:11:38, 11.00s/it, lr=1e-5, step_loss=0.0306]
Steps: 1%| | 6277/1000000 [16:03:58<3296:29:43, 11.94s/it, lr=1e-5, step_loss=0.0306][RANK-0]: Step: [6277], local_loss=0.011239031329751015, train_loss=0.029686419293284416, time_cost=9.881221532821655
+
Steps: 1%| | 6277/1000000 [16:03:58<3296:29:43, 11.94s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 6278/1000000 [16:04:06<2987:01:07, 10.82s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [6278], local_loss=0.07235706597566605, train_loss=0.15198563039302826, time_cost=1.221458911895752
+
Steps: 1%| | 6278/1000000 [16:04:06<2987:01:07, 10.82s/it, lr=1e-5, step_loss=0.0724]
Steps: 1%| | 6279/1000000 [16:04:12<2570:05:41, 9.31s/it, lr=1e-5, step_loss=0.0724][RANK-0]: Step: [6279], local_loss=0.028445594012737274, train_loss=0.039962127804756165, time_cost=1.3503518104553223
+
Steps: 1%| | 6279/1000000 [16:04:12<2570:05:41, 9.31s/it, lr=1e-5, step_loss=0.0284]
Steps: 1%| | 6280/1000000 [16:04:21<2536:36:14, 9.19s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [6280], local_loss=0.009405054152011871, train_loss=0.04163602739572525, time_cost=1.2881488800048828
+
Steps: 1%| | 6280/1000000 [16:04:21<2536:36:14, 9.19s/it, lr=1e-5, step_loss=0.00941]
Steps: 1%| | 6281/1000000 [16:04:29<2478:18:40, 8.98s/it, lr=1e-5, step_loss=0.00941][RANK-0]: Step: [6281], local_loss=0.021259892731904984, train_loss=0.05066889896988869, time_cost=3.577399492263794
+
Steps: 1%| | 6281/1000000 [16:04:29<2478:18:40, 8.98s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 6282/1000000 [16:04:44<2941:55:37, 10.66s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [6282], local_loss=0.36518311500549316, train_loss=0.0834619402885437, time_cost=4.008883237838745
+
Steps: 1%| | 6282/1000000 [16:04:44<2941:55:37, 10.66s/it, lr=1e-5, step_loss=0.365]
Steps: 1%| | 6283/1000000 [16:04:53<2835:04:44, 10.27s/it, lr=1e-5, step_loss=0.365][RANK-0]: Step: [6283], local_loss=0.013867232948541641, train_loss=0.14336660504341125, time_cost=1.7362897396087646
+
Steps: 1%| | 6283/1000000 [16:04:53<2835:04:44, 10.27s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6284/1000000 [16:04:58<2343:25:26, 8.49s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6284], local_loss=0.06133277341723442, train_loss=0.08164223283529282, time_cost=1.2298564910888672
+
Steps: 1%| | 6284/1000000 [16:04:58<2343:25:26, 8.49s/it, lr=1e-5, step_loss=0.0613]
Steps: 1%| | 6285/1000000 [16:05:12<2854:35:40, 10.34s/it, lr=1e-5, step_loss=0.0613][RANK-0]: Step: [6285], local_loss=0.04120129346847534, train_loss=0.04918853938579559, time_cost=5.592528581619263
+
Steps: 1%| | 6285/1000000 [16:05:12<2854:35:40, 10.34s/it, lr=1e-5, step_loss=0.0412]
Steps: 1%| | 6286/1000000 [16:05:21<2750:27:31, 9.96s/it, lr=1e-5, step_loss=0.0412][RANK-0]: Step: [6286], local_loss=0.012450402602553368, train_loss=0.017646778374910355, time_cost=1.6116173267364502
+
Steps: 1%| | 6286/1000000 [16:05:21<2750:27:31, 9.96s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 6287/1000000 [16:05:32<2828:02:11, 10.25s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [6287], local_loss=0.012131507508456707, train_loss=0.02361030876636505, time_cost=1.6049997806549072
+
Steps: 1%| | 6287/1000000 [16:05:32<2828:02:11, 10.25s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6288/1000000 [16:05:38<2492:07:38, 9.03s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6288], local_loss=0.031134461984038353, train_loss=0.018545299768447876, time_cost=1.272493600845337
+
Steps: 1%| | 6288/1000000 [16:05:38<2492:07:38, 9.03s/it, lr=1e-5, step_loss=0.0311]
Steps: 1%| | 6289/1000000 [16:05:46<2347:42:55, 8.51s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [6289], local_loss=0.0524786002933979, train_loss=0.03357348591089249, time_cost=2.600309371948242
+
Steps: 1%| | 6289/1000000 [16:05:46<2347:42:55, 8.51s/it, lr=1e-5, step_loss=0.0525]
Steps: 1%| | 6290/1000000 [16:05:52<2142:28:06, 7.76s/it, lr=1e-5, step_loss=0.0525][RANK-0]: Step: [6290], local_loss=0.15121027827262878, train_loss=0.05379188060760498, time_cost=4.555271863937378
+
Steps: 1%| | 6290/1000000 [16:05:52<2142:28:06, 7.76s/it, lr=1e-5, step_loss=0.151]
Steps: 1%| | 6291/1000000 [16:06:01<2258:13:30, 8.18s/it, lr=1e-5, step_loss=0.151][RANK-0]: Step: [6291], local_loss=0.03649605065584183, train_loss=0.07563477009534836, time_cost=2.039731740951538
+
Steps: 1%| | 6291/1000000 [16:06:01<2258:13:30, 8.18s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%| | 6292/1000000 [16:06:12<2534:38:13, 9.18s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [6292], local_loss=0.03965938091278076, train_loss=0.029381178319454193, time_cost=2.492920398712158
+
Steps: 1%| | 6292/1000000 [16:06:12<2534:38:13, 9.18s/it, lr=1e-5, step_loss=0.0397]
Steps: 1%| | 6293/1000000 [16:06:19<2347:11:46, 8.50s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [6293], local_loss=0.01918771117925644, train_loss=0.0644330307841301, time_cost=3.0480539798736572
+
Steps: 1%| | 6293/1000000 [16:06:19<2347:11:46, 8.50s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 6294/1000000 [16:06:30<2503:44:53, 9.07s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [6294], local_loss=0.01102000568062067, train_loss=0.01688867248594761, time_cost=1.9880168437957764
+
Steps: 1%| | 6294/1000000 [16:06:30<2503:44:53, 9.07s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 6295/1000000 [16:06:40<2631:39:03, 9.53s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [6295], local_loss=0.01237795501947403, train_loss=0.02531365677714348, time_cost=7.953343391418457
+
Steps: 1%| | 6295/1000000 [16:06:40<2631:39:03, 9.53s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 6296/1000000 [16:06:54<2979:09:26, 10.79s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [6296], local_loss=0.011643117293715477, train_loss=0.03691861778497696, time_cost=5.380160570144653
+
Steps: 1%| | 6296/1000000 [16:06:54<2979:09:26, 10.79s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 6297/1000000 [16:07:00<2555:00:56, 9.26s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [6297], local_loss=0.015485657379031181, train_loss=0.02540780045092106, time_cost=3.025585174560547
+
Steps: 1%| | 6297/1000000 [16:07:00<2555:00:56, 9.26s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 6298/1000000 [16:07:08<2507:47:06, 9.09s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [6298], local_loss=0.024336358532309532, train_loss=0.04219036549329758, time_cost=1.663045883178711
+
Steps: 1%| | 6298/1000000 [16:07:08<2507:47:06, 9.09s/it, lr=1e-5, step_loss=0.0243]
Steps: 1%| | 6299/1000000 [16:07:13<2122:02:38, 7.69s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [6299], local_loss=0.0229700468480587, train_loss=0.03209774196147919, time_cost=1.5255305767059326
+
Steps: 1%| | 6299/1000000 [16:07:13<2122:02:38, 7.69s/it, lr=1e-5, step_loss=0.023]
Steps: 1%| | 6300/1000000 [16:07:24<2406:40:48, 8.72s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [6300], local_loss=0.009627780877053738, train_loss=0.04925372451543808, time_cost=1.3620185852050781
+
Steps: 1%| | 6300/1000000 [16:07:24<2406:40:48, 8.72s/it, lr=1e-5, step_loss=0.00963]
Steps: 1%| | 6301/1000000 [16:07:36<2663:19:07, 9.65s/it, lr=1e-5, step_loss=0.00963][RANK-0]: Step: [6301], local_loss=0.06776236742734909, train_loss=0.07681432366371155, time_cost=2.7591142654418945
+
Steps: 1%| | 6301/1000000 [16:07:36<2663:19:07, 9.65s/it, lr=1e-5, step_loss=0.0678]
Steps: 1%| | 6302/1000000 [16:07:43<2472:56:26, 8.96s/it, lr=1e-5, step_loss=0.0678][RANK-0]: Step: [6302], local_loss=0.1881357878446579, train_loss=0.0567992627620697, time_cost=2.9286789894104004
+
Steps: 1%| | 6302/1000000 [16:07:43<2472:56:26, 8.96s/it, lr=1e-5, step_loss=0.188]
Steps: 1%| | 6303/1000000 [16:07:49<2201:14:00, 7.97s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [6303], local_loss=0.018036480993032455, train_loss=0.027846122160553932, time_cost=1.63698148727417
+
Steps: 1%| | 6303/1000000 [16:07:49<2201:14:00, 7.97s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 6304/1000000 [16:08:00<2420:36:39, 8.77s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [6304], local_loss=0.04220151528716087, train_loss=0.045380983501672745, time_cost=4.482160329818726
+
Steps: 1%| | 6304/1000000 [16:08:00<2420:36:39, 8.77s/it, lr=1e-5, step_loss=0.0422]
Steps: 1%| | 6305/1000000 [16:08:14<2897:25:55, 10.50s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [6305], local_loss=0.026853621006011963, train_loss=0.021392490714788437, time_cost=6.118032217025757
+
Steps: 1%| | 6305/1000000 [16:08:14<2897:25:55, 10.50s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%| | 6306/1000000 [16:08:20<2491:54:30, 9.03s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [6306], local_loss=0.00739214988425374, train_loss=0.041129887104034424, time_cost=2.366785764694214
+
Steps: 1%| | 6306/1000000 [16:08:20<2491:54:30, 9.03s/it, lr=1e-5, step_loss=0.00739]
Steps: 1%| | 6307/1000000 [16:08:25<2224:43:12, 8.06s/it, lr=1e-5, step_loss=0.00739][RANK-0]: Step: [6307], local_loss=0.4149363934993744, train_loss=0.08745437115430832, time_cost=1.2715492248535156
+
Steps: 1%| | 6307/1000000 [16:08:25<2224:43:12, 8.06s/it, lr=1e-5, step_loss=0.415]
Steps: 1%| | 6308/1000000 [16:08:37<2538:06:34, 9.20s/it, lr=1e-5, step_loss=0.415][RANK-0]: Step: [6308], local_loss=0.04157164320349693, train_loss=0.027170568704605103, time_cost=3.369218349456787
+
Steps: 1%| | 6308/1000000 [16:08:37<2538:06:34, 9.20s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%| | 6309/1000000 [16:08:50<2861:17:52, 10.37s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [6309], local_loss=0.14423762261867523, train_loss=0.031980402767658234, time_cost=1.3368616104125977
+
Steps: 1%| | 6309/1000000 [16:08:50<2861:17:52, 10.37s/it, lr=1e-5, step_loss=0.144]
Steps: 1%| | 6310/1000000 [16:08:56<2439:25:24, 8.84s/it, lr=1e-5, step_loss=0.144][RANK-0]: Step: [6310], local_loss=0.03966619446873665, train_loss=0.028316838666796684, time_cost=2.078270196914673
+
Steps: 1%| | 6310/1000000 [16:08:56<2439:25:24, 8.84s/it, lr=1e-5, step_loss=0.0397]
Steps: 1%| | 6311/1000000 [16:09:00<2061:56:05, 7.47s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [6311], local_loss=0.02094438299536705, train_loss=0.15771323442459106, time_cost=1.638509750366211
+
Steps: 1%| | 6311/1000000 [16:09:00<2061:56:05, 7.47s/it, lr=1e-5, step_loss=0.0209]
Steps: 1%| | 6312/1000000 [16:09:13<2523:04:36, 9.14s/it, lr=1e-5, step_loss=0.0209][RANK-0]: Step: [6312], local_loss=0.03425974398851395, train_loss=0.04959527403116226, time_cost=4.055056571960449
+
Steps: 1%| | 6312/1000000 [16:09:13<2523:04:36, 9.14s/it, lr=1e-5, step_loss=0.0343]
Steps: 1%| | 6313/1000000 [16:09:20<2344:43:02, 8.49s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [6313], local_loss=0.01804208941757679, train_loss=0.018847256898880005, time_cost=5.311920881271362
+
Steps: 1%| | 6313/1000000 [16:09:20<2344:43:02, 8.49s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 6314/1000000 [16:09:30<2438:09:41, 8.83s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [6314], local_loss=0.012677581049501896, train_loss=0.023369425907731056, time_cost=3.220465660095215
+
Steps: 1%| | 6314/1000000 [16:09:30<2438:09:41, 8.83s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 6315/1000000 [16:09:42<2698:58:26, 9.78s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [6315], local_loss=0.05374652519822121, train_loss=0.03110416792333126, time_cost=8.993597030639648
+
Steps: 1%| | 6315/1000000 [16:09:42<2698:58:26, 9.78s/it, lr=1e-5, step_loss=0.0537]
Steps: 1%| | 6316/1000000 [16:09:47<2347:50:05, 8.51s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [6316], local_loss=0.05748402327299118, train_loss=0.06626536697149277, time_cost=1.564042329788208
+
Steps: 1%| | 6316/1000000 [16:09:47<2347:50:05, 8.51s/it, lr=1e-5, step_loss=0.0575]
Steps: 1%| | 6317/1000000 [16:09:56<2384:37:50, 8.64s/it, lr=1e-5, step_loss=0.0575][RANK-0]: Step: [6317], local_loss=0.03288254886865616, train_loss=0.03237248212099075, time_cost=3.309983968734741
+
Steps: 1%| | 6317/1000000 [16:09:56<2384:37:50, 8.64s/it, lr=1e-5, step_loss=0.0329]
Steps: 1%| | 6318/1000000 [16:10:05<2393:57:13, 8.67s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [6318], local_loss=0.05820596590638161, train_loss=0.039422839879989624, time_cost=2.5514330863952637
+
Steps: 1%| | 6318/1000000 [16:10:05<2393:57:13, 8.67s/it, lr=1e-5, step_loss=0.0582]
Steps: 1%| | 6319/1000000 [16:10:19<2856:39:47, 10.35s/it, lr=1e-5, step_loss=0.0582][RANK-0]: Step: [6319], local_loss=0.04486493021249771, train_loss=0.03288871422410011, time_cost=3.9931654930114746
+
Steps: 1%| | 6319/1000000 [16:10:19<2856:39:47, 10.35s/it, lr=1e-5, step_loss=0.0449]
Steps: 1%| | 6320/1000000 [16:10:24<2414:54:55, 8.75s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [6320], local_loss=0.04470157250761986, train_loss=0.0909578949213028, time_cost=2.01992130279541
+
Steps: 1%| | 6320/1000000 [16:10:24<2414:54:55, 8.75s/it, lr=1e-5, step_loss=0.0447]
Steps: 1%| | 6321/1000000 [16:10:37<2779:42:32, 10.07s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [6321], local_loss=0.06086168438196182, train_loss=0.06201570853590965, time_cost=4.090821027755737
+
Steps: 1%| | 6321/1000000 [16:10:37<2779:42:32, 10.07s/it, lr=1e-5, step_loss=0.0609]
Steps: 1%| | 6322/1000000 [16:10:41<2296:38:35, 8.32s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [6322], local_loss=0.07181102782487869, train_loss=0.03124687448143959, time_cost=1.2483830451965332
+
Steps: 1%| | 6322/1000000 [16:10:41<2296:38:35, 8.32s/it, lr=1e-5, step_loss=0.0718]
Steps: 1%| | 6323/1000000 [16:10:50<2273:50:34, 8.24s/it, lr=1e-5, step_loss=0.0718][RANK-0]: Step: [6323], local_loss=0.017892807722091675, train_loss=0.04079844057559967, time_cost=6.998264789581299
+
Steps: 1%| | 6323/1000000 [16:10:50<2273:50:34, 8.24s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 6324/1000000 [16:10:58<2332:50:33, 8.45s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [6324], local_loss=0.062281493097543716, train_loss=0.0519818440079689, time_cost=2.1057732105255127
+
Steps: 1%| | 6324/1000000 [16:10:58<2332:50:33, 8.45s/it, lr=1e-5, step_loss=0.0623]
Steps: 1%| | 6325/1000000 [16:11:03<2013:50:18, 7.30s/it, lr=1e-5, step_loss=0.0623][RANK-0]: Step: [6325], local_loss=0.05380500480532646, train_loss=0.10782407969236374, time_cost=1.3407971858978271
+
Steps: 1%| | 6325/1000000 [16:11:03<2013:50:18, 7.30s/it, lr=1e-5, step_loss=0.0538]
Steps: 1%| | 6326/1000000 [16:11:10<1996:51:18, 7.23s/it, lr=1e-5, step_loss=0.0538][RANK-0]: Step: [6326], local_loss=0.08031948655843735, train_loss=0.05069936439394951, time_cost=2.429687976837158
+
Steps: 1%| | 6326/1000000 [16:11:10<1996:51:18, 7.23s/it, lr=1e-5, step_loss=0.0803]
Steps: 1%| | 6327/1000000 [16:11:15<1772:27:38, 6.42s/it, lr=1e-5, step_loss=0.0803][RANK-0]: Step: [6327], local_loss=0.007562594022601843, train_loss=0.15774451196193695, time_cost=3.3734989166259766
+
Steps: 1%| | 6327/1000000 [16:11:15<1772:27:38, 6.42s/it, lr=1e-5, step_loss=0.00756]
Steps: 1%| | 6328/1000000 [16:11:24<2047:04:44, 7.42s/it, lr=1e-5, step_loss=0.00756][RANK-0]: Step: [6328], local_loss=0.029960572719573975, train_loss=0.034618400037288666, time_cost=3.237011671066284
+
Steps: 1%| | 6328/1000000 [16:11:24<2047:04:44, 7.42s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 6329/1000000 [16:11:29<1803:10:41, 6.53s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [6329], local_loss=0.008662257343530655, train_loss=0.14243818819522858, time_cost=1.4834966659545898
+
Steps: 1%| | 6329/1000000 [16:11:29<1803:10:41, 6.53s/it, lr=1e-5, step_loss=0.00866]
Steps: 1%| | 6330/1000000 [16:11:36<1849:54:42, 6.70s/it, lr=1e-5, step_loss=0.00866][RANK-0]: Step: [6330], local_loss=0.1055101528763771, train_loss=0.051393527537584305, time_cost=2.8007161617279053
+
Steps: 1%| | 6330/1000000 [16:11:36<1849:54:42, 6.70s/it, lr=1e-5, step_loss=0.106]
Steps: 1%| | 6331/1000000 [16:11:47<2239:11:20, 8.11s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [6331], local_loss=0.16633464395999908, train_loss=0.03883684054017067, time_cost=2.1891982555389404
+
Steps: 1%| | 6331/1000000 [16:11:47<2239:11:20, 8.11s/it, lr=1e-5, step_loss=0.166]
Steps: 1%| | 6332/1000000 [16:12:03<2890:04:38, 10.47s/it, lr=1e-5, step_loss=0.166][RANK-0]: Step: [6332], local_loss=0.01562759093940258, train_loss=0.042498767375946045, time_cost=12.13233733177185
+
Steps: 1%| | 6332/1000000 [16:12:03<2890:04:38, 10.47s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%| | 6333/1000000 [16:12:11<2621:05:44, 9.50s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [6333], local_loss=0.06687857955694199, train_loss=0.03692951053380966, time_cost=2.6393208503723145
+
Steps: 1%| | 6333/1000000 [16:12:11<2621:05:44, 9.50s/it, lr=1e-5, step_loss=0.0669]
Steps: 1%| | 6334/1000000 [16:12:23<2838:51:01, 10.29s/it, lr=1e-5, step_loss=0.0669][RANK-0]: Step: [6334], local_loss=0.017199339345097542, train_loss=0.06294498592615128, time_cost=8.204299211502075
+
Steps: 1%| | 6334/1000000 [16:12:23<2838:51:01, 10.29s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 6335/1000000 [16:12:39<3333:48:10, 12.08s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [6335], local_loss=0.04553583636879921, train_loss=0.021850084885954857, time_cost=7.857634544372559
+
Steps: 1%| | 6335/1000000 [16:12:39<3333:48:10, 12.08s/it, lr=1e-5, step_loss=0.0455]
Steps: 1%| | 6336/1000000 [16:12:52<3445:25:26, 12.48s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [6336], local_loss=0.1175071969628334, train_loss=0.030690819025039673, time_cost=11.738923788070679
+
Steps: 1%| | 6336/1000000 [16:12:52<3445:25:26, 12.48s/it, lr=1e-5, step_loss=0.118]
Steps: 1%| | 6337/1000000 [16:13:02<3196:00:44, 11.58s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [6337], local_loss=0.02686316892504692, train_loss=0.019858887419104576, time_cost=2.2309229373931885
+
Steps: 1%| | 6337/1000000 [16:13:02<3196:00:44, 11.58s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%| | 6338/1000000 [16:13:06<2615:08:30, 9.47s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [6338], local_loss=0.005937155801802874, train_loss=0.012393906712532043, time_cost=1.2959115505218506
+
Steps: 1%| | 6338/1000000 [16:13:06<2615:08:30, 9.47s/it, lr=1e-5, step_loss=0.00594]
Steps: 1%| | 6339/1000000 [16:13:11<2242:03:50, 8.12s/it, lr=1e-5, step_loss=0.00594][RANK-0]: Step: [6339], local_loss=0.1291656345129013, train_loss=0.036756545305252075, time_cost=1.2414205074310303
+
Steps: 1%| | 6339/1000000 [16:13:11<2242:03:50, 8.12s/it, lr=1e-5, step_loss=0.129]
Steps: 1%| | 6340/1000000 [16:13:25<2705:56:45, 9.80s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [6340], local_loss=0.024709593504667282, train_loss=0.05272930860519409, time_cost=4.713007688522339
+
Steps: 1%| | 6340/1000000 [16:13:25<2705:56:45, 9.80s/it, lr=1e-5, step_loss=0.0247]
Steps: 1%| | 6341/1000000 [16:13:33<2518:48:52, 9.13s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [6341], local_loss=0.016999131068587303, train_loss=0.019961100071668625, time_cost=1.8509185314178467
+
Steps: 1%| | 6341/1000000 [16:13:33<2518:48:52, 9.13s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 6342/1000000 [16:13:37<2111:30:24, 7.65s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [6342], local_loss=0.01592848263680935, train_loss=0.013636810705065727, time_cost=1.4570591449737549
+
Steps: 1%| | 6342/1000000 [16:13:37<2111:30:24, 7.65s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 6343/1000000 [16:13:43<1981:52:03, 7.18s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [6343], local_loss=0.0384693406522274, train_loss=0.0461370125412941, time_cost=1.7065868377685547
+
Steps: 1%| | 6343/1000000 [16:13:43<1981:52:03, 7.18s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%| | 6344/1000000 [16:13:50<1981:44:50, 7.18s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [6344], local_loss=0.05549066513776779, train_loss=0.057240817695856094, time_cost=2.513828754425049
+
Steps: 1%| | 6344/1000000 [16:13:50<1981:44:50, 7.18s/it, lr=1e-5, step_loss=0.0555]
Steps: 1%| | 6345/1000000 [16:13:55<1754:02:53, 6.35s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [6345], local_loss=0.012176739051938057, train_loss=0.03641466796398163, time_cost=1.5877726078033447
+
Steps: 1%| | 6345/1000000 [16:13:55<1754:02:53, 6.35s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 6346/1000000 [16:14:03<1930:06:20, 6.99s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [6346], local_loss=0.013828758150339127, train_loss=0.08830603212118149, time_cost=2.6517088413238525
+
Steps: 1%| | 6346/1000000 [16:14:03<1930:06:20, 6.99s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 6347/1000000 [16:14:17<2508:08:05, 9.09s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [6347], local_loss=0.3644365668296814, train_loss=0.07319294661283493, time_cost=5.052320718765259
+
Steps: 1%| | 6347/1000000 [16:14:17<2508:08:05, 9.09s/it, lr=1e-5, step_loss=0.364]
Steps: 1%| | 6348/1000000 [16:14:28<2675:39:10, 9.69s/it, lr=1e-5, step_loss=0.364][RANK-0]: Step: [6348], local_loss=0.04055409878492355, train_loss=0.03900217264890671, time_cost=1.2641255855560303
+
Steps: 1%| | 6348/1000000 [16:14:28<2675:39:10, 9.69s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%| | 6349/1000000 [16:14:33<2264:19:03, 8.20s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [6349], local_loss=0.028786160051822662, train_loss=0.03713078796863556, time_cost=1.8710055351257324
+
Steps: 1%| | 6349/1000000 [16:14:33<2264:19:03, 8.20s/it, lr=1e-5, step_loss=0.0288]
Steps: 1%| | 6350/1000000 [16:14:50<3009:16:02, 10.90s/it, lr=1e-5, step_loss=0.0288][RANK-0]: Step: [6350], local_loss=0.04641454294323921, train_loss=0.042771339416503906, time_cost=9.004069089889526
+
Steps: 1%| | 6350/1000000 [16:14:50<3009:16:02, 10.90s/it, lr=1e-5, step_loss=0.0464]
Steps: 1%| | 6351/1000000 [16:15:03<3187:00:29, 11.55s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [6351], local_loss=0.07138099521398544, train_loss=0.15171083807945251, time_cost=5.531413316726685
+
Steps: 1%| | 6351/1000000 [16:15:03<3187:00:29, 11.55s/it, lr=1e-5, step_loss=0.0714]
Steps: 1%| | 6352/1000000 [16:15:08<2652:17:40, 9.61s/it, lr=1e-5, step_loss=0.0714][RANK-0]: Step: [6352], local_loss=0.042907923460006714, train_loss=0.0375046506524086, time_cost=2.174530506134033
+
Steps: 1%| | 6352/1000000 [16:15:08<2652:17:40, 9.61s/it, lr=1e-5, step_loss=0.0429]
Steps: 1%| | 6353/1000000 [16:15:16<2473:38:05, 8.96s/it, lr=1e-5, step_loss=0.0429][RANK-0]: Step: [6353], local_loss=0.008580685593187809, train_loss=0.025388887152075768, time_cost=1.2371954917907715
+
Steps: 1%| | 6353/1000000 [16:15:16<2473:38:05, 8.96s/it, lr=1e-5, step_loss=0.00858]
Steps: 1%| | 6354/1000000 [16:15:20<2085:49:46, 7.56s/it, lr=1e-5, step_loss=0.00858][RANK-0]: Step: [6354], local_loss=0.014094449579715729, train_loss=54.926300048828125, time_cost=1.388331413269043
+
Steps: 1%| | 6354/1000000 [16:15:20<2085:49:46, 7.56s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 6355/1000000 [16:15:25<1869:37:52, 6.77s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [6355], local_loss=0.008353650569915771, train_loss=0.03662168234586716, time_cost=3.953838348388672
+
Steps: 1%| | 6355/1000000 [16:15:25<1869:37:52, 6.77s/it, lr=1e-5, step_loss=0.00835]
Steps: 1%| | 6356/1000000 [16:15:38<2365:20:53, 8.57s/it, lr=1e-5, step_loss=0.00835][RANK-0]: Step: [6356], local_loss=0.05504973977804184, train_loss=9.26517391204834, time_cost=1.247288465499878
+
Steps: 1%| | 6356/1000000 [16:15:38<2365:20:53, 8.57s/it, lr=1e-5, step_loss=0.055]
Steps: 1%| | 6357/1000000 [16:15:49<2618:39:33, 9.49s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [6357], local_loss=0.017158513888716698, train_loss=0.014303382486104965, time_cost=1.239335060119629
+
Steps: 1%| | 6357/1000000 [16:15:49<2618:39:33, 9.49s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 6358/1000000 [16:15:58<2584:02:20, 9.36s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [6358], local_loss=0.0023576482199132442, train_loss=0.08789075911045074, time_cost=1.939147710800171
+
Steps: 1%| | 6358/1000000 [16:15:58<2584:02:20, 9.36s/it, lr=1e-5, step_loss=0.00236]
Steps: 1%| | 6359/1000000 [16:16:07<2496:53:02, 9.05s/it, lr=1e-5, step_loss=0.00236][RANK-0]: Step: [6359], local_loss=0.018330533057451248, train_loss=0.06659391522407532, time_cost=2.0859720706939697
+
Steps: 1%| | 6359/1000000 [16:16:07<2496:53:02, 9.05s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 6360/1000000 [16:16:17<2609:43:22, 9.46s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [6360], local_loss=0.019818376749753952, train_loss=0.0881536602973938, time_cost=1.2326507568359375
+
Steps: 1%| | 6360/1000000 [16:16:17<2609:43:22, 9.46s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 6361/1000000 [16:16:24<2367:11:58, 8.58s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [6361], local_loss=0.0523008331656456, train_loss=0.0748700201511383, time_cost=1.2119784355163574
+
Steps: 1%| | 6361/1000000 [16:16:24<2367:11:58, 8.58s/it, lr=1e-5, step_loss=0.0523]
Steps: 1%| | 6362/1000000 [16:16:31<2243:00:35, 8.13s/it, lr=1e-5, step_loss=0.0523][RANK-0]: Step: [6362], local_loss=0.0954812690615654, train_loss=0.037596285343170166, time_cost=3.097926378250122
+
Steps: 1%| | 6362/1000000 [16:16:31<2243:00:35, 8.13s/it, lr=1e-5, step_loss=0.0955]
Steps: 1%| | 6363/1000000 [16:16:36<1990:51:01, 7.21s/it, lr=1e-5, step_loss=0.0955][RANK-0]: Step: [6363], local_loss=0.042380817234516144, train_loss=0.024584965780377388, time_cost=4.229851961135864
+
Steps: 1%| | 6363/1000000 [16:16:36<1990:51:01, 7.21s/it, lr=1e-5, step_loss=0.0424]
Steps: 1%| | 6364/1000000 [16:16:41<1823:01:57, 6.60s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [6364], local_loss=0.08514006435871124, train_loss=0.021203942596912384, time_cost=2.1702044010162354
+
Steps: 1%| | 6364/1000000 [16:16:41<1823:01:57, 6.60s/it, lr=1e-5, step_loss=0.0851]
Steps: 1%| | 6365/1000000 [16:16:52<2170:43:09, 7.86s/it, lr=1e-5, step_loss=0.0851][RANK-0]: Step: [6365], local_loss=0.02227693982422352, train_loss=0.02857907861471176, time_cost=2.1563823223114014
+
Steps: 1%| | 6365/1000000 [16:16:52<2170:43:09, 7.86s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 6366/1000000 [16:16:57<1931:33:47, 7.00s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [6366], local_loss=0.012527009472250938, train_loss=0.013590306043624878, time_cost=1.9709227085113525
+
Steps: 1%| | 6366/1000000 [16:16:57<1931:33:47, 7.00s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 6367/1000000 [16:17:04<1968:44:30, 7.13s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [6367], local_loss=0.009425544179975986, train_loss=0.06109211593866348, time_cost=1.5106918811798096
+
Steps: 1%| | 6367/1000000 [16:17:04<1968:44:30, 7.13s/it, lr=1e-5, step_loss=0.00943]
Steps: 1%| | 6368/1000000 [16:17:10<1844:22:15, 6.68s/it, lr=1e-5, step_loss=0.00943][RANK-0]: Step: [6368], local_loss=0.038070786744356155, train_loss=0.15641751885414124, time_cost=2.911646604537964
+
Steps: 1%| | 6368/1000000 [16:17:10<1844:22:15, 6.68s/it, lr=1e-5, step_loss=0.0381]
Steps: 1%| | 6369/1000000 [16:17:20<2145:34:21, 7.77s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [6369], local_loss=0.016467569395899773, train_loss=0.032151542603969574, time_cost=1.2249023914337158
+
Steps: 1%| | 6369/1000000 [16:17:20<2145:34:21, 7.77s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 6370/1000000 [16:17:37<2897:45:16, 10.50s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [6370], local_loss=0.012058163061738014, train_loss=0.08352316170930862, time_cost=7.88697624206543
+
Steps: 1%| | 6370/1000000 [16:17:37<2897:45:16, 10.50s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6371/1000000 [16:17:44<2612:53:21, 9.47s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6371], local_loss=0.007478448562324047, train_loss=0.02502467855811119, time_cost=2.5699901580810547
+
Steps: 1%| | 6371/1000000 [16:17:44<2612:53:21, 9.47s/it, lr=1e-5, step_loss=0.00748]
Steps: 1%| | 6372/1000000 [16:17:55<2727:24:28, 9.88s/it, lr=1e-5, step_loss=0.00748][RANK-0]: Step: [6372], local_loss=0.040186166763305664, train_loss=0.053609080612659454, time_cost=1.2460482120513916
+
Steps: 1%| | 6372/1000000 [16:17:55<2727:24:28, 9.88s/it, lr=1e-5, step_loss=0.0402]
Steps: 1%| | 6373/1000000 [16:18:10<3173:17:02, 11.50s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [6373], local_loss=0.013272051699459553, train_loss=0.016589155420660973, time_cost=1.5480983257293701
+
Steps: 1%| | 6373/1000000 [16:18:10<3173:17:02, 11.50s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6374/1000000 [16:18:15<2655:11:24, 9.62s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6374], local_loss=0.0191225353628397, train_loss=0.024647075682878494, time_cost=3.3323285579681396
+
Steps: 1%| | 6374/1000000 [16:18:15<2655:11:24, 9.62s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 6375/1000000 [16:18:22<2444:14:36, 8.86s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [6375], local_loss=0.056703049689531326, train_loss=0.07975294440984726, time_cost=2.5245189666748047
+
Steps: 1%| | 6375/1000000 [16:18:22<2444:14:36, 8.86s/it, lr=1e-5, step_loss=0.0567]
Steps: 1%| | 6376/1000000 [16:18:31<2411:46:09, 8.74s/it, lr=1e-5, step_loss=0.0567][RANK-0]: Step: [6376], local_loss=0.03292476013302803, train_loss=0.05040943622589111, time_cost=1.3292295932769775
+
Steps: 1%| | 6376/1000000 [16:18:31<2411:46:09, 8.74s/it, lr=1e-5, step_loss=0.0329]
Steps: 1%| | 6377/1000000 [16:18:42<2643:59:26, 9.58s/it, lr=1e-5, step_loss=0.0329][RANK-0]: Step: [6377], local_loss=0.04872442036867142, train_loss=0.02567695826292038, time_cost=2.791095495223999
+
Steps: 1%| | 6377/1000000 [16:18:42<2643:59:26, 9.58s/it, lr=1e-5, step_loss=0.0487]
Steps: 1%| | 6378/1000000 [16:18:57<3039:21:52, 11.01s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [6378], local_loss=0.013044968247413635, train_loss=0.035040296614170074, time_cost=3.7334682941436768
+
Steps: 1%| | 6378/1000000 [16:18:57<3039:21:52, 11.01s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 6379/1000000 [16:19:05<2825:38:49, 10.24s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [6379], local_loss=0.009350545704364777, train_loss=0.06450191885232925, time_cost=4.452549695968628
+
Steps: 1%| | 6379/1000000 [16:19:05<2825:38:49, 10.24s/it, lr=1e-5, step_loss=0.00935]
Steps: 1%| | 6380/1000000 [16:19:18<3051:22:25, 11.06s/it, lr=1e-5, step_loss=0.00935][RANK-0]: Step: [6380], local_loss=0.02131684496998787, train_loss=0.02577337622642517, time_cost=4.447390794754028
+
Steps: 1%| | 6380/1000000 [16:19:18<3051:22:25, 11.06s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 6381/1000000 [16:19:24<2574:07:17, 9.33s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [6381], local_loss=0.020778439939022064, train_loss=0.05173702538013458, time_cost=2.0239696502685547
+
Steps: 1%| | 6381/1000000 [16:19:24<2574:07:17, 9.33s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%| | 6382/1000000 [16:19:29<2242:28:54, 8.12s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [6382], local_loss=43.333778381347656, train_loss=5.5245232582092285, time_cost=2.3864996433258057
+
Steps: 1%| | 6382/1000000 [16:19:29<2242:28:54, 8.12s/it, lr=1e-5, step_loss=43.3]
Steps: 1%| | 6383/1000000 [16:19:40<2482:23:10, 8.99s/it, lr=1e-5, step_loss=43.3][RANK-0]: Step: [6383], local_loss=0.00853201374411583, train_loss=0.04053882136940956, time_cost=3.602194309234619
+
Steps: 1%| | 6383/1000000 [16:19:40<2482:23:10, 8.99s/it, lr=1e-5, step_loss=0.00853]
Steps: 1%| | 6384/1000000 [16:19:45<2155:19:42, 7.81s/it, lr=1e-5, step_loss=0.00853][RANK-0]: Step: [6384], local_loss=0.015553244389593601, train_loss=0.05707831680774689, time_cost=4.106729030609131
+
Steps: 1%| | 6384/1000000 [16:19:45<2155:19:42, 7.81s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%| | 6385/1000000 [16:19:56<2441:06:46, 8.84s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [6385], local_loss=0.06006113067269325, train_loss=0.03225792199373245, time_cost=2.0869522094726562
+
Steps: 1%| | 6385/1000000 [16:19:56<2441:06:46, 8.84s/it, lr=1e-5, step_loss=0.0601]
Steps: 1%| | 6386/1000000 [16:20:02<2153:36:14, 7.80s/it, lr=1e-5, step_loss=0.0601][RANK-0]: Step: [6386], local_loss=0.23206399381160736, train_loss=0.06102254241704941, time_cost=2.3206324577331543
+
Steps: 1%| | 6386/1000000 [16:20:02<2153:36:14, 7.80s/it, lr=1e-5, step_loss=0.232]
Steps: 1%| | 6387/1000000 [16:20:06<1908:56:26, 6.92s/it, lr=1e-5, step_loss=0.232][RANK-0]: Step: [6387], local_loss=0.03353800252079964, train_loss=0.03679627180099487, time_cost=1.6629650592803955
+
Steps: 1%| | 6387/1000000 [16:20:06<1908:56:26, 6.92s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%| | 6388/1000000 [16:20:20<2438:30:40, 8.84s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [6388], local_loss=0.06234769523143768, train_loss=0.09002785384654999, time_cost=4.716449737548828
+
Steps: 1%| | 6388/1000000 [16:20:20<2438:30:40, 8.84s/it, lr=1e-5, step_loss=0.0623]
Steps: 1%| | 6389/1000000 [16:20:35<2999:16:50, 10.87s/it, lr=1e-5, step_loss=0.0623][RANK-0]: Step: [6389], local_loss=0.03416286036372185, train_loss=0.03533147647976875, time_cost=7.464897871017456
+
Steps: 1%| | 6389/1000000 [16:20:35<2999:16:50, 10.87s/it, lr=1e-5, step_loss=0.0342]
Steps: 1%| | 6390/1000000 [16:20:40<2526:34:45, 9.15s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [6390], local_loss=0.010074449703097343, train_loss=0.02138923853635788, time_cost=2.1201798915863037
+
Steps: 1%| | 6390/1000000 [16:20:40<2526:34:45, 9.15s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 6391/1000000 [16:20:46<2219:54:54, 8.04s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [6391], local_loss=0.020298629999160767, train_loss=0.028712043538689613, time_cost=2.689746618270874
+
Steps: 1%| | 6391/1000000 [16:20:46<2219:54:54, 8.04s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%| | 6392/1000000 [16:20:58<2556:54:03, 9.26s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [6392], local_loss=0.01303258165717125, train_loss=0.02554180473089218, time_cost=4.446225166320801
+
Steps: 1%| | 6392/1000000 [16:20:58<2556:54:03, 9.26s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 6393/1000000 [16:21:14<3099:44:01, 11.23s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [6393], local_loss=0.03377349302172661, train_loss=0.04471737891435623, time_cost=8.226291418075562
+
Steps: 1%| | 6393/1000000 [16:21:14<3099:44:01, 11.23s/it, lr=1e-5, step_loss=0.0338]
Steps: 1%| | 6394/1000000 [16:21:21<2744:43:48, 9.94s/it, lr=1e-5, step_loss=0.0338][RANK-0]: Step: [6394], local_loss=0.009345617145299911, train_loss=0.04394703358411789, time_cost=2.5842931270599365
+
Steps: 1%| | 6394/1000000 [16:21:21<2744:43:48, 9.94s/it, lr=1e-5, step_loss=0.00935]
Steps: 1%| | 6395/1000000 [16:21:34<3029:47:26, 10.98s/it, lr=1e-5, step_loss=0.00935][RANK-0]: Step: [6395], local_loss=0.0181882381439209, train_loss=0.05788515508174896, time_cost=3.2547969818115234
+
Steps: 1%| | 6395/1000000 [16:21:34<3029:47:26, 10.98s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%| | 6396/1000000 [16:21:52<3634:44:28, 13.17s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [6396], local_loss=0.013879508711397648, train_loss=0.07775653153657913, time_cost=8.812014818191528
+
Steps: 1%| | 6396/1000000 [16:21:52<3634:44:28, 13.17s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6397/1000000 [16:22:06<3706:49:38, 13.43s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6397], local_loss=0.013853052631020546, train_loss=0.0674431324005127, time_cost=1.6299023628234863
+
Steps: 1%| | 6397/1000000 [16:22:06<3706:49:38, 13.43s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6398/1000000 [16:22:12<3032:37:09, 10.99s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6398], local_loss=0.06814759224653244, train_loss=0.024618778377771378, time_cost=2.2542803287506104
+
Steps: 1%| | 6398/1000000 [16:22:12<3032:37:09, 10.99s/it, lr=1e-5, step_loss=0.0681]
Steps: 1%| | 6399/1000000 [16:22:19<2747:12:28, 9.95s/it, lr=1e-5, step_loss=0.0681][RANK-0]: Step: [6399], local_loss=0.01474965550005436, train_loss=0.17161113023757935, time_cost=4.3341569900512695
+
Steps: 1%| | 6399/1000000 [16:22:19<2747:12:28, 9.95s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 6400/1000000 [16:22:30<2841:58:10, 10.30s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [6400], local_loss=0.01661611720919609, train_loss=0.08174742013216019, time_cost=4.144906282424927
+
Steps: 1%| | 6400/1000000 [16:22:30<2841:58:10, 10.30s/it, lr=1e-5, step_loss=0.0166]
Steps: 1%| | 6401/1000000 [16:22:39<2672:56:47, 9.68s/it, lr=1e-5, step_loss=0.0166][RANK-0]: Step: [6401], local_loss=0.04074056074023247, train_loss=0.019325610250234604, time_cost=2.518502712249756
+
Steps: 1%| | 6401/1000000 [16:22:39<2672:56:47, 9.68s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%| | 6402/1000000 [16:22:46<2502:39:54, 9.07s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [6402], local_loss=0.009272407740354538, train_loss=0.03138068690896034, time_cost=1.4767124652862549
+
Steps: 1%| | 6402/1000000 [16:22:46<2502:39:54, 9.07s/it, lr=1e-5, step_loss=0.00927]
Steps: 1%| | 6403/1000000 [16:22:53<2273:42:33, 8.24s/it, lr=1e-5, step_loss=0.00927][RANK-0]: Step: [6403], local_loss=0.03133502975106239, train_loss=0.02943735383450985, time_cost=2.5842175483703613
+
Steps: 1%| | 6403/1000000 [16:22:53<2273:42:33, 8.24s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 6404/1000000 [16:23:00<2217:00:13, 8.03s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [6404], local_loss=0.010518129914999008, train_loss=0.01662401109933853, time_cost=1.5449261665344238
+
Steps: 1%| | 6404/1000000 [16:23:00<2217:00:13, 8.03s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 6405/1000000 [16:23:07<2102:49:21, 7.62s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [6405], local_loss=0.03004600666463375, train_loss=0.06997926533222198, time_cost=1.765089988708496
+
Steps: 1%| | 6405/1000000 [16:23:07<2102:49:21, 7.62s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 6406/1000000 [16:23:12<1886:13:16, 6.83s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [6406], local_loss=0.017884787172079086, train_loss=0.03455594927072525, time_cost=1.213428258895874
+
Steps: 1%| | 6406/1000000 [16:23:12<1886:13:16, 6.83s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 6407/1000000 [16:23:19<1945:47:00, 7.05s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [6407], local_loss=0.01206874754279852, train_loss=0.02848798595368862, time_cost=1.220766544342041
+
Steps: 1%| | 6407/1000000 [16:23:19<1945:47:00, 7.05s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6408/1000000 [16:23:32<2429:44:36, 8.80s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6408], local_loss=0.04787985235452652, train_loss=0.1663733422756195, time_cost=5.166983366012573
+
Steps: 1%| | 6408/1000000 [16:23:32<2429:44:36, 8.80s/it, lr=1e-5, step_loss=0.0479]
Steps: 1%| | 6409/1000000 [16:23:40<2306:40:34, 8.36s/it, lr=1e-5, step_loss=0.0479][RANK-0]: Step: [6409], local_loss=0.09694540500640869, train_loss=0.05123218521475792, time_cost=3.258054494857788
+
Steps: 1%| | 6409/1000000 [16:23:40<2306:40:34, 8.36s/it, lr=1e-5, step_loss=0.0969]
Steps: 1%| | 6410/1000000 [16:23:49<2370:38:26, 8.59s/it, lr=1e-5, step_loss=0.0969][RANK-0]: Step: [6410], local_loss=0.014361215755343437, train_loss=0.02633281983435154, time_cost=1.450716495513916
+
Steps: 1%| | 6410/1000000 [16:23:49<2370:38:26, 8.59s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6411/1000000 [16:23:56<2275:50:20, 8.25s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6411], local_loss=0.09375440329313278, train_loss=0.09078402817249298, time_cost=2.0762217044830322
+
Steps: 1%| | 6411/1000000 [16:23:56<2275:50:20, 8.25s/it, lr=1e-5, step_loss=0.0938]
Steps: 1%| | 6412/1000000 [16:24:09<2620:39:08, 9.50s/it, lr=1e-5, step_loss=0.0938][RANK-0]: Step: [6412], local_loss=0.006759460084140301, train_loss=0.08445580303668976, time_cost=9.813093423843384
+
Steps: 1%| | 6412/1000000 [16:24:09<2620:39:08, 9.50s/it, lr=1e-5, step_loss=0.00676]
Steps: 1%| | 6413/1000000 [16:24:15<2407:13:23, 8.72s/it, lr=1e-5, step_loss=0.00676][RANK-0]: Step: [6413], local_loss=0.31555208563804626, train_loss=0.06746259331703186, time_cost=2.292773962020874
+
Steps: 1%| | 6413/1000000 [16:24:15<2407:13:23, 8.72s/it, lr=1e-5, step_loss=0.316]
Steps: 1%| | 6414/1000000 [16:24:21<2106:00:28, 7.63s/it, lr=1e-5, step_loss=0.316][RANK-0]: Step: [6414], local_loss=0.03236796706914902, train_loss=0.022118408232927322, time_cost=2.5135884284973145
+
Steps: 1%| | 6414/1000000 [16:24:21<2106:00:28, 7.63s/it, lr=1e-5, step_loss=0.0324]
Steps: 1%| | 6415/1000000 [16:24:35<2663:37:09, 9.65s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [6415], local_loss=0.018708692863583565, train_loss=0.07957030832767487, time_cost=10.690393924713135
+
Steps: 1%| | 6415/1000000 [16:24:35<2663:37:09, 9.65s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 6416/1000000 [16:24:41<2374:01:03, 8.60s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [6416], local_loss=0.057441823184490204, train_loss=0.028067395091056824, time_cost=1.8978469371795654
+
Steps: 1%| | 6416/1000000 [16:24:41<2374:01:03, 8.60s/it, lr=1e-5, step_loss=0.0574]
Steps: 1%| | 6417/1000000 [16:24:47<2115:56:49, 7.67s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [6417], local_loss=0.034564003348350525, train_loss=0.0482998751103878, time_cost=1.3849892616271973
+
Steps: 1%| | 6417/1000000 [16:24:47<2115:56:49, 7.67s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%| | 6418/1000000 [16:24:56<2232:04:38, 8.09s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [6418], local_loss=0.05030453950166702, train_loss=0.04754325747489929, time_cost=3.1441943645477295
+
Steps: 1%| | 6418/1000000 [16:24:56<2232:04:38, 8.09s/it, lr=1e-5, step_loss=0.0503]
Steps: 1%| | 6419/1000000 [16:25:03<2176:32:09, 7.89s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [6419], local_loss=0.013261677697300911, train_loss=0.020206525921821594, time_cost=3.1867830753326416
+
Steps: 1%| | 6419/1000000 [16:25:03<2176:32:09, 7.89s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6420/1000000 [16:25:10<2124:35:24, 7.70s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6420], local_loss=0.021521350368857384, train_loss=0.02808244526386261, time_cost=2.867042303085327
+
Steps: 1%| | 6420/1000000 [16:25:10<2124:35:24, 7.70s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%| | 6421/1000000 [16:25:23<2557:31:29, 9.27s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [6421], local_loss=0.016883810982108116, train_loss=0.0363566055893898, time_cost=1.232264518737793
+
Steps: 1%| | 6421/1000000 [16:25:23<2557:31:29, 9.27s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%| | 6422/1000000 [16:25:35<2776:45:46, 10.06s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [6422], local_loss=0.20928706228733063, train_loss=0.16044975817203522, time_cost=3.294243574142456
+
Steps: 1%| | 6422/1000000 [16:25:35<2776:45:46, 10.06s/it, lr=1e-5, step_loss=0.209]
Steps: 1%| | 6423/1000000 [16:25:44<2678:03:02, 9.70s/it, lr=1e-5, step_loss=0.209][RANK-0]: Step: [6423], local_loss=0.011833852156996727, train_loss=0.01443956233561039, time_cost=1.7473676204681396
+
Steps: 1%| | 6423/1000000 [16:25:44<2678:03:02, 9.70s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6424/1000000 [16:25:50<2372:10:13, 8.60s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6424], local_loss=0.03040308691561222, train_loss=0.0721140056848526, time_cost=1.2720656394958496
+
Steps: 1%| | 6424/1000000 [16:25:50<2372:10:13, 8.60s/it, lr=1e-5, step_loss=0.0304]
Steps: 1%| | 6425/1000000 [16:25:59<2394:22:07, 8.68s/it, lr=1e-5, step_loss=0.0304][RANK-0]: Step: [6425], local_loss=0.031868964433670044, train_loss=0.041754499077796936, time_cost=2.020012140274048
+
Steps: 1%| | 6425/1000000 [16:25:59<2394:22:07, 8.68s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%| | 6426/1000000 [16:26:12<2728:32:05, 9.89s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [6426], local_loss=0.0146265160292387, train_loss=0.022876940667629242, time_cost=1.237178087234497
+
Steps: 1%| | 6426/1000000 [16:26:12<2728:32:05, 9.89s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 6427/1000000 [16:26:23<2849:07:17, 10.32s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [6427], local_loss=0.014920014888048172, train_loss=0.020113099366426468, time_cost=3.6253750324249268
+
Steps: 1%| | 6427/1000000 [16:26:23<2849:07:17, 10.32s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 6428/1000000 [16:26:31<2632:15:07, 9.54s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [6428], local_loss=0.009729832410812378, train_loss=0.0307746734470129, time_cost=1.3256361484527588
+
Steps: 1%| | 6428/1000000 [16:26:31<2632:15:07, 9.54s/it, lr=1e-5, step_loss=0.00973]
Steps: 1%| | 6429/1000000 [16:26:36<2246:31:51, 8.14s/it, lr=1e-5, step_loss=0.00973][RANK-0]: Step: [6429], local_loss=0.10424353182315826, train_loss=0.10933364182710648, time_cost=2.063725709915161
+
Steps: 1%| | 6429/1000000 [16:26:36<2246:31:51, 8.14s/it, lr=1e-5, step_loss=0.104]
Steps: 1%| | 6430/1000000 [16:26:40<1927:48:14, 6.99s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [6430], local_loss=0.0392233245074749, train_loss=0.09375709295272827, time_cost=1.2515778541564941
+
Steps: 1%| | 6430/1000000 [16:26:40<1927:48:14, 6.99s/it, lr=1e-5, step_loss=0.0392]
Steps: 1%| | 6431/1000000 [16:26:52<2324:46:32, 8.42s/it, lr=1e-5, step_loss=0.0392][RANK-0]: Step: [6431], local_loss=0.013059094548225403, train_loss=0.04070975258946419, time_cost=1.6869816780090332
+
Steps: 1%| | 6431/1000000 [16:26:52<2324:46:32, 8.42s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 6432/1000000 [16:27:08<3010:30:15, 10.91s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [6432], local_loss=0.018466386944055557, train_loss=0.034183673560619354, time_cost=8.489674806594849
+
Steps: 1%| | 6432/1000000 [16:27:08<3010:30:15, 10.91s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%| | 6433/1000000 [16:27:14<2580:19:54, 9.35s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [6433], local_loss=0.019544033333659172, train_loss=0.037857186049222946, time_cost=2.834380865097046
+
Steps: 1%| | 6433/1000000 [16:27:14<2580:19:54, 9.35s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 6434/1000000 [16:27:22<2459:38:32, 8.91s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [6434], local_loss=0.021775292232632637, train_loss=0.029945801943540573, time_cost=1.205716848373413
+
Steps: 1%| | 6434/1000000 [16:27:22<2459:38:32, 8.91s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%| | 6435/1000000 [16:27:38<3025:07:22, 10.96s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [6435], local_loss=0.05208433419466019, train_loss=9.134623527526855, time_cost=1.2111866474151611
+
Steps: 1%| | 6435/1000000 [16:27:38<3025:07:22, 10.96s/it, lr=1e-5, step_loss=0.0521]
Steps: 1%| | 6436/1000000 [16:27:45<2724:52:22, 9.87s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [6436], local_loss=0.02837390825152397, train_loss=0.041075557470321655, time_cost=1.8742704391479492
+
Steps: 1%| | 6436/1000000 [16:27:45<2724:52:22, 9.87s/it, lr=1e-5, step_loss=0.0284]
Steps: 1%| | 6437/1000000 [16:27:51<2381:12:08, 8.63s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [6437], local_loss=0.04054473713040352, train_loss=0.034726813435554504, time_cost=1.4939115047454834
+
Steps: 1%| | 6437/1000000 [16:27:51<2381:12:08, 8.63s/it, lr=1e-5, step_loss=0.0405]
Steps: 1%| | 6438/1000000 [16:28:01<2485:20:48, 9.01s/it, lr=1e-5, step_loss=0.0405][RANK-0]: Step: [6438], local_loss=0.00671200780197978, train_loss=0.06734926998615265, time_cost=2.355733871459961
+
Steps: 1%| | 6438/1000000 [16:28:01<2485:20:48, 9.01s/it, lr=1e-5, step_loss=0.00671]
Steps: 1%| | 6439/1000000 [16:28:07<2233:49:24, 8.09s/it, lr=1e-5, step_loss=0.00671][RANK-0]: Step: [6439], local_loss=0.10361912101507187, train_loss=0.03537140414118767, time_cost=1.7231626510620117
+
Steps: 1%| | 6439/1000000 [16:28:07<2233:49:24, 8.09s/it, lr=1e-5, step_loss=0.104]
Steps: 1%| | 6440/1000000 [16:28:16<2360:19:15, 8.55s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [6440], local_loss=0.01707957684993744, train_loss=0.031054016202688217, time_cost=1.6174194812774658
+
Steps: 1%| | 6440/1000000 [16:28:16<2360:19:15, 8.55s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 6441/1000000 [16:28:28<2667:18:14, 9.66s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [6441], local_loss=0.031634628772735596, train_loss=0.11310093104839325, time_cost=5.569294691085815
+
Steps: 1%| | 6441/1000000 [16:28:28<2667:18:14, 9.66s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 6442/1000000 [16:28:37<2611:21:19, 9.46s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [6442], local_loss=0.03176330029964447, train_loss=0.025846917182207108, time_cost=1.7356042861938477
+
Steps: 1%| | 6442/1000000 [16:28:37<2611:21:19, 9.46s/it, lr=1e-5, step_loss=0.0318]
Steps: 1%| | 6443/1000000 [16:28:44<2412:45:23, 8.74s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [6443], local_loss=0.031641460955142975, train_loss=0.04631750285625458, time_cost=2.535365343093872
+
Steps: 1%| | 6443/1000000 [16:28:44<2412:45:23, 8.74s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 6444/1000000 [16:28:58<2773:15:34, 10.05s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [6444], local_loss=0.011808334849774837, train_loss=28.896194458007812, time_cost=1.2478275299072266
+
Steps: 1%| | 6444/1000000 [16:28:58<2773:15:34, 10.05s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6445/1000000 [16:29:09<2917:07:51, 10.57s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6445], local_loss=0.02731594443321228, train_loss=0.03373245149850845, time_cost=4.388423919677734
+
Steps: 1%| | 6445/1000000 [16:29:09<2917:07:51, 10.57s/it, lr=1e-5, step_loss=0.0273]
Steps: 1%| | 6446/1000000 [16:29:15<2497:26:07, 9.05s/it, lr=1e-5, step_loss=0.0273][RANK-0]: Step: [6446], local_loss=0.040746498852968216, train_loss=0.053888190537691116, time_cost=4.768219709396362
+
Steps: 1%| | 6446/1000000 [16:29:15<2497:26:07, 9.05s/it, lr=1e-5, step_loss=0.0407]
Steps: 1%| | 6447/1000000 [16:29:24<2492:22:52, 9.03s/it, lr=1e-5, step_loss=0.0407][RANK-0]: Step: [6447], local_loss=0.08864094316959381, train_loss=0.07765904068946838, time_cost=3.52109956741333
+
Steps: 1%| | 6447/1000000 [16:29:24<2492:22:52, 9.03s/it, lr=1e-5, step_loss=0.0886]
Steps: 1%| | 6448/1000000 [16:29:32<2454:26:51, 8.89s/it, lr=1e-5, step_loss=0.0886][RANK-0]: Step: [6448], local_loss=0.008457614108920097, train_loss=0.02501726523041725, time_cost=1.928870677947998
+
Steps: 1%| | 6448/1000000 [16:29:32<2454:26:51, 8.89s/it, lr=1e-5, step_loss=0.00846]
Steps: 1%| | 6449/1000000 [16:29:43<2579:33:20, 9.35s/it, lr=1e-5, step_loss=0.00846][RANK-0]: Step: [6449], local_loss=0.017674444243311882, train_loss=0.09861034899950027, time_cost=4.885560989379883
+
Steps: 1%| | 6449/1000000 [16:29:43<2579:33:20, 9.35s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 6450/1000000 [16:29:54<2722:10:24, 9.86s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [6450], local_loss=0.014995023608207703, train_loss=0.09473506361246109, time_cost=2.193549394607544
+
Steps: 1%| | 6450/1000000 [16:29:54<2722:10:24, 9.86s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 6451/1000000 [16:29:58<2277:23:31, 8.25s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [6451], local_loss=0.020385123789310455, train_loss=0.03762494772672653, time_cost=1.8102612495422363
+
Steps: 1%| | 6451/1000000 [16:29:58<2277:23:31, 8.25s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%| | 6452/1000000 [16:30:13<2804:44:03, 10.16s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [6452], local_loss=0.01002291589975357, train_loss=0.02117675542831421, time_cost=11.940417528152466
+
Steps: 1%| | 6452/1000000 [16:30:13<2804:44:03, 10.16s/it, lr=1e-5, step_loss=0.01]
Steps: 1%| | 6453/1000000 [16:30:24<2846:13:23, 10.31s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [6453], local_loss=0.012950055301189423, train_loss=0.019924473017454147, time_cost=1.2233994007110596
+
Steps: 1%| | 6453/1000000 [16:30:24<2846:13:23, 10.31s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 6454/1000000 [16:30:31<2610:57:38, 9.46s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [6454], local_loss=0.02355348877608776, train_loss=0.018206648528575897, time_cost=1.9249744415283203
+
Steps: 1%| | 6454/1000000 [16:30:31<2610:57:38, 9.46s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%| | 6455/1000000 [16:30:39<2440:19:03, 8.84s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [6455], local_loss=0.019428744912147522, train_loss=0.023374594748020172, time_cost=1.8883752822875977
+
Steps: 1%| | 6455/1000000 [16:30:39<2440:19:03, 8.84s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 6456/1000000 [16:30:43<2052:17:45, 7.44s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [6456], local_loss=0.24443158507347107, train_loss=0.062398847192525864, time_cost=1.2397723197937012
+
Steps: 1%| | 6456/1000000 [16:30:43<2052:17:45, 7.44s/it, lr=1e-5, step_loss=0.244]
Steps: 1%| | 6457/1000000 [16:30:55<2433:42:11, 8.82s/it, lr=1e-5, step_loss=0.244][RANK-0]: Step: [6457], local_loss=0.028618820011615753, train_loss=0.07894425094127655, time_cost=1.2332983016967773
+
Steps: 1%| | 6457/1000000 [16:30:55<2433:42:11, 8.82s/it, lr=1e-5, step_loss=0.0286]
Steps: 1%| | 6458/1000000 [16:31:00<2148:42:04, 7.79s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [6458], local_loss=0.305759459733963, train_loss=0.05149266496300697, time_cost=2.403515338897705
+
Steps: 1%| | 6458/1000000 [16:31:00<2148:42:04, 7.79s/it, lr=1e-5, step_loss=0.306]
Steps: 1%| | 6459/1000000 [16:31:10<2284:05:06, 8.28s/it, lr=1e-5, step_loss=0.306][RANK-0]: Step: [6459], local_loss=0.031068500131368637, train_loss=0.04555058479309082, time_cost=2.0858511924743652
+
Steps: 1%| | 6459/1000000 [16:31:10<2284:05:06, 8.28s/it, lr=1e-5, step_loss=0.0311]
Steps: 1%| | 6460/1000000 [16:31:21<2559:52:01, 9.28s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [6460], local_loss=0.02973787672817707, train_loss=0.0852108746767044, time_cost=3.5864040851593018
+
Steps: 1%| | 6460/1000000 [16:31:21<2559:52:01, 9.28s/it, lr=1e-5, step_loss=0.0297]
Steps: 1%| | 6461/1000000 [16:31:29<2453:44:13, 8.89s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [6461], local_loss=0.010446612723171711, train_loss=0.04879635572433472, time_cost=4.002923488616943
+
Steps: 1%| | 6461/1000000 [16:31:29<2453:44:13, 8.89s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 6462/1000000 [16:31:39<2560:36:15, 9.28s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [6462], local_loss=0.012385622598230839, train_loss=0.018826140090823174, time_cost=1.2125544548034668
+
Steps: 1%| | 6462/1000000 [16:31:39<2560:36:15, 9.28s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 6463/1000000 [16:31:47<2402:38:56, 8.71s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [6463], local_loss=0.011288275942206383, train_loss=0.01783064194023609, time_cost=5.597611904144287
+
Steps: 1%| | 6463/1000000 [16:31:47<2402:38:56, 8.71s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 6464/1000000 [16:31:56<2421:59:30, 8.78s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [6464], local_loss=0.0939856544137001, train_loss=0.06116652488708496, time_cost=2.7304322719573975
+
Steps: 1%| | 6464/1000000 [16:31:56<2421:59:30, 8.78s/it, lr=1e-5, step_loss=0.094]
Steps: 1%| | 6465/1000000 [16:32:07<2602:30:52, 9.43s/it, lr=1e-5, step_loss=0.094][RANK-0]: Step: [6465], local_loss=0.04512458294630051, train_loss=0.03166057541966438, time_cost=1.2322680950164795
+
Steps: 1%| | 6465/1000000 [16:32:07<2602:30:52, 9.43s/it, lr=1e-5, step_loss=0.0451]
Steps: 1%| | 6466/1000000 [16:32:22<3103:36:48, 11.25s/it, lr=1e-5, step_loss=0.0451][RANK-0]: Step: [6466], local_loss=0.03570448234677315, train_loss=0.020328616723418236, time_cost=3.1699118614196777
+
Steps: 1%| | 6466/1000000 [16:32:22<3103:36:48, 11.25s/it, lr=1e-5, step_loss=0.0357]
Steps: 1%| | 6467/1000000 [16:32:37<3399:47:13, 12.32s/it, lr=1e-5, step_loss=0.0357][RANK-0]: Step: [6467], local_loss=136.71401977539062, train_loss=17.15961265563965, time_cost=6.568533658981323
+
Steps: 1%| | 6467/1000000 [16:32:37<3399:47:13, 12.32s/it, lr=1e-5, step_loss=137]
Steps: 1%| | 6468/1000000 [16:32:51<3569:16:52, 12.93s/it, lr=1e-5, step_loss=137][RANK-0]: Step: [6468], local_loss=0.021172048524022102, train_loss=0.06415219604969025, time_cost=5.771615505218506
+
Steps: 1%| | 6468/1000000 [16:32:51<3569:16:52, 12.93s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%| | 6469/1000000 [16:32:57<3000:14:00, 10.87s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [6469], local_loss=0.015528317540884018, train_loss=0.027248045429587364, time_cost=1.9139604568481445
+
Steps: 1%| | 6469/1000000 [16:32:57<3000:14:00, 10.87s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 6470/1000000 [16:33:02<2508:15:09, 9.09s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [6470], local_loss=0.006184256169945002, train_loss=0.031860966235399246, time_cost=1.872506856918335
+
Steps: 1%| | 6470/1000000 [16:33:02<2508:15:09, 9.09s/it, lr=1e-5, step_loss=0.00618]
Steps: 1%| | 6471/1000000 [16:33:14<2702:59:15, 9.79s/it, lr=1e-5, step_loss=0.00618][RANK-0]: Step: [6471], local_loss=0.017374621704220772, train_loss=0.03497755527496338, time_cost=3.9540700912475586
+
Steps: 1%| | 6471/1000000 [16:33:14<2702:59:15, 9.79s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 6472/1000000 [16:33:28<3056:46:58, 11.08s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [6472], local_loss=0.034588705748319626, train_loss=0.061099134385585785, time_cost=4.625816583633423
+
Steps: 1%| | 6472/1000000 [16:33:28<3056:46:58, 11.08s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%| | 6473/1000000 [16:33:40<3139:38:38, 11.38s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [6473], local_loss=0.042367447167634964, train_loss=0.02840398997068405, time_cost=1.815814733505249
+
Steps: 1%| | 6473/1000000 [16:33:40<3139:38:38, 11.38s/it, lr=1e-5, step_loss=0.0424]
Steps: 1%| | 6474/1000000 [16:33:50<3003:25:58, 10.88s/it, lr=1e-5, step_loss=0.0424][RANK-0]: Step: [6474], local_loss=0.045808833092451096, train_loss=0.03678906708955765, time_cost=3.11662220954895
+
Steps: 1%| | 6474/1000000 [16:33:50<3003:25:58, 10.88s/it, lr=1e-5, step_loss=0.0458]
Steps: 1%| | 6475/1000000 [16:34:05<3360:46:33, 12.18s/it, lr=1e-5, step_loss=0.0458][RANK-0]: Step: [6475], local_loss=0.008079162798821926, train_loss=0.1555635631084442, time_cost=11.872567892074585
+
Steps: 1%| | 6475/1000000 [16:34:05<3360:46:33, 12.18s/it, lr=1e-5, step_loss=0.00808]
Steps: 1%| | 6476/1000000 [16:34:16<3296:20:54, 11.94s/it, lr=1e-5, step_loss=0.00808][RANK-0]: Step: [6476], local_loss=0.0635259672999382, train_loss=0.02230791375041008, time_cost=1.902470350265503
+
Steps: 1%| | 6476/1000000 [16:34:16<3296:20:54, 11.94s/it, lr=1e-5, step_loss=0.0635]
Steps: 1%| | 6477/1000000 [16:34:24<2946:17:29, 10.68s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [6477], local_loss=0.011747825890779495, train_loss=34.3613166809082, time_cost=1.8309192657470703
+
Steps: 1%| | 6477/1000000 [16:34:24<2946:17:29, 10.68s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 6478/1000000 [16:34:30<2562:02:51, 9.28s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [6478], local_loss=0.0465497262775898, train_loss=0.05210450664162636, time_cost=1.6191918849945068
+
Steps: 1%| | 6478/1000000 [16:34:30<2562:02:51, 9.28s/it, lr=1e-5, step_loss=0.0465]
Steps: 1%| | 6479/1000000 [16:34:45<3028:04:24, 10.97s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [6479], local_loss=0.03892507031559944, train_loss=0.034221351146698, time_cost=6.3712005615234375
+
Steps: 1%| | 6479/1000000 [16:34:45<3028:04:24, 10.97s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%| | 6480/1000000 [16:34:51<2612:02:11, 9.46s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [6480], local_loss=0.028298523277044296, train_loss=0.04223636910319328, time_cost=2.1993091106414795
+
Steps: 1%| | 6480/1000000 [16:34:51<2612:02:11, 9.46s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%| | 6481/1000000 [16:34:57<2349:54:54, 8.51s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [6481], local_loss=0.01132412999868393, train_loss=0.026770152151584625, time_cost=1.4986708164215088
+
Steps: 1%| | 6481/1000000 [16:34:57<2349:54:54, 8.51s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 6482/1000000 [16:35:02<2059:50:29, 7.46s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [6482], local_loss=0.05319947004318237, train_loss=0.07621758431196213, time_cost=2.5679967403411865
+
Steps: 1%| | 6482/1000000 [16:35:02<2059:50:29, 7.46s/it, lr=1e-5, step_loss=0.0532]
Steps: 1%| | 6483/1000000 [16:35:09<2048:17:02, 7.42s/it, lr=1e-5, step_loss=0.0532][RANK-0]: Step: [6483], local_loss=0.04688827320933342, train_loss=27.3829345703125, time_cost=4.985115051269531
+
Steps: 1%| | 6483/1000000 [16:35:09<2048:17:02, 7.42s/it, lr=1e-5, step_loss=0.0469]
Steps: 1%| | 6484/1000000 [16:35:23<2549:40:01, 9.24s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [6484], local_loss=0.019438667222857475, train_loss=0.025960281491279602, time_cost=2.0635759830474854
+
Steps: 1%| | 6484/1000000 [16:35:23<2549:40:01, 9.24s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 6485/1000000 [16:35:30<2356:51:29, 8.54s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [6485], local_loss=0.04652562364935875, train_loss=0.06460864096879959, time_cost=1.402902364730835
+
Steps: 1%| | 6485/1000000 [16:35:30<2356:51:29, 8.54s/it, lr=1e-5, step_loss=0.0465]
Steps: 1%| | 6486/1000000 [16:35:39<2402:33:47, 8.71s/it, lr=1e-5, step_loss=0.0465][RANK-0]: Step: [6486], local_loss=0.04269620403647423, train_loss=0.0827617421746254, time_cost=2.869410276412964
+
Steps: 1%| | 6486/1000000 [16:35:39<2402:33:47, 8.71s/it, lr=1e-5, step_loss=0.0427]
Steps: 1%| | 6487/1000000 [16:35:54<2890:37:08, 10.47s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [6487], local_loss=0.016299359500408173, train_loss=0.022116990759968758, time_cost=9.945539951324463
+
Steps: 1%| | 6487/1000000 [16:35:54<2890:37:08, 10.47s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 6488/1000000 [16:36:08<3250:50:24, 11.78s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [6488], local_loss=0.03475022688508034, train_loss=0.03140430152416229, time_cost=5.030506610870361
+
Steps: 1%| | 6488/1000000 [16:36:08<3250:50:24, 11.78s/it, lr=1e-5, step_loss=0.0348]
Steps: 1%| | 6489/1000000 [16:36:14<2704:26:06, 9.80s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [6489], local_loss=0.025217529386281967, train_loss=0.08574756979942322, time_cost=1.2615203857421875
+
Steps: 1%| | 6489/1000000 [16:36:14<2704:26:06, 9.80s/it, lr=1e-5, step_loss=0.0252]
Steps: 1%| | 6490/1000000 [16:36:23<2638:09:56, 9.56s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [6490], local_loss=0.019518304616212845, train_loss=9.854024887084961, time_cost=4.95301079750061
+
Steps: 1%| | 6490/1000000 [16:36:23<2638:09:56, 9.56s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 6491/1000000 [16:36:33<2711:30:58, 9.83s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [6491], local_loss=0.013107968494296074, train_loss=0.0673520490527153, time_cost=2.861680269241333
+
Steps: 1%| | 6491/1000000 [16:36:33<2711:30:58, 9.83s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 6492/1000000 [16:36:47<3049:03:14, 11.05s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [6492], local_loss=0.010909991338849068, train_loss=0.02318510226905346, time_cost=2.250795364379883
+
Steps: 1%| | 6492/1000000 [16:36:47<3049:03:14, 11.05s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 6493/1000000 [16:36:55<2767:12:26, 10.03s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [6493], local_loss=0.013564398512244225, train_loss=0.13775263726711273, time_cost=3.252521514892578
+
Steps: 1%| | 6493/1000000 [16:36:55<2767:12:26, 10.03s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 6494/1000000 [16:37:01<2506:03:10, 9.08s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [6494], local_loss=0.020216263830661774, train_loss=0.042998190969228745, time_cost=5.0883214473724365
+
Steps: 1%| | 6494/1000000 [16:37:01<2506:03:10, 9.08s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%| | 6495/1000000 [16:37:07<2182:51:36, 7.91s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [6495], local_loss=0.06298963725566864, train_loss=0.1919381320476532, time_cost=1.4617552757263184
+
Steps: 1%| | 6495/1000000 [16:37:07<2182:51:36, 7.91s/it, lr=1e-5, step_loss=0.063]
Steps: 1%| | 6496/1000000 [16:37:14<2141:38:07, 7.76s/it, lr=1e-5, step_loss=0.063][RANK-0]: Step: [6496], local_loss=0.015016885474324226, train_loss=0.08200076222419739, time_cost=3.3209054470062256
+
Steps: 1%| | 6496/1000000 [16:37:14<2141:38:07, 7.76s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 6497/1000000 [16:37:25<2408:01:52, 8.73s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [6497], local_loss=0.026893379166722298, train_loss=32.74668884277344, time_cost=1.5556066036224365
+
Steps: 1%| | 6497/1000000 [16:37:25<2408:01:52, 8.73s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%| | 6498/1000000 [16:37:30<2117:44:43, 7.67s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [6498], local_loss=0.020244356244802475, train_loss=0.04318951070308685, time_cost=2.149029493331909
+
Steps: 1%| | 6498/1000000 [16:37:30<2117:44:43, 7.67s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%| | 6499/1000000 [16:37:43<2561:48:11, 9.28s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [6499], local_loss=0.0171541478484869, train_loss=19.815994262695312, time_cost=5.056016206741333
+
Steps: 1%| | 6499/1000000 [16:37:43<2561:48:11, 9.28s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 6500/1000000 [16:37:48<2195:27:01, 7.96s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [6500], local_loss=0.01716298609972, train_loss=0.14615680277347565, time_cost=1.74562406539917
+
Steps: 1%| | 6500/1000000 [16:37:48<2195:27:01, 7.96s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 6501/1000000 [16:37:53<1943:53:46, 7.04s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [6501], local_loss=0.11819818615913391, train_loss=0.03645985573530197, time_cost=2.038672685623169
+
Steps: 1%| | 6501/1000000 [16:37:53<1943:53:46, 7.04s/it, lr=1e-5, step_loss=0.118]
Steps: 1%| | 6502/1000000 [16:38:09<2680:26:53, 9.71s/it, lr=1e-5, step_loss=0.118][RANK-0]: Step: [6502], local_loss=0.007239287253469229, train_loss=0.04690541699528694, time_cost=9.084233283996582
+
Steps: 1%| | 6502/1000000 [16:38:09<2680:26:53, 9.71s/it, lr=1e-5, step_loss=0.00724]
Steps: 1%| | 6503/1000000 [16:38:21<2846:14:33, 10.31s/it, lr=1e-5, step_loss=0.00724][RANK-0]: Step: [6503], local_loss=0.07785830646753311, train_loss=0.030078276991844177, time_cost=1.758178949356079
+
Steps: 1%| | 6503/1000000 [16:38:21<2846:14:33, 10.31s/it, lr=1e-5, step_loss=0.0779]
Steps: 1%| | 6504/1000000 [16:38:26<2469:54:53, 8.95s/it, lr=1e-5, step_loss=0.0779][RANK-0]: Step: [6504], local_loss=0.16484948992729187, train_loss=0.042222149670124054, time_cost=1.6085145473480225
+
Steps: 1%| | 6504/1000000 [16:38:26<2469:54:53, 8.95s/it, lr=1e-5, step_loss=0.165]
Steps: 1%| | 6505/1000000 [16:38:35<2419:28:17, 8.77s/it, lr=1e-5, step_loss=0.165][RANK-0]: Step: [6505], local_loss=0.008667257614433765, train_loss=0.08599376678466797, time_cost=1.2421531677246094
+
Steps: 1%| | 6505/1000000 [16:38:35<2419:28:17, 8.77s/it, lr=1e-5, step_loss=0.00867]
Steps: 1%| | 6506/1000000 [16:38:39<2029:35:31, 7.35s/it, lr=1e-5, step_loss=0.00867][RANK-0]: Step: [6506], local_loss=0.03071148693561554, train_loss=0.022877048701047897, time_cost=1.3412446975708008
+
Steps: 1%| | 6506/1000000 [16:38:39<2029:35:31, 7.35s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%| | 6507/1000000 [16:38:50<2378:54:17, 8.62s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [6507], local_loss=0.16813045740127563, train_loss=0.041401877999305725, time_cost=2.8744966983795166
+
Steps: 1%| | 6507/1000000 [16:38:50<2378:54:17, 8.62s/it, lr=1e-5, step_loss=0.168]
Steps: 1%| | 6508/1000000 [16:38:57<2255:00:45, 8.17s/it, lr=1e-5, step_loss=0.168][RANK-0]: Step: [6508], local_loss=0.21569213271141052, train_loss=0.06517721712589264, time_cost=5.673115491867065
+
Steps: 1%| | 6508/1000000 [16:38:57<2255:00:45, 8.17s/it, lr=1e-5, step_loss=0.216]
Steps: 1%| | 6509/1000000 [16:39:05<2200:23:38, 7.97s/it, lr=1e-5, step_loss=0.216][RANK-0]: Step: [6509], local_loss=0.11958448588848114, train_loss=0.033764034509658813, time_cost=2.8750572204589844
+
Steps: 1%| | 6509/1000000 [16:39:05<2200:23:38, 7.97s/it, lr=1e-5, step_loss=0.12]
Steps: 1%| | 6510/1000000 [16:39:14<2299:33:28, 8.33s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [6510], local_loss=0.01187284104526043, train_loss=0.13033287227153778, time_cost=3.0165443420410156
+
Steps: 1%| | 6510/1000000 [16:39:14<2299:33:28, 8.33s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 6511/1000000 [16:39:23<2355:16:18, 8.53s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [6511], local_loss=0.042063020169734955, train_loss=0.04521007835865021, time_cost=1.2413339614868164
+
Steps: 1%| | 6511/1000000 [16:39:23<2355:16:18, 8.53s/it, lr=1e-5, step_loss=0.0421]
Steps: 1%| | 6512/1000000 [16:39:34<2511:28:55, 9.10s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [6512], local_loss=0.03000997006893158, train_loss=0.030328471213579178, time_cost=9.23877215385437
+
Steps: 1%| | 6512/1000000 [16:39:34<2511:28:55, 9.10s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 6513/1000000 [16:39:39<2204:33:43, 7.99s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [6513], local_loss=0.03217572346329689, train_loss=0.14684753119945526, time_cost=2.659634828567505
+
Steps: 1%| | 6513/1000000 [16:39:39<2204:33:43, 7.99s/it, lr=1e-5, step_loss=0.0322]
Steps: 1%| | 6514/1000000 [16:39:48<2296:13:34, 8.32s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [6514], local_loss=0.01358998566865921, train_loss=0.11054565757513046, time_cost=1.2611277103424072
+
Steps: 1%| | 6514/1000000 [16:39:48<2296:13:34, 8.32s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 6515/1000000 [16:39:57<2368:45:21, 8.58s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [6515], local_loss=0.012947622686624527, train_loss=0.022386308759450912, time_cost=3.2606630325317383
+
Steps: 1%| | 6515/1000000 [16:39:57<2368:45:21, 8.58s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 6516/1000000 [16:40:09<2590:30:44, 9.39s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [6516], local_loss=0.05259428545832634, train_loss=0.04992096126079559, time_cost=1.3926310539245605
+
Steps: 1%| | 6516/1000000 [16:40:09<2590:30:44, 9.39s/it, lr=1e-5, step_loss=0.0526]
Steps: 1%| | 6517/1000000 [16:40:15<2320:33:23, 8.41s/it, lr=1e-5, step_loss=0.0526][RANK-0]: Step: [6517], local_loss=0.026504315435886383, train_loss=0.024521706625819206, time_cost=1.5555644035339355
+
Steps: 1%| | 6517/1000000 [16:40:15<2320:33:23, 8.41s/it, lr=1e-5, step_loss=0.0265]
Steps: 1%| | 6518/1000000 [16:40:20<2047:26:15, 7.42s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [6518], local_loss=0.017036404460668564, train_loss=0.05099007487297058, time_cost=2.6067724227905273
+
Steps: 1%| | 6518/1000000 [16:40:20<2047:26:15, 7.42s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 6519/1000000 [16:40:34<2617:02:40, 9.48s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [6519], local_loss=0.006679730024188757, train_loss=0.020725112408399582, time_cost=6.3783485889434814
+
Steps: 1%| | 6519/1000000 [16:40:34<2617:02:40, 9.48s/it, lr=1e-5, step_loss=0.00668]
Steps: 1%| | 6520/1000000 [16:40:44<2637:27:52, 9.56s/it, lr=1e-5, step_loss=0.00668][RANK-0]: Step: [6520], local_loss=0.010059371590614319, train_loss=0.026924261823296547, time_cost=4.172487497329712
+
Steps: 1%| | 6520/1000000 [16:40:44<2637:27:52, 9.56s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 6521/1000000 [16:40:51<2451:06:54, 8.88s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [6521], local_loss=0.04448242112994194, train_loss=0.017251163721084595, time_cost=1.2099573612213135
+
Steps: 1%| | 6521/1000000 [16:40:51<2451:06:54, 8.88s/it, lr=1e-5, step_loss=0.0445]
Steps: 1%| | 6522/1000000 [16:41:08<3096:55:40, 11.22s/it, lr=1e-5, step_loss=0.0445][RANK-0]: Step: [6522], local_loss=0.03478636220097542, train_loss=0.03734062239527702, time_cost=7.246049642562866
+
Steps: 1%| | 6522/1000000 [16:41:08<3096:55:40, 11.22s/it, lr=1e-5, step_loss=0.0348]
Steps: 1%| | 6523/1000000 [16:41:15<2755:04:41, 9.98s/it, lr=1e-5, step_loss=0.0348][RANK-0]: Step: [6523], local_loss=0.015312612988054752, train_loss=0.048990555107593536, time_cost=1.2466914653778076
+
Steps: 1%| | 6523/1000000 [16:41:15<2755:04:41, 9.98s/it, lr=1e-5, step_loss=0.0153]
Steps: 1%| | 6524/1000000 [16:41:27<2923:38:57, 10.59s/it, lr=1e-5, step_loss=0.0153][RANK-0]: Step: [6524], local_loss=0.07821184396743774, train_loss=0.15179981291294098, time_cost=4.615896463394165
+
Steps: 1%| | 6524/1000000 [16:41:27<2923:38:57, 10.59s/it, lr=1e-5, step_loss=0.0782]
Steps: 1%| | 6525/1000000 [16:41:38<2951:24:03, 10.69s/it, lr=1e-5, step_loss=0.0782][RANK-0]: Step: [6525], local_loss=0.0580800399184227, train_loss=0.03425491601228714, time_cost=2.8487660884857178
+
Steps: 1%| | 6525/1000000 [16:41:38<2951:24:03, 10.69s/it, lr=1e-5, step_loss=0.0581]
Steps: 1%| | 6526/1000000 [16:41:43<2504:04:32, 9.07s/it, lr=1e-5, step_loss=0.0581][RANK-0]: Step: [6526], local_loss=0.016089312732219696, train_loss=0.02897867187857628, time_cost=1.6694278717041016
+
Steps: 1%| | 6526/1000000 [16:41:43<2504:04:32, 9.07s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 6527/1000000 [16:41:57<2895:33:52, 10.49s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [6527], local_loss=0.9968878626823425, train_loss=0.16800756752490997, time_cost=4.829838037490845
+
Steps: 1%| | 6527/1000000 [16:41:57<2895:33:52, 10.49s/it, lr=1e-5, step_loss=0.997]
Steps: 1%| | 6528/1000000 [16:42:05<2682:09:07, 9.72s/it, lr=1e-5, step_loss=0.997][RANK-0]: Step: [6528], local_loss=0.010648617520928383, train_loss=0.06893840432167053, time_cost=3.860412836074829
+
Steps: 1%| | 6528/1000000 [16:42:05<2682:09:07, 9.72s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 6529/1000000 [16:42:11<2363:49:55, 8.57s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [6529], local_loss=0.03156612068414688, train_loss=0.062248595058918, time_cost=1.4930024147033691
+
Steps: 1%| | 6529/1000000 [16:42:11<2363:49:55, 8.57s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 6530/1000000 [16:42:15<1987:21:31, 7.20s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [6530], local_loss=0.054573941975831985, train_loss=22.498327255249023, time_cost=3.317509174346924
+
Steps: 1%| | 6530/1000000 [16:42:15<1987:21:31, 7.20s/it, lr=1e-5, step_loss=0.0546]
Steps: 1%| | 6531/1000000 [16:42:25<2268:24:42, 8.22s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [6531], local_loss=0.01579749584197998, train_loss=0.02143597975373268, time_cost=1.3066465854644775
+
Steps: 1%| | 6531/1000000 [16:42:25<2268:24:42, 8.22s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 6532/1000000 [16:42:33<2204:44:08, 7.99s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [6532], local_loss=0.007522947154939175, train_loss=0.019923508167266846, time_cost=1.2355806827545166
+
Steps: 1%| | 6532/1000000 [16:42:33<2204:44:08, 7.99s/it, lr=1e-5, step_loss=0.00752]
Steps: 1%| | 6533/1000000 [16:42:40<2135:43:59, 7.74s/it, lr=1e-5, step_loss=0.00752][RANK-0]: Step: [6533], local_loss=0.01075083576142788, train_loss=0.1834549903869629, time_cost=1.232198715209961
+
Steps: 1%| | 6533/1000000 [16:42:40<2135:43:59, 7.74s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 6534/1000000 [16:42:51<2443:12:09, 8.85s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [6534], local_loss=0.04685854911804199, train_loss=20.19144630432129, time_cost=1.2256262302398682
+
Steps: 1%| | 6534/1000000 [16:42:51<2443:12:09, 8.85s/it, lr=1e-5, step_loss=0.0469]
Steps: 1%| | 6535/1000000 [16:42:58<2234:53:20, 8.10s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [6535], local_loss=0.015437503345310688, train_loss=0.13110895454883575, time_cost=4.537118196487427
+
Steps: 1%| | 6535/1000000 [16:42:58<2234:53:20, 8.10s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 6536/1000000 [16:43:05<2165:32:33, 7.85s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [6536], local_loss=0.019467052072286606, train_loss=0.049612630158662796, time_cost=2.589348316192627
+
Steps: 1%| | 6536/1000000 [16:43:05<2165:32:33, 7.85s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 6537/1000000 [16:43:16<2412:55:02, 8.74s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [6537], local_loss=0.01730741746723652, train_loss=0.038046080619096756, time_cost=8.08266305923462
+
Steps: 1%| | 6537/1000000 [16:43:16<2412:55:02, 8.74s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%| | 6538/1000000 [16:43:27<2603:38:07, 9.43s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [6538], local_loss=0.01351630873978138, train_loss=0.01931571587920189, time_cost=3.8106932640075684
+
Steps: 1%| | 6538/1000000 [16:43:27<2603:38:07, 9.43s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 6539/1000000 [16:43:38<2731:50:12, 9.90s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [6539], local_loss=0.03457977622747421, train_loss=0.02272738516330719, time_cost=2.1678035259246826
+
Steps: 1%| | 6539/1000000 [16:43:38<2731:50:12, 9.90s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%| | 6540/1000000 [16:43:46<2597:55:00, 9.41s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [6540], local_loss=0.03485031798481941, train_loss=0.022068439051508904, time_cost=2.8271920680999756
+
Steps: 1%| | 6540/1000000 [16:43:46<2597:55:00, 9.41s/it, lr=1e-5, step_loss=0.0349]
Steps: 1%| | 6541/1000000 [16:43:51<2231:44:18, 8.09s/it, lr=1e-5, step_loss=0.0349][RANK-0]: Step: [6541], local_loss=0.04033583775162697, train_loss=6.469747066497803, time_cost=3.7807040214538574
+
Steps: 1%| | 6541/1000000 [16:43:51<2231:44:18, 8.09s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%| | 6542/1000000 [16:43:57<2048:13:45, 7.42s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [6542], local_loss=0.02942844107747078, train_loss=0.08089660108089447, time_cost=2.7318878173828125
+
Steps: 1%| | 6542/1000000 [16:43:57<2048:13:45, 7.42s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%| | 6543/1000000 [16:44:07<2272:28:17, 8.23s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [6543], local_loss=0.02479412406682968, train_loss=0.0666041448712349, time_cost=1.2402355670928955
+
Steps: 1%| | 6543/1000000 [16:44:07<2272:28:17, 8.23s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%| | 6544/1000000 [16:44:22<2825:59:21, 10.24s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [6544], local_loss=0.02057076059281826, train_loss=0.14555242657661438, time_cost=1.238044261932373
+
Steps: 1%| | 6544/1000000 [16:44:22<2825:59:21, 10.24s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 6545/1000000 [16:44:35<3045:04:55, 11.03s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [6545], local_loss=0.017755046486854553, train_loss=0.02974613755941391, time_cost=5.329632043838501
+
Steps: 1%| | 6545/1000000 [16:44:35<3045:04:55, 11.03s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%| | 6546/1000000 [16:44:44<2893:04:59, 10.48s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [6546], local_loss=0.008596794679760933, train_loss=0.026117246598005295, time_cost=7.926831483840942
+
Steps: 1%| | 6546/1000000 [16:44:44<2893:04:59, 10.48s/it, lr=1e-5, step_loss=0.0086]
Steps: 1%| | 6547/1000000 [16:44:49<2423:20:27, 8.78s/it, lr=1e-5, step_loss=0.0086][RANK-0]: Step: [6547], local_loss=0.09782595187425613, train_loss=0.17310938239097595, time_cost=3.40616512298584
+
Steps: 1%| | 6547/1000000 [16:44:49<2423:20:27, 8.78s/it, lr=1e-5, step_loss=0.0978]
Steps: 1%| | 6548/1000000 [16:44:58<2459:01:45, 8.91s/it, lr=1e-5, step_loss=0.0978][RANK-0]: Step: [6548], local_loss=0.023323869332671165, train_loss=0.08766123652458191, time_cost=1.2256247997283936
+
Steps: 1%| | 6548/1000000 [16:44:58<2459:01:45, 8.91s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%| | 6549/1000000 [16:45:06<2367:59:43, 8.58s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [6549], local_loss=0.03766271471977234, train_loss=0.0344155915081501, time_cost=2.359234094619751
+
Steps: 1%| | 6549/1000000 [16:45:06<2367:59:43, 8.58s/it, lr=1e-5, step_loss=0.0377]
Steps: 1%| | 6550/1000000 [16:45:14<2287:22:31, 8.29s/it, lr=1e-5, step_loss=0.0377][RANK-0]: Step: [6550], local_loss=1.0008914470672607, train_loss=0.1491503119468689, time_cost=3.381709575653076
+
Steps: 1%| | 6550/1000000 [16:45:14<2287:22:31, 8.29s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6551/1000000 [16:45:24<2450:43:33, 8.88s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6551], local_loss=0.007229556795209646, train_loss=0.06843353807926178, time_cost=2.6410069465637207
+
Steps: 1%| | 6551/1000000 [16:45:24<2450:43:33, 8.88s/it, lr=1e-5, step_loss=0.00723]
Steps: 1%| | 6552/1000000 [16:45:31<2280:51:34, 8.27s/it, lr=1e-5, step_loss=0.00723][RANK-0]: Step: [6552], local_loss=0.03454452380537987, train_loss=0.047667283564805984, time_cost=2.6878061294555664
+
Steps: 1%| | 6552/1000000 [16:45:31<2280:51:34, 8.27s/it, lr=1e-5, step_loss=0.0345]
Steps: 1%| | 6553/1000000 [16:45:36<2034:31:08, 7.37s/it, lr=1e-5, step_loss=0.0345][RANK-0]: Step: [6553], local_loss=0.013586670160293579, train_loss=0.06959524750709534, time_cost=2.3667702674865723
+
Steps: 1%| | 6553/1000000 [16:45:36<2034:31:08, 7.37s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 6554/1000000 [16:45:47<2306:23:25, 8.36s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [6554], local_loss=0.04399571567773819, train_loss=0.0392531156539917, time_cost=7.972821235656738
+
Steps: 1%| | 6554/1000000 [16:45:47<2306:23:25, 8.36s/it, lr=1e-5, step_loss=0.044]
Steps: 1%| | 6555/1000000 [16:46:02<2854:01:26, 10.34s/it, lr=1e-5, step_loss=0.044][RANK-0]: Step: [6555], local_loss=0.10319684445858002, train_loss=0.042487941682338715, time_cost=1.765181303024292
+
Steps: 1%| | 6555/1000000 [16:46:02<2854:01:26, 10.34s/it, lr=1e-5, step_loss=0.103]
Steps: 1%| | 6556/1000000 [16:46:11<2792:03:46, 10.12s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [6556], local_loss=0.013628557324409485, train_loss=0.05570008605718613, time_cost=1.3096632957458496
+
Steps: 1%| | 6556/1000000 [16:46:11<2792:03:46, 10.12s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 6557/1000000 [16:46:19<2577:52:26, 9.34s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [6557], local_loss=0.18729211390018463, train_loss=0.16994963586330414, time_cost=3.591808557510376
+
Steps: 1%| | 6557/1000000 [16:46:19<2577:52:26, 9.34s/it, lr=1e-5, step_loss=0.187]
Steps: 1%| | 6558/1000000 [16:46:24<2232:42:00, 8.09s/it, lr=1e-5, step_loss=0.187][RANK-0]: Step: [6558], local_loss=0.014229075983166695, train_loss=43.28852462768555, time_cost=1.9060523509979248
+
Steps: 1%| | 6558/1000000 [16:46:24<2232:42:00, 8.09s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 6559/1000000 [16:46:30<2066:54:56, 7.49s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [6559], local_loss=1.050909399986267, train_loss=0.20749521255493164, time_cost=1.4068479537963867
+
Steps: 1%| | 6559/1000000 [16:46:30<2066:54:56, 7.49s/it, lr=1e-5, step_loss=1.05]
Steps: 1%| | 6560/1000000 [16:46:35<1874:27:06, 6.79s/it, lr=1e-5, step_loss=1.05][RANK-0]: Step: [6560], local_loss=0.013802044093608856, train_loss=0.03719513118267059, time_cost=2.11069655418396
+
Steps: 1%| | 6560/1000000 [16:46:35<1874:27:06, 6.79s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 6561/1000000 [16:46:46<2184:47:31, 7.92s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [6561], local_loss=0.04139140993356705, train_loss=0.034814346581697464, time_cost=8.780011177062988
+
Steps: 1%| | 6561/1000000 [16:46:46<2184:47:31, 7.92s/it, lr=1e-5, step_loss=0.0414]
Steps: 1%| | 6562/1000000 [16:46:52<2019:12:08, 7.32s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [6562], local_loss=0.046344026923179626, train_loss=0.03486159071326256, time_cost=1.720092535018921
+
Steps: 1%| | 6562/1000000 [16:46:52<2019:12:08, 7.32s/it, lr=1e-5, step_loss=0.0463]
Steps: 1%| | 6563/1000000 [16:46:56<1768:52:35, 6.41s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [6563], local_loss=0.015187670476734638, train_loss=0.061845049262046814, time_cost=1.2248668670654297
+
Steps: 1%| | 6563/1000000 [16:46:56<1768:52:35, 6.41s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 6564/1000000 [16:47:07<2161:31:52, 7.83s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [6564], local_loss=0.023657504469156265, train_loss=8.83393669128418, time_cost=3.4428768157958984
+
Steps: 1%| | 6564/1000000 [16:47:07<2161:31:52, 7.83s/it, lr=1e-5, step_loss=0.0237]
Steps: 1%| | 6565/1000000 [16:47:12<1930:35:04, 7.00s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [6565], local_loss=0.021573252975940704, train_loss=0.0668020099401474, time_cost=2.0515151023864746
+
Steps: 1%| | 6565/1000000 [16:47:12<1930:35:04, 7.00s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 6566/1000000 [16:47:25<2400:58:05, 8.70s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [6566], local_loss=0.013253959827125072, train_loss=0.024661045521497726, time_cost=1.3984642028808594
+
Steps: 1%| | 6566/1000000 [16:47:25<2400:58:05, 8.70s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6567/1000000 [16:47:34<2416:12:21, 8.76s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6567], local_loss=0.01939653418958187, train_loss=0.030963286757469177, time_cost=2.8502371311187744
+
Steps: 1%| | 6567/1000000 [16:47:34<2416:12:21, 8.76s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 6568/1000000 [16:47:44<2549:29:17, 9.24s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [6568], local_loss=0.022336779162287712, train_loss=0.020090637728571892, time_cost=2.23230242729187
+
Steps: 1%| | 6568/1000000 [16:47:44<2549:29:17, 9.24s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 6569/1000000 [16:47:53<2528:54:53, 9.16s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [6569], local_loss=0.009039775468409061, train_loss=0.031791746616363525, time_cost=1.2238807678222656
+
Steps: 1%| | 6569/1000000 [16:47:53<2528:54:53, 9.16s/it, lr=1e-5, step_loss=0.00904]
Steps: 1%| | 6570/1000000 [16:47:57<2106:29:52, 7.63s/it, lr=1e-5, step_loss=0.00904][RANK-0]: Step: [6570], local_loss=0.03202975168824196, train_loss=0.05148264765739441, time_cost=1.2296431064605713
+
Steps: 1%| | 6570/1000000 [16:47:57<2106:29:52, 7.63s/it, lr=1e-5, step_loss=0.032]
Steps: 1%| | 6571/1000000 [16:48:08<2380:03:20, 8.62s/it, lr=1e-5, step_loss=0.032][RANK-0]: Step: [6571], local_loss=0.36928999423980713, train_loss=0.10025292634963989, time_cost=1.1968910694122314
+
Steps: 1%| | 6571/1000000 [16:48:08<2380:03:20, 8.62s/it, lr=1e-5, step_loss=0.369]
Steps: 1%| | 6572/1000000 [16:48:16<2365:52:40, 8.57s/it, lr=1e-5, step_loss=0.369][RANK-0]: Step: [6572], local_loss=0.010313480161130428, train_loss=0.03381118178367615, time_cost=1.9861929416656494
+
Steps: 1%| | 6572/1000000 [16:48:16<2365:52:40, 8.57s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%| | 6573/1000000 [16:48:28<2606:30:38, 9.45s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [6573], local_loss=0.019152916967868805, train_loss=0.026607414707541466, time_cost=3.5403363704681396
+
Steps: 1%| | 6573/1000000 [16:48:28<2606:30:38, 9.45s/it, lr=1e-5, step_loss=0.0192]
Steps: 1%| | 6574/1000000 [16:48:39<2735:17:39, 9.91s/it, lr=1e-5, step_loss=0.0192][RANK-0]: Step: [6574], local_loss=0.014209795743227005, train_loss=0.026822950690984726, time_cost=1.3286900520324707
+
Steps: 1%| | 6574/1000000 [16:48:39<2735:17:39, 9.91s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 6575/1000000 [16:48:45<2408:27:18, 8.73s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [6575], local_loss=0.00839502178132534, train_loss=0.014817965216934681, time_cost=2.242556095123291
+
Steps: 1%| | 6575/1000000 [16:48:45<2408:27:18, 8.73s/it, lr=1e-5, step_loss=0.0084]
Steps: 1%| | 6576/1000000 [16:48:51<2189:34:06, 7.93s/it, lr=1e-5, step_loss=0.0084][RANK-0]: Step: [6576], local_loss=1.0082941055297852, train_loss=0.27255579829216003, time_cost=1.7854037284851074
+
Steps: 1%| | 6576/1000000 [16:48:51<2189:34:06, 7.93s/it, lr=1e-5, step_loss=1.01]
Steps: 1%| | 6577/1000000 [16:49:00<2273:47:01, 8.24s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [6577], local_loss=0.0065167006105184555, train_loss=0.03460872918367386, time_cost=1.327277660369873
+
Steps: 1%| | 6577/1000000 [16:49:00<2273:47:01, 8.24s/it, lr=1e-5, step_loss=0.00652]
Steps: 1%| | 6578/1000000 [16:49:09<2338:24:18, 8.47s/it, lr=1e-5, step_loss=0.00652][RANK-0]: Step: [6578], local_loss=0.00950213149189949, train_loss=0.027163993567228317, time_cost=1.4646217823028564
+
Steps: 1%| | 6578/1000000 [16:49:09<2338:24:18, 8.47s/it, lr=1e-5, step_loss=0.0095]
Steps: 1%| | 6579/1000000 [16:49:23<2782:00:57, 10.08s/it, lr=1e-5, step_loss=0.0095][RANK-0]: Step: [6579], local_loss=0.046350378543138504, train_loss=0.05069825053215027, time_cost=1.3709020614624023
+
Steps: 1%| | 6579/1000000 [16:49:23<2782:00:57, 10.08s/it, lr=1e-5, step_loss=0.0464]
Steps: 1%| | 6580/1000000 [16:49:32<2689:03:51, 9.74s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [6580], local_loss=0.007737970910966396, train_loss=0.022231247276067734, time_cost=7.3825154304504395
+
Steps: 1%| | 6580/1000000 [16:49:32<2689:03:51, 9.74s/it, lr=1e-5, step_loss=0.00774]
Steps: 1%| | 6581/1000000 [16:49:43<2827:35:35, 10.25s/it, lr=1e-5, step_loss=0.00774][RANK-0]: Step: [6581], local_loss=0.011494211852550507, train_loss=0.05341566354036331, time_cost=1.2998642921447754
+
Steps: 1%| | 6581/1000000 [16:49:43<2827:35:35, 10.25s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 6582/1000000 [16:49:51<2605:03:27, 9.44s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [6582], local_loss=0.2689388394355774, train_loss=0.06680873036384583, time_cost=2.0657565593719482
+
Steps: 1%| | 6582/1000000 [16:49:51<2605:03:27, 9.44s/it, lr=1e-5, step_loss=0.269]
Steps: 1%| | 6583/1000000 [16:49:56<2272:16:20, 8.23s/it, lr=1e-5, step_loss=0.269][RANK-0]: Step: [6583], local_loss=0.008913839235901833, train_loss=0.0340387187898159, time_cost=2.514523983001709
+
Steps: 1%| | 6583/1000000 [16:49:56<2272:16:20, 8.23s/it, lr=1e-5, step_loss=0.00891]
Steps: 1%| | 6584/1000000 [16:50:10<2775:56:03, 10.06s/it, lr=1e-5, step_loss=0.00891][RANK-0]: Step: [6584], local_loss=0.009247077628970146, train_loss=0.04387561231851578, time_cost=4.931076765060425
+
Steps: 1%| | 6584/1000000 [16:50:10<2775:56:03, 10.06s/it, lr=1e-5, step_loss=0.00925]
Steps: 1%| | 6585/1000000 [16:50:27<3286:06:23, 11.91s/it, lr=1e-5, step_loss=0.00925][RANK-0]: Step: [6585], local_loss=0.01134166494011879, train_loss=0.03950680047273636, time_cost=5.233215093612671
+
Steps: 1%| | 6585/1000000 [16:50:27<3286:06:23, 11.91s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 6586/1000000 [16:50:32<2719:23:02, 9.85s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [6586], local_loss=0.013048229739069939, train_loss=0.013560500927269459, time_cost=1.3159270286560059
+
Steps: 1%| | 6586/1000000 [16:50:32<2719:23:02, 9.85s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 6587/1000000 [16:50:44<2878:04:35, 10.43s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [6587], local_loss=0.00851011835038662, train_loss=0.021534139290452003, time_cost=4.1808507442474365
+
Steps: 1%| | 6587/1000000 [16:50:44<2878:04:35, 10.43s/it, lr=1e-5, step_loss=0.00851]
Steps: 1%| | 6588/1000000 [16:50:51<2607:42:27, 9.45s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [6588], local_loss=0.020108507946133614, train_loss=0.16921605169773102, time_cost=2.823594808578491
+
Steps: 1%| | 6588/1000000 [16:50:51<2607:42:27, 9.45s/it, lr=1e-5, step_loss=0.0201]
Steps: 1%| | 6589/1000000 [16:50:56<2277:14:20, 8.25s/it, lr=1e-5, step_loss=0.0201][RANK-0]: Step: [6589], local_loss=0.031023653224110603, train_loss=0.04917314648628235, time_cost=4.1173179149627686
+
Steps: 1%| | 6589/1000000 [16:50:56<2277:14:20, 8.25s/it, lr=1e-5, step_loss=0.031]
Steps: 1%| | 6590/1000000 [16:51:03<2145:04:09, 7.77s/it, lr=1e-5, step_loss=0.031][RANK-0]: Step: [6590], local_loss=0.03525429219007492, train_loss=0.14522720873355865, time_cost=2.1994574069976807
+
Steps: 1%| | 6590/1000000 [16:51:03<2145:04:09, 7.77s/it, lr=1e-5, step_loss=0.0353]
Steps: 1%| | 6591/1000000 [16:51:14<2399:33:21, 8.70s/it, lr=1e-5, step_loss=0.0353][RANK-0]: Step: [6591], local_loss=0.11741309612989426, train_loss=0.17296209931373596, time_cost=3.2952628135681152
+
Steps: 1%| | 6591/1000000 [16:51:14<2399:33:21, 8.70s/it, lr=1e-5, step_loss=0.117]
Steps: 1%| | 6592/1000000 [16:51:19<2101:00:47, 7.61s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [6592], local_loss=0.008851013146340847, train_loss=0.06090544909238815, time_cost=1.2474591732025146
+
Steps: 1%| | 6592/1000000 [16:51:19<2101:00:47, 7.61s/it, lr=1e-5, step_loss=0.00885]
Steps: 1%| | 6593/1000000 [16:51:24<1944:59:41, 7.05s/it, lr=1e-5, step_loss=0.00885][RANK-0]: Step: [6593], local_loss=0.05030442401766777, train_loss=0.06319660693407059, time_cost=1.3203167915344238
+
Steps: 1%| | 6593/1000000 [16:51:24<1944:59:41, 7.05s/it, lr=1e-5, step_loss=0.0503]
Steps: 1%| | 6594/1000000 [16:51:29<1711:38:44, 6.20s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [6594], local_loss=0.019876781851053238, train_loss=0.0190243162214756, time_cost=1.2185513973236084
+
Steps: 1%| | 6594/1000000 [16:51:29<1711:38:44, 6.20s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%| | 6595/1000000 [16:51:34<1617:03:04, 5.86s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [6595], local_loss=0.02334090881049633, train_loss=0.1682521104812622, time_cost=2.2784552574157715
+
Steps: 1%| | 6595/1000000 [16:51:34<1617:03:04, 5.86s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%| | 6596/1000000 [16:51:44<2019:55:34, 7.32s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [6596], local_loss=0.014116277918219566, train_loss=0.03767213970422745, time_cost=1.3670380115509033
+
Steps: 1%| | 6596/1000000 [16:51:44<2019:55:34, 7.32s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 6597/1000000 [16:51:55<2289:10:16, 8.30s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [6597], local_loss=0.014130217023193836, train_loss=0.06198367848992348, time_cost=4.159462928771973
+
Steps: 1%| | 6597/1000000 [16:51:55<2289:10:16, 8.30s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 6598/1000000 [16:52:08<2704:06:04, 9.80s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [6598], local_loss=0.05271865054965019, train_loss=0.07703301310539246, time_cost=1.4939253330230713
+
Steps: 1%| | 6598/1000000 [16:52:08<2704:06:04, 9.80s/it, lr=1e-5, step_loss=0.0527]
Steps: 1%| | 6599/1000000 [16:52:19<2793:00:13, 10.12s/it, lr=1e-5, step_loss=0.0527][RANK-0]: Step: [6599], local_loss=0.014017250388860703, train_loss=0.027373094111680984, time_cost=2.9330360889434814
+
Steps: 1%| | 6599/1000000 [16:52:19<2793:00:13, 10.12s/it, lr=1e-5, step_loss=0.014]
Steps: 1%| | 6600/1000000 [16:52:26<2535:51:05, 9.19s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [6600], local_loss=0.013267314061522484, train_loss=0.02829257771372795, time_cost=1.2315707206726074
+
Steps: 1%| | 6600/1000000 [16:52:26<2535:51:05, 9.19s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6601/1000000 [16:52:38<2731:55:31, 9.90s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6601], local_loss=0.0222586989402771, train_loss=0.060865581035614014, time_cost=1.9392077922821045
+
Steps: 1%| | 6601/1000000 [16:52:38<2731:55:31, 9.90s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 6602/1000000 [16:52:44<2416:20:10, 8.76s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [6602], local_loss=0.009563710540533066, train_loss=0.019452502951025963, time_cost=1.7457656860351562
+
Steps: 1%| | 6602/1000000 [16:52:44<2416:20:10, 8.76s/it, lr=1e-5, step_loss=0.00956]
Steps: 1%| | 6603/1000000 [16:52:58<2882:36:22, 10.45s/it, lr=1e-5, step_loss=0.00956][RANK-0]: Step: [6603], local_loss=0.024197179824113846, train_loss=0.05956655740737915, time_cost=1.5374979972839355
+
Steps: 1%| | 6603/1000000 [16:52:58<2882:36:22, 10.45s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 6604/1000000 [16:53:10<2947:14:08, 10.68s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [6604], local_loss=0.019425731152296066, train_loss=18.85315704345703, time_cost=3.8734450340270996
+
Steps: 1%| | 6604/1000000 [16:53:10<2947:14:08, 10.68s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 6605/1000000 [16:53:23<3212:56:23, 11.64s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [6605], local_loss=0.010496186092495918, train_loss=0.08672627061605453, time_cost=5.39688777923584
+
Steps: 1%| | 6605/1000000 [16:53:23<3212:56:23, 11.64s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 6606/1000000 [16:53:38<3447:04:23, 12.49s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [6606], local_loss=1.0018082857131958, train_loss=0.18845036625862122, time_cost=1.22648024559021
+
Steps: 1%| | 6606/1000000 [16:53:38<3447:04:23, 12.49s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6607/1000000 [16:53:47<3205:01:50, 11.61s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6607], local_loss=0.012076109647750854, train_loss=0.030466711148619652, time_cost=2.2693936824798584
+
Steps: 1%| | 6607/1000000 [16:53:47<3205:01:50, 11.61s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6608/1000000 [16:53:54<2800:14:03, 10.15s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6608], local_loss=0.01540365256369114, train_loss=0.02210535854101181, time_cost=2.1998603343963623
+
Steps: 1%| | 6608/1000000 [16:53:54<2800:14:03, 10.15s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 6609/1000000 [16:54:10<3264:31:55, 11.83s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [6609], local_loss=0.04325777292251587, train_loss=0.030314411967992783, time_cost=7.414634943008423
+
Steps: 1%| | 6609/1000000 [16:54:10<3264:31:55, 11.83s/it, lr=1e-5, step_loss=0.0433]
Steps: 1%| | 6610/1000000 [16:54:16<2768:52:55, 10.03s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [6610], local_loss=0.023740254342556, train_loss=0.021273499354720116, time_cost=1.6960275173187256
+
Steps: 1%| | 6610/1000000 [16:54:16<2768:52:55, 10.03s/it, lr=1e-5, step_loss=0.0237]
Steps: 1%| | 6611/1000000 [16:54:30<3123:52:55, 11.32s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [6611], local_loss=0.015703514218330383, train_loss=0.16876105964183807, time_cost=4.24433159828186
+
Steps: 1%| | 6611/1000000 [16:54:30<3123:52:55, 11.32s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 6612/1000000 [16:54:34<2544:41:42, 9.22s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [6612], local_loss=0.007983668707311153, train_loss=0.06235393136739731, time_cost=1.5623443126678467
+
Steps: 1%| | 6612/1000000 [16:54:34<2544:41:42, 9.22s/it, lr=1e-5, step_loss=0.00798]
Steps: 1%| | 6613/1000000 [16:54:44<2553:36:19, 9.25s/it, lr=1e-5, step_loss=0.00798][RANK-0]: Step: [6613], local_loss=0.05743764340877533, train_loss=0.08406022191047668, time_cost=3.6833343505859375
+
Steps: 1%| | 6613/1000000 [16:54:44<2553:36:19, 9.25s/it, lr=1e-5, step_loss=0.0574]
Steps: 1%| | 6614/1000000 [16:54:53<2520:19:49, 9.13s/it, lr=1e-5, step_loss=0.0574][RANK-0]: Step: [6614], local_loss=1.0139721632003784, train_loss=0.16323064267635345, time_cost=1.2808301448822021
+
Steps: 1%| | 6614/1000000 [16:54:53<2520:19:49, 9.13s/it, lr=1e-5, step_loss=1.01]
Steps: 1%| | 6615/1000000 [16:55:01<2473:41:22, 8.96s/it, lr=1e-5, step_loss=1.01][RANK-0]: Step: [6615], local_loss=0.09887013584375381, train_loss=0.032568253576755524, time_cost=2.5617997646331787
+
Steps: 1%| | 6615/1000000 [16:55:01<2473:41:22, 8.96s/it, lr=1e-5, step_loss=0.0989]
Steps: 1%| | 6616/1000000 [16:55:09<2385:54:02, 8.65s/it, lr=1e-5, step_loss=0.0989][RANK-0]: Step: [6616], local_loss=0.07857124507427216, train_loss=15.103605270385742, time_cost=6.553258657455444
+
Steps: 1%| | 6616/1000000 [16:55:09<2385:54:02, 8.65s/it, lr=1e-5, step_loss=0.0786]
Steps: 1%| | 6617/1000000 [16:55:14<2075:47:57, 7.52s/it, lr=1e-5, step_loss=0.0786][RANK-0]: Step: [6617], local_loss=0.023511173203587532, train_loss=0.23894181847572327, time_cost=1.8359215259552002
+
Steps: 1%| | 6617/1000000 [16:55:14<2075:47:57, 7.52s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%| | 6618/1000000 [16:55:22<2108:54:30, 7.64s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [6618], local_loss=0.046952325850725174, train_loss=0.12128743529319763, time_cost=1.2795510292053223
+
Steps: 1%| | 6618/1000000 [16:55:22<2108:54:30, 7.64s/it, lr=1e-5, step_loss=0.047]
Steps: 1%| | 6619/1000000 [16:55:33<2393:02:14, 8.67s/it, lr=1e-5, step_loss=0.047][RANK-0]: Step: [6619], local_loss=0.03160494193434715, train_loss=0.07794231921434402, time_cost=3.515199661254883
+
Steps: 1%| | 6619/1000000 [16:55:33<2393:02:14, 8.67s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 6620/1000000 [16:55:50<3057:04:25, 11.08s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [6620], local_loss=0.011790611781179905, train_loss=0.02126253768801689, time_cost=9.331677436828613
+
Steps: 1%| | 6620/1000000 [16:55:50<3057:04:25, 11.08s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6621/1000000 [16:55:56<2684:20:08, 9.73s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6621], local_loss=0.012712422758340836, train_loss=23.925918579101562, time_cost=2.686826467514038
+
Steps: 1%| | 6621/1000000 [16:55:56<2684:20:08, 9.73s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 6622/1000000 [16:56:08<2840:36:29, 10.29s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [6622], local_loss=0.011122321709990501, train_loss=0.038863178342580795, time_cost=3.94480037689209
+
Steps: 1%| | 6622/1000000 [16:56:08<2840:36:29, 10.29s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 6623/1000000 [16:56:19<2901:18:13, 10.51s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [6623], local_loss=0.008350900374352932, train_loss=0.14431564509868622, time_cost=1.5133399963378906
+
Steps: 1%| | 6623/1000000 [16:56:19<2901:18:13, 10.51s/it, lr=1e-5, step_loss=0.00835]
Steps: 1%| | 6624/1000000 [16:56:24<2444:44:12, 8.86s/it, lr=1e-5, step_loss=0.00835][RANK-0]: Step: [6624], local_loss=0.04547544941306114, train_loss=0.03193322569131851, time_cost=2.0480077266693115
+
Steps: 1%| | 6624/1000000 [16:56:24<2444:44:12, 8.86s/it, lr=1e-5, step_loss=0.0455]
Steps: 1%| | 6625/1000000 [16:56:35<2627:39:21, 9.52s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [6625], local_loss=0.01390928216278553, train_loss=0.05916875600814819, time_cost=3.717273473739624
+
Steps: 1%| | 6625/1000000 [16:56:35<2627:39:21, 9.52s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6626/1000000 [16:56:42<2442:41:21, 8.85s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6626], local_loss=0.04441307857632637, train_loss=0.07318141311407089, time_cost=2.609823226928711
+
Steps: 1%| | 6626/1000000 [16:56:42<2442:41:21, 8.85s/it, lr=1e-5, step_loss=0.0444]
Steps: 1%| | 6627/1000000 [16:56:56<2809:03:23, 10.18s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [6627], local_loss=0.019893120974302292, train_loss=0.038091909140348434, time_cost=1.2352001667022705
+
Steps: 1%| | 6627/1000000 [16:56:56<2809:03:23, 10.18s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%| | 6628/1000000 [16:57:09<3072:19:10, 11.13s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [6628], local_loss=0.017794225364923477, train_loss=0.14358513057231903, time_cost=4.860412120819092
+
Steps: 1%| | 6628/1000000 [16:57:09<3072:19:10, 11.13s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%| | 6629/1000000 [16:57:22<3213:37:18, 11.65s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [6629], local_loss=0.01822804845869541, train_loss=0.03191080689430237, time_cost=2.712538719177246
+
Steps: 1%| | 6629/1000000 [16:57:22<3213:37:18, 11.65s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%| | 6630/1000000 [16:57:35<3380:44:59, 12.25s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [6630], local_loss=0.00571295153349638, train_loss=0.043037306517362595, time_cost=5.8415846824646
+
Steps: 1%| | 6630/1000000 [16:57:35<3380:44:59, 12.25s/it, lr=1e-5, step_loss=0.00571]
Steps: 1%| | 6631/1000000 [16:57:47<3299:28:42, 11.96s/it, lr=1e-5, step_loss=0.00571][RANK-0]: Step: [6631], local_loss=0.1422063410282135, train_loss=0.054277896881103516, time_cost=1.8572947978973389
+
Steps: 1%| | 6631/1000000 [16:57:47<3299:28:42, 11.96s/it, lr=1e-5, step_loss=0.142]
Steps: 1%| | 6632/1000000 [16:58:01<3465:49:36, 12.56s/it, lr=1e-5, step_loss=0.142][RANK-0]: Step: [6632], local_loss=0.02227143757045269, train_loss=0.03360578417778015, time_cost=5.727664470672607
+
Steps: 1%| | 6632/1000000 [16:58:01<3465:49:36, 12.56s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 6633/1000000 [16:58:06<2855:42:11, 10.35s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [6633], local_loss=0.007028248161077499, train_loss=0.08391441404819489, time_cost=1.2237918376922607
+
Steps: 1%| | 6633/1000000 [16:58:06<2855:42:11, 10.35s/it, lr=1e-5, step_loss=0.00703]
Steps: 1%| | 6634/1000000 [16:58:13<2596:34:46, 9.41s/it, lr=1e-5, step_loss=0.00703][RANK-0]: Step: [6634], local_loss=0.03352458029985428, train_loss=0.08678629994392395, time_cost=3.962498664855957
+
Steps: 1%| | 6634/1000000 [16:58:13<2596:34:46, 9.41s/it, lr=1e-5, step_loss=0.0335]
Steps: 1%| | 6635/1000000 [16:58:20<2392:57:22, 8.67s/it, lr=1e-5, step_loss=0.0335][RANK-0]: Step: [6635], local_loss=0.013142356649041176, train_loss=0.0395536869764328, time_cost=2.122070550918579
+
Steps: 1%| | 6635/1000000 [16:58:20<2392:57:22, 8.67s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 6636/1000000 [16:58:27<2252:03:36, 8.16s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [6636], local_loss=1.002172827720642, train_loss=0.18472960591316223, time_cost=2.88983416557312
+
Steps: 1%| | 6636/1000000 [16:58:27<2252:03:36, 8.16s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6637/1000000 [16:58:33<2039:23:21, 7.39s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6637], local_loss=0.008188582956790924, train_loss=0.042206309735774994, time_cost=1.5545589923858643
+
Steps: 1%| | 6637/1000000 [16:58:33<2039:23:21, 7.39s/it, lr=1e-5, step_loss=0.00819]
Steps: 1%| | 6638/1000000 [16:58:47<2628:08:32, 9.52s/it, lr=1e-5, step_loss=0.00819][RANK-0]: Step: [6638], local_loss=0.011701860465109348, train_loss=0.020169351249933243, time_cost=2.6641793251037598
+
Steps: 1%| | 6638/1000000 [16:58:47<2628:08:32, 9.52s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 6639/1000000 [16:59:02<3085:33:50, 11.18s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [6639], local_loss=0.010881747119128704, train_loss=0.014848717488348484, time_cost=8.444073915481567
+
Steps: 1%| | 6639/1000000 [16:59:02<3085:33:50, 11.18s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 6640/1000000 [16:59:08<2622:15:39, 9.50s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [6640], local_loss=0.05560408532619476, train_loss=0.054458390921354294, time_cost=1.231574535369873
+
Steps: 1%| | 6640/1000000 [16:59:08<2622:15:39, 9.50s/it, lr=1e-5, step_loss=0.0556]
Steps: 1%| | 6641/1000000 [16:59:19<2767:55:05, 10.03s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [6641], local_loss=0.0802425891160965, train_loss=0.07580288499593735, time_cost=3.5077614784240723
+
Steps: 1%| | 6641/1000000 [16:59:19<2767:55:05, 10.03s/it, lr=1e-5, step_loss=0.0802]
Steps: 1%| | 6642/1000000 [16:59:30<2860:41:31, 10.37s/it, lr=1e-5, step_loss=0.0802][RANK-0]: Step: [6642], local_loss=0.21063262224197388, train_loss=0.07538727670907974, time_cost=2.4610788822174072
+
Steps: 1%| | 6642/1000000 [16:59:30<2860:41:31, 10.37s/it, lr=1e-5, step_loss=0.211]
Steps: 1%| | 6643/1000000 [16:59:47<3359:19:52, 12.17s/it, lr=1e-5, step_loss=0.211][RANK-0]: Step: [6643], local_loss=0.01960665173828602, train_loss=0.01653122901916504, time_cost=8.185417175292969
+
Steps: 1%| | 6643/1000000 [16:59:47<3359:19:52, 12.17s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%| | 6644/1000000 [16:59:57<3215:44:47, 11.65s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [6644], local_loss=0.014869974926114082, train_loss=0.039082691073417664, time_cost=1.3160545825958252
+
Steps: 1%| | 6644/1000000 [16:59:57<3215:44:47, 11.65s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 6645/1000000 [17:00:07<3075:08:48, 11.14s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [6645], local_loss=0.020318951457738876, train_loss=0.030769389122724533, time_cost=2.1212687492370605
+
Steps: 1%| | 6645/1000000 [17:00:07<3075:08:48, 11.14s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%| | 6646/1000000 [17:00:11<2529:30:07, 9.17s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [6646], local_loss=0.014377990737557411, train_loss=0.020936688408255577, time_cost=2.0010123252868652
+
Steps: 1%| | 6646/1000000 [17:00:11<2529:30:07, 9.17s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6647/1000000 [17:00:23<2730:34:53, 9.90s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6647], local_loss=0.6714805364608765, train_loss=0.13006740808486938, time_cost=3.7701289653778076
+
Steps: 1%| | 6647/1000000 [17:00:23<2730:34:53, 9.90s/it, lr=1e-5, step_loss=0.671]
Steps: 1%| | 6648/1000000 [17:00:36<2997:33:56, 10.86s/it, lr=1e-5, step_loss=0.671][RANK-0]: Step: [6648], local_loss=0.01618531532585621, train_loss=0.2380421906709671, time_cost=3.215740442276001
+
Steps: 1%| | 6648/1000000 [17:00:36<2997:33:56, 10.86s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 6649/1000000 [17:00:49<3137:35:08, 11.37s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [6649], local_loss=0.014946430921554565, train_loss=0.026657193899154663, time_cost=6.021378517150879
+
Steps: 1%| | 6649/1000000 [17:00:49<3137:35:08, 11.37s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 6650/1000000 [17:00:56<2786:51:30, 10.10s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [6650], local_loss=0.013331325724720955, train_loss=0.04835528880357742, time_cost=1.2165567874908447
+
Steps: 1%| | 6650/1000000 [17:00:56<2786:51:30, 10.10s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6651/1000000 [17:01:01<2373:27:39, 8.60s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6651], local_loss=0.18424804508686066, train_loss=0.06693941354751587, time_cost=1.2219133377075195
+
Steps: 1%| | 6651/1000000 [17:01:01<2373:27:39, 8.60s/it, lr=1e-5, step_loss=0.184]
Steps: 1%| | 6652/1000000 [17:01:11<2528:41:37, 9.16s/it, lr=1e-5, step_loss=0.184][RANK-0]: Step: [6652], local_loss=0.02940179966390133, train_loss=0.02800029143691063, time_cost=1.9421687126159668
+
Steps: 1%| | 6652/1000000 [17:01:11<2528:41:37, 9.16s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%| | 6653/1000000 [17:01:24<2819:34:02, 10.22s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [6653], local_loss=0.008085831999778748, train_loss=0.017797183245420456, time_cost=4.848304986953735
+
Steps: 1%| | 6653/1000000 [17:01:24<2819:34:02, 10.22s/it, lr=1e-5, step_loss=0.00809]
Steps: 1%| | 6654/1000000 [17:01:29<2381:20:06, 8.63s/it, lr=1e-5, step_loss=0.00809][RANK-0]: Step: [6654], local_loss=0.011137486435472965, train_loss=0.14843137562274933, time_cost=2.1384642124176025
+
Steps: 1%| | 6654/1000000 [17:01:29<2381:20:06, 8.63s/it, lr=1e-5, step_loss=0.0111]
Steps: 1%| | 6655/1000000 [17:01:33<2032:52:42, 7.37s/it, lr=1e-5, step_loss=0.0111][RANK-0]: Step: [6655], local_loss=0.09107238799333572, train_loss=0.04680349677801132, time_cost=1.3874363899230957
+
Steps: 1%| | 6655/1000000 [17:01:33<2032:52:42, 7.37s/it, lr=1e-5, step_loss=0.0911]
Steps: 1%| | 6656/1000000 [17:01:46<2426:01:38, 8.79s/it, lr=1e-5, step_loss=0.0911][RANK-0]: Step: [6656], local_loss=0.5841453671455383, train_loss=10.45160961151123, time_cost=4.256115198135376
+
Steps: 1%| | 6656/1000000 [17:01:46<2426:01:38, 8.79s/it, lr=1e-5, step_loss=0.584]
Steps: 1%| | 6657/1000000 [17:01:52<2196:37:59, 7.96s/it, lr=1e-5, step_loss=0.584][RANK-0]: Step: [6657], local_loss=0.044631682336330414, train_loss=0.04707220196723938, time_cost=1.6580064296722412
+
Steps: 1%| | 6657/1000000 [17:01:52<2196:37:59, 7.96s/it, lr=1e-5, step_loss=0.0446]
Steps: 1%| | 6658/1000000 [17:02:09<2988:41:16, 10.83s/it, lr=1e-5, step_loss=0.0446][RANK-0]: Step: [6658], local_loss=0.01793069951236248, train_loss=0.044855132699012756, time_cost=10.094872951507568
+
Steps: 1%| | 6658/1000000 [17:02:09<2988:41:16, 10.83s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 6659/1000000 [17:02:19<2950:13:36, 10.69s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [6659], local_loss=0.010654862970113754, train_loss=0.027177151292562485, time_cost=7.513043165206909
+
Steps: 1%| | 6659/1000000 [17:02:20<2950:13:36, 10.69s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6660/1000000 [17:02:30<2966:59:59, 10.75s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6660], local_loss=0.24882979691028595, train_loss=0.04841615632176399, time_cost=1.6246466636657715
+
Steps: 1%| | 6660/1000000 [17:02:30<2966:59:59, 10.75s/it, lr=1e-5, step_loss=0.249]
Steps: 1%| | 6661/1000000 [17:02:39<2828:41:29, 10.25s/it, lr=1e-5, step_loss=0.249][RANK-0]: Step: [6661], local_loss=0.26828041672706604, train_loss=0.05652429163455963, time_cost=4.08947491645813
+
Steps: 1%| | 6661/1000000 [17:02:39<2828:41:29, 10.25s/it, lr=1e-5, step_loss=0.268]
Steps: 1%| | 6662/1000000 [17:02:46<2480:37:30, 8.99s/it, lr=1e-5, step_loss=0.268][RANK-0]: Step: [6662], local_loss=0.011328784748911858, train_loss=0.020787719637155533, time_cost=2.6440846920013428
+
Steps: 1%| | 6662/1000000 [17:02:46<2480:37:30, 8.99s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 6663/1000000 [17:02:54<2404:24:29, 8.71s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [6663], local_loss=0.02710365131497383, train_loss=0.030413195490837097, time_cost=3.1124610900878906
+
Steps: 1%| | 6663/1000000 [17:02:54<2404:24:29, 8.71s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%| | 6664/1000000 [17:03:03<2453:18:38, 8.89s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [6664], local_loss=0.055026400834321976, train_loss=0.15950313210487366, time_cost=3.3895418643951416
+
Steps: 1%| | 6664/1000000 [17:03:03<2453:18:38, 8.89s/it, lr=1e-5, step_loss=0.055]
Steps: 1%| | 6665/1000000 [17:03:10<2305:54:28, 8.36s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [6665], local_loss=0.014488514512777328, train_loss=0.18161968886852264, time_cost=2.1241562366485596
+
Steps: 1%| | 6665/1000000 [17:03:10<2305:54:28, 8.36s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 6666/1000000 [17:03:16<2121:07:50, 7.69s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [6666], local_loss=0.01283888891339302, train_loss=0.08263957500457764, time_cost=1.3065598011016846
+
Steps: 1%| | 6666/1000000 [17:03:16<2121:07:50, 7.69s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 6667/1000000 [17:03:22<1953:35:04, 7.08s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [6667], local_loss=0.006676738616079092, train_loss=0.03753645718097687, time_cost=3.7673842906951904
+
Steps: 1%| | 6667/1000000 [17:03:22<1953:35:04, 7.08s/it, lr=1e-5, step_loss=0.00668]
Steps: 1%| | 6668/1000000 [17:03:26<1700:00:01, 6.16s/it, lr=1e-5, step_loss=0.00668][RANK-0]: Step: [6668], local_loss=0.2843913435935974, train_loss=0.05605384334921837, time_cost=1.236173152923584
+
Steps: 1%| | 6668/1000000 [17:03:26<1700:00:01, 6.16s/it, lr=1e-5, step_loss=0.284]
Steps: 1%| | 6669/1000000 [17:03:37<2100:53:06, 7.61s/it, lr=1e-5, step_loss=0.284][RANK-0]: Step: [6669], local_loss=0.02349654957652092, train_loss=0.03786380961537361, time_cost=1.6557953357696533
+
Steps: 1%| | 6669/1000000 [17:03:37<2100:53:06, 7.61s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%| | 6670/1000000 [17:03:44<2072:59:08, 7.51s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [6670], local_loss=0.01945740170776844, train_loss=0.05422180891036987, time_cost=2.9274446964263916
+
Steps: 1%| | 6670/1000000 [17:03:44<2072:59:08, 7.51s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 6671/1000000 [17:03:56<2407:47:35, 8.73s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [6671], local_loss=0.030178597196936607, train_loss=0.030475221574306488, time_cost=1.23573637008667
+
Steps: 1%| | 6671/1000000 [17:03:56<2407:47:35, 8.73s/it, lr=1e-5, step_loss=0.0302]
Steps: 1%| | 6672/1000000 [17:04:03<2267:56:57, 8.22s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [6672], local_loss=0.011389113031327724, train_loss=0.032438669353723526, time_cost=1.6246826648712158
+
Steps: 1%| | 6672/1000000 [17:04:03<2267:56:57, 8.22s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6673/1000000 [17:04:11<2245:34:23, 8.14s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6673], local_loss=0.009416170418262482, train_loss=0.021078990772366524, time_cost=3.4619336128234863
+
Steps: 1%| | 6673/1000000 [17:04:11<2245:34:23, 8.14s/it, lr=1e-5, step_loss=0.00942]
Steps: 1%| | 6674/1000000 [17:04:16<1985:51:10, 7.20s/it, lr=1e-5, step_loss=0.00942][RANK-0]: Step: [6674], local_loss=0.04942529648542404, train_loss=0.06820321083068848, time_cost=3.97839617729187
+
Steps: 1%| | 6674/1000000 [17:04:16<1985:51:10, 7.20s/it, lr=1e-5, step_loss=0.0494]
Steps: 1%| | 6675/1000000 [17:04:23<2001:05:30, 7.25s/it, lr=1e-5, step_loss=0.0494][RANK-0]: Step: [6675], local_loss=0.03157481551170349, train_loss=0.03327373415231705, time_cost=1.6376943588256836
+
Steps: 1%| | 6675/1000000 [17:04:23<2001:05:30, 7.25s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 6676/1000000 [17:04:35<2405:58:13, 8.72s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [6676], local_loss=0.0482892282307148, train_loss=0.09632499516010284, time_cost=9.240488052368164
+
Steps: 1%| | 6676/1000000 [17:04:35<2405:58:13, 8.72s/it, lr=1e-5, step_loss=0.0483]
Steps: 1%| | 6677/1000000 [17:04:40<2088:22:49, 7.57s/it, lr=1e-5, step_loss=0.0483][RANK-0]: Step: [6677], local_loss=0.022671323269605637, train_loss=0.02453409507870674, time_cost=1.7022922039031982
+
Steps: 1%| | 6677/1000000 [17:04:40<2088:22:49, 7.57s/it, lr=1e-5, step_loss=0.0227]
Steps: 1%| | 6678/1000000 [17:04:45<1876:05:12, 6.80s/it, lr=1e-5, step_loss=0.0227][RANK-0]: Step: [6678], local_loss=0.03022068180143833, train_loss=0.031494367867708206, time_cost=2.124288558959961
+
Steps: 1%| | 6678/1000000 [17:04:45<1876:05:12, 6.80s/it, lr=1e-5, step_loss=0.0302]
Steps: 1%| | 6679/1000000 [17:04:54<2089:28:43, 7.57s/it, lr=1e-5, step_loss=0.0302][RANK-0]: Step: [6679], local_loss=0.021372713148593903, train_loss=0.1771150529384613, time_cost=1.7469394207000732
+
Steps: 1%| | 6679/1000000 [17:04:54<2089:28:43, 7.57s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 6680/1000000 [17:05:04<2218:20:33, 8.04s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [6680], local_loss=0.01017652079463005, train_loss=0.07181581109762192, time_cost=6.027329683303833
+
Steps: 1%| | 6680/1000000 [17:05:04<2218:20:33, 8.04s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 6681/1000000 [17:05:09<1966:49:46, 7.13s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [6681], local_loss=0.019556675106287003, train_loss=0.036139797419309616, time_cost=2.0016705989837646
+
Steps: 1%| | 6681/1000000 [17:05:09<1966:49:46, 7.13s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%| | 6682/1000000 [17:05:16<1965:00:53, 7.12s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [6682], local_loss=0.03295924514532089, train_loss=0.03493389114737511, time_cost=2.768348455429077
+
Steps: 1%| | 6682/1000000 [17:05:16<1965:00:53, 7.12s/it, lr=1e-5, step_loss=0.033]
Steps: 1%| | 6683/1000000 [17:05:22<1860:59:59, 6.74s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [6683], local_loss=0.012976083904504776, train_loss=0.024794790893793106, time_cost=1.6790368556976318
+
Steps: 1%| | 6683/1000000 [17:05:22<1860:59:59, 6.74s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 6684/1000000 [17:05:34<2357:53:07, 8.55s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [6684], local_loss=0.052812609821558, train_loss=0.029214326292276382, time_cost=4.820784091949463
+
Steps: 1%| | 6684/1000000 [17:05:34<2357:53:07, 8.55s/it, lr=1e-5, step_loss=0.0528]
Steps: 1%| | 6685/1000000 [17:05:41<2242:29:03, 8.13s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [6685], local_loss=0.06085604056715965, train_loss=0.05083204060792923, time_cost=5.2811548709869385
+
Steps: 1%| | 6685/1000000 [17:05:41<2242:29:03, 8.13s/it, lr=1e-5, step_loss=0.0609]
Steps: 1%| | 6686/1000000 [17:05:50<2281:49:20, 8.27s/it, lr=1e-5, step_loss=0.0609][RANK-0]: Step: [6686], local_loss=0.015177902765572071, train_loss=0.10468050092458725, time_cost=1.273329496383667
+
Steps: 1%| | 6686/1000000 [17:05:50<2281:49:20, 8.27s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 6687/1000000 [17:05:54<1943:19:52, 7.04s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [6687], local_loss=0.42581331729888916, train_loss=0.24380448460578918, time_cost=1.2818281650543213
+
Steps: 1%| | 6687/1000000 [17:05:54<1943:19:52, 7.04s/it, lr=1e-5, step_loss=0.426]
Steps: 1%| | 6688/1000000 [17:05:59<1785:42:16, 6.47s/it, lr=1e-5, step_loss=0.426][RANK-0]: Step: [6688], local_loss=0.00888341199606657, train_loss=0.04504977539181709, time_cost=2.2045557498931885
+
Steps: 1%| | 6688/1000000 [17:05:59<1785:42:16, 6.47s/it, lr=1e-5, step_loss=0.00888]
Steps: 1%| | 6689/1000000 [17:06:07<1876:48:04, 6.80s/it, lr=1e-5, step_loss=0.00888][RANK-0]: Step: [6689], local_loss=0.03085518255829811, train_loss=0.031637273728847504, time_cost=3.7822482585906982
+
Steps: 1%| | 6689/1000000 [17:06:07<1876:48:04, 6.80s/it, lr=1e-5, step_loss=0.0309]
Steps: 1%| | 6690/1000000 [17:06:14<1913:24:38, 6.93s/it, lr=1e-5, step_loss=0.0309][RANK-0]: Step: [6690], local_loss=0.05562789738178253, train_loss=0.07239440828561783, time_cost=2.652371406555176
+
Steps: 1%| | 6690/1000000 [17:06:14<1913:24:38, 6.93s/it, lr=1e-5, step_loss=0.0556]
Steps: 1%| | 6691/1000000 [17:06:28<2493:20:34, 9.04s/it, lr=1e-5, step_loss=0.0556][RANK-0]: Step: [6691], local_loss=0.04010672867298126, train_loss=0.052517980337142944, time_cost=5.440201759338379
+
Steps: 1%| | 6691/1000000 [17:06:28<2493:20:34, 9.04s/it, lr=1e-5, step_loss=0.0401]
Steps: 1%| | 6692/1000000 [17:06:33<2175:09:03, 7.88s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [6692], local_loss=0.03698671981692314, train_loss=0.05306880548596382, time_cost=1.9024662971496582
+
Steps: 1%| | 6692/1000000 [17:06:33<2175:09:03, 7.88s/it, lr=1e-5, step_loss=0.037]
Steps: 1%| | 6693/1000000 [17:06:39<1969:46:16, 7.14s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [6693], local_loss=0.06416203826665878, train_loss=0.09702640771865845, time_cost=1.2391493320465088
+
Steps: 1%| | 6693/1000000 [17:06:39<1969:46:16, 7.14s/it, lr=1e-5, step_loss=0.0642]
Steps: 1%| | 6694/1000000 [17:06:44<1812:36:20, 6.57s/it, lr=1e-5, step_loss=0.0642][RANK-0]: Step: [6694], local_loss=0.04140042886137962, train_loss=0.06428682804107666, time_cost=2.8953003883361816
+
Steps: 1%| | 6694/1000000 [17:06:44<1812:36:20, 6.57s/it, lr=1e-5, step_loss=0.0414]
Steps: 1%| | 6695/1000000 [17:07:00<2586:08:20, 9.37s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [6695], local_loss=0.05418914556503296, train_loss=0.04792138561606407, time_cost=6.401994466781616
+
Steps: 1%| | 6695/1000000 [17:07:00<2586:08:20, 9.37s/it, lr=1e-5, step_loss=0.0542]
Steps: 1%| | 6696/1000000 [17:07:13<2924:06:11, 10.60s/it, lr=1e-5, step_loss=0.0542][RANK-0]: Step: [6696], local_loss=0.04887255281209946, train_loss=0.04004816710948944, time_cost=3.7474751472473145
+
Steps: 1%| | 6696/1000000 [17:07:13<2924:06:11, 10.60s/it, lr=1e-5, step_loss=0.0489]
Steps: 1%| | 6697/1000000 [17:07:27<3149:48:19, 11.42s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [6697], local_loss=0.05403970927000046, train_loss=0.023717569187283516, time_cost=1.24172043800354
+
Steps: 1%| | 6697/1000000 [17:07:27<3149:48:19, 11.42s/it, lr=1e-5, step_loss=0.054]
Steps: 1%| | 6698/1000000 [17:07:43<3563:03:26, 12.91s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [6698], local_loss=0.02136220969259739, train_loss=0.0243108831346035, time_cost=6.169012069702148
+
Steps: 1%| | 6698/1000000 [17:07:43<3563:03:26, 12.91s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 6699/1000000 [17:07:57<3646:54:39, 13.22s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [6699], local_loss=0.03653297573328018, train_loss=0.028459571301937103, time_cost=1.6335525512695312
+
Steps: 1%| | 6699/1000000 [17:07:57<3646:54:39, 13.22s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%| | 6700/1000000 [17:08:09<3519:32:19, 12.76s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [6700], local_loss=0.008478163741528988, train_loss=0.09569831937551498, time_cost=9.312649726867676
+
Steps: 1%| | 6700/1000000 [17:08:09<3519:32:19, 12.76s/it, lr=1e-5, step_loss=0.00848]
Steps: 1%| | 6701/1000000 [17:08:20<3373:47:58, 12.23s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [6701], local_loss=0.06607063114643097, train_loss=0.03971094638109207, time_cost=3.985118865966797
+
Steps: 1%| | 6701/1000000 [17:08:20<3373:47:58, 12.23s/it, lr=1e-5, step_loss=0.0661]
Steps: 1%| | 6702/1000000 [17:08:29<3126:45:03, 11.33s/it, lr=1e-5, step_loss=0.0661][RANK-0]: Step: [6702], local_loss=0.015682369470596313, train_loss=0.021119125187397003, time_cost=2.983525037765503
+
Steps: 1%| | 6702/1000000 [17:08:29<3126:45:03, 11.33s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 6703/1000000 [17:08:40<3122:36:46, 11.32s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [6703], local_loss=0.06744151562452316, train_loss=0.04367014020681381, time_cost=1.2201228141784668
+
Steps: 1%| | 6703/1000000 [17:08:40<3122:36:46, 11.32s/it, lr=1e-5, step_loss=0.0674]
Steps: 1%| | 6704/1000000 [17:08:54<3298:57:43, 11.96s/it, lr=1e-5, step_loss=0.0674][RANK-0]: Step: [6704], local_loss=0.01614943891763687, train_loss=0.03953569009900093, time_cost=4.671353578567505
+
Steps: 1%| | 6704/1000000 [17:08:54<3298:57:43, 11.96s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 6705/1000000 [17:09:04<3158:08:17, 11.45s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [6705], local_loss=0.017652004957199097, train_loss=0.016240080818533897, time_cost=3.528507947921753
+
Steps: 1%| | 6705/1000000 [17:09:04<3158:08:17, 11.45s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 6706/1000000 [17:09:19<3462:35:58, 12.55s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [6706], local_loss=0.01040906272828579, train_loss=0.026883557438850403, time_cost=7.139540672302246
+
Steps: 1%| | 6706/1000000 [17:09:19<3462:35:58, 12.55s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 6707/1000000 [17:09:31<3378:25:54, 12.24s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [6707], local_loss=0.044099751859903336, train_loss=0.021757371723651886, time_cost=2.2627463340759277
+
Steps: 1%| | 6707/1000000 [17:09:31<3378:25:54, 12.24s/it, lr=1e-5, step_loss=0.0441]
Steps: 1%| | 6708/1000000 [17:09:38<2948:57:52, 10.69s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [6708], local_loss=0.020640378817915916, train_loss=0.027013340964913368, time_cost=2.6703782081604004
+
Steps: 1%| | 6708/1000000 [17:09:38<2948:57:52, 10.69s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 6709/1000000 [17:09:50<3077:18:04, 11.15s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [6709], local_loss=0.02666398137807846, train_loss=0.13482843339443207, time_cost=3.1684584617614746
+
Steps: 1%| | 6709/1000000 [17:09:50<3077:18:04, 11.15s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%| | 6710/1000000 [17:09:57<2756:20:53, 9.99s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [6710], local_loss=0.2910047471523285, train_loss=0.07147020846605301, time_cost=2.1478660106658936
+
Steps: 1%| | 6710/1000000 [17:09:57<2756:20:53, 9.99s/it, lr=1e-5, step_loss=0.291]
Steps: 1%| | 6711/1000000 [17:10:12<3145:04:26, 11.40s/it, lr=1e-5, step_loss=0.291][RANK-0]: Step: [6711], local_loss=0.010659242980182171, train_loss=0.031800173223018646, time_cost=2.2788712978363037
+
Steps: 1%| | 6711/1000000 [17:10:12<3145:04:26, 11.40s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6712/1000000 [17:10:19<2824:12:15, 10.24s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6712], local_loss=0.031691774725914, train_loss=0.0834026038646698, time_cost=1.2531249523162842
+
Steps: 1%| | 6712/1000000 [17:10:19<2824:12:15, 10.24s/it, lr=1e-5, step_loss=0.0317]
Steps: 1%| | 6713/1000000 [17:10:26<2555:06:10, 9.26s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [6713], local_loss=0.08967003971338272, train_loss=0.08414658904075623, time_cost=2.4520788192749023
+
Steps: 1%| | 6713/1000000 [17:10:26<2555:06:10, 9.26s/it, lr=1e-5, step_loss=0.0897]
Steps: 1%| | 6714/1000000 [17:10:32<2225:49:16, 8.07s/it, lr=1e-5, step_loss=0.0897][RANK-0]: Step: [6714], local_loss=0.024626174941658974, train_loss=0.04479391500353813, time_cost=2.2578582763671875
+
Steps: 1%| | 6714/1000000 [17:10:32<2225:49:16, 8.07s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%| | 6715/1000000 [17:10:45<2706:09:26, 9.81s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [6715], local_loss=0.011383501812815666, train_loss=0.14313416182994843, time_cost=6.330013751983643
+
Steps: 1%| | 6715/1000000 [17:10:45<2706:09:26, 9.81s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6716/1000000 [17:10:56<2762:22:04, 10.01s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6716], local_loss=0.009009555913507938, train_loss=0.0634610503911972, time_cost=3.0218870639801025
+
Steps: 1%| | 6716/1000000 [17:10:56<2762:22:04, 10.01s/it, lr=1e-5, step_loss=0.00901]
Steps: 1%| | 6717/1000000 [17:11:07<2866:38:57, 10.39s/it, lr=1e-5, step_loss=0.00901][RANK-0]: Step: [6717], local_loss=0.07597342878580093, train_loss=0.07011309266090393, time_cost=5.179600477218628
+
Steps: 1%| | 6717/1000000 [17:11:07<2866:38:57, 10.39s/it, lr=1e-5, step_loss=0.076]
Steps: 1%| | 6718/1000000 [17:11:21<3147:31:58, 11.41s/it, lr=1e-5, step_loss=0.076][RANK-0]: Step: [6718], local_loss=0.024687252938747406, train_loss=0.05408615618944168, time_cost=4.6206982135772705
+
Steps: 1%| | 6718/1000000 [17:11:21<3147:31:58, 11.41s/it, lr=1e-5, step_loss=0.0247]
Steps: 1%| | 6719/1000000 [17:11:27<2659:55:32, 9.64s/it, lr=1e-5, step_loss=0.0247][RANK-0]: Step: [6719], local_loss=0.010734766721725464, train_loss=0.03267242759466171, time_cost=2.9423067569732666
+
Steps: 1%| | 6719/1000000 [17:11:27<2659:55:32, 9.64s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6720/1000000 [17:11:32<2337:43:40, 8.47s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6720], local_loss=0.02344081923365593, train_loss=0.04592147469520569, time_cost=1.3241205215454102
+
Steps: 1%| | 6720/1000000 [17:11:32<2337:43:40, 8.47s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%| | 6721/1000000 [17:11:44<2569:11:58, 9.31s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [6721], local_loss=0.01163873728364706, train_loss=0.01417633704841137, time_cost=3.812260150909424
+
Steps: 1%| | 6721/1000000 [17:11:44<2569:11:58, 9.31s/it, lr=1e-5, step_loss=0.0116]
Steps: 1%| | 6722/1000000 [17:11:49<2249:47:22, 8.15s/it, lr=1e-5, step_loss=0.0116][RANK-0]: Step: [6722], local_loss=0.010043063201010227, train_loss=0.03930170461535454, time_cost=2.451993227005005
+
Steps: 1%| | 6722/1000000 [17:11:49<2249:47:22, 8.15s/it, lr=1e-5, step_loss=0.01]
Steps: 1%| | 6723/1000000 [17:12:00<2463:13:21, 8.93s/it, lr=1e-5, step_loss=0.01][RANK-0]: Step: [6723], local_loss=0.011241440661251545, train_loss=0.0730242133140564, time_cost=1.7450346946716309
+
Steps: 1%| | 6723/1000000 [17:12:00<2463:13:21, 8.93s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 6724/1000000 [17:12:09<2459:34:39, 8.91s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [6724], local_loss=0.045450352132320404, train_loss=0.08593007177114487, time_cost=2.765206813812256
+
Steps: 1%| | 6724/1000000 [17:12:09<2459:34:39, 8.91s/it, lr=1e-5, step_loss=0.0455]
Steps: 1%| | 6725/1000000 [17:12:13<2117:31:42, 7.67s/it, lr=1e-5, step_loss=0.0455][RANK-0]: Step: [6725], local_loss=0.057831618934869766, train_loss=0.0421382412314415, time_cost=2.3163366317749023
+
Steps: 1%| | 6725/1000000 [17:12:13<2117:31:42, 7.67s/it, lr=1e-5, step_loss=0.0578]
Steps: 1%| | 6726/1000000 [17:12:26<2533:53:34, 9.18s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [6726], local_loss=0.018049778416752815, train_loss=0.028235096484422684, time_cost=3.101572036743164
+
Steps: 1%| | 6726/1000000 [17:12:26<2533:53:34, 9.18s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 6727/1000000 [17:12:31<2191:04:39, 7.94s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [6727], local_loss=0.01970709301531315, train_loss=0.06823304295539856, time_cost=1.9368326663970947
+
Steps: 1%| | 6727/1000000 [17:12:31<2191:04:39, 7.94s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%| | 6728/1000000 [17:12:44<2582:13:29, 9.36s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [6728], local_loss=0.014470420777797699, train_loss=0.09118345379829407, time_cost=1.3403294086456299
+
Steps: 1%| | 6728/1000000 [17:12:44<2582:13:29, 9.36s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 6729/1000000 [17:12:49<2272:07:39, 8.24s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [6729], local_loss=0.040209345519542694, train_loss=0.0325743705034256, time_cost=1.4875009059906006
+
Steps: 1%| | 6729/1000000 [17:12:49<2272:07:39, 8.24s/it, lr=1e-5, step_loss=0.0402]
Steps: 1%| | 6730/1000000 [17:13:01<2520:48:31, 9.14s/it, lr=1e-5, step_loss=0.0402][RANK-0]: Step: [6730], local_loss=0.03506675362586975, train_loss=0.08393076807260513, time_cost=3.736672878265381
+
Steps: 1%| | 6730/1000000 [17:13:01<2520:48:31, 9.14s/it, lr=1e-5, step_loss=0.0351]
Steps: 1%| | 6731/1000000 [17:13:07<2323:38:09, 8.42s/it, lr=1e-5, step_loss=0.0351][RANK-0]: Step: [6731], local_loss=0.019007645547389984, train_loss=0.029308941215276718, time_cost=1.2213032245635986
+
Steps: 1%| | 6731/1000000 [17:13:07<2323:38:09, 8.42s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 6732/1000000 [17:13:14<2143:05:26, 7.77s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [6732], local_loss=0.10118268430233002, train_loss=0.17423851788043976, time_cost=1.8345730304718018
+
Steps: 1%| | 6732/1000000 [17:13:14<2143:05:26, 7.77s/it, lr=1e-5, step_loss=0.101]
Steps: 1%| | 6733/1000000 [17:13:21<2079:33:01, 7.54s/it, lr=1e-5, step_loss=0.101][RANK-0]: Step: [6733], local_loss=0.19338580965995789, train_loss=0.09330801665782928, time_cost=2.308638334274292
+
Steps: 1%| | 6733/1000000 [17:13:21<2079:33:01, 7.54s/it, lr=1e-5, step_loss=0.193]
Steps: 1%| | 6734/1000000 [17:13:35<2659:18:13, 9.64s/it, lr=1e-5, step_loss=0.193][RANK-0]: Step: [6734], local_loss=0.02638748474419117, train_loss=0.05183058977127075, time_cost=5.655443906784058
+
Steps: 1%| | 6734/1000000 [17:13:35<2659:18:13, 9.64s/it, lr=1e-5, step_loss=0.0264]
Steps: 1%| | 6735/1000000 [17:13:46<2755:06:59, 9.99s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [6735], local_loss=0.05430397763848305, train_loss=0.04503872990608215, time_cost=1.4202327728271484
+
Steps: 1%| | 6735/1000000 [17:13:46<2755:06:59, 9.99s/it, lr=1e-5, step_loss=0.0543]
Steps: 1%| | 6736/1000000 [17:13:52<2437:10:38, 8.83s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [6736], local_loss=0.01814499869942665, train_loss=0.08245044946670532, time_cost=1.2166876792907715
+
Steps: 1%| | 6736/1000000 [17:13:52<2437:10:38, 8.83s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%| | 6737/1000000 [17:13:58<2194:42:03, 7.95s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [6737], local_loss=1.0042331218719482, train_loss=0.18417474627494812, time_cost=3.472829580307007
+
Steps: 1%| | 6737/1000000 [17:13:58<2194:42:03, 7.95s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6738/1000000 [17:14:04<2041:59:28, 7.40s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6738], local_loss=0.019750330597162247, train_loss=0.08439433574676514, time_cost=1.915806770324707
+
Steps: 1%| | 6738/1000000 [17:14:04<2041:59:28, 7.40s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 6739/1000000 [17:14:10<1916:39:54, 6.95s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [6739], local_loss=0.016506223008036613, train_loss=0.056801147758960724, time_cost=1.3914921283721924
+
Steps: 1%| | 6739/1000000 [17:14:10<1916:39:54, 6.95s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 6740/1000000 [17:14:24<2536:03:43, 9.19s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [6740], local_loss=0.090671107172966, train_loss=0.044896870851516724, time_cost=6.0724945068359375
+
Steps: 1%| | 6740/1000000 [17:14:24<2536:03:43, 9.19s/it, lr=1e-5, step_loss=0.0907]
Steps: 1%| | 6741/1000000 [17:14:31<2297:33:09, 8.33s/it, lr=1e-5, step_loss=0.0907][RANK-0]: Step: [6741], local_loss=0.03321267291903496, train_loss=11.281702041625977, time_cost=1.2331209182739258
+
Steps: 1%| | 6741/1000000 [17:14:31<2297:33:09, 8.33s/it, lr=1e-5, step_loss=0.0332]
Steps: 1%| | 6742/1000000 [17:14:42<2500:41:37, 9.06s/it, lr=1e-5, step_loss=0.0332][RANK-0]: Step: [6742], local_loss=0.030690647661685944, train_loss=0.06459365785121918, time_cost=2.1835856437683105
+
Steps: 1%| | 6742/1000000 [17:14:42<2500:41:37, 9.06s/it, lr=1e-5, step_loss=0.0307]
Steps: 1%| | 6743/1000000 [17:14:49<2328:39:58, 8.44s/it, lr=1e-5, step_loss=0.0307][RANK-0]: Step: [6743], local_loss=0.15499496459960938, train_loss=0.044384684413671494, time_cost=2.4135000705718994
+
Steps: 1%| | 6743/1000000 [17:14:49<2328:39:58, 8.44s/it, lr=1e-5, step_loss=0.155]
Steps: 1%| | 6744/1000000 [17:14:54<2057:05:03, 7.46s/it, lr=1e-5, step_loss=0.155][RANK-0]: Step: [6744], local_loss=0.016774572432041168, train_loss=0.032747507095336914, time_cost=3.834744691848755
+
Steps: 1%| | 6744/1000000 [17:14:54<2057:05:03, 7.46s/it, lr=1e-5, step_loss=0.0168]
Steps: 1%| | 6745/1000000 [17:15:03<2182:31:37, 7.91s/it, lr=1e-5, step_loss=0.0168][RANK-0]: Step: [6745], local_loss=0.14014849066734314, train_loss=0.04093015193939209, time_cost=3.8313698768615723
+
Steps: 1%| | 6745/1000000 [17:15:03<2182:31:37, 7.91s/it, lr=1e-5, step_loss=0.14]
Steps: 1%| | 6746/1000000 [17:15:08<1943:33:23, 7.04s/it, lr=1e-5, step_loss=0.14][RANK-0]: Step: [6746], local_loss=0.06225164234638214, train_loss=0.0847942978143692, time_cost=2.204860210418701
+
Steps: 1%| | 6746/1000000 [17:15:08<1943:33:23, 7.04s/it, lr=1e-5, step_loss=0.0623]
Steps: 1%| | 6747/1000000 [17:15:22<2520:58:04, 9.14s/it, lr=1e-5, step_loss=0.0623][RANK-0]: Step: [6747], local_loss=0.03438838943839073, train_loss=0.04437787085771561, time_cost=4.936960220336914
+
Steps: 1%| | 6747/1000000 [17:15:22<2520:58:04, 9.14s/it, lr=1e-5, step_loss=0.0344]
Steps: 1%| | 6748/1000000 [17:15:35<2877:34:43, 10.43s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [6748], local_loss=0.008669538423418999, train_loss=0.06889444589614868, time_cost=5.565325021743774
+
Steps: 1%| | 6748/1000000 [17:15:35<2877:34:43, 10.43s/it, lr=1e-5, step_loss=0.00867]
Steps: 1%| | 6749/1000000 [17:15:41<2512:04:08, 9.10s/it, lr=1e-5, step_loss=0.00867][RANK-0]: Step: [6749], local_loss=0.014930201694369316, train_loss=0.024171944707632065, time_cost=2.1567821502685547
+
Steps: 1%| | 6749/1000000 [17:15:41<2512:04:08, 9.10s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 6750/1000000 [17:15:58<3169:52:12, 11.49s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [6750], local_loss=0.008692682720720768, train_loss=0.05536695569753647, time_cost=14.128542184829712
+
Steps: 1%| | 6750/1000000 [17:15:58<3169:52:12, 11.49s/it, lr=1e-5, step_loss=0.00869]
Steps: 1%| | 6751/1000000 [17:16:10<3177:15:25, 11.52s/it, lr=1e-5, step_loss=0.00869][RANK-0]: Step: [6751], local_loss=0.0297402273863554, train_loss=0.02064262144267559, time_cost=2.2851948738098145
+
Steps: 1%| | 6751/1000000 [17:16:10<3177:15:25, 11.52s/it, lr=1e-5, step_loss=0.0297]
Steps: 1%| | 6752/1000000 [17:16:20<3095:00:54, 11.22s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [6752], local_loss=0.00755365751683712, train_loss=0.09851764142513275, time_cost=1.783738136291504
+
Steps: 1%| | 6752/1000000 [17:16:20<3095:00:54, 11.22s/it, lr=1e-5, step_loss=0.00755]
Steps: 1%| | 6753/1000000 [17:16:29<2862:55:41, 10.38s/it, lr=1e-5, step_loss=0.00755][RANK-0]: Step: [6753], local_loss=0.008556131273508072, train_loss=0.05095009505748749, time_cost=1.3173785209655762
+
Steps: 1%| | 6753/1000000 [17:16:29<2862:55:41, 10.38s/it, lr=1e-5, step_loss=0.00856]
Steps: 1%| | 6754/1000000 [17:16:40<2908:31:30, 10.54s/it, lr=1e-5, step_loss=0.00856][RANK-0]: Step: [6754], local_loss=0.013909290544688702, train_loss=0.030606355518102646, time_cost=1.2460222244262695
+
Steps: 1%| | 6754/1000000 [17:16:40<2908:31:30, 10.54s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6755/1000000 [17:16:53<3121:25:39, 11.31s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6755], local_loss=0.03180113062262535, train_loss=0.03604967147111893, time_cost=6.076432943344116
+
Steps: 1%| | 6755/1000000 [17:16:53<3121:25:39, 11.31s/it, lr=1e-5, step_loss=0.0318]
Steps: 1%| | 6756/1000000 [17:17:08<3418:38:42, 12.39s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [6756], local_loss=0.038359109312295914, train_loss=0.09644506871700287, time_cost=2.4102046489715576
+
Steps: 1%| | 6756/1000000 [17:17:08<3418:38:42, 12.39s/it, lr=1e-5, step_loss=0.0384]
Steps: 1%| | 6757/1000000 [17:17:18<3216:19:40, 11.66s/it, lr=1e-5, step_loss=0.0384][RANK-0]: Step: [6757], local_loss=1.0011200904846191, train_loss=0.17052403092384338, time_cost=2.3261361122131348
+
Steps: 1%| | 6757/1000000 [17:17:18<3216:19:40, 11.66s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6758/1000000 [17:17:22<2602:01:37, 9.43s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6758], local_loss=0.02074851095676422, train_loss=0.05032934248447418, time_cost=1.2197670936584473
+
Steps: 1%| | 6758/1000000 [17:17:22<2602:01:37, 9.43s/it, lr=1e-5, step_loss=0.0207]
Steps: 1%| | 6759/1000000 [17:17:29<2400:24:46, 8.70s/it, lr=1e-5, step_loss=0.0207][RANK-0]: Step: [6759], local_loss=0.05687795579433441, train_loss=0.031871065497398376, time_cost=1.2146060466766357
+
Steps: 1%| | 6759/1000000 [17:17:29<2400:24:46, 8.70s/it, lr=1e-5, step_loss=0.0569]
Steps: 1%| | 6760/1000000 [17:17:36<2274:12:19, 8.24s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [6760], local_loss=0.007589162793010473, train_loss=0.05123375356197357, time_cost=2.8119523525238037
+
Steps: 1%| | 6760/1000000 [17:17:36<2274:12:19, 8.24s/it, lr=1e-5, step_loss=0.00759]
Steps: 1%| | 6761/1000000 [17:17:46<2392:39:06, 8.67s/it, lr=1e-5, step_loss=0.00759][RANK-0]: Step: [6761], local_loss=0.018958674743771553, train_loss=0.05904822424054146, time_cost=4.2109527587890625
+
Steps: 1%| | 6761/1000000 [17:17:46<2392:39:06, 8.67s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 6762/1000000 [17:17:52<2172:06:09, 7.87s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [6762], local_loss=0.020613588392734528, train_loss=0.025417577475309372, time_cost=5.134873867034912
+
Steps: 1%| | 6762/1000000 [17:17:52<2172:06:09, 7.87s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 6763/1000000 [17:18:04<2525:30:13, 9.15s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [6763], local_loss=0.0223732590675354, train_loss=0.18763324618339539, time_cost=3.4543139934539795
+
Steps: 1%| | 6763/1000000 [17:18:04<2525:30:13, 9.15s/it, lr=1e-5, step_loss=0.0224]
Steps: 1%| | 6764/1000000 [17:18:08<2119:06:16, 7.68s/it, lr=1e-5, step_loss=0.0224][RANK-0]: Step: [6764], local_loss=0.026432789862155914, train_loss=0.034654710441827774, time_cost=1.2453570365905762
+
Steps: 1%| | 6764/1000000 [17:18:08<2119:06:16, 7.68s/it, lr=1e-5, step_loss=0.0264]
Steps: 1%| | 6765/1000000 [17:18:14<1947:34:17, 7.06s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [6765], local_loss=0.0542856827378273, train_loss=0.03661802411079407, time_cost=4.32177996635437
+
Steps: 1%| | 6765/1000000 [17:18:14<1947:34:17, 7.06s/it, lr=1e-5, step_loss=0.0543]
Steps: 1%| | 6766/1000000 [17:18:25<2322:08:04, 8.42s/it, lr=1e-5, step_loss=0.0543][RANK-0]: Step: [6766], local_loss=0.03554762527346611, train_loss=0.06410940736532211, time_cost=3.1719584465026855
+
Steps: 1%| | 6766/1000000 [17:18:25<2322:08:04, 8.42s/it, lr=1e-5, step_loss=0.0355]
Steps: 1%| | 6767/1000000 [17:18:34<2327:52:33, 8.44s/it, lr=1e-5, step_loss=0.0355][RANK-0]: Step: [6767], local_loss=0.013358252122998238, train_loss=0.06119236722588539, time_cost=1.3249423503875732
+
Steps: 1%| | 6767/1000000 [17:18:34<2327:52:33, 8.44s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 6768/1000000 [17:18:45<2529:36:20, 9.17s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [6768], local_loss=0.03177274763584137, train_loss=0.0750807523727417, time_cost=2.183462381362915
+
Steps: 1%| | 6768/1000000 [17:18:45<2529:36:20, 9.17s/it, lr=1e-5, step_loss=0.0318]
Steps: 1%| | 6769/1000000 [17:19:00<3075:25:47, 11.15s/it, lr=1e-5, step_loss=0.0318][RANK-0]: Step: [6769], local_loss=0.007870815694332123, train_loss=0.022814810276031494, time_cost=3.7377171516418457
+
Steps: 1%| | 6769/1000000 [17:19:00<3075:25:47, 11.15s/it, lr=1e-5, step_loss=0.00787]
Steps: 1%| | 6770/1000000 [17:19:06<2633:42:24, 9.55s/it, lr=1e-5, step_loss=0.00787][RANK-0]: Step: [6770], local_loss=0.010735549964010715, train_loss=0.10297869145870209, time_cost=1.33229660987854
+
Steps: 1%| | 6770/1000000 [17:19:06<2633:42:24, 9.55s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6771/1000000 [17:19:17<2755:19:17, 9.99s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6771], local_loss=0.007536051794886589, train_loss=0.021728819236159325, time_cost=2.2599737644195557
+
Steps: 1%| | 6771/1000000 [17:19:17<2755:19:17, 9.99s/it, lr=1e-5, step_loss=0.00754]
Steps: 1%| | 6772/1000000 [17:19:25<2574:57:34, 9.33s/it, lr=1e-5, step_loss=0.00754][RANK-0]: Step: [6772], local_loss=0.006651286501437426, train_loss=0.03728887438774109, time_cost=2.788026809692383
+
Steps: 1%| | 6772/1000000 [17:19:25<2574:57:34, 9.33s/it, lr=1e-5, step_loss=0.00665]
Steps: 1%| | 6773/1000000 [17:19:32<2372:50:22, 8.60s/it, lr=1e-5, step_loss=0.00665][RANK-0]: Step: [6773], local_loss=0.05614718049764633, train_loss=0.04417753219604492, time_cost=5.2336015701293945
+
Steps: 1%| | 6773/1000000 [17:19:32<2372:50:22, 8.60s/it, lr=1e-5, step_loss=0.0561]
Steps: 1%| | 6774/1000000 [17:19:37<2066:06:28, 7.49s/it, lr=1e-5, step_loss=0.0561][RANK-0]: Step: [6774], local_loss=0.020156649872660637, train_loss=0.022140173241496086, time_cost=2.4995713233947754
+
Steps: 1%| | 6774/1000000 [17:19:37<2066:06:28, 7.49s/it, lr=1e-5, step_loss=0.0202]
Steps: 1%| | 6775/1000000 [17:19:44<2034:13:44, 7.37s/it, lr=1e-5, step_loss=0.0202][RANK-0]: Step: [6775], local_loss=0.2607291638851166, train_loss=0.05886820703744888, time_cost=2.3751556873321533
+
Steps: 1%| | 6775/1000000 [17:19:44<2034:13:44, 7.37s/it, lr=1e-5, step_loss=0.261]
Steps: 1%| | 6776/1000000 [17:19:54<2251:13:40, 8.16s/it, lr=1e-5, step_loss=0.261][RANK-0]: Step: [6776], local_loss=0.03888200223445892, train_loss=0.1348447948694229, time_cost=1.6075711250305176
+
Steps: 1%| | 6776/1000000 [17:19:54<2251:13:40, 8.16s/it, lr=1e-5, step_loss=0.0389]
Steps: 1%| | 6777/1000000 [17:20:06<2554:05:04, 9.26s/it, lr=1e-5, step_loss=0.0389][RANK-0]: Step: [6777], local_loss=0.007559554651379585, train_loss=0.0680302157998085, time_cost=5.528886556625366
+
Steps: 1%| | 6777/1000000 [17:20:06<2554:05:04, 9.26s/it, lr=1e-5, step_loss=0.00756]
Steps: 1%| | 6778/1000000 [17:20:17<2709:00:33, 9.82s/it, lr=1e-5, step_loss=0.00756][RANK-0]: Step: [6778], local_loss=0.10179274529218674, train_loss=0.04730324074625969, time_cost=1.2388873100280762
+
Steps: 1%| | 6778/1000000 [17:20:17<2709:00:33, 9.82s/it, lr=1e-5, step_loss=0.102]
Steps: 1%| | 6779/1000000 [17:20:21<2242:46:33, 8.13s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [6779], local_loss=0.02157849445939064, train_loss=0.01862119510769844, time_cost=1.452348232269287
+
Steps: 1%| | 6779/1000000 [17:20:21<2242:46:33, 8.13s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 6780/1000000 [17:20:35<2685:36:56, 9.73s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [6780], local_loss=0.07397736608982086, train_loss=0.048039570450782776, time_cost=5.423861742019653
+
Steps: 1%| | 6780/1000000 [17:20:35<2685:36:56, 9.73s/it, lr=1e-5, step_loss=0.074]
Steps: 1%| | 6781/1000000 [17:20:43<2598:14:09, 9.42s/it, lr=1e-5, step_loss=0.074][RANK-0]: Step: [6781], local_loss=0.15635144710540771, train_loss=0.05132005363702774, time_cost=3.1756014823913574
+
Steps: 1%| | 6781/1000000 [17:20:43<2598:14:09, 9.42s/it, lr=1e-5, step_loss=0.156]
Steps: 1%| | 6782/1000000 [17:20:59<3104:11:56, 11.25s/it, lr=1e-5, step_loss=0.156][RANK-0]: Step: [6782], local_loss=0.028077667579054832, train_loss=0.023944122716784477, time_cost=1.2190287113189697
+
Steps: 1%| | 6782/1000000 [17:20:59<3104:11:56, 11.25s/it, lr=1e-5, step_loss=0.0281]
Steps: 1%| | 6783/1000000 [17:21:03<2530:18:00, 9.17s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [6783], local_loss=0.011408930644392967, train_loss=0.14968642592430115, time_cost=3.2307207584381104
+
Steps: 1%| | 6783/1000000 [17:21:03<2530:18:00, 9.17s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6784/1000000 [17:21:09<2249:55:08, 8.16s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6784], local_loss=0.03109612688422203, train_loss=0.6371365785598755, time_cost=3.147282838821411
+
Steps: 1%| | 6784/1000000 [17:21:09<2249:55:08, 8.16s/it, lr=1e-5, step_loss=0.0311]
Steps: 1%| | 6785/1000000 [17:21:16<2179:59:10, 7.90s/it, lr=1e-5, step_loss=0.0311][RANK-0]: Step: [6785], local_loss=0.4070742130279541, train_loss=0.10289596766233444, time_cost=2.8105409145355225
+
Steps: 1%| | 6785/1000000 [17:21:16<2179:59:10, 7.90s/it, lr=1e-5, step_loss=0.407]
Steps: 1%| | 6786/1000000 [17:21:20<1880:58:14, 6.82s/it, lr=1e-5, step_loss=0.407][RANK-0]: Step: [6786], local_loss=1.0023778676986694, train_loss=8.681851387023926, time_cost=1.3965785503387451
+
Steps: 1%| | 6786/1000000 [17:21:20<1880:58:14, 6.82s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 6787/1000000 [17:21:25<1699:51:24, 6.16s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [6787], local_loss=0.03973116725683212, train_loss=20.87447738647461, time_cost=1.360386610031128
+
Steps: 1%| | 6787/1000000 [17:21:25<1699:51:24, 6.16s/it, lr=1e-5, step_loss=0.0397]
Steps: 1%| | 6788/1000000 [17:21:32<1796:39:05, 6.51s/it, lr=1e-5, step_loss=0.0397][RANK-0]: Step: [6788], local_loss=0.09134279191493988, train_loss=0.03509870916604996, time_cost=1.2813029289245605
+
Steps: 1%| | 6788/1000000 [17:21:32<1796:39:05, 6.51s/it, lr=1e-5, step_loss=0.0913]
Steps: 1%| | 6789/1000000 [17:21:46<2402:44:56, 8.71s/it, lr=1e-5, step_loss=0.0913][RANK-0]: Step: [6789], local_loss=0.08266308158636093, train_loss=0.16142280399799347, time_cost=1.9923663139343262
+
Steps: 1%| | 6789/1000000 [17:21:46<2402:44:56, 8.71s/it, lr=1e-5, step_loss=0.0827]
Steps: 1%| | 6790/1000000 [17:22:01<2914:07:23, 10.56s/it, lr=1e-5, step_loss=0.0827][RANK-0]: Step: [6790], local_loss=0.019061783328652382, train_loss=0.02177971787750721, time_cost=11.184767484664917
+
Steps: 1%| | 6790/1000000 [17:22:01<2914:07:23, 10.56s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 6791/1000000 [17:22:12<2934:18:28, 10.64s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [6791], local_loss=0.015667743980884552, train_loss=0.024567382410168648, time_cost=4.624863862991333
+
Steps: 1%| | 6791/1000000 [17:22:12<2934:18:28, 10.64s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 6792/1000000 [17:22:19<2657:17:06, 9.63s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [6792], local_loss=0.018209587782621384, train_loss=0.02638910710811615, time_cost=3.3235063552856445
+
Steps: 1%| | 6792/1000000 [17:22:19<2657:17:06, 9.63s/it, lr=1e-5, step_loss=0.0182]
Steps: 1%| | 6793/1000000 [17:22:24<2238:25:59, 8.11s/it, lr=1e-5, step_loss=0.0182][RANK-0]: Step: [6793], local_loss=0.06227077543735504, train_loss=0.03865092247724533, time_cost=1.7647955417633057
+
Steps: 1%| | 6793/1000000 [17:22:24<2238:25:59, 8.11s/it, lr=1e-5, step_loss=0.0623]
Steps: 1%| | 6794/1000000 [17:22:34<2404:10:08, 8.71s/it, lr=1e-5, step_loss=0.0623][RANK-0]: Step: [6794], local_loss=0.0535617396235466, train_loss=0.09027811139822006, time_cost=2.7707557678222656
+
Steps: 1%| | 6794/1000000 [17:22:34<2404:10:08, 8.71s/it, lr=1e-5, step_loss=0.0536]
Steps: 1%| | 6795/1000000 [17:22:40<2194:45:39, 7.96s/it, lr=1e-5, step_loss=0.0536][RANK-0]: Step: [6795], local_loss=0.012608981691300869, train_loss=0.1385641247034073, time_cost=5.481829643249512
+
Steps: 1%| | 6795/1000000 [17:22:40<2194:45:39, 7.96s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 6796/1000000 [17:22:45<1950:00:26, 7.07s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [6796], local_loss=0.01582452282309532, train_loss=23.58588981628418, time_cost=1.9565789699554443
+
Steps: 1%| | 6796/1000000 [17:22:45<1950:00:26, 7.07s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 6797/1000000 [17:22:49<1727:35:29, 6.26s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [6797], local_loss=0.04319469630718231, train_loss=0.06330272555351257, time_cost=3.13094425201416
+
Steps: 1%| | 6797/1000000 [17:22:49<1727:35:29, 6.26s/it, lr=1e-5, step_loss=0.0432]
Steps: 1%| | 6798/1000000 [17:23:01<2149:56:36, 7.79s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [6798], local_loss=0.03558822348713875, train_loss=0.028850790113210678, time_cost=4.300769329071045
+
Steps: 1%| | 6798/1000000 [17:23:01<2149:56:36, 7.79s/it, lr=1e-5, step_loss=0.0356]
Steps: 1%| | 6799/1000000 [17:23:07<2036:16:41, 7.38s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [6799], local_loss=0.041566625237464905, train_loss=0.03730004280805588, time_cost=2.437185764312744
+
Steps: 1%| | 6799/1000000 [17:23:07<2036:16:41, 7.38s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%| | 6800/1000000 [17:23:13<1905:41:05, 6.91s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [6800], local_loss=0.00975052546709776, train_loss=0.030586037784814835, time_cost=1.2241907119750977
+
Steps: 1%| | 6800/1000000 [17:23:13<1905:41:05, 6.91s/it, lr=1e-5, step_loss=0.00975]
Steps: 1%| | 6801/1000000 [17:23:22<2034:51:16, 7.38s/it, lr=1e-5, step_loss=0.00975][RANK-0]: Step: [6801], local_loss=0.2089925855398178, train_loss=0.05470934882760048, time_cost=2.7520525455474854
+
Steps: 1%| | 6801/1000000 [17:23:22<2034:51:16, 7.38s/it, lr=1e-5, step_loss=0.209]
Steps: 1%| | 6802/1000000 [17:23:26<1832:24:47, 6.64s/it, lr=1e-5, step_loss=0.209][RANK-0]: Step: [6802], local_loss=0.05131825804710388, train_loss=0.030460692942142487, time_cost=1.9064691066741943
+
Steps: 1%| | 6802/1000000 [17:23:26<1832:24:47, 6.64s/it, lr=1e-5, step_loss=0.0513]
Steps: 1%| | 6803/1000000 [17:23:41<2450:40:24, 8.88s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [6803], local_loss=0.031931761652231216, train_loss=0.1034996509552002, time_cost=1.2537119388580322
+
Steps: 1%| | 6803/1000000 [17:23:41<2450:40:24, 8.88s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%| | 6804/1000000 [17:23:50<2523:48:09, 9.15s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [6804], local_loss=0.04958198592066765, train_loss=0.15145300328731537, time_cost=3.683408260345459
+
Steps: 1%| | 6804/1000000 [17:23:50<2523:48:09, 9.15s/it, lr=1e-5, step_loss=0.0496]
Steps: 1%| | 6805/1000000 [17:24:00<2568:39:51, 9.31s/it, lr=1e-5, step_loss=0.0496][RANK-0]: Step: [6805], local_loss=0.009091611951589584, train_loss=0.04485248029232025, time_cost=4.4441914558410645
+
Steps: 1%| | 6805/1000000 [17:24:00<2568:39:51, 9.31s/it, lr=1e-5, step_loss=0.00909]
Steps: 1%| | 6806/1000000 [17:24:13<2894:04:33, 10.49s/it, lr=1e-5, step_loss=0.00909][RANK-0]: Step: [6806], local_loss=0.010900426656007767, train_loss=0.05213814228773117, time_cost=4.51467752456665
+
Steps: 1%| | 6806/1000000 [17:24:13<2894:04:33, 10.49s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 6807/1000000 [17:24:23<2849:10:33, 10.33s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [6807], local_loss=0.09322049468755722, train_loss=0.03829030692577362, time_cost=1.2322931289672852
+
Steps: 1%| | 6807/1000000 [17:24:23<2849:10:33, 10.33s/it, lr=1e-5, step_loss=0.0932]
Steps: 1%| | 6808/1000000 [17:24:28<2417:22:32, 8.76s/it, lr=1e-5, step_loss=0.0932][RANK-0]: Step: [6808], local_loss=0.007258082274347544, train_loss=0.015456020832061768, time_cost=1.3742339611053467
+
Steps: 1%| | 6808/1000000 [17:24:28<2417:22:32, 8.76s/it, lr=1e-5, step_loss=0.00726]
Steps: 1%| | 6809/1000000 [17:24:36<2346:12:30, 8.50s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [6809], local_loss=0.008015789091587067, train_loss=0.03778713941574097, time_cost=1.232950210571289
+
Steps: 1%| | 6809/1000000 [17:24:36<2346:12:30, 8.50s/it, lr=1e-5, step_loss=0.00802]
Steps: 1%| | 6810/1000000 [17:24:43<2201:40:18, 7.98s/it, lr=1e-5, step_loss=0.00802][RANK-0]: Step: [6810], local_loss=0.013549704104661942, train_loss=0.04069935902953148, time_cost=2.6350133419036865
+
Steps: 1%| | 6810/1000000 [17:24:43<2201:40:18, 7.98s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 6811/1000000 [17:24:54<2431:28:02, 8.81s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [6811], local_loss=0.03304580599069595, train_loss=0.042852677404880524, time_cost=1.7455048561096191
+
Steps: 1%| | 6811/1000000 [17:24:54<2431:28:02, 8.81s/it, lr=1e-5, step_loss=0.033]
Steps: 1%| | 6812/1000000 [17:25:05<2597:53:23, 9.42s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [6812], local_loss=0.012691737152636051, train_loss=0.018586914986371994, time_cost=1.548323154449463
+
Steps: 1%| | 6812/1000000 [17:25:05<2597:53:23, 9.42s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 6813/1000000 [17:25:10<2246:21:55, 8.14s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [6813], local_loss=0.05495760962367058, train_loss=0.0530158169567585, time_cost=1.2918875217437744
+
Steps: 1%| | 6813/1000000 [17:25:10<2246:21:55, 8.14s/it, lr=1e-5, step_loss=0.055]
Steps: 1%| | 6814/1000000 [17:25:15<1979:51:04, 7.18s/it, lr=1e-5, step_loss=0.055][RANK-0]: Step: [6814], local_loss=0.07450569421052933, train_loss=0.050193656235933304, time_cost=1.9717926979064941
+
Steps: 1%| | 6814/1000000 [17:25:15<1979:51:04, 7.18s/it, lr=1e-5, step_loss=0.0745]
Steps: 1%| | 6815/1000000 [17:25:22<1999:31:27, 7.25s/it, lr=1e-5, step_loss=0.0745][RANK-0]: Step: [6815], local_loss=0.009202579036355019, train_loss=0.034349676221609116, time_cost=3.2060952186584473
+
Steps: 1%| | 6815/1000000 [17:25:22<1999:31:27, 7.25s/it, lr=1e-5, step_loss=0.0092]
Steps: 1%| | 6816/1000000 [17:25:31<2156:15:18, 7.82s/it, lr=1e-5, step_loss=0.0092][RANK-0]: Step: [6816], local_loss=0.09500091522932053, train_loss=0.1004621684551239, time_cost=1.3133385181427002
+
Steps: 1%| | 6816/1000000 [17:25:31<2156:15:18, 7.82s/it, lr=1e-5, step_loss=0.095]
Steps: 1%| | 6817/1000000 [17:25:39<2122:06:05, 7.69s/it, lr=1e-5, step_loss=0.095][RANK-0]: Step: [6817], local_loss=0.009551388211548328, train_loss=0.12329573929309845, time_cost=2.20892596244812
+
Steps: 1%| | 6817/1000000 [17:25:39<2122:06:05, 7.69s/it, lr=1e-5, step_loss=0.00955]
Steps: 1%| | 6818/1000000 [17:25:46<2077:23:56, 7.53s/it, lr=1e-5, step_loss=0.00955][RANK-0]: Step: [6818], local_loss=0.09910812973976135, train_loss=0.07017660140991211, time_cost=5.725800514221191
+
Steps: 1%| | 6818/1000000 [17:25:46<2077:23:56, 7.53s/it, lr=1e-5, step_loss=0.0991]
Steps: 1%| | 6819/1000000 [17:25:57<2400:48:02, 8.70s/it, lr=1e-5, step_loss=0.0991][RANK-0]: Step: [6819], local_loss=0.02559678629040718, train_loss=0.07482348382472992, time_cost=5.4351019859313965
+
Steps: 1%| | 6819/1000000 [17:25:57<2400:48:02, 8.70s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%| | 6820/1000000 [17:26:07<2491:05:30, 9.03s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [6820], local_loss=0.023062465712428093, train_loss=0.036647357046604156, time_cost=2.563260316848755
+
Steps: 1%| | 6820/1000000 [17:26:07<2491:05:30, 9.03s/it, lr=1e-5, step_loss=0.0231]
Steps: 1%| | 6821/1000000 [17:26:20<2823:06:39, 10.23s/it, lr=1e-5, step_loss=0.0231][RANK-0]: Step: [6821], local_loss=0.10645762830972672, train_loss=0.06035992503166199, time_cost=9.484743356704712
+
Steps: 1%| | 6821/1000000 [17:26:20<2823:06:39, 10.23s/it, lr=1e-5, step_loss=0.106]
Steps: 1%| | 6822/1000000 [17:26:28<2595:54:28, 9.41s/it, lr=1e-5, step_loss=0.106][RANK-0]: Step: [6822], local_loss=0.03591974452137947, train_loss=0.03575395792722702, time_cost=3.714003086090088
+
Steps: 1%| | 6822/1000000 [17:26:28<2595:54:28, 9.41s/it, lr=1e-5, step_loss=0.0359]
Steps: 1%| | 6823/1000000 [17:26:40<2876:53:03, 10.43s/it, lr=1e-5, step_loss=0.0359][RANK-0]: Step: [6823], local_loss=0.010763685218989849, train_loss=0.020542049780488014, time_cost=1.2818894386291504
+
Steps: 1%| | 6823/1000000 [17:26:40<2876:53:03, 10.43s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 6824/1000000 [17:26:46<2480:40:31, 8.99s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [6824], local_loss=0.16344188153743744, train_loss=0.08626505732536316, time_cost=2.625478982925415
+
Steps: 1%| | 6824/1000000 [17:26:46<2480:40:31, 8.99s/it, lr=1e-5, step_loss=0.163]
Steps: 1%| | 6825/1000000 [17:26:50<2096:20:01, 7.60s/it, lr=1e-5, step_loss=0.163][RANK-0]: Step: [6825], local_loss=0.013657283037900925, train_loss=0.04191422834992409, time_cost=1.2891604900360107
+
Steps: 1%| | 6825/1000000 [17:26:50<2096:20:01, 7.60s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 6826/1000000 [17:27:06<2723:41:13, 9.87s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [6826], local_loss=0.011848080903291702, train_loss=0.04261459782719612, time_cost=8.858264207839966
+
Steps: 1%| | 6826/1000000 [17:27:06<2723:41:13, 9.87s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6827/1000000 [17:27:14<2636:25:34, 9.56s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6827], local_loss=0.007654278539121151, train_loss=0.04613655433058739, time_cost=4.878939628601074
+
Steps: 1%| | 6827/1000000 [17:27:14<2636:25:34, 9.56s/it, lr=1e-5, step_loss=0.00765]
Steps: 1%| | 6828/1000000 [17:27:23<2571:52:16, 9.32s/it, lr=1e-5, step_loss=0.00765][RANK-0]: Step: [6828], local_loss=0.02860432118177414, train_loss=0.0382404625415802, time_cost=3.709606647491455
+
Steps: 1%| | 6828/1000000 [17:27:23<2571:52:16, 9.32s/it, lr=1e-5, step_loss=0.0286]
Steps: 1%| | 6829/1000000 [17:27:31<2479:49:51, 8.99s/it, lr=1e-5, step_loss=0.0286][RANK-0]: Step: [6829], local_loss=0.008170081302523613, train_loss=0.02005140669643879, time_cost=1.764657974243164
+
Steps: 1%| | 6829/1000000 [17:27:31<2479:49:51, 8.99s/it, lr=1e-5, step_loss=0.00817]
Steps: 1%| | 6830/1000000 [17:27:43<2702:50:03, 9.80s/it, lr=1e-5, step_loss=0.00817][RANK-0]: Step: [6830], local_loss=0.011422598734498024, train_loss=0.07597605884075165, time_cost=5.1948933601379395
+
Steps: 1%| | 6830/1000000 [17:27:43<2702:50:03, 9.80s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6831/1000000 [17:27:47<2263:44:52, 8.21s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6831], local_loss=0.38045597076416016, train_loss=0.0722094476222992, time_cost=1.6495215892791748
+
Steps: 1%| | 6831/1000000 [17:27:47<2263:44:52, 8.21s/it, lr=1e-5, step_loss=0.38]
Steps: 1%| | 6832/1000000 [17:27:53<2070:20:25, 7.50s/it, lr=1e-5, step_loss=0.38][RANK-0]: Step: [6832], local_loss=0.057124748826026917, train_loss=0.05201103538274765, time_cost=1.859177589416504
+
Steps: 1%| | 6832/1000000 [17:27:53<2070:20:25, 7.50s/it, lr=1e-5, step_loss=0.0571]
Steps: 1%| | 6833/1000000 [17:28:07<2553:31:17, 9.26s/it, lr=1e-5, step_loss=0.0571][RANK-0]: Step: [6833], local_loss=0.060579054057598114, train_loss=0.14822423458099365, time_cost=4.7721405029296875
+
Steps: 1%| | 6833/1000000 [17:28:07<2553:31:17, 9.26s/it, lr=1e-5, step_loss=0.0606]
Steps: 1%| | 6834/1000000 [17:28:16<2571:43:43, 9.32s/it, lr=1e-5, step_loss=0.0606][RANK-0]: Step: [6834], local_loss=0.01878015697002411, train_loss=0.018574301153421402, time_cost=7.1316916942596436
+
Steps: 1%| | 6834/1000000 [17:28:16<2571:43:43, 9.32s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 6835/1000000 [17:28:28<2751:58:33, 9.98s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [6835], local_loss=0.04148615151643753, train_loss=0.0463523305952549, time_cost=2.611004114151001
+
Steps: 1%| | 6835/1000000 [17:28:28<2751:58:33, 9.98s/it, lr=1e-5, step_loss=0.0415]
Steps: 1%| | 6836/1000000 [17:28:33<2390:41:41, 8.67s/it, lr=1e-5, step_loss=0.0415][RANK-0]: Step: [6836], local_loss=0.008567748591303825, train_loss=0.010652258060872555, time_cost=1.3953266143798828
+
Steps: 1%| | 6836/1000000 [17:28:33<2390:41:41, 8.67s/it, lr=1e-5, step_loss=0.00857]
Steps: 1%| | 6837/1000000 [17:28:47<2793:27:25, 10.13s/it, lr=1e-5, step_loss=0.00857][RANK-0]: Step: [6837], local_loss=0.02416619099676609, train_loss=0.2073983997106552, time_cost=4.601102113723755
+
Steps: 1%| | 6837/1000000 [17:28:47<2793:27:25, 10.13s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 6838/1000000 [17:28:57<2828:07:56, 10.25s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [6838], local_loss=0.011419100686907768, train_loss=0.038472872227430344, time_cost=4.003509998321533
+
Steps: 1%| | 6838/1000000 [17:28:57<2828:07:56, 10.25s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 6839/1000000 [17:29:05<2580:10:11, 9.35s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [6839], local_loss=0.022980669513344765, train_loss=0.02324860729277134, time_cost=3.6004066467285156
+
Steps: 1%| | 6839/1000000 [17:29:05<2580:10:11, 9.35s/it, lr=1e-5, step_loss=0.023]
Steps: 1%| | 6840/1000000 [17:29:13<2517:13:48, 9.12s/it, lr=1e-5, step_loss=0.023][RANK-0]: Step: [6840], local_loss=0.010744583792984486, train_loss=0.1400856375694275, time_cost=2.2320501804351807
+
Steps: 1%| | 6840/1000000 [17:29:13<2517:13:48, 9.12s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6841/1000000 [17:29:18<2178:43:04, 7.90s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6841], local_loss=0.15693722665309906, train_loss=0.08927025645971298, time_cost=2.1591992378234863
+
Steps: 1%| | 6841/1000000 [17:29:18<2178:43:04, 7.90s/it, lr=1e-5, step_loss=0.157]
Steps: 1%| | 6842/1000000 [17:29:27<2281:41:47, 8.27s/it, lr=1e-5, step_loss=0.157][RANK-0]: Step: [6842], local_loss=0.18821239471435547, train_loss=0.14904263615608215, time_cost=3.2278995513916016
+
Steps: 1%| | 6842/1000000 [17:29:27<2281:41:47, 8.27s/it, lr=1e-5, step_loss=0.188]
Steps: 1%| | 6843/1000000 [17:29:33<2071:36:22, 7.51s/it, lr=1e-5, step_loss=0.188][RANK-0]: Step: [6843], local_loss=0.01404464803636074, train_loss=0.047064878046512604, time_cost=1.221099615097046
+
Steps: 1%| | 6843/1000000 [17:29:33<2071:36:22, 7.51s/it, lr=1e-5, step_loss=0.014]
Steps: 1%| | 6844/1000000 [17:29:41<2082:36:07, 7.55s/it, lr=1e-5, step_loss=0.014][RANK-0]: Step: [6844], local_loss=0.03495657071471214, train_loss=0.0410325825214386, time_cost=3.5335605144500732
+
Steps: 1%| | 6844/1000000 [17:29:41<2082:36:07, 7.55s/it, lr=1e-5, step_loss=0.035]
Steps: 1%| | 6845/1000000 [17:29:45<1796:50:21, 6.51s/it, lr=1e-5, step_loss=0.035][RANK-0]: Step: [6845], local_loss=0.008554382249712944, train_loss=0.08600515127182007, time_cost=1.262998104095459
+
Steps: 1%| | 6845/1000000 [17:29:45<1796:50:21, 6.51s/it, lr=1e-5, step_loss=0.00855]
Steps: 1%| | 6846/1000000 [17:29:59<2437:15:12, 8.83s/it, lr=1e-5, step_loss=0.00855][RANK-0]: Step: [6846], local_loss=0.021325724199414253, train_loss=0.038892198354005814, time_cost=4.858675718307495
+
Steps: 1%| | 6846/1000000 [17:29:59<2437:15:12, 8.83s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 6847/1000000 [17:30:08<2400:57:38, 8.70s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [6847], local_loss=0.009028398431837559, train_loss=0.02108655497431755, time_cost=4.966949224472046
+
Steps: 1%| | 6847/1000000 [17:30:08<2400:57:38, 8.70s/it, lr=1e-5, step_loss=0.00903]
Steps: 1%| | 6848/1000000 [17:30:21<2772:33:32, 10.05s/it, lr=1e-5, step_loss=0.00903][RANK-0]: Step: [6848], local_loss=0.05473465472459793, train_loss=0.02654917538166046, time_cost=3.9191908836364746
+
Steps: 1%| | 6848/1000000 [17:30:21<2772:33:32, 10.05s/it, lr=1e-5, step_loss=0.0547]
Steps: 1%| | 6849/1000000 [17:30:25<2317:17:27, 8.40s/it, lr=1e-5, step_loss=0.0547][RANK-0]: Step: [6849], local_loss=0.03385942056775093, train_loss=0.05169614404439926, time_cost=1.5930261611938477
+
Steps: 1%| | 6849/1000000 [17:30:25<2317:17:27, 8.40s/it, lr=1e-5, step_loss=0.0339]
Steps: 1%| | 6850/1000000 [17:30:37<2567:54:37, 9.31s/it, lr=1e-5, step_loss=0.0339][RANK-0]: Step: [6850], local_loss=0.0443570651113987, train_loss=0.030233334749937057, time_cost=1.2493693828582764
+
Steps: 1%| | 6850/1000000 [17:30:37<2567:54:37, 9.31s/it, lr=1e-5, step_loss=0.0444]
Steps: 1%| | 6851/1000000 [17:30:44<2417:17:27, 8.76s/it, lr=1e-5, step_loss=0.0444][RANK-0]: Step: [6851], local_loss=0.008087974041700363, train_loss=0.030978428199887276, time_cost=1.4898407459259033
+
Steps: 1%| | 6851/1000000 [17:30:44<2417:17:27, 8.76s/it, lr=1e-5, step_loss=0.00809]
Steps: 1%| | 6852/1000000 [17:30:57<2719:20:22, 9.86s/it, lr=1e-5, step_loss=0.00809][RANK-0]: Step: [6852], local_loss=0.38955163955688477, train_loss=0.08992662280797958, time_cost=5.949618101119995
+
Steps: 1%| | 6852/1000000 [17:30:57<2719:20:22, 9.86s/it, lr=1e-5, step_loss=0.39]
Steps: 1%| | 6853/1000000 [17:31:11<3094:00:07, 11.22s/it, lr=1e-5, step_loss=0.39][RANK-0]: Step: [6853], local_loss=0.019811630249023438, train_loss=0.023583045229315758, time_cost=4.839707612991333
+
Steps: 1%| | 6853/1000000 [17:31:11<3094:00:07, 11.22s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 6854/1000000 [17:31:24<3258:22:10, 11.81s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [6854], local_loss=0.019771317020058632, train_loss=0.02517646923661232, time_cost=2.457108736038208
+
Steps: 1%| | 6854/1000000 [17:31:24<3258:22:10, 11.81s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 6855/1000000 [17:31:31<2864:02:31, 10.38s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [6855], local_loss=0.008893335238099098, train_loss=0.017981283366680145, time_cost=1.418431282043457
+
Steps: 1%| | 6855/1000000 [17:31:31<2864:02:31, 10.38s/it, lr=1e-5, step_loss=0.00889]
Steps: 1%| | 6856/1000000 [17:31:40<2730:42:43, 9.90s/it, lr=1e-5, step_loss=0.00889][RANK-0]: Step: [6856], local_loss=0.04144224151968956, train_loss=0.04810928553342819, time_cost=3.054046869277954
+
Steps: 1%| | 6856/1000000 [17:31:40<2730:42:43, 9.90s/it, lr=1e-5, step_loss=0.0414]
Steps: 1%| | 6857/1000000 [17:31:45<2310:54:19, 8.38s/it, lr=1e-5, step_loss=0.0414][RANK-0]: Step: [6857], local_loss=0.4698393642902374, train_loss=0.19985270500183105, time_cost=1.2208023071289062
+
Steps: 1%| | 6857/1000000 [17:31:45<2310:54:19, 8.38s/it, lr=1e-5, step_loss=0.47]
Steps: 1%| | 6858/1000000 [17:31:50<2057:03:16, 7.46s/it, lr=1e-5, step_loss=0.47][RANK-0]: Step: [6858], local_loss=0.06159288436174393, train_loss=0.09238944202661514, time_cost=2.2878408432006836
+
Steps: 1%| | 6858/1000000 [17:31:50<2057:03:16, 7.46s/it, lr=1e-5, step_loss=0.0616]
Steps: 1%| | 6859/1000000 [17:32:01<2333:09:51, 8.46s/it, lr=1e-5, step_loss=0.0616][RANK-0]: Step: [6859], local_loss=0.011245093308389187, train_loss=0.14806705713272095, time_cost=1.2401061058044434
+
Steps: 1%| | 6859/1000000 [17:32:01<2333:09:51, 8.46s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 6860/1000000 [17:32:19<3139:44:30, 11.38s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [6860], local_loss=0.11166919767856598, train_loss=0.06343275308609009, time_cost=1.2500009536743164
+
Steps: 1%| | 6860/1000000 [17:32:19<3139:44:30, 11.38s/it, lr=1e-5, step_loss=0.112]
Steps: 1%| | 6861/1000000 [17:32:28<2946:49:58, 10.68s/it, lr=1e-5, step_loss=0.112][RANK-0]: Step: [6861], local_loss=0.029067497700452805, train_loss=0.025356749072670937, time_cost=1.636387825012207
+
Steps: 1%| | 6861/1000000 [17:32:28<2946:49:58, 10.68s/it, lr=1e-5, step_loss=0.0291]
Steps: 1%| | 6862/1000000 [17:32:37<2791:42:15, 10.12s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [6862], local_loss=0.009313809685409069, train_loss=0.031352460384368896, time_cost=1.2301514148712158
+
Steps: 1%| | 6862/1000000 [17:32:37<2791:42:15, 10.12s/it, lr=1e-5, step_loss=0.00931]
Steps: 1%| | 6863/1000000 [17:32:45<2651:57:24, 9.61s/it, lr=1e-5, step_loss=0.00931][RANK-0]: Step: [6863], local_loss=0.01797911897301674, train_loss=0.019800450652837753, time_cost=1.2376224994659424
+
Steps: 1%| | 6863/1000000 [17:32:45<2651:57:24, 9.61s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 6864/1000000 [17:32:57<2814:12:35, 10.20s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [6864], local_loss=0.010502250865101814, train_loss=0.017404798418283463, time_cost=3.999976396560669
+
Steps: 1%| | 6864/1000000 [17:32:57<2814:12:35, 10.20s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 6865/1000000 [17:33:10<3073:01:09, 11.14s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [6865], local_loss=0.006470180116593838, train_loss=0.02880362793803215, time_cost=1.2243382930755615
+
Steps: 1%| | 6865/1000000 [17:33:10<3073:01:09, 11.14s/it, lr=1e-5, step_loss=0.00647]
Steps: 1%| | 6866/1000000 [17:33:17<2721:07:14, 9.86s/it, lr=1e-5, step_loss=0.00647][RANK-0]: Step: [6866], local_loss=0.02159130573272705, train_loss=0.030038882046937943, time_cost=2.0030853748321533
+
Steps: 1%| | 6866/1000000 [17:33:17<2721:07:14, 9.86s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 6867/1000000 [17:33:22<2320:23:29, 8.41s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [6867], local_loss=0.01768806204199791, train_loss=0.16985082626342773, time_cost=2.227703809738159
+
Steps: 1%| | 6867/1000000 [17:33:22<2320:23:29, 8.41s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 6868/1000000 [17:33:27<1985:42:36, 7.20s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [6868], local_loss=0.04889053851366043, train_loss=0.035193994641304016, time_cost=1.2423980236053467
+
Steps: 1%| | 6868/1000000 [17:33:27<1985:42:36, 7.20s/it, lr=1e-5, step_loss=0.0489]
Steps: 1%| | 6869/1000000 [17:33:33<1908:26:00, 6.92s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [6869], local_loss=0.019337717443704605, train_loss=0.10118125379085541, time_cost=2.201634645462036
+
Steps: 1%| | 6869/1000000 [17:33:33<1908:26:00, 6.92s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%| | 6870/1000000 [17:33:47<2548:29:16, 9.24s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [6870], local_loss=0.015088644810020924, train_loss=0.03725186362862587, time_cost=3.784148931503296
+
Steps: 1%| | 6870/1000000 [17:33:47<2548:29:16, 9.24s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 6871/1000000 [17:33:53<2230:52:18, 8.09s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [6871], local_loss=0.17412583529949188, train_loss=0.039312947541475296, time_cost=1.482316493988037
+
Steps: 1%| | 6871/1000000 [17:33:53<2230:52:18, 8.09s/it, lr=1e-5, step_loss=0.174]
Steps: 1%| | 6872/1000000 [17:33:57<1915:31:18, 6.94s/it, lr=1e-5, step_loss=0.174][RANK-0]: Step: [6872], local_loss=0.06549019366502762, train_loss=0.03877493739128113, time_cost=1.3624389171600342
+
Steps: 1%| | 6872/1000000 [17:33:57<1915:31:18, 6.94s/it, lr=1e-5, step_loss=0.0655]
Steps: 1%| | 6873/1000000 [17:34:03<1830:04:08, 6.63s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [6873], local_loss=0.029977289959788322, train_loss=0.050249166786670685, time_cost=3.0089170932769775
+
Steps: 1%| | 6873/1000000 [17:34:03<1830:04:08, 6.63s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 6874/1000000 [17:34:16<2337:33:58, 8.47s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [6874], local_loss=0.007489436771720648, train_loss=0.026374008506536484, time_cost=5.611858367919922
+
Steps: 1%| | 6874/1000000 [17:34:16<2337:33:58, 8.47s/it, lr=1e-5, step_loss=0.00749]
Steps: 1%| | 6875/1000000 [17:34:21<2044:59:14, 7.41s/it, lr=1e-5, step_loss=0.00749][RANK-0]: Step: [6875], local_loss=0.013347245752811432, train_loss=0.05035748705267906, time_cost=2.325599193572998
+
Steps: 1%| | 6875/1000000 [17:34:21<2044:59:14, 7.41s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6876/1000000 [17:34:25<1781:15:48, 6.46s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6876], local_loss=0.027142899110913277, train_loss=0.06507591903209686, time_cost=1.7090880870819092
+
Steps: 1%| | 6876/1000000 [17:34:25<1781:15:48, 6.46s/it, lr=1e-5, step_loss=0.0271]
Steps: 1%| | 6877/1000000 [17:34:37<2217:21:47, 8.04s/it, lr=1e-5, step_loss=0.0271][RANK-0]: Step: [6877], local_loss=0.09827346354722977, train_loss=0.04229169338941574, time_cost=2.6133856773376465
+
Steps: 1%| | 6877/1000000 [17:34:37<2217:21:47, 8.04s/it, lr=1e-5, step_loss=0.0983]
Steps: 1%| | 6878/1000000 [17:34:42<2024:03:40, 7.34s/it, lr=1e-5, step_loss=0.0983][RANK-0]: Step: [6878], local_loss=0.010718250647187233, train_loss=0.0203854963183403, time_cost=1.6767539978027344
+
Steps: 1%| | 6878/1000000 [17:34:42<2024:03:40, 7.34s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 6879/1000000 [17:34:48<1844:08:29, 6.68s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [6879], local_loss=0.010188471525907516, train_loss=0.0633741244673729, time_cost=2.161259412765503
+
Steps: 1%| | 6879/1000000 [17:34:48<1844:08:29, 6.68s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 6880/1000000 [17:35:02<2471:04:52, 8.96s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [6880], local_loss=0.010148732922971249, train_loss=0.05612099915742874, time_cost=1.7651209831237793
+
Steps: 1%| | 6880/1000000 [17:35:02<2471:04:52, 8.96s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 6881/1000000 [17:35:11<2461:33:51, 8.92s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [6881], local_loss=0.022775599732995033, train_loss=0.015091029927134514, time_cost=2.476609706878662
+
Steps: 1%| | 6881/1000000 [17:35:11<2461:33:51, 8.92s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%| | 6882/1000000 [17:35:17<2202:59:27, 7.99s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [6882], local_loss=0.014372729696333408, train_loss=0.02753928303718567, time_cost=1.2939434051513672
+
Steps: 1%| | 6882/1000000 [17:35:17<2202:59:27, 7.99s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6883/1000000 [17:35:24<2161:06:13, 7.83s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6883], local_loss=0.10996665805578232, train_loss=0.05805816501379013, time_cost=3.0823793411254883
+
Steps: 1%| | 6883/1000000 [17:35:24<2161:06:13, 7.83s/it, lr=1e-5, step_loss=0.11]
Steps: 1%| | 6884/1000000 [17:35:37<2595:33:28, 9.41s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [6884], local_loss=0.08696107566356659, train_loss=0.11527099460363388, time_cost=5.283339977264404
+
Steps: 1%| | 6884/1000000 [17:35:37<2595:33:28, 9.41s/it, lr=1e-5, step_loss=0.087]
Steps: 1%| | 6885/1000000 [17:35:54<3180:45:30, 11.53s/it, lr=1e-5, step_loss=0.087][RANK-0]: Step: [6885], local_loss=0.0229045283049345, train_loss=0.03132934123277664, time_cost=6.889409780502319
+
Steps: 1%| | 6885/1000000 [17:35:54<3180:45:30, 11.53s/it, lr=1e-5, step_loss=0.0229]
Steps: 1%| | 6886/1000000 [17:36:07<3348:09:44, 12.14s/it, lr=1e-5, step_loss=0.0229][RANK-0]: Step: [6886], local_loss=0.029276352375745773, train_loss=0.019981693476438522, time_cost=3.5900299549102783
+
Steps: 1%| | 6886/1000000 [17:36:07<3348:09:44, 12.14s/it, lr=1e-5, step_loss=0.0293]
Steps: 1%| | 6887/1000000 [17:36:22<3609:01:05, 13.08s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [6887], local_loss=0.046016495674848557, train_loss=0.024925928562879562, time_cost=6.746020555496216
+
Steps: 1%| | 6887/1000000 [17:36:22<3609:01:05, 13.08s/it, lr=1e-5, step_loss=0.046]
Steps: 1%| | 6888/1000000 [17:36:31<3256:15:02, 11.80s/it, lr=1e-5, step_loss=0.046][RANK-0]: Step: [6888], local_loss=0.008823011070489883, train_loss=0.025878559798002243, time_cost=6.610310316085815
+
Steps: 1%| | 6888/1000000 [17:36:31<3256:15:02, 11.80s/it, lr=1e-5, step_loss=0.00882]
Steps: 1%| | 6889/1000000 [17:36:36<2695:30:24, 9.77s/it, lr=1e-5, step_loss=0.00882][RANK-0]: Step: [6889], local_loss=0.06912209093570709, train_loss=0.051689550280570984, time_cost=1.354400873184204
+
Steps: 1%| | 6889/1000000 [17:36:36<2695:30:24, 9.77s/it, lr=1e-5, step_loss=0.0691]
Steps: 1%| | 6890/1000000 [17:36:43<2458:00:50, 8.91s/it, lr=1e-5, step_loss=0.0691][RANK-0]: Step: [6890], local_loss=0.018719477578997612, train_loss=0.04487622156739235, time_cost=2.2958455085754395
+
Steps: 1%| | 6890/1000000 [17:36:43<2458:00:50, 8.91s/it, lr=1e-5, step_loss=0.0187]
Steps: 1%| | 6891/1000000 [17:36:52<2472:36:38, 8.96s/it, lr=1e-5, step_loss=0.0187][RANK-0]: Step: [6891], local_loss=0.021923387423157692, train_loss=0.03770630806684494, time_cost=1.8412377834320068
+
Steps: 1%| | 6891/1000000 [17:36:52<2472:36:38, 8.96s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%| | 6892/1000000 [17:36:57<2120:42:33, 7.69s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [6892], local_loss=0.25252005457878113, train_loss=0.056621477007865906, time_cost=1.874178171157837
+
Steps: 1%| | 6892/1000000 [17:36:57<2120:42:33, 7.69s/it, lr=1e-5, step_loss=0.253]
Steps: 1%| | 6893/1000000 [17:37:06<2239:10:22, 8.12s/it, lr=1e-5, step_loss=0.253][RANK-0]: Step: [6893], local_loss=0.10748365521430969, train_loss=0.12155114114284515, time_cost=4.245368480682373
+
Steps: 1%| | 6893/1000000 [17:37:06<2239:10:22, 8.12s/it, lr=1e-5, step_loss=0.107]
Steps: 1%| | 6894/1000000 [17:37:15<2322:04:22, 8.42s/it, lr=1e-5, step_loss=0.107][RANK-0]: Step: [6894], local_loss=0.009274497628211975, train_loss=0.03212995082139969, time_cost=7.434499502182007
+
Steps: 1%| | 6894/1000000 [17:37:15<2322:04:22, 8.42s/it, lr=1e-5, step_loss=0.00927]
Steps: 1%| | 6895/1000000 [17:37:21<2088:07:39, 7.57s/it, lr=1e-5, step_loss=0.00927][RANK-0]: Step: [6895], local_loss=0.0484330914914608, train_loss=0.04751589149236679, time_cost=3.2339494228363037
+
Steps: 1%| | 6895/1000000 [17:37:21<2088:07:39, 7.57s/it, lr=1e-5, step_loss=0.0484]
Steps: 1%| | 6896/1000000 [17:37:28<2097:26:54, 7.60s/it, lr=1e-5, step_loss=0.0484][RANK-0]: Step: [6896], local_loss=0.008102868683636189, train_loss=0.09098480641841888, time_cost=4.3980393409729
+
Steps: 1%| | 6896/1000000 [17:37:28<2097:26:54, 7.60s/it, lr=1e-5, step_loss=0.0081]
Steps: 1%| | 6897/1000000 [17:37:36<2060:56:01, 7.47s/it, lr=1e-5, step_loss=0.0081][RANK-0]: Step: [6897], local_loss=0.05440080910921097, train_loss=0.08833396434783936, time_cost=2.961383104324341
+
Steps: 1%| | 6897/1000000 [17:37:36<2060:56:01, 7.47s/it, lr=1e-5, step_loss=0.0544]
Steps: 1%| | 6898/1000000 [17:37:40<1846:59:45, 6.70s/it, lr=1e-5, step_loss=0.0544][RANK-0]: Step: [6898], local_loss=0.013499466702342033, train_loss=0.10498876869678497, time_cost=1.9813902378082275
+
Steps: 1%| | 6898/1000000 [17:37:40<1846:59:45, 6.70s/it, lr=1e-5, step_loss=0.0135]
Steps: 1%| | 6899/1000000 [17:37:55<2508:00:45, 9.09s/it, lr=1e-5, step_loss=0.0135][RANK-0]: Step: [6899], local_loss=0.05573277547955513, train_loss=0.01930832490324974, time_cost=1.2208280563354492
+
Steps: 1%| | 6899/1000000 [17:37:55<2508:00:45, 9.09s/it, lr=1e-5, step_loss=0.0557]
Steps: 1%| | 6900/1000000 [17:38:07<2697:23:53, 9.78s/it, lr=1e-5, step_loss=0.0557][RANK-0]: Step: [6900], local_loss=0.010121345520019531, train_loss=0.03372061997652054, time_cost=1.2171077728271484
+
Steps: 1%| | 6900/1000000 [17:38:07<2697:23:53, 9.78s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 6901/1000000 [17:38:12<2323:05:40, 8.42s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [6901], local_loss=0.15614312887191772, train_loss=0.03751520812511444, time_cost=2.324216842651367
+
Steps: 1%| | 6901/1000000 [17:38:12<2323:05:40, 8.42s/it, lr=1e-5, step_loss=0.156]
Steps: 1%| | 6902/1000000 [17:38:22<2500:16:04, 9.06s/it, lr=1e-5, step_loss=0.156][RANK-0]: Step: [6902], local_loss=0.09436365216970444, train_loss=0.22341400384902954, time_cost=1.7314543724060059
+
Steps: 1%| | 6902/1000000 [17:38:22<2500:16:04, 9.06s/it, lr=1e-5, step_loss=0.0944]
Steps: 1%| | 6903/1000000 [17:38:34<2739:43:55, 9.93s/it, lr=1e-5, step_loss=0.0944][RANK-0]: Step: [6903], local_loss=0.018923457711935043, train_loss=0.02709321863949299, time_cost=4.424259185791016
+
Steps: 1%| | 6903/1000000 [17:38:34<2739:43:55, 9.93s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%| | 6904/1000000 [17:38:39<2344:07:03, 8.50s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [6904], local_loss=0.027936188504099846, train_loss=0.038613490760326385, time_cost=1.3190350532531738
+
Steps: 1%| | 6904/1000000 [17:38:39<2344:07:03, 8.50s/it, lr=1e-5, step_loss=0.0279]
Steps: 1%| | 6905/1000000 [17:38:44<2054:32:32, 7.45s/it, lr=1e-5, step_loss=0.0279][RANK-0]: Step: [6905], local_loss=0.034227702766656876, train_loss=0.05413922294974327, time_cost=2.241088390350342
+
Steps: 1%| | 6905/1000000 [17:38:44<2054:32:32, 7.45s/it, lr=1e-5, step_loss=0.0342]
Steps: 1%| | 6906/1000000 [17:38:57<2508:49:33, 9.09s/it, lr=1e-5, step_loss=0.0342][RANK-0]: Step: [6906], local_loss=0.020762119442224503, train_loss=0.0637424886226654, time_cost=3.475804090499878
+
Steps: 1%| | 6906/1000000 [17:38:57<2508:49:33, 9.09s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%| | 6907/1000000 [17:39:06<2482:50:25, 9.00s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [6907], local_loss=0.03949311748147011, train_loss=0.028407059609889984, time_cost=2.7083144187927246
+
Steps: 1%| | 6907/1000000 [17:39:06<2482:50:25, 9.00s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%| | 6908/1000000 [17:39:20<2864:01:17, 10.38s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [6908], local_loss=0.09514814615249634, train_loss=0.03994220495223999, time_cost=4.304005861282349
+
Steps: 1%| | 6908/1000000 [17:39:20<2864:01:17, 10.38s/it, lr=1e-5, step_loss=0.0951]
Steps: 1%| | 6909/1000000 [17:39:28<2672:52:59, 9.69s/it, lr=1e-5, step_loss=0.0951][RANK-0]: Step: [6909], local_loss=0.01180045586079359, train_loss=0.020150993019342422, time_cost=1.2235612869262695
+
Steps: 1%| | 6909/1000000 [17:39:28<2672:52:59, 9.69s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6910/1000000 [17:39:33<2302:20:48, 8.35s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6910], local_loss=0.021960025653243065, train_loss=0.048936329782009125, time_cost=1.5428712368011475
+
Steps: 1%| | 6910/1000000 [17:39:33<2302:20:48, 8.35s/it, lr=1e-5, step_loss=0.022]
Steps: 1%| | 6911/1000000 [17:39:48<2835:39:07, 10.28s/it, lr=1e-5, step_loss=0.022][RANK-0]: Step: [6911], local_loss=0.007255044765770435, train_loss=0.029398122802376747, time_cost=4.842298269271851
+
Steps: 1%| | 6911/1000000 [17:39:48<2835:39:07, 10.28s/it, lr=1e-5, step_loss=0.00726]
Steps: 1%| | 6912/1000000 [17:39:53<2447:36:06, 8.87s/it, lr=1e-5, step_loss=0.00726][RANK-0]: Step: [6912], local_loss=0.02375389076769352, train_loss=0.024306315928697586, time_cost=2.6558310985565186
+
Steps: 1%| | 6912/1000000 [17:39:53<2447:36:06, 8.87s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%| | 6913/1000000 [17:40:08<2879:00:07, 10.44s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [6913], local_loss=0.08935798704624176, train_loss=0.06713150441646576, time_cost=1.4063425064086914
+
Steps: 1%| | 6913/1000000 [17:40:08<2879:00:07, 10.44s/it, lr=1e-5, step_loss=0.0894]
Steps: 1%| | 6914/1000000 [17:40:17<2766:59:54, 10.03s/it, lr=1e-5, step_loss=0.0894][RANK-0]: Step: [6914], local_loss=0.014449656940996647, train_loss=0.03551694005727768, time_cost=1.2382326126098633
+
Steps: 1%| | 6914/1000000 [17:40:17<2766:59:54, 10.03s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6915/1000000 [17:40:22<2397:19:41, 8.69s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6915], local_loss=0.039540790021419525, train_loss=0.05028966814279556, time_cost=3.017080307006836
+
Steps: 1%| | 6915/1000000 [17:40:22<2397:19:41, 8.69s/it, lr=1e-5, step_loss=0.0395]
Steps: 1%| | 6916/1000000 [17:40:29<2270:51:06, 8.23s/it, lr=1e-5, step_loss=0.0395][RANK-0]: Step: [6916], local_loss=0.00927241612225771, train_loss=11.281510353088379, time_cost=1.313964605331421
+
Steps: 1%| | 6916/1000000 [17:40:29<2270:51:06, 8.23s/it, lr=1e-5, step_loss=0.00927]
Steps: 1%| | 6917/1000000 [17:40:39<2404:30:28, 8.72s/it, lr=1e-5, step_loss=0.00927][RANK-0]: Step: [6917], local_loss=0.029127880930900574, train_loss=0.048610806465148926, time_cost=1.7320544719696045
+
Steps: 1%| | 6917/1000000 [17:40:39<2404:30:28, 8.72s/it, lr=1e-5, step_loss=0.0291]
Steps: 1%| | 6918/1000000 [17:40:46<2215:19:19, 8.03s/it, lr=1e-5, step_loss=0.0291][RANK-0]: Step: [6918], local_loss=0.021378792822360992, train_loss=0.06844472140073776, time_cost=2.3114030361175537
+
Steps: 1%| | 6918/1000000 [17:40:46<2215:19:19, 8.03s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 6919/1000000 [17:40:54<2261:39:31, 8.20s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [6919], local_loss=0.021601103246212006, train_loss=0.037757791578769684, time_cost=2.393134117126465
+
Steps: 1%| | 6919/1000000 [17:40:54<2261:39:31, 8.20s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 6920/1000000 [17:41:02<2202:46:02, 7.99s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [6920], local_loss=0.015749456360936165, train_loss=0.03865259885787964, time_cost=1.5103888511657715
+
Steps: 1%| | 6920/1000000 [17:41:02<2202:46:02, 7.99s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 6921/1000000 [17:41:09<2154:54:04, 7.81s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [6921], local_loss=0.011900002136826515, train_loss=0.03615046292543411, time_cost=1.699906826019287
+
Steps: 1%| | 6921/1000000 [17:41:09<2154:54:04, 7.81s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 6922/1000000 [17:41:14<1887:26:25, 6.84s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [6922], local_loss=0.01259669754654169, train_loss=0.0395340621471405, time_cost=1.2977488040924072
+
Steps: 1%| | 6922/1000000 [17:41:14<1887:26:25, 6.84s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 6923/1000000 [17:41:22<2031:47:45, 7.37s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [6923], local_loss=0.010789044201374054, train_loss=0.16642136871814728, time_cost=2.6493983268737793
+
Steps: 1%| | 6923/1000000 [17:41:22<2031:47:45, 7.37s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 6924/1000000 [17:41:36<2562:39:55, 9.29s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [6924], local_loss=0.01850929483771324, train_loss=0.040378388017416, time_cost=4.469793796539307
+
Steps: 1%| | 6924/1000000 [17:41:36<2562:39:55, 9.29s/it, lr=1e-5, step_loss=0.0185]
Steps: 1%| | 6925/1000000 [17:41:41<2190:30:02, 7.94s/it, lr=1e-5, step_loss=0.0185][RANK-0]: Step: [6925], local_loss=0.016146568581461906, train_loss=0.026279523968696594, time_cost=1.7996091842651367
+
Steps: 1%| | 6925/1000000 [17:41:41<2190:30:02, 7.94s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 6926/1000000 [17:41:49<2198:02:01, 7.97s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [6926], local_loss=0.025137223303318024, train_loss=0.02815529890358448, time_cost=3.7706029415130615
+
Steps: 1%| | 6926/1000000 [17:41:49<2198:02:01, 7.97s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%| | 6927/1000000 [17:41:59<2405:49:24, 8.72s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [6927], local_loss=0.07584801316261292, train_loss=0.029274355620145798, time_cost=5.0206193923950195
+
Steps: 1%| | 6927/1000000 [17:41:59<2405:49:24, 8.72s/it, lr=1e-5, step_loss=0.0758]
Steps: 1%| | 6928/1000000 [17:42:05<2164:58:56, 7.85s/it, lr=1e-5, step_loss=0.0758][RANK-0]: Step: [6928], local_loss=0.013219518586993217, train_loss=0.0324544683098793, time_cost=1.3245742321014404
+
Steps: 1%| | 6928/1000000 [17:42:05<2164:58:56, 7.85s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 6929/1000000 [17:42:20<2722:42:11, 9.87s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [6929], local_loss=0.03895765542984009, train_loss=0.05422189086675644, time_cost=1.243450403213501
+
Steps: 1%| | 6929/1000000 [17:42:20<2722:42:11, 9.87s/it, lr=1e-5, step_loss=0.039]
Steps: 1%| | 6930/1000000 [17:42:34<3077:28:25, 11.16s/it, lr=1e-5, step_loss=0.039][RANK-0]: Step: [6930], local_loss=0.014968039467930794, train_loss=5.69140100479126, time_cost=4.7733752727508545
+
Steps: 1%| | 6930/1000000 [17:42:34<3077:28:25, 11.16s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 6931/1000000 [17:42:45<3069:00:43, 11.13s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [6931], local_loss=0.058398667722940445, train_loss=0.02556050941348076, time_cost=3.0566084384918213
+
Steps: 1%| | 6931/1000000 [17:42:45<3069:00:43, 11.13s/it, lr=1e-5, step_loss=0.0584]
Steps: 1%| | 6932/1000000 [17:42:58<3224:25:25, 11.69s/it, lr=1e-5, step_loss=0.0584][RANK-0]: Step: [6932], local_loss=0.49697345495224, train_loss=24.36406135559082, time_cost=6.250986576080322
+
Steps: 1%| | 6932/1000000 [17:42:58<3224:25:25, 11.69s/it, lr=1e-5, step_loss=0.497]
Steps: 1%| | 6933/1000000 [17:43:14<3577:30:16, 12.97s/it, lr=1e-5, step_loss=0.497][RANK-0]: Step: [6933], local_loss=0.05577310547232628, train_loss=0.030638180673122406, time_cost=7.170778512954712
+
Steps: 1%| | 6933/1000000 [17:43:14<3577:30:16, 12.97s/it, lr=1e-5, step_loss=0.0558]
Steps: 1%| | 6934/1000000 [17:43:21<3118:52:12, 11.31s/it, lr=1e-5, step_loss=0.0558][RANK-0]: Step: [6934], local_loss=0.03193088620901108, train_loss=0.0286547914147377, time_cost=2.888017177581787
+
Steps: 1%| | 6934/1000000 [17:43:21<3118:52:12, 11.31s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%| | 6935/1000000 [17:43:27<2685:17:09, 9.73s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [6935], local_loss=0.008593875914812088, train_loss=0.028456050902605057, time_cost=4.3527445793151855
+
Steps: 1%| | 6935/1000000 [17:43:27<2685:17:09, 9.73s/it, lr=1e-5, step_loss=0.00859]
Steps: 1%| | 6936/1000000 [17:43:36<2549:40:32, 9.24s/it, lr=1e-5, step_loss=0.00859][RANK-0]: Step: [6936], local_loss=0.019584551453590393, train_loss=0.04052860662341118, time_cost=1.2323336601257324
+
Steps: 1%| | 6936/1000000 [17:43:36<2549:40:32, 9.24s/it, lr=1e-5, step_loss=0.0196]
Steps: 1%| | 6937/1000000 [17:43:43<2408:31:35, 8.73s/it, lr=1e-5, step_loss=0.0196][RANK-0]: Step: [6937], local_loss=0.013896086253225803, train_loss=0.03642557933926582, time_cost=3.067596673965454
+
Steps: 1%| | 6937/1000000 [17:43:43<2408:31:35, 8.73s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 6938/1000000 [17:43:51<2353:13:01, 8.53s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [6938], local_loss=0.06285163015127182, train_loss=0.022292250767350197, time_cost=1.8957798480987549
+
Steps: 1%| | 6938/1000000 [17:43:51<2353:13:01, 8.53s/it, lr=1e-5, step_loss=0.0629]
Steps: 1%| | 6939/1000000 [17:43:57<2095:22:26, 7.60s/it, lr=1e-5, step_loss=0.0629][RANK-0]: Step: [6939], local_loss=0.007098644506186247, train_loss=0.044369153678417206, time_cost=2.695368766784668
+
Steps: 1%| | 6939/1000000 [17:43:57<2095:22:26, 7.60s/it, lr=1e-5, step_loss=0.0071]
Steps: 1%| | 6940/1000000 [17:44:03<1965:55:56, 7.13s/it, lr=1e-5, step_loss=0.0071][RANK-0]: Step: [6940], local_loss=0.018761884421110153, train_loss=0.08182068169116974, time_cost=2.3608736991882324
+
Steps: 1%| | 6940/1000000 [17:44:03<1965:55:56, 7.13s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 6941/1000000 [17:44:13<2261:31:27, 8.20s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [6941], local_loss=0.041636597365140915, train_loss=0.041126444935798645, time_cost=1.6081254482269287
+
Steps: 1%| | 6941/1000000 [17:44:13<2261:31:27, 8.20s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%| | 6942/1000000 [17:44:19<2038:59:09, 7.39s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [6942], local_loss=0.018096303567290306, train_loss=0.078385129570961, time_cost=3.94840145111084
+
Steps: 1%| | 6942/1000000 [17:44:19<2038:59:09, 7.39s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%| | 6943/1000000 [17:44:27<2077:45:32, 7.53s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [6943], local_loss=0.03411080315709114, train_loss=0.051239460706710815, time_cost=2.9543533325195312
+
Steps: 1%| | 6943/1000000 [17:44:27<2077:45:32, 7.53s/it, lr=1e-5, step_loss=0.0341]
Steps: 1%| | 6944/1000000 [17:44:32<1936:26:09, 7.02s/it, lr=1e-5, step_loss=0.0341][RANK-0]: Step: [6944], local_loss=0.02043112739920616, train_loss=0.03127512335777283, time_cost=1.4679770469665527
+
Steps: 1%| | 6944/1000000 [17:44:32<1936:26:09, 7.02s/it, lr=1e-5, step_loss=0.0204]
Steps: 1%| | 6945/1000000 [17:44:44<2272:47:16, 8.24s/it, lr=1e-5, step_loss=0.0204][RANK-0]: Step: [6945], local_loss=0.0467437282204628, train_loss=0.03465447202324867, time_cost=5.965698480606079
+
Steps: 1%| | 6945/1000000 [17:44:44<2272:47:16, 8.24s/it, lr=1e-5, step_loss=0.0467]
Steps: 1%| | 6946/1000000 [17:44:51<2207:12:56, 8.00s/it, lr=1e-5, step_loss=0.0467][RANK-0]: Step: [6946], local_loss=0.0660676509141922, train_loss=17.284202575683594, time_cost=1.5764803886413574
+
Steps: 1%| | 6946/1000000 [17:44:51<2207:12:56, 8.00s/it, lr=1e-5, step_loss=0.0661]
Steps: 1%| | 6947/1000000 [17:45:02<2461:31:25, 8.92s/it, lr=1e-5, step_loss=0.0661][RANK-0]: Step: [6947], local_loss=0.07186532020568848, train_loss=0.06340109556913376, time_cost=3.8936989307403564
+
Steps: 1%| | 6947/1000000 [17:45:02<2461:31:25, 8.92s/it, lr=1e-5, step_loss=0.0719]
Steps: 1%| | 6948/1000000 [17:45:18<3011:12:11, 10.92s/it, lr=1e-5, step_loss=0.0719][RANK-0]: Step: [6948], local_loss=0.023546848446130753, train_loss=0.02014532871544361, time_cost=6.760504245758057
+
Steps: 1%| | 6948/1000000 [17:45:18<3011:12:11, 10.92s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%| | 6949/1000000 [17:45:30<3165:38:12, 11.48s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [6949], local_loss=0.018002144992351532, train_loss=0.11438412964344025, time_cost=3.7519309520721436
+
Steps: 1%| | 6949/1000000 [17:45:30<3165:38:12, 11.48s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 6950/1000000 [17:45:36<2687:51:11, 9.74s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [6950], local_loss=0.0607910230755806, train_loss=0.05738551914691925, time_cost=1.270273208618164
+
Steps: 1%| | 6950/1000000 [17:45:36<2687:51:11, 9.74s/it, lr=1e-5, step_loss=0.0608]
Steps: 1%| | 6951/1000000 [17:45:41<2292:27:33, 8.31s/it, lr=1e-5, step_loss=0.0608][RANK-0]: Step: [6951], local_loss=0.008569163270294666, train_loss=0.05797558277845383, time_cost=1.8750617504119873
+
Steps: 1%| | 6951/1000000 [17:45:41<2292:27:33, 8.31s/it, lr=1e-5, step_loss=0.00857]
Steps: 1%| | 6952/1000000 [17:45:46<2016:09:29, 7.31s/it, lr=1e-5, step_loss=0.00857][RANK-0]: Step: [6952], local_loss=0.03132518380880356, train_loss=0.028437355533242226, time_cost=1.2420268058776855
+
Steps: 1%| | 6952/1000000 [17:45:46<2016:09:29, 7.31s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 6953/1000000 [17:45:53<1954:31:21, 7.09s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [6953], local_loss=0.011518729850649834, train_loss=0.05280575901269913, time_cost=4.113284111022949
+
Steps: 1%| | 6953/1000000 [17:45:53<1954:31:21, 7.09s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 6954/1000000 [17:46:08<2616:49:23, 9.49s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [6954], local_loss=0.03646045923233032, train_loss=0.03005761280655861, time_cost=11.298124313354492
+
Steps: 1%| | 6954/1000000 [17:46:08<2616:49:23, 9.49s/it, lr=1e-5, step_loss=0.0365]
Steps: 1%| | 6955/1000000 [17:46:13<2255:52:17, 8.18s/it, lr=1e-5, step_loss=0.0365][RANK-0]: Step: [6955], local_loss=0.0741388127207756, train_loss=0.023668695241212845, time_cost=2.576387405395508
+
Steps: 1%| | 6955/1000000 [17:46:13<2255:52:17, 8.18s/it, lr=1e-5, step_loss=0.0741]
Steps: 1%| | 6956/1000000 [17:46:29<2875:02:34, 10.42s/it, lr=1e-5, step_loss=0.0741][RANK-0]: Step: [6956], local_loss=0.009984932839870453, train_loss=0.03966118395328522, time_cost=8.078571557998657
+
Steps: 1%| | 6956/1000000 [17:46:29<2875:02:34, 10.42s/it, lr=1e-5, step_loss=0.00998]
Steps: 1%| | 6957/1000000 [17:46:38<2807:24:09, 10.18s/it, lr=1e-5, step_loss=0.00998][RANK-0]: Step: [6957], local_loss=0.014339610002934933, train_loss=0.06585286557674408, time_cost=1.532322883605957
+
Steps: 1%| | 6957/1000000 [17:46:38<2807:24:09, 10.18s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 6958/1000000 [17:46:48<2812:35:09, 10.20s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [6958], local_loss=0.025174714624881744, train_loss=0.03338020294904709, time_cost=1.767254114151001
+
Steps: 1%| | 6958/1000000 [17:46:48<2812:35:09, 10.20s/it, lr=1e-5, step_loss=0.0252]
Steps: 1%| | 6959/1000000 [17:47:04<3239:04:03, 11.74s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [6959], local_loss=0.4012218117713928, train_loss=0.06362275779247284, time_cost=6.674524307250977
+
Steps: 1%| | 6959/1000000 [17:47:04<3239:04:03, 11.74s/it, lr=1e-5, step_loss=0.401]
Steps: 1%| | 6960/1000000 [17:47:12<2981:55:30, 10.81s/it, lr=1e-5, step_loss=0.401][RANK-0]: Step: [6960], local_loss=0.022176343947649002, train_loss=0.017375599592924118, time_cost=3.4288947582244873
+
Steps: 1%| | 6960/1000000 [17:47:12<2981:55:30, 10.81s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 6961/1000000 [17:47:21<2832:12:43, 10.27s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [6961], local_loss=0.007185132242739201, train_loss=0.274164080619812, time_cost=1.3827035427093506
+
Steps: 1%| | 6961/1000000 [17:47:21<2832:12:43, 10.27s/it, lr=1e-5, step_loss=0.00719]
Steps: 1%| | 6962/1000000 [17:47:31<2753:24:15, 9.98s/it, lr=1e-5, step_loss=0.00719][RANK-0]: Step: [6962], local_loss=0.07425550371408463, train_loss=0.034839652478694916, time_cost=2.362337112426758
+
Steps: 1%| | 6962/1000000 [17:47:31<2753:24:15, 9.98s/it, lr=1e-5, step_loss=0.0743]
Steps: 1%| | 6963/1000000 [17:47:41<2784:12:38, 10.09s/it, lr=1e-5, step_loss=0.0743][RANK-0]: Step: [6963], local_loss=0.02582462690770626, train_loss=0.05846475809812546, time_cost=5.5966901779174805
+
Steps: 1%| | 6963/1000000 [17:47:41<2784:12:38, 10.09s/it, lr=1e-5, step_loss=0.0258]
Steps: 1%| | 6964/1000000 [17:47:47<2416:15:01, 8.76s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [6964], local_loss=0.07263346016407013, train_loss=0.06709584593772888, time_cost=1.3215656280517578
+
Steps: 1%| | 6964/1000000 [17:47:47<2416:15:01, 8.76s/it, lr=1e-5, step_loss=0.0726]
Steps: 1%| | 6965/1000000 [17:47:58<2632:05:55, 9.54s/it, lr=1e-5, step_loss=0.0726][RANK-0]: Step: [6965], local_loss=0.026801500469446182, train_loss=0.02288280799984932, time_cost=1.2510900497436523
+
Steps: 1%| | 6965/1000000 [17:47:58<2632:05:55, 9.54s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%| | 6966/1000000 [17:48:03<2261:51:48, 8.20s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [6966], local_loss=0.014393900521099567, train_loss=0.043602436780929565, time_cost=2.0092246532440186
+
Steps: 1%| | 6966/1000000 [17:48:03<2261:51:48, 8.20s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 6967/1000000 [17:48:10<2186:06:28, 7.93s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [6967], local_loss=0.02322315238416195, train_loss=0.04190795123577118, time_cost=2.205498695373535
+
Steps: 1%| | 6967/1000000 [17:48:10<2186:06:28, 7.93s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 6968/1000000 [17:48:22<2483:46:57, 9.00s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [6968], local_loss=0.0540369413793087, train_loss=0.03250127658247948, time_cost=8.522623777389526
+
Steps: 1%| | 6968/1000000 [17:48:22<2483:46:57, 9.00s/it, lr=1e-5, step_loss=0.054]
Steps: 1%| | 6969/1000000 [17:48:35<2847:04:06, 10.32s/it, lr=1e-5, step_loss=0.054][RANK-0]: Step: [6969], local_loss=0.03440761938691139, train_loss=0.0301599632948637, time_cost=1.2481169700622559
+
Steps: 1%| | 6969/1000000 [17:48:35<2847:04:06, 10.32s/it, lr=1e-5, step_loss=0.0344]
Steps: 1%| | 6970/1000000 [17:48:43<2627:44:03, 9.53s/it, lr=1e-5, step_loss=0.0344][RANK-0]: Step: [6970], local_loss=0.011772937141358852, train_loss=30.19060707092285, time_cost=2.2819817066192627
+
Steps: 1%| | 6970/1000000 [17:48:43<2627:44:03, 9.53s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 6971/1000000 [17:48:52<2582:55:10, 9.36s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [6971], local_loss=0.0434405691921711, train_loss=0.025894153863191605, time_cost=1.5503134727478027
+
Steps: 1%| | 6971/1000000 [17:48:52<2582:55:10, 9.36s/it, lr=1e-5, step_loss=0.0434]
Steps: 1%| | 6972/1000000 [17:48:59<2398:32:18, 8.70s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [6972], local_loss=0.06352634727954865, train_loss=0.07388681918382645, time_cost=4.771777868270874
+
Steps: 1%| | 6972/1000000 [17:48:59<2398:32:18, 8.70s/it, lr=1e-5, step_loss=0.0635]
Steps: 1%| | 6973/1000000 [17:49:12<2743:53:22, 9.95s/it, lr=1e-5, step_loss=0.0635][RANK-0]: Step: [6973], local_loss=0.05033326521515846, train_loss=0.02271203324198723, time_cost=2.853304386138916
+
Steps: 1%| | 6973/1000000 [17:49:12<2743:53:22, 9.95s/it, lr=1e-5, step_loss=0.0503]
Steps: 1%| | 6974/1000000 [17:49:21<2703:11:38, 9.80s/it, lr=1e-5, step_loss=0.0503][RANK-0]: Step: [6974], local_loss=0.01326513011008501, train_loss=0.038129229098558426, time_cost=3.3307759761810303
+
Steps: 1%| | 6974/1000000 [17:49:21<2703:11:38, 9.80s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 6975/1000000 [17:49:28<2449:32:09, 8.88s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [6975], local_loss=0.11414013057947159, train_loss=0.07364168018102646, time_cost=1.3397529125213623
+
Steps: 1%| | 6975/1000000 [17:49:28<2449:32:09, 8.88s/it, lr=1e-5, step_loss=0.114]
Steps: 1%| | 6976/1000000 [17:49:37<2445:36:58, 8.87s/it, lr=1e-5, step_loss=0.114][RANK-0]: Step: [6976], local_loss=0.06056734174489975, train_loss=0.04611736536026001, time_cost=1.2439355850219727
+
Steps: 1%| | 6976/1000000 [17:49:37<2445:36:58, 8.87s/it, lr=1e-5, step_loss=0.0606]
Steps: 1%| | 6977/1000000 [17:49:48<2612:53:30, 9.47s/it, lr=1e-5, step_loss=0.0606][RANK-0]: Step: [6977], local_loss=0.10339511185884476, train_loss=0.05664869770407677, time_cost=3.338621139526367
+
Steps: 1%| | 6977/1000000 [17:49:48<2612:53:30, 9.47s/it, lr=1e-5, step_loss=0.103]
Steps: 1%| | 6978/1000000 [17:49:55<2407:59:25, 8.73s/it, lr=1e-5, step_loss=0.103][RANK-0]: Step: [6978], local_loss=0.05832226946949959, train_loss=0.091529481112957, time_cost=1.251122236251831
+
Steps: 1%| | 6978/1000000 [17:49:55<2407:59:25, 8.73s/it, lr=1e-5, step_loss=0.0583]
Steps: 1%| | 6979/1000000 [17:50:03<2327:40:35, 8.44s/it, lr=1e-5, step_loss=0.0583][RANK-0]: Step: [6979], local_loss=0.035378918051719666, train_loss=0.07732762396335602, time_cost=2.680087089538574
+
Steps: 1%| | 6979/1000000 [17:50:03<2327:40:35, 8.44s/it, lr=1e-5, step_loss=0.0354]
Steps: 1%| | 6980/1000000 [17:50:08<2068:35:13, 7.50s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [6980], local_loss=0.05242123082280159, train_loss=0.04344600439071655, time_cost=2.335165500640869
+
Steps: 1%| | 6980/1000000 [17:50:08<2068:35:13, 7.50s/it, lr=1e-5, step_loss=0.0524]
Steps: 1%| | 6981/1000000 [17:50:21<2528:52:46, 9.17s/it, lr=1e-5, step_loss=0.0524][RANK-0]: Step: [6981], local_loss=0.06551704555749893, train_loss=0.04008990526199341, time_cost=3.3483381271362305
+
Steps: 1%| | 6981/1000000 [17:50:21<2528:52:46, 9.17s/it, lr=1e-5, step_loss=0.0655]
Steps: 1%| | 6982/1000000 [17:50:40<3315:33:06, 12.02s/it, lr=1e-5, step_loss=0.0655][RANK-0]: Step: [6982], local_loss=0.17939339578151703, train_loss=0.04562636464834213, time_cost=10.501171350479126
+
Steps: 1%| | 6982/1000000 [17:50:40<3315:33:06, 12.02s/it, lr=1e-5, step_loss=0.179]
Steps: 1%| | 6983/1000000 [17:50:45<2765:47:45, 10.03s/it, lr=1e-5, step_loss=0.179][RANK-0]: Step: [6983], local_loss=0.009830471128225327, train_loss=0.0374559685587883, time_cost=2.866204261779785
+
Steps: 1%| | 6983/1000000 [17:50:45<2765:47:45, 10.03s/it, lr=1e-5, step_loss=0.00983]
Steps: 1%| | 6984/1000000 [17:50:57<2888:42:21, 10.47s/it, lr=1e-5, step_loss=0.00983][RANK-0]: Step: [6984], local_loss=0.009627959690988064, train_loss=0.0205026064068079, time_cost=2.2975399494171143
+
Steps: 1%| | 6984/1000000 [17:50:57<2888:42:21, 10.47s/it, lr=1e-5, step_loss=0.00963]
Steps: 1%| | 6985/1000000 [17:51:11<3248:35:20, 11.78s/it, lr=1e-5, step_loss=0.00963][RANK-0]: Step: [6985], local_loss=0.01262403093278408, train_loss=0.0476880818605423, time_cost=5.709103584289551
+
Steps: 1%| | 6985/1000000 [17:51:11<3248:35:20, 11.78s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 6986/1000000 [17:51:24<3278:20:19, 11.89s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [6986], local_loss=0.010408367961645126, train_loss=0.16275134682655334, time_cost=1.2492575645446777
+
Steps: 1%| | 6986/1000000 [17:51:24<3278:20:19, 11.89s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 6987/1000000 [17:51:37<3421:12:42, 12.40s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [6987], local_loss=0.12839391827583313, train_loss=0.06837262213230133, time_cost=6.5977113246917725
+
Steps: 1%| | 6987/1000000 [17:51:37<3421:12:42, 12.40s/it, lr=1e-5, step_loss=0.128]
Steps: 1%| | 6988/1000000 [17:51:51<3562:45:08, 12.92s/it, lr=1e-5, step_loss=0.128][RANK-0]: Step: [6988], local_loss=0.012145214714109898, train_loss=0.025912154465913773, time_cost=4.55828595161438
+
Steps: 1%| | 6988/1000000 [17:51:51<3562:45:08, 12.92s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 6989/1000000 [17:51:56<2890:55:46, 10.48s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [6989], local_loss=0.03141215816140175, train_loss=0.01829233393073082, time_cost=2.0424320697784424
+
Steps: 1%| | 6989/1000000 [17:51:56<2890:55:46, 10.48s/it, lr=1e-5, step_loss=0.0314]
Steps: 1%| | 6990/1000000 [17:52:08<2984:22:55, 10.82s/it, lr=1e-5, step_loss=0.0314][RANK-0]: Step: [6990], local_loss=0.09571744501590729, train_loss=0.03258794546127319, time_cost=2.242363452911377
+
Steps: 1%| | 6990/1000000 [17:52:08<2984:22:55, 10.82s/it, lr=1e-5, step_loss=0.0957]
Steps: 1%| | 6991/1000000 [17:52:15<2718:51:31, 9.86s/it, lr=1e-5, step_loss=0.0957][RANK-0]: Step: [6991], local_loss=0.009163960814476013, train_loss=0.03229682892560959, time_cost=3.344128131866455
+
Steps: 1%| | 6991/1000000 [17:52:15<2718:51:31, 9.86s/it, lr=1e-5, step_loss=0.00916]
Steps: 1%| | 6992/1000000 [17:52:19<2245:08:16, 8.14s/it, lr=1e-5, step_loss=0.00916][RANK-0]: Step: [6992], local_loss=0.015151478350162506, train_loss=0.184251070022583, time_cost=1.353076696395874
+
Steps: 1%| | 6992/1000000 [17:52:19<2245:08:16, 8.14s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 6993/1000000 [17:52:30<2452:40:46, 8.89s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [6993], local_loss=0.009985337033867836, train_loss=0.03328306972980499, time_cost=3.0896008014678955
+
Steps: 1%| | 6993/1000000 [17:52:30<2452:40:46, 8.89s/it, lr=1e-5, step_loss=0.00999]
Steps: 1%| | 6994/1000000 [17:52:35<2140:36:35, 7.76s/it, lr=1e-5, step_loss=0.00999][RANK-0]: Step: [6994], local_loss=0.008041813969612122, train_loss=0.05868071690201759, time_cost=4.460784912109375
+
Steps: 1%| | 6994/1000000 [17:52:35<2140:36:35, 7.76s/it, lr=1e-5, step_loss=0.00804]
Steps: 1%| | 6995/1000000 [17:52:40<1930:07:30, 7.00s/it, lr=1e-5, step_loss=0.00804][RANK-0]: Step: [6995], local_loss=0.04060998186469078, train_loss=0.02971630170941353, time_cost=1.2301015853881836
+
Steps: 1%| | 6995/1000000 [17:52:40<1930:07:30, 7.00s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%| | 6996/1000000 [17:52:56<2640:07:31, 9.57s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [6996], local_loss=0.021947013214230537, train_loss=0.061308786273002625, time_cost=6.898200035095215
+
Steps: 1%| | 6996/1000000 [17:52:56<2640:07:31, 9.57s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%| | 6997/1000000 [17:53:10<3013:41:37, 10.93s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [6997], local_loss=0.08783361315727234, train_loss=0.04246646910905838, time_cost=4.6708598136901855
+
Steps: 1%| | 6997/1000000 [17:53:10<3013:41:37, 10.93s/it, lr=1e-5, step_loss=0.0878]
Steps: 1%| | 6998/1000000 [17:53:20<2945:22:34, 10.68s/it, lr=1e-5, step_loss=0.0878][RANK-0]: Step: [6998], local_loss=0.010809533298015594, train_loss=0.013733320869505405, time_cost=4.469052791595459
+
Steps: 1%| | 6998/1000000 [17:53:20<2945:22:34, 10.68s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 6999/1000000 [17:53:25<2488:06:50, 9.02s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [6999], local_loss=0.04327655956149101, train_loss=0.04442542418837547, time_cost=1.995471715927124
+
Steps: 1%| | 6999/1000000 [17:53:25<2488:06:50, 9.02s/it, lr=1e-5, step_loss=0.0433]
Steps: 1%| | 7000/1000000 [17:53:42<3123:00:45, 11.32s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [7000], local_loss=0.03563021868467331, train_loss=0.11658035218715668, time_cost=8.362663507461548
+09/19/2024 17:03:35 - INFO - accelerate.accelerator - Saving current state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000
+09/19/2024 17:03:35 - INFO - accelerate.accelerator - Saving DeepSpeed Model and Optimizer
+[2024-09-19 17:03:35,110] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint pytorch_model is about to be saved!
+[2024-09-19 17:03:35,141] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/mp_rank_00_model_states.pt
+[2024-09-19 17:03:35,141] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/mp_rank_00_model_states.pt...
+[2024-09-19 17:03:54,084] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/mp_rank_00_model_states.pt.
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2024-09-19 17:03:54,095] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt...
+[2024-09-19 17:04:27,384] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:27,384] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:27,384] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:28,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:28,021] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:28,021] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:28,737] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:28,819] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:28,819] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:29,619] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:29,619] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:29,619] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:30,707] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:30,708] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:30,708] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:30,818] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:30,818] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:30,818] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:30,863] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:30,863] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:30,863] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+[2024-09-19 17:04:31,004] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.
+[2024-09-19 17:04:31,005] [INFO] [engine.py:3443:_save_zero_checkpoint] zero checkpoint saved /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
+[2024-09-19 17:04:31,005] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint pytorch_model is ready now!
+09/19/2024 17:04:31 - INFO - accelerate.accelerator - DeepSpeed Model and Optimizer saved to output dir /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/pytorch_model
+{'use_additional_conditions', 'norm_num_groups', 'dropout'} was not found in config. Values will be initialized to default values.
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/model_ema/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/model_ema/diffusion_pytorch_model.safetensors
+Configuration saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/model/config.json
+Model weights saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/model/diffusion_pytorch_model.safetensors
+09/19/2024 17:05:49 - INFO - accelerate.checkpointing - Scheduler state saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/scheduler.bin
+09/19/2024 17:05:49 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/sampler.bin
+09/19/2024 17:05:49 - INFO - accelerate.checkpointing - Random states saved in /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000/random_states_0.pkl
+09/19/2024 17:05:49 - INFO - __main__ - Saved state to /home/save_dir/runs/allinpaint_stage1_2/checkpoint-7000
+
Steps: 1%| | 7000/1000000 [17:55:57<3123:00:45, 11.32s/it, lr=1e-5, step_loss=0.0356]
Steps: 1%| | 7001/1000000 [17:56:02<13780:59:14, 49.96s/it, lr=1e-5, step_loss=0.0356][RANK-0]: Step: [7001], local_loss=0.05063183605670929, train_loss=0.034559812396764755, time_cost=1.2523372173309326
+
Steps: 1%| | 7001/1000000 [17:56:02<13780:59:14, 49.96s/it, lr=1e-5, step_loss=0.0506]
Steps: 1%| | 7002/1000000 [17:56:10<10270:59:46, 37.24s/it, lr=1e-5, step_loss=0.0506][RANK-0]: Step: [7002], local_loss=0.21869152784347534, train_loss=0.06793021410703659, time_cost=1.4458305835723877
+
Steps: 1%| | 7002/1000000 [17:56:10<10270:59:46, 37.24s/it, lr=1e-5, step_loss=0.219]
Steps: 1%| | 7003/1000000 [17:56:15<7633:44:35, 27.68s/it, lr=1e-5, step_loss=0.219] [RANK-0]: Step: [7003], local_loss=0.040584396570920944, train_loss=19.725191116333008, time_cost=2.2893989086151123
+
Steps: 1%| | 7003/1000000 [17:56:15<7633:44:35, 27.68s/it, lr=1e-5, step_loss=0.0406]
Steps: 1%| | 7004/1000000 [17:56:26<6283:36:09, 22.78s/it, lr=1e-5, step_loss=0.0406][RANK-0]: Step: [7004], local_loss=0.05021002143621445, train_loss=0.04455719143152237, time_cost=4.902331113815308
+
Steps: 1%| | 7004/1000000 [17:56:26<6283:36:09, 22.78s/it, lr=1e-5, step_loss=0.0502]
Steps: 1%| | 7005/1000000 [17:56:34<4994:23:54, 18.11s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [7005], local_loss=0.0074712117202579975, train_loss=0.02862408012151718, time_cost=2.4423422813415527
+
Steps: 1%| | 7005/1000000 [17:56:34<4994:23:54, 18.11s/it, lr=1e-5, step_loss=0.00747]
Steps: 1%| | 7006/1000000 [17:56:45<4421:10:21, 16.03s/it, lr=1e-5, step_loss=0.00747][RANK-0]: Step: [7006], local_loss=0.05849923938512802, train_loss=38.72570037841797, time_cost=6.283507347106934
+
Steps: 1%| | 7006/1000000 [17:56:45<4421:10:21, 16.03s/it, lr=1e-5, step_loss=0.0585]
Steps: 1%| | 7007/1000000 [17:56:57<4110:47:01, 14.90s/it, lr=1e-5, step_loss=0.0585][RANK-0]: Step: [7007], local_loss=0.02716996893286705, train_loss=0.03938412666320801, time_cost=4.719211578369141
+
Steps: 1%| | 7007/1000000 [17:56:57<4110:47:01, 14.90s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%| | 7008/1000000 [17:57:02<3286:03:01, 11.91s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [7008], local_loss=0.016484275460243225, train_loss=0.031366776674985886, time_cost=2.474466562271118
+
Steps: 1%| | 7008/1000000 [17:57:02<3286:03:01, 11.91s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 7009/1000000 [17:57:16<3491:44:31, 12.66s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [7009], local_loss=0.05510980635881424, train_loss=0.15718616545200348, time_cost=6.4094624519348145
+
Steps: 1%| | 7009/1000000 [17:57:16<3491:44:31, 12.66s/it, lr=1e-5, step_loss=0.0551]
Steps: 1%| | 7010/1000000 [17:57:24<3044:46:42, 11.04s/it, lr=1e-5, step_loss=0.0551][RANK-0]: Step: [7010], local_loss=0.04789360240101814, train_loss=0.057247139513492584, time_cost=3.245857000350952
+
Steps: 1%| | 7010/1000000 [17:57:24<3044:46:42, 11.04s/it, lr=1e-5, step_loss=0.0479]
Steps: 1%| | 7011/1000000 [17:57:35<3045:26:59, 11.04s/it, lr=1e-5, step_loss=0.0479][RANK-0]: Step: [7011], local_loss=0.01857975870370865, train_loss=0.030800513923168182, time_cost=3.805027961730957
+
Steps: 1%| | 7011/1000000 [17:57:35<3045:26:59, 11.04s/it, lr=1e-5, step_loss=0.0186]
Steps: 1%| | 7012/1000000 [17:57:45<3013:18:52, 10.92s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [7012], local_loss=0.011172696948051453, train_loss=0.015610325150191784, time_cost=2.3053836822509766
+
Steps: 1%| | 7012/1000000 [17:57:45<3013:18:52, 10.92s/it, lr=1e-5, step_loss=0.0112]
Steps: 1%| | 7013/1000000 [17:57:54<2859:26:34, 10.37s/it, lr=1e-5, step_loss=0.0112][RANK-0]: Step: [7013], local_loss=0.051217708736658096, train_loss=0.027238499373197556, time_cost=1.655080795288086
+
Steps: 1%| | 7013/1000000 [17:57:54<2859:26:34, 10.37s/it, lr=1e-5, step_loss=0.0512]
Steps: 1%| | 7014/1000000 [17:58:10<3326:02:27, 12.06s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [7014], local_loss=0.01194759551435709, train_loss=0.04546152800321579, time_cost=7.119127035140991
+
Steps: 1%| | 7014/1000000 [17:58:10<3326:02:27, 12.06s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 7015/1000000 [17:58:15<2689:21:41, 9.75s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [7015], local_loss=0.9913683533668518, train_loss=0.1656036227941513, time_cost=1.5189423561096191
+
Steps: 1%| | 7015/1000000 [17:58:15<2689:21:41, 9.75s/it, lr=1e-5, step_loss=0.991]
Steps: 1%| | 7016/1000000 [17:58:27<2930:02:33, 10.62s/it, lr=1e-5, step_loss=0.991][RANK-0]: Step: [7016], local_loss=0.01393858902156353, train_loss=0.03079388104379177, time_cost=5.698585271835327
+
Steps: 1%| | 7016/1000000 [17:58:27<2930:02:33, 10.62s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 7017/1000000 [17:58:37<2861:46:52, 10.38s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [7017], local_loss=0.015870893374085426, train_loss=0.032275184988975525, time_cost=3.952240228652954
+
Steps: 1%| | 7017/1000000 [17:58:37<2861:46:52, 10.38s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 7018/1000000 [17:58:46<2707:42:46, 9.82s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [7018], local_loss=0.031569838523864746, train_loss=0.16180628538131714, time_cost=2.806417465209961
+
Steps: 1%| | 7018/1000000 [17:58:46<2707:42:46, 9.82s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 7019/1000000 [17:58:51<2321:09:34, 8.42s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [7019], local_loss=0.04633568227291107, train_loss=0.03512018918991089, time_cost=2.2394981384277344
+
Steps: 1%| | 7019/1000000 [17:58:51<2321:09:34, 8.42s/it, lr=1e-5, step_loss=0.0463]
Steps: 1%| | 7020/1000000 [17:58:56<2025:54:45, 7.34s/it, lr=1e-5, step_loss=0.0463][RANK-0]: Step: [7020], local_loss=0.04642790928483009, train_loss=0.04574677720665932, time_cost=1.8518805503845215
+
Steps: 1%| | 7020/1000000 [17:58:56<2025:54:45, 7.34s/it, lr=1e-5, step_loss=0.0464]
Steps: 1%| | 7021/1000000 [17:59:11<2680:31:19, 9.72s/it, lr=1e-5, step_loss=0.0464][RANK-0]: Step: [7021], local_loss=0.02505692094564438, train_loss=0.056947607547044754, time_cost=7.659273624420166
+
Steps: 1%| | 7021/1000000 [17:59:11<2680:31:19, 9.72s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%| | 7022/1000000 [17:59:20<2584:32:15, 9.37s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [7022], local_loss=0.011330018751323223, train_loss=0.026266999542713165, time_cost=2.7803308963775635
+
Steps: 1%| | 7022/1000000 [17:59:20<2584:32:15, 9.37s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 7023/1000000 [17:59:24<2166:15:47, 7.85s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [7023], local_loss=0.008031796663999557, train_loss=0.04405511915683746, time_cost=1.582693099975586
+
Steps: 1%| | 7023/1000000 [17:59:24<2166:15:47, 7.85s/it, lr=1e-5, step_loss=0.00803]
Steps: 1%| | 7024/1000000 [17:59:43<3086:39:27, 11.19s/it, lr=1e-5, step_loss=0.00803][RANK-0]: Step: [7024], local_loss=0.011465674266219139, train_loss=0.04430766403675079, time_cost=1.3225727081298828
+
Steps: 1%| | 7024/1000000 [17:59:43<3086:39:27, 11.19s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 7025/1000000 [17:59:54<3068:15:26, 11.12s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [7025], local_loss=0.015786264091730118, train_loss=0.06147884577512741, time_cost=2.004729986190796
+
Steps: 1%| | 7025/1000000 [17:59:54<3068:15:26, 11.12s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 7026/1000000 [18:00:04<3021:13:32, 10.95s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [7026], local_loss=0.020813822746276855, train_loss=0.05802270025014877, time_cost=2.610640048980713
+
Steps: 1%| | 7026/1000000 [18:00:04<3021:13:32, 10.95s/it, lr=1e-5, step_loss=0.0208]
Steps: 1%| | 7027/1000000 [18:00:10<2572:33:10, 9.33s/it, lr=1e-5, step_loss=0.0208][RANK-0]: Step: [7027], local_loss=0.043065667152404785, train_loss=0.023261602967977524, time_cost=2.553809642791748
+
Steps: 1%| | 7027/1000000 [18:00:10<2572:33:10, 9.33s/it, lr=1e-5, step_loss=0.0431]
Steps: 1%| | 7028/1000000 [18:00:20<2631:00:22, 9.54s/it, lr=1e-5, step_loss=0.0431][RANK-0]: Step: [7028], local_loss=0.02651051990687847, train_loss=2.944017171859741, time_cost=2.9141225814819336
+
Steps: 1%| | 7028/1000000 [18:00:20<2631:00:22, 9.54s/it, lr=1e-5, step_loss=0.0265]
Steps: 1%| | 7029/1000000 [18:00:32<2826:01:04, 10.25s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [7029], local_loss=0.01627412438392639, train_loss=0.03903349116444588, time_cost=3.2977991104125977
+
Steps: 1%| | 7029/1000000 [18:00:32<2826:01:04, 10.25s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 7030/1000000 [18:00:41<2763:41:57, 10.02s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [7030], local_loss=0.009869723580777645, train_loss=0.019890956580638885, time_cost=3.5074753761291504
+
Steps: 1%| | 7030/1000000 [18:00:41<2763:41:57, 10.02s/it, lr=1e-5, step_loss=0.00987]
Steps: 1%| | 7031/1000000 [18:00:55<3107:04:20, 11.26s/it, lr=1e-5, step_loss=0.00987][RANK-0]: Step: [7031], local_loss=0.007931780070066452, train_loss=0.09840870648622513, time_cost=1.310359001159668
+
Steps: 1%| | 7031/1000000 [18:00:55<3107:04:20, 11.26s/it, lr=1e-5, step_loss=0.00793]
Steps: 1%| | 7032/1000000 [18:01:04<2910:48:41, 10.55s/it, lr=1e-5, step_loss=0.00793][RANK-0]: Step: [7032], local_loss=0.029663115739822388, train_loss=0.08721384406089783, time_cost=1.6403415203094482
+
Steps: 1%| | 7032/1000000 [18:01:04<2910:48:41, 10.55s/it, lr=1e-5, step_loss=0.0297]
Steps: 1%| | 7033/1000000 [18:01:13<2716:29:49, 9.85s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [7033], local_loss=0.012115271762013435, train_loss=0.07687755674123764, time_cost=2.513094186782837
+
Steps: 1%| | 7033/1000000 [18:01:13<2716:29:49, 9.85s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 7034/1000000 [18:01:18<2320:10:37, 8.41s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [7034], local_loss=0.015721727162599564, train_loss=0.09825846552848816, time_cost=2.181166172027588
+
Steps: 1%| | 7034/1000000 [18:01:18<2320:10:37, 8.41s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 7035/1000000 [18:01:24<2146:50:49, 7.78s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [7035], local_loss=0.012012715451419353, train_loss=0.05876738950610161, time_cost=5.032488584518433
+
Steps: 1%| | 7035/1000000 [18:01:24<2146:50:49, 7.78s/it, lr=1e-5, step_loss=0.012]
Steps: 1%| | 7036/1000000 [18:01:30<2003:44:23, 7.26s/it, lr=1e-5, step_loss=0.012][RANK-0]: Step: [7036], local_loss=0.07682603597640991, train_loss=0.08686560392379761, time_cost=2.303152084350586
+
Steps: 1%| | 7036/1000000 [18:01:30<2003:44:23, 7.26s/it, lr=1e-5, step_loss=0.0768]
Steps: 1%| | 7037/1000000 [18:01:58<3749:46:44, 13.59s/it, lr=1e-5, step_loss=0.0768][RANK-0]: Step: [7037], local_loss=0.9871630668640137, train_loss=0.36941325664520264, time_cost=4.009943008422852
+
Steps: 1%| | 7037/1000000 [18:01:58<3749:46:44, 13.59s/it, lr=1e-5, step_loss=0.987]
Steps: 1%| | 7038/1000000 [18:02:06<3297:51:18, 11.96s/it, lr=1e-5, step_loss=0.987][RANK-0]: Step: [7038], local_loss=0.04131699725985527, train_loss=0.03341751545667648, time_cost=1.3451707363128662
+
Steps: 1%| | 7038/1000000 [18:02:06<3297:51:18, 11.96s/it, lr=1e-5, step_loss=0.0413]
Steps: 1%| | 7039/1000000 [18:02:14<2913:23:04, 10.56s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [7039], local_loss=0.015153627842664719, train_loss=0.0442165806889534, time_cost=4.337735414505005
+
Steps: 1%| | 7039/1000000 [18:02:14<2913:23:04, 10.56s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 7040/1000000 [18:02:23<2814:41:48, 10.20s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [7040], local_loss=0.0685228705406189, train_loss=0.029060646891593933, time_cost=1.978930950164795
+
Steps: 1%| | 7040/1000000 [18:02:23<2814:41:48, 10.20s/it, lr=1e-5, step_loss=0.0685]
Steps: 1%| | 7041/1000000 [18:02:32<2705:06:39, 9.81s/it, lr=1e-5, step_loss=0.0685][RANK-0]: Step: [7041], local_loss=0.05141054093837738, train_loss=0.18241599202156067, time_cost=2.141953229904175
+
Steps: 1%| | 7041/1000000 [18:02:32<2705:06:39, 9.81s/it, lr=1e-5, step_loss=0.0514]
Steps: 1%| | 7042/1000000 [18:02:41<2617:09:27, 9.49s/it, lr=1e-5, step_loss=0.0514][RANK-0]: Step: [7042], local_loss=0.031906336545944214, train_loss=0.026787852868437767, time_cost=6.295045614242554
+
Steps: 1%| | 7042/1000000 [18:02:41<2617:09:27, 9.49s/it, lr=1e-5, step_loss=0.0319]
Steps: 1%| | 7043/1000000 [18:02:53<2803:08:29, 10.16s/it, lr=1e-5, step_loss=0.0319][RANK-0]: Step: [7043], local_loss=0.04318038374185562, train_loss=0.41076287627220154, time_cost=3.790706157684326
+
Steps: 1%| | 7043/1000000 [18:02:53<2803:08:29, 10.16s/it, lr=1e-5, step_loss=0.0432]
Steps: 1%| | 7044/1000000 [18:02:58<2440:50:26, 8.85s/it, lr=1e-5, step_loss=0.0432][RANK-0]: Step: [7044], local_loss=0.018583547323942184, train_loss=0.08080977201461792, time_cost=3.0097367763519287
+
Steps: 1%| | 7044/1000000 [18:02:58<2440:50:26, 8.85s/it, lr=1e-5, step_loss=0.0186]
Steps: 1%| | 7045/1000000 [18:03:06<2330:54:48, 8.45s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [7045], local_loss=0.04363382235169411, train_loss=0.05687267333269119, time_cost=3.0487213134765625
+
Steps: 1%| | 7045/1000000 [18:03:06<2330:54:48, 8.45s/it, lr=1e-5, step_loss=0.0436]
Steps: 1%| | 7046/1000000 [18:03:15<2387:26:42, 8.66s/it, lr=1e-5, step_loss=0.0436][RANK-0]: Step: [7046], local_loss=0.04996400326490402, train_loss=0.033597156405448914, time_cost=1.3296339511871338
+
Steps: 1%| | 7046/1000000 [18:03:15<2387:26:42, 8.66s/it, lr=1e-5, step_loss=0.05]
Steps: 1%| | 7047/1000000 [18:03:24<2437:52:27, 8.84s/it, lr=1e-5, step_loss=0.05][RANK-0]: Step: [7047], local_loss=0.023334583267569542, train_loss=0.08160492777824402, time_cost=1.5853290557861328
+
Steps: 1%| | 7047/1000000 [18:03:24<2437:52:27, 8.84s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%| | 7048/1000000 [18:03:30<2172:59:08, 7.88s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [7048], local_loss=0.011478589847683907, train_loss=0.03550940006971359, time_cost=1.293008804321289
+
Steps: 1%| | 7048/1000000 [18:03:30<2172:59:08, 7.88s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 7049/1000000 [18:03:35<1942:56:52, 7.04s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [7049], local_loss=0.01897023431956768, train_loss=0.03479180485010147, time_cost=1.9186718463897705
+
Steps: 1%| | 7049/1000000 [18:03:35<1942:56:52, 7.04s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 7050/1000000 [18:03:39<1726:04:34, 6.26s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [7050], local_loss=0.014647691510617733, train_loss=0.033445704728364944, time_cost=1.2808992862701416
+
Steps: 1%| | 7050/1000000 [18:03:39<1726:04:34, 6.26s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 7051/1000000 [18:03:49<2026:56:14, 7.35s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [7051], local_loss=0.008465485647320747, train_loss=0.04032428562641144, time_cost=6.719166994094849
+
Steps: 1%| | 7051/1000000 [18:03:49<2026:56:14, 7.35s/it, lr=1e-5, step_loss=0.00847]
Steps: 1%| | 7052/1000000 [18:04:03<2566:19:00, 9.30s/it, lr=1e-5, step_loss=0.00847][RANK-0]: Step: [7052], local_loss=0.036982886493206024, train_loss=0.024197103455662727, time_cost=1.2416870594024658
+
Steps: 1%| | 7052/1000000 [18:04:03<2566:19:00, 9.30s/it, lr=1e-5, step_loss=0.037]
Steps: 1%| | 7053/1000000 [18:04:13<2588:11:48, 9.38s/it, lr=1e-5, step_loss=0.037][RANK-0]: Step: [7053], local_loss=0.024161480367183685, train_loss=0.0257891658693552, time_cost=3.6017162799835205
+
Steps: 1%| | 7053/1000000 [18:04:13<2588:11:48, 9.38s/it, lr=1e-5, step_loss=0.0242]
Steps: 1%| | 7054/1000000 [18:04:18<2286:26:53, 8.29s/it, lr=1e-5, step_loss=0.0242][RANK-0]: Step: [7054], local_loss=0.039750076830387115, train_loss=0.08077796548604965, time_cost=1.8560328483581543
+
Steps: 1%| | 7054/1000000 [18:04:18<2286:26:53, 8.29s/it, lr=1e-5, step_loss=0.0398]
Steps: 1%| | 7055/1000000 [18:04:27<2346:44:05, 8.51s/it, lr=1e-5, step_loss=0.0398][RANK-0]: Step: [7055], local_loss=0.08631791174411774, train_loss=0.04502283036708832, time_cost=1.6804659366607666
+
Steps: 1%| | 7055/1000000 [18:04:27<2346:44:05, 8.51s/it, lr=1e-5, step_loss=0.0863]
Steps: 1%| | 7056/1000000 [18:04:35<2249:49:50, 8.16s/it, lr=1e-5, step_loss=0.0863][RANK-0]: Step: [7056], local_loss=0.027178924530744553, train_loss=0.03682738542556763, time_cost=2.9139974117279053
+
Steps: 1%| | 7056/1000000 [18:04:35<2249:49:50, 8.16s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%| | 7057/1000000 [18:04:46<2508:32:32, 9.09s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [7057], local_loss=0.023842360824346542, train_loss=0.03254903480410576, time_cost=4.07726263999939
+
Steps: 1%| | 7057/1000000 [18:04:46<2508:32:32, 9.09s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%| | 7058/1000000 [18:04:51<2193:45:32, 7.95s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [7058], local_loss=0.025515012443065643, train_loss=0.04912470281124115, time_cost=3.962533712387085
+
Steps: 1%| | 7058/1000000 [18:04:51<2193:45:32, 7.95s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 7059/1000000 [18:04:59<2158:20:29, 7.83s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [7059], local_loss=0.062625952064991, train_loss=0.03570044785737991, time_cost=2.2627575397491455
+
Steps: 1%| | 7059/1000000 [18:04:59<2158:20:29, 7.83s/it, lr=1e-5, step_loss=0.0626]
Steps: 1%| | 7060/1000000 [18:05:12<2563:31:07, 9.29s/it, lr=1e-5, step_loss=0.0626][RANK-0]: Step: [7060], local_loss=0.030509689822793007, train_loss=0.06417542695999146, time_cost=5.566291570663452
+
Steps: 1%| | 7060/1000000 [18:05:12<2563:31:07, 9.29s/it, lr=1e-5, step_loss=0.0305]
Steps: 1%| | 7061/1000000 [18:05:20<2487:55:27, 9.02s/it, lr=1e-5, step_loss=0.0305][RANK-0]: Step: [7061], local_loss=0.015209472738206387, train_loss=0.035144269466400146, time_cost=2.738163471221924
+
Steps: 1%| | 7061/1000000 [18:05:20<2487:55:27, 9.02s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 7062/1000000 [18:05:29<2463:06:01, 8.93s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [7062], local_loss=0.013190517202019691, train_loss=0.03275076299905777, time_cost=1.4365200996398926
+
Steps: 1%| | 7062/1000000 [18:05:29<2463:06:01, 8.93s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 7063/1000000 [18:05:40<2678:23:56, 9.71s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [7063], local_loss=0.008932454511523247, train_loss=0.04025371000170708, time_cost=3.3482439517974854
+
Steps: 1%| | 7063/1000000 [18:05:40<2678:23:56, 9.71s/it, lr=1e-5, step_loss=0.00893]
Steps: 1%| | 7064/1000000 [18:05:48<2487:15:06, 9.02s/it, lr=1e-5, step_loss=0.00893][RANK-0]: Step: [7064], local_loss=0.0635811984539032, train_loss=0.08186177909374237, time_cost=2.941377878189087
+
Steps: 1%| | 7064/1000000 [18:05:48<2487:15:06, 9.02s/it, lr=1e-5, step_loss=0.0636]
Steps: 1%| | 7065/1000000 [18:05:55<2338:33:45, 8.48s/it, lr=1e-5, step_loss=0.0636][RANK-0]: Step: [7065], local_loss=0.011452358216047287, train_loss=0.021147290244698524, time_cost=1.249016523361206
+
Steps: 1%| | 7065/1000000 [18:05:55<2338:33:45, 8.48s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 7066/1000000 [18:06:03<2276:20:31, 8.25s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [7066], local_loss=0.07799258083105087, train_loss=0.18099850416183472, time_cost=3.338432550430298
+
Steps: 1%| | 7066/1000000 [18:06:03<2276:20:31, 8.25s/it, lr=1e-5, step_loss=0.078]
Steps: 1%| | 7067/1000000 [18:06:08<2027:26:36, 7.35s/it, lr=1e-5, step_loss=0.078][RANK-0]: Step: [7067], local_loss=0.043332088738679886, train_loss=0.07681198418140411, time_cost=1.4650566577911377
+
Steps: 1%| | 7067/1000000 [18:06:08<2027:26:36, 7.35s/it, lr=1e-5, step_loss=0.0433]
Steps: 1%| | 7068/1000000 [18:06:13<1861:43:27, 6.75s/it, lr=1e-5, step_loss=0.0433][RANK-0]: Step: [7068], local_loss=0.008280963636934757, train_loss=0.08512701839208603, time_cost=2.568624973297119
+
Steps: 1%| | 7068/1000000 [18:06:13<1861:43:27, 6.75s/it, lr=1e-5, step_loss=0.00828]
Steps: 1%| | 7069/1000000 [18:06:27<2419:08:19, 8.77s/it, lr=1e-5, step_loss=0.00828][RANK-0]: Step: [7069], local_loss=0.01887478493154049, train_loss=0.13873696327209473, time_cost=1.2082490921020508
+
Steps: 1%| | 7069/1000000 [18:06:27<2419:08:19, 8.77s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%| | 7070/1000000 [18:06:34<2294:32:43, 8.32s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [7070], local_loss=0.041124943643808365, train_loss=0.03990769386291504, time_cost=2.578935384750366
+
Steps: 1%| | 7070/1000000 [18:06:34<2294:32:43, 8.32s/it, lr=1e-5, step_loss=0.0411]
Steps: 1%| | 7071/1000000 [18:06:39<2018:02:04, 7.32s/it, lr=1e-5, step_loss=0.0411][RANK-0]: Step: [7071], local_loss=0.07380442321300507, train_loss=0.03662978857755661, time_cost=1.2822315692901611
+
Steps: 1%| | 7071/1000000 [18:06:39<2018:02:04, 7.32s/it, lr=1e-5, step_loss=0.0738]
Steps: 1%| | 7072/1000000 [18:06:46<1970:55:36, 7.15s/it, lr=1e-5, step_loss=0.0738][RANK-0]: Step: [7072], local_loss=0.010509692132472992, train_loss=0.019376389682292938, time_cost=3.079829692840576
+
Steps: 1%| | 7072/1000000 [18:06:46<1970:55:36, 7.15s/it, lr=1e-5, step_loss=0.0105]
Steps: 1%| | 7073/1000000 [18:06:53<1996:53:40, 7.24s/it, lr=1e-5, step_loss=0.0105][RANK-0]: Step: [7073], local_loss=0.008680309168994427, train_loss=0.02775810845196247, time_cost=3.6239027976989746
+
Steps: 1%| | 7073/1000000 [18:06:53<1996:53:40, 7.24s/it, lr=1e-5, step_loss=0.00868]
Steps: 1%| | 7074/1000000 [18:07:01<2018:23:14, 7.32s/it, lr=1e-5, step_loss=0.00868][RANK-0]: Step: [7074], local_loss=0.11082275956869125, train_loss=0.05090021342039108, time_cost=3.347677707672119
+
Steps: 1%| | 7074/1000000 [18:07:01<2018:23:14, 7.32s/it, lr=1e-5, step_loss=0.111]
Steps: 1%| | 7075/1000000 [18:07:12<2318:38:00, 8.41s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [7075], local_loss=0.014368023723363876, train_loss=0.03761008381843567, time_cost=1.272521734237671
+
Steps: 1%| | 7075/1000000 [18:07:12<2318:38:00, 8.41s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 7076/1000000 [18:07:16<1973:01:33, 7.15s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [7076], local_loss=0.009348749183118343, train_loss=0.027562018483877182, time_cost=1.2320959568023682
+
Steps: 1%| | 7076/1000000 [18:07:16<1973:01:33, 7.15s/it, lr=1e-5, step_loss=0.00935]
Steps: 1%| | 7077/1000000 [18:07:28<2415:47:27, 8.76s/it, lr=1e-5, step_loss=0.00935][RANK-0]: Step: [7077], local_loss=0.405946284532547, train_loss=0.07311693578958511, time_cost=5.344985246658325
+
Steps: 1%| | 7077/1000000 [18:07:28<2415:47:27, 8.76s/it, lr=1e-5, step_loss=0.406]
Steps: 1%| | 7078/1000000 [18:07:35<2275:00:01, 8.25s/it, lr=1e-5, step_loss=0.406][RANK-0]: Step: [7078], local_loss=0.010806247591972351, train_loss=0.07812696695327759, time_cost=1.2334201335906982
+
Steps: 1%| | 7078/1000000 [18:07:35<2275:00:01, 8.25s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 7079/1000000 [18:07:50<2816:29:45, 10.21s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [7079], local_loss=0.014471749775111675, train_loss=0.020783964544534683, time_cost=2.5262577533721924
+
Steps: 1%| | 7079/1000000 [18:07:50<2816:29:45, 10.21s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 7080/1000000 [18:07:54<2308:22:18, 8.37s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [7080], local_loss=0.017085587605834007, train_loss=0.04036140441894531, time_cost=1.306447982788086
+
Steps: 1%| | 7080/1000000 [18:07:54<2308:22:18, 8.37s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 7081/1000000 [18:08:00<2094:51:07, 7.60s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [7081], local_loss=0.11652140319347382, train_loss=0.07075940817594528, time_cost=1.7882053852081299
+
Steps: 1%| | 7081/1000000 [18:08:00<2094:51:07, 7.60s/it, lr=1e-5, step_loss=0.117]
Steps: 1%| | 7082/1000000 [18:08:05<1917:41:05, 6.95s/it, lr=1e-5, step_loss=0.117][RANK-0]: Step: [7082], local_loss=0.008851766586303711, train_loss=0.042971935123205185, time_cost=1.4573659896850586
+
Steps: 1%| | 7082/1000000 [18:08:06<1917:41:05, 6.95s/it, lr=1e-5, step_loss=0.00885]
Steps: 1%| | 7083/1000000 [18:08:11<1836:58:48, 6.66s/it, lr=1e-5, step_loss=0.00885][RANK-0]: Step: [7083], local_loss=0.01837753877043724, train_loss=0.040928374975919724, time_cost=2.442129373550415
+
Steps: 1%| | 7083/1000000 [18:08:11<1836:58:48, 6.66s/it, lr=1e-5, step_loss=0.0184]
Steps: 1%| | 7084/1000000 [18:08:17<1707:12:21, 6.19s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [7084], local_loss=0.007947711274027824, train_loss=0.04139784723520279, time_cost=1.9236383438110352
+
Steps: 1%| | 7084/1000000 [18:08:17<1707:12:21, 6.19s/it, lr=1e-5, step_loss=0.00795]
Steps: 1%| | 7085/1000000 [18:08:30<2270:53:22, 8.23s/it, lr=1e-5, step_loss=0.00795][RANK-0]: Step: [7085], local_loss=0.019531119614839554, train_loss=0.051310740411281586, time_cost=4.328325271606445
+
Steps: 1%| | 7085/1000000 [18:08:30<2270:53:22, 8.23s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 7086/1000000 [18:08:34<1996:19:41, 7.24s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [7086], local_loss=0.03163164481520653, train_loss=0.07921270281076431, time_cost=1.2350130081176758
+
Steps: 1%| | 7086/1000000 [18:08:34<1996:19:41, 7.24s/it, lr=1e-5, step_loss=0.0316]
Steps: 1%| | 7087/1000000 [18:08:40<1813:56:10, 6.58s/it, lr=1e-5, step_loss=0.0316][RANK-0]: Step: [7087], local_loss=0.06646393984556198, train_loss=0.0591006875038147, time_cost=1.805321216583252
+
Steps: 1%| | 7087/1000000 [18:08:40<1813:56:10, 6.58s/it, lr=1e-5, step_loss=0.0665]
Steps: 1%| | 7088/1000000 [18:08:45<1714:29:47, 6.22s/it, lr=1e-5, step_loss=0.0665][RANK-0]: Step: [7088], local_loss=0.01154718641191721, train_loss=0.02673698216676712, time_cost=1.2300021648406982
+
Steps: 1%| | 7088/1000000 [18:08:45<1714:29:47, 6.22s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 7089/1000000 [18:08:52<1825:38:25, 6.62s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [7089], local_loss=0.214710533618927, train_loss=0.06268613040447235, time_cost=2.0191826820373535
+
Steps: 1%| | 7089/1000000 [18:08:52<1825:38:25, 6.62s/it, lr=1e-5, step_loss=0.215]
Steps: 1%| | 7090/1000000 [18:09:02<2085:34:30, 7.56s/it, lr=1e-5, step_loss=0.215][RANK-0]: Step: [7090], local_loss=0.028450358659029007, train_loss=2.942531108856201, time_cost=3.2311851978302
+
Steps: 1%| | 7090/1000000 [18:09:02<2085:34:30, 7.56s/it, lr=1e-5, step_loss=0.0285]
Steps: 1%| | 7091/1000000 [18:09:06<1793:00:14, 6.50s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [7091], local_loss=0.15902477502822876, train_loss=0.13095203042030334, time_cost=1.2064902782440186
+
Steps: 1%| | 7091/1000000 [18:09:06<1793:00:14, 6.50s/it, lr=1e-5, step_loss=0.159]
Steps: 1%| | 7092/1000000 [18:09:14<1884:38:25, 6.83s/it, lr=1e-5, step_loss=0.159][RANK-0]: Step: [7092], local_loss=0.054095566272735596, train_loss=0.14792382717132568, time_cost=2.9861793518066406
+
Steps: 1%| | 7092/1000000 [18:09:14<1884:38:25, 6.83s/it, lr=1e-5, step_loss=0.0541]
Steps: 1%| | 7093/1000000 [18:09:21<1885:58:02, 6.84s/it, lr=1e-5, step_loss=0.0541][RANK-0]: Step: [7093], local_loss=0.013088412582874298, train_loss=0.024990247562527657, time_cost=1.2476224899291992
+
Steps: 1%| | 7093/1000000 [18:09:21<1885:58:02, 6.84s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 7094/1000000 [18:09:28<1927:55:20, 6.99s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [7094], local_loss=0.040252722799777985, train_loss=0.038549430668354034, time_cost=1.708728313446045
+
Steps: 1%| | 7094/1000000 [18:09:28<1927:55:20, 6.99s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%| | 7095/1000000 [18:09:34<1815:32:28, 6.58s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [7095], local_loss=0.012650690041482449, train_loss=0.09097723662853241, time_cost=2.9867448806762695
+
Steps: 1%| | 7095/1000000 [18:09:34<1815:32:28, 6.58s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 7096/1000000 [18:09:41<1893:45:22, 6.87s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [7096], local_loss=0.012843272648751736, train_loss=0.082453653216362, time_cost=3.2016756534576416
+
Steps: 1%| | 7096/1000000 [18:09:41<1893:45:22, 6.87s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 7097/1000000 [18:09:52<2207:30:23, 8.00s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [7097], local_loss=0.5770989060401917, train_loss=0.14676016569137573, time_cost=1.7719461917877197
+
Steps: 1%| | 7097/1000000 [18:09:52<2207:30:23, 8.00s/it, lr=1e-5, step_loss=0.577]
Steps: 1%| | 7098/1000000 [18:09:59<2119:55:49, 7.69s/it, lr=1e-5, step_loss=0.577][RANK-0]: Step: [7098], local_loss=0.0118063660338521, train_loss=0.01818043738603592, time_cost=2.9453160762786865
+
Steps: 1%| | 7098/1000000 [18:09:59<2119:55:49, 7.69s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 7099/1000000 [18:10:04<1907:48:46, 6.92s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [7099], local_loss=0.006306316703557968, train_loss=0.02760179713368416, time_cost=2.419691562652588
+
Steps: 1%| | 7099/1000000 [18:10:04<1907:48:46, 6.92s/it, lr=1e-5, step_loss=0.00631]
Steps: 1%| | 7100/1000000 [18:10:12<1971:18:44, 7.15s/it, lr=1e-5, step_loss=0.00631][RANK-0]: Step: [7100], local_loss=0.01571531593799591, train_loss=0.019510889425873756, time_cost=3.151108980178833
+
Steps: 1%| | 7100/1000000 [18:10:12<1971:18:44, 7.15s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 7101/1000000 [18:10:16<1772:59:51, 6.43s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [7101], local_loss=0.0103541798889637, train_loss=0.03554142266511917, time_cost=1.2627990245819092
+
Steps: 1%| | 7101/1000000 [18:10:16<1772:59:51, 6.43s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7102/1000000 [18:10:31<2415:26:23, 8.76s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7102], local_loss=263.55780029296875, train_loss=32.95559310913086, time_cost=1.234562873840332
+
Steps: 1%| | 7102/1000000 [18:10:31<2415:26:23, 8.76s/it, lr=1e-5, step_loss=264]
Steps: 1%| | 7103/1000000 [18:10:45<2865:42:21, 10.39s/it, lr=1e-5, step_loss=264][RANK-0]: Step: [7103], local_loss=0.25332632660865784, train_loss=0.051804281771183014, time_cost=1.2204070091247559
+
Steps: 1%| | 7103/1000000 [18:10:45<2865:42:21, 10.39s/it, lr=1e-5, step_loss=0.253]
Steps: 1%| | 7104/1000000 [18:10:58<3133:02:49, 11.36s/it, lr=1e-5, step_loss=0.253][RANK-0]: Step: [7104], local_loss=0.02603224106132984, train_loss=0.030429653823375702, time_cost=4.4818620681762695
+
Steps: 1%| | 7104/1000000 [18:10:58<3133:02:49, 11.36s/it, lr=1e-5, step_loss=0.026]
Steps: 1%| | 7105/1000000 [18:11:11<3218:37:56, 11.67s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [7105], local_loss=0.014171728864312172, train_loss=0.03373269364237785, time_cost=3.3710317611694336
+
Steps: 1%| | 7105/1000000 [18:11:11<3218:37:56, 11.67s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 7106/1000000 [18:11:21<3130:17:24, 11.35s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [7106], local_loss=0.20808306336402893, train_loss=0.047853462398052216, time_cost=1.5781981945037842
+
Steps: 1%| | 7106/1000000 [18:11:21<3130:17:24, 11.35s/it, lr=1e-5, step_loss=0.208]
Steps: 1%| | 7107/1000000 [18:11:27<2687:57:26, 9.75s/it, lr=1e-5, step_loss=0.208][RANK-0]: Step: [7107], local_loss=0.017675751820206642, train_loss=0.015519523061811924, time_cost=1.7112207412719727
+
Steps: 1%| | 7107/1000000 [18:11:27<2687:57:26, 9.75s/it, lr=1e-5, step_loss=0.0177]
Steps: 1%| | 7108/1000000 [18:11:43<3152:07:53, 11.43s/it, lr=1e-5, step_loss=0.0177][RANK-0]: Step: [7108], local_loss=0.30736589431762695, train_loss=0.0942135602235794, time_cost=7.323732376098633
+
Steps: 1%| | 7108/1000000 [18:11:43<3152:07:53, 11.43s/it, lr=1e-5, step_loss=0.307]
Steps: 1%| | 7109/1000000 [18:11:53<3053:27:57, 11.07s/it, lr=1e-5, step_loss=0.307][RANK-0]: Step: [7109], local_loss=0.018262723460793495, train_loss=0.023906894028186798, time_cost=4.945305109024048
+
Steps: 1%| | 7109/1000000 [18:11:53<3053:27:57, 11.07s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 7110/1000000 [18:11:57<2483:26:24, 9.00s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [7110], local_loss=0.016033902764320374, train_loss=0.0612311065196991, time_cost=1.2562379837036133
+
Steps: 1%| | 7110/1000000 [18:11:57<2483:26:24, 9.00s/it, lr=1e-5, step_loss=0.016]
Steps: 1%| | 7111/1000000 [18:12:04<2322:18:36, 8.42s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [7111], local_loss=0.023462539538741112, train_loss=0.06985251605510712, time_cost=1.2294421195983887
+
Steps: 1%| | 7111/1000000 [18:12:04<2322:18:36, 8.42s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%| | 7112/1000000 [18:12:20<2901:07:53, 10.52s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [7112], local_loss=0.013438008725643158, train_loss=0.020154276862740517, time_cost=13.572389841079712
+
Steps: 1%| | 7112/1000000 [18:12:20<2901:07:53, 10.52s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 7113/1000000 [18:12:27<2673:44:31, 9.69s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [7113], local_loss=0.019688501954078674, train_loss=0.029679864645004272, time_cost=3.974430561065674
+
Steps: 1%| | 7113/1000000 [18:12:27<2673:44:31, 9.69s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%| | 7114/1000000 [18:12:35<2509:24:43, 9.10s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [7114], local_loss=0.08199768513441086, train_loss=0.06414324045181274, time_cost=2.9729297161102295
+
Steps: 1%| | 7114/1000000 [18:12:35<2509:24:43, 9.10s/it, lr=1e-5, step_loss=0.082]
Steps: 1%| | 7115/1000000 [18:12:40<2182:22:24, 7.91s/it, lr=1e-5, step_loss=0.082][RANK-0]: Step: [7115], local_loss=0.05347129702568054, train_loss=0.05701041594147682, time_cost=1.2447443008422852
+
Steps: 1%| | 7115/1000000 [18:12:40<2182:22:24, 7.91s/it, lr=1e-5, step_loss=0.0535]
Steps: 1%| | 7116/1000000 [18:12:52<2479:23:17, 8.99s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [7116], local_loss=0.019019996747374535, train_loss=0.03206675127148628, time_cost=1.239457607269287
+
Steps: 1%| | 7116/1000000 [18:12:52<2479:23:17, 8.99s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 7117/1000000 [18:12:59<2341:21:30, 8.49s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [7117], local_loss=0.5133102536201477, train_loss=0.0872507095336914, time_cost=1.301832914352417
+
Steps: 1%| | 7117/1000000 [18:12:59<2341:21:30, 8.49s/it, lr=1e-5, step_loss=0.513]
Steps: 1%| | 7118/1000000 [18:13:05<2143:06:38, 7.77s/it, lr=1e-5, step_loss=0.513][RANK-0]: Step: [7118], local_loss=0.14322035014629364, train_loss=0.053281478583812714, time_cost=1.234114170074463
+
Steps: 1%| | 7118/1000000 [18:13:05<2143:06:38, 7.77s/it, lr=1e-5, step_loss=0.143]
Steps: 1%| | 7119/1000000 [18:13:10<1925:38:10, 6.98s/it, lr=1e-5, step_loss=0.143][RANK-0]: Step: [7119], local_loss=0.08281023800373077, train_loss=0.07003068178892136, time_cost=2.4467015266418457
+
Steps: 1%| | 7119/1000000 [18:13:10<1925:38:10, 6.98s/it, lr=1e-5, step_loss=0.0828]
Steps: 1%| | 7120/1000000 [18:13:21<2240:26:17, 8.12s/it, lr=1e-5, step_loss=0.0828][RANK-0]: Step: [7120], local_loss=0.03132263943552971, train_loss=0.049863051623106, time_cost=3.671041965484619
+
Steps: 1%| | 7120/1000000 [18:13:21<2240:26:17, 8.12s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 7121/1000000 [18:13:37<2892:37:49, 10.49s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [7121], local_loss=0.012461131438612938, train_loss=0.043528251349925995, time_cost=6.289408445358276
+
Steps: 1%| | 7121/1000000 [18:13:37<2892:37:49, 10.49s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 7122/1000000 [18:13:48<2952:30:09, 10.71s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [7122], local_loss=0.012634435668587685, train_loss=0.03746451810002327, time_cost=8.345519781112671
+
Steps: 1%| | 7122/1000000 [18:13:48<2952:30:09, 10.71s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 7123/1000000 [18:13:57<2805:30:54, 10.17s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [7123], local_loss=0.022588014602661133, train_loss=0.045687176287174225, time_cost=2.7719616889953613
+
Steps: 1%| | 7123/1000000 [18:13:57<2805:30:54, 10.17s/it, lr=1e-5, step_loss=0.0226]
Steps: 1%| | 7124/1000000 [18:14:04<2493:43:33, 9.04s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [7124], local_loss=0.012625225819647312, train_loss=0.037351757287979126, time_cost=2.944554328918457
+
Steps: 1%| | 7124/1000000 [18:14:04<2493:43:33, 9.04s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 7125/1000000 [18:14:08<2110:10:05, 7.65s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [7125], local_loss=0.00971354078501463, train_loss=0.06156332790851593, time_cost=1.7787575721740723
+
Steps: 1%| | 7125/1000000 [18:14:08<2110:10:05, 7.65s/it, lr=1e-5, step_loss=0.00971]
Steps: 1%| | 7126/1000000 [18:14:21<2512:11:17, 9.11s/it, lr=1e-5, step_loss=0.00971][RANK-0]: Step: [7126], local_loss=0.015690181404352188, train_loss=0.037095580250024796, time_cost=1.4573321342468262
+
Steps: 1%| | 7126/1000000 [18:14:21<2512:11:17, 9.11s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 7127/1000000 [18:14:34<2855:26:40, 10.35s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [7127], local_loss=0.017400724813342094, train_loss=0.04159895330667496, time_cost=5.4584925174713135
+
Steps: 1%| | 7127/1000000 [18:14:34<2855:26:40, 10.35s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 7128/1000000 [18:14:38<2339:16:36, 8.48s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [7128], local_loss=0.021143052726984024, train_loss=0.027770988643169403, time_cost=1.3010692596435547
+
Steps: 1%| | 7128/1000000 [18:14:38<2339:16:36, 8.48s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%| | 7129/1000000 [18:14:43<2058:19:58, 7.46s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [7129], local_loss=0.015538310632109642, train_loss=0.06935099512338638, time_cost=1.9300169944763184
+
Steps: 1%| | 7129/1000000 [18:14:43<2058:19:58, 7.46s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 7130/1000000 [18:14:48<1864:26:46, 6.76s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [7130], local_loss=0.03664061799645424, train_loss=0.023489441722631454, time_cost=2.032440662384033
+
Steps: 1%| | 7130/1000000 [18:14:48<1864:26:46, 6.76s/it, lr=1e-5, step_loss=0.0366]
Steps: 1%| | 7131/1000000 [18:15:02<2432:50:12, 8.82s/it, lr=1e-5, step_loss=0.0366][RANK-0]: Step: [7131], local_loss=0.028193572536110878, train_loss=0.0745248794555664, time_cost=4.030376195907593
+
Steps: 1%| | 7131/1000000 [18:15:02<2432:50:12, 8.82s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%| | 7132/1000000 [18:15:12<2509:21:30, 9.10s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [7132], local_loss=0.021906953305006027, train_loss=0.01913718692958355, time_cost=2.3988840579986572
+
Steps: 1%| | 7132/1000000 [18:15:12<2509:21:30, 9.10s/it, lr=1e-5, step_loss=0.0219]
Steps: 1%| | 7133/1000000 [18:15:21<2574:08:06, 9.33s/it, lr=1e-5, step_loss=0.0219][RANK-0]: Step: [7133], local_loss=0.01927543431520462, train_loss=0.18367010354995728, time_cost=4.3531248569488525
+
Steps: 1%| | 7133/1000000 [18:15:21<2574:08:06, 9.33s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%| | 7134/1000000 [18:15:34<2818:29:06, 10.22s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [7134], local_loss=0.009154384024441242, train_loss=0.029126938432455063, time_cost=4.22534966468811
+
Steps: 1%| | 7134/1000000 [18:15:34<2818:29:06, 10.22s/it, lr=1e-5, step_loss=0.00915]
Steps: 1%| | 7135/1000000 [18:15:39<2423:18:04, 8.79s/it, lr=1e-5, step_loss=0.00915][RANK-0]: Step: [7135], local_loss=0.01491355150938034, train_loss=0.15532110631465912, time_cost=2.7469823360443115
+
Steps: 1%| | 7135/1000000 [18:15:39<2423:18:04, 8.79s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 7136/1000000 [18:15:50<2626:42:52, 9.52s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [7136], local_loss=0.006969877518713474, train_loss=0.1646890789270401, time_cost=4.3285017013549805
+
Steps: 1%| | 7136/1000000 [18:15:50<2626:42:52, 9.52s/it, lr=1e-5, step_loss=0.00697]
Steps: 1%| | 7137/1000000 [18:16:02<2780:55:58, 10.08s/it, lr=1e-5, step_loss=0.00697][RANK-0]: Step: [7137], local_loss=0.03117133118212223, train_loss=0.0395507849752903, time_cost=9.610659122467041
+
Steps: 1%| | 7137/1000000 [18:16:02<2780:55:58, 10.08s/it, lr=1e-5, step_loss=0.0312]
Steps: 1%| | 7138/1000000 [18:16:06<2297:55:06, 8.33s/it, lr=1e-5, step_loss=0.0312][RANK-0]: Step: [7138], local_loss=0.012180056422948837, train_loss=0.034952178597450256, time_cost=3.047483444213867
+
Steps: 1%| | 7138/1000000 [18:16:06<2297:55:06, 8.33s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 7139/1000000 [18:16:20<2730:49:01, 9.90s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [7139], local_loss=0.011700130067765713, train_loss=0.049748796969652176, time_cost=4.980916738510132
+
Steps: 1%| | 7139/1000000 [18:16:20<2730:49:01, 9.90s/it, lr=1e-5, step_loss=0.0117]
Steps: 1%| | 7140/1000000 [18:16:25<2344:19:26, 8.50s/it, lr=1e-5, step_loss=0.0117][RANK-0]: Step: [7140], local_loss=0.7864256501197815, train_loss=0.1352788358926773, time_cost=2.370527744293213
+
Steps: 1%| | 7140/1000000 [18:16:25<2344:19:26, 8.50s/it, lr=1e-5, step_loss=0.786]
Steps: 1%| | 7141/1000000 [18:16:34<2435:29:41, 8.83s/it, lr=1e-5, step_loss=0.786][RANK-0]: Step: [7141], local_loss=0.007289444096386433, train_loss=0.03040444478392601, time_cost=1.9486620426177979
+
Steps: 1%| | 7141/1000000 [18:16:34<2435:29:41, 8.83s/it, lr=1e-5, step_loss=0.00729]
Steps: 1%| | 7142/1000000 [18:16:42<2311:05:12, 8.38s/it, lr=1e-5, step_loss=0.00729][RANK-0]: Step: [7142], local_loss=0.05194519832730293, train_loss=0.040009163320064545, time_cost=2.9035842418670654
+
Steps: 1%| | 7142/1000000 [18:16:42<2311:05:12, 8.38s/it, lr=1e-5, step_loss=0.0519]
Steps: 1%| | 7143/1000000 [18:16:57<2858:35:40, 10.36s/it, lr=1e-5, step_loss=0.0519][RANK-0]: Step: [7143], local_loss=0.03369897976517677, train_loss=0.021193530410528183, time_cost=6.402540683746338
+
Steps: 1%| | 7143/1000000 [18:16:57<2858:35:40, 10.36s/it, lr=1e-5, step_loss=0.0337]
Steps: 1%| | 7144/1000000 [18:17:04<2592:08:30, 9.40s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [7144], local_loss=0.12964804470539093, train_loss=0.06241503357887268, time_cost=1.1914660930633545
+
Steps: 1%| | 7144/1000000 [18:17:04<2592:08:30, 9.40s/it, lr=1e-5, step_loss=0.13]
Steps: 1%| | 7145/1000000 [18:17:20<3108:19:44, 11.27s/it, lr=1e-5, step_loss=0.13][RANK-0]: Step: [7145], local_loss=0.013192716985940933, train_loss=0.03042282722890377, time_cost=7.806421995162964
+
Steps: 1%| | 7145/1000000 [18:17:20<3108:19:44, 11.27s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 7146/1000000 [18:17:24<2511:01:08, 9.10s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [7146], local_loss=0.044306959956884384, train_loss=0.037151090800762177, time_cost=1.3013875484466553
+
Steps: 1%| | 7146/1000000 [18:17:24<2511:01:08, 9.10s/it, lr=1e-5, step_loss=0.0443]
Steps: 1%| | 7147/1000000 [18:17:30<2322:00:16, 8.42s/it, lr=1e-5, step_loss=0.0443][RANK-0]: Step: [7147], local_loss=0.1758914738893509, train_loss=0.05599617213010788, time_cost=2.2188527584075928
+
Steps: 1%| | 7147/1000000 [18:17:30<2322:00:16, 8.42s/it, lr=1e-5, step_loss=0.176]
Steps: 1%| | 7148/1000000 [18:17:37<2194:33:43, 7.96s/it, lr=1e-5, step_loss=0.176][RANK-0]: Step: [7148], local_loss=0.08860114961862564, train_loss=0.07759916037321091, time_cost=2.795189142227173
+
Steps: 1%| | 7148/1000000 [18:17:37<2194:33:43, 7.96s/it, lr=1e-5, step_loss=0.0886]
Steps: 1%| | 7149/1000000 [18:17:45<2156:03:09, 7.82s/it, lr=1e-5, step_loss=0.0886][RANK-0]: Step: [7149], local_loss=0.025632627308368683, train_loss=0.055452510714530945, time_cost=1.3181989192962646
+
Steps: 1%| | 7149/1000000 [18:17:45<2156:03:09, 7.82s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%| | 7150/1000000 [18:17:55<2347:00:18, 8.51s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [7150], local_loss=0.010420705191791058, train_loss=0.05403708294034004, time_cost=1.2403528690338135
+
Steps: 1%| | 7150/1000000 [18:17:55<2347:00:18, 8.51s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7151/1000000 [18:18:02<2210:06:42, 8.01s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7151], local_loss=0.015195323154330254, train_loss=0.0165808517485857, time_cost=1.237858533859253
+
Steps: 1%| | 7151/1000000 [18:18:02<2210:06:42, 8.01s/it, lr=1e-5, step_loss=0.0152]
Steps: 1%| | 7152/1000000 [18:18:09<2111:40:11, 7.66s/it, lr=1e-5, step_loss=0.0152][RANK-0]: Step: [7152], local_loss=0.041742660105228424, train_loss=0.04666315019130707, time_cost=2.590094804763794
+
Steps: 1%| | 7152/1000000 [18:18:09<2111:40:11, 7.66s/it, lr=1e-5, step_loss=0.0417]
Steps: 1%| | 7153/1000000 [18:18:24<2758:40:37, 10.00s/it, lr=1e-5, step_loss=0.0417][RANK-0]: Step: [7153], local_loss=0.0314909964799881, train_loss=0.019295983016490936, time_cost=6.514641046524048
+
Steps: 1%| | 7153/1000000 [18:18:24<2758:40:37, 10.00s/it, lr=1e-5, step_loss=0.0315]
Steps: 1%| | 7154/1000000 [18:18:31<2484:14:16, 9.01s/it, lr=1e-5, step_loss=0.0315][RANK-0]: Step: [7154], local_loss=0.01615970768034458, train_loss=0.10282380878925323, time_cost=2.586055040359497
+
Steps: 1%| | 7154/1000000 [18:18:31<2484:14:16, 9.01s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 7155/1000000 [18:18:38<2316:35:35, 8.40s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [7155], local_loss=0.023201439529657364, train_loss=6.248806953430176, time_cost=1.5102403163909912
+
Steps: 1%| | 7155/1000000 [18:18:38<2316:35:35, 8.40s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 7156/1000000 [18:18:50<2679:48:35, 9.72s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [7156], local_loss=0.05376512184739113, train_loss=0.04862057417631149, time_cost=6.929365396499634
+
Steps: 1%| | 7156/1000000 [18:18:50<2679:48:35, 9.72s/it, lr=1e-5, step_loss=0.0538]
Steps: 1%| | 7157/1000000 [18:18:59<2594:33:00, 9.41s/it, lr=1e-5, step_loss=0.0538][RANK-0]: Step: [7157], local_loss=0.012314882129430771, train_loss=0.050735149532556534, time_cost=2.754004955291748
+
Steps: 1%| | 7157/1000000 [18:18:59<2594:33:00, 9.41s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 7158/1000000 [18:19:06<2420:52:54, 8.78s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [7158], local_loss=0.029283080250024796, train_loss=0.022325150668621063, time_cost=2.7846994400024414
+
Steps: 1%| | 7158/1000000 [18:19:06<2420:52:54, 8.78s/it, lr=1e-5, step_loss=0.0293]
Steps: 1%| | 7159/1000000 [18:19:15<2424:05:01, 8.79s/it, lr=1e-5, step_loss=0.0293][RANK-0]: Step: [7159], local_loss=0.01631953753530979, train_loss=0.04803336784243584, time_cost=1.2287473678588867
+
Steps: 1%| | 7159/1000000 [18:19:15<2424:05:01, 8.79s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 7160/1000000 [18:19:19<2034:24:04, 7.38s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [7160], local_loss=0.022219954058527946, train_loss=0.06262771785259247, time_cost=2.6398093700408936
+
Steps: 1%| | 7160/1000000 [18:19:19<2034:24:04, 7.38s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 7161/1000000 [18:19:27<2090:12:45, 7.58s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [7161], local_loss=0.12632612884044647, train_loss=0.05615603178739548, time_cost=3.7803895473480225
+
Steps: 1%| | 7161/1000000 [18:19:27<2090:12:45, 7.58s/it, lr=1e-5, step_loss=0.126]
Steps: 1%| | 7162/1000000 [18:19:36<2202:04:42, 7.98s/it, lr=1e-5, step_loss=0.126][RANK-0]: Step: [7162], local_loss=0.021422207355499268, train_loss=0.0407366119325161, time_cost=1.937624454498291
+
Steps: 1%| | 7162/1000000 [18:19:36<2202:04:42, 7.98s/it, lr=1e-5, step_loss=0.0214]
Steps: 1%| | 7163/1000000 [18:19:48<2491:44:23, 9.03s/it, lr=1e-5, step_loss=0.0214][RANK-0]: Step: [7163], local_loss=0.04519409313797951, train_loss=0.04144696891307831, time_cost=4.488759994506836
+
Steps: 1%| | 7163/1000000 [18:19:48<2491:44:23, 9.03s/it, lr=1e-5, step_loss=0.0452]
Steps: 1%| | 7164/1000000 [18:19:53<2203:26:39, 7.99s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [7164], local_loss=0.014065739698708057, train_loss=0.03697704151272774, time_cost=2.234262228012085
+
Steps: 1%| | 7164/1000000 [18:19:53<2203:26:39, 7.99s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 7165/1000000 [18:20:00<2091:35:19, 7.58s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [7165], local_loss=0.013714945875108242, train_loss=0.07993485033512115, time_cost=2.8970580101013184
+
Steps: 1%| | 7165/1000000 [18:20:00<2091:35:19, 7.58s/it, lr=1e-5, step_loss=0.0137]
Steps: 1%| | 7166/1000000 [18:20:11<2341:28:38, 8.49s/it, lr=1e-5, step_loss=0.0137][RANK-0]: Step: [7166], local_loss=0.02669726498425007, train_loss=0.046471673995256424, time_cost=1.7566730976104736
+
Steps: 1%| | 7166/1000000 [18:20:11<2341:28:38, 8.49s/it, lr=1e-5, step_loss=0.0267]
Steps: 1%| | 7167/1000000 [18:20:15<1997:11:01, 7.24s/it, lr=1e-5, step_loss=0.0267][RANK-0]: Step: [7167], local_loss=0.010709170252084732, train_loss=0.04118523374199867, time_cost=1.4313735961914062
+
Steps: 1%| | 7167/1000000 [18:20:15<1997:11:01, 7.24s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 7168/1000000 [18:20:23<2041:14:49, 7.40s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [7168], local_loss=0.4238387644290924, train_loss=0.10263854265213013, time_cost=1.5000331401824951
+
Steps: 1%| | 7168/1000000 [18:20:23<2041:14:49, 7.40s/it, lr=1e-5, step_loss=0.424]
Steps: 1%| | 7169/1000000 [18:20:36<2546:18:03, 9.23s/it, lr=1e-5, step_loss=0.424][RANK-0]: Step: [7169], local_loss=0.07420500367879868, train_loss=0.05097195506095886, time_cost=4.20124077796936
+
Steps: 1%| | 7169/1000000 [18:20:36<2546:18:03, 9.23s/it, lr=1e-5, step_loss=0.0742]
Steps: 1%| | 7170/1000000 [18:20:43<2378:02:08, 8.62s/it, lr=1e-5, step_loss=0.0742][RANK-0]: Step: [7170], local_loss=0.10787957906723022, train_loss=0.04629839211702347, time_cost=2.9686200618743896
+
Steps: 1%| | 7170/1000000 [18:20:43<2378:02:08, 8.62s/it, lr=1e-5, step_loss=0.108]
Steps: 1%| | 7171/1000000 [18:20:53<2428:16:26, 8.80s/it, lr=1e-5, step_loss=0.108][RANK-0]: Step: [7171], local_loss=0.06775394827127457, train_loss=0.04512844979763031, time_cost=2.973794460296631
+
Steps: 1%| | 7171/1000000 [18:20:53<2428:16:26, 8.80s/it, lr=1e-5, step_loss=0.0678]
Steps: 1%| | 7172/1000000 [18:21:05<2744:59:38, 9.95s/it, lr=1e-5, step_loss=0.0678][RANK-0]: Step: [7172], local_loss=0.12856106460094452, train_loss=0.038446757942438126, time_cost=5.8600828647613525
+
Steps: 1%| | 7172/1000000 [18:21:05<2744:59:38, 9.95s/it, lr=1e-5, step_loss=0.129]
Steps: 1%| | 7173/1000000 [18:21:13<2522:22:20, 9.15s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [7173], local_loss=0.009916861541569233, train_loss=0.02939976193010807, time_cost=2.8136610984802246
+
Steps: 1%| | 7173/1000000 [18:21:13<2522:22:20, 9.15s/it, lr=1e-5, step_loss=0.00992]
Steps: 1%| | 7174/1000000 [18:21:22<2516:25:01, 9.12s/it, lr=1e-5, step_loss=0.00992][RANK-0]: Step: [7174], local_loss=0.010408177971839905, train_loss=0.036279767751693726, time_cost=3.2086732387542725
+
Steps: 1%| | 7174/1000000 [18:21:22<2516:25:01, 9.12s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7175/1000000 [18:21:33<2671:29:40, 9.69s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7175], local_loss=0.010201388038694859, train_loss=0.06235248222947121, time_cost=1.7908010482788086
+
Steps: 1%| | 7175/1000000 [18:21:33<2671:29:40, 9.69s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7176/1000000 [18:21:40<2495:17:02, 9.05s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7176], local_loss=0.011750394478440285, train_loss=0.049188584089279175, time_cost=2.6574344635009766
+
Steps: 1%| | 7176/1000000 [18:21:40<2495:17:02, 9.05s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 7177/1000000 [18:21:51<2678:38:39, 9.71s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [7177], local_loss=0.017027823254466057, train_loss=0.060002103447914124, time_cost=3.772819757461548
+
Steps: 1%| | 7177/1000000 [18:21:51<2678:38:39, 9.71s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 7178/1000000 [18:22:06<3100:55:16, 11.24s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [7178], local_loss=0.017327960580587387, train_loss=0.02764204517006874, time_cost=5.484748840332031
+
Steps: 1%| | 7178/1000000 [18:22:06<3100:55:16, 11.24s/it, lr=1e-5, step_loss=0.0173]
Steps: 1%| | 7179/1000000 [18:22:21<3401:39:46, 12.33s/it, lr=1e-5, step_loss=0.0173][RANK-0]: Step: [7179], local_loss=0.021784847602248192, train_loss=0.022426266223192215, time_cost=6.197389364242554
+
Steps: 1%| | 7179/1000000 [18:22:21<3401:39:46, 12.33s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%| | 7180/1000000 [18:22:33<3354:12:56, 12.16s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [7180], local_loss=0.10188616067171097, train_loss=0.05842021852731705, time_cost=2.4110591411590576
+
Steps: 1%| | 7180/1000000 [18:22:33<3354:12:56, 12.16s/it, lr=1e-5, step_loss=0.102]
Steps: 1%| | 7181/1000000 [18:22:44<3252:52:21, 11.80s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [7181], local_loss=0.008525537326931953, train_loss=0.03695022687315941, time_cost=3.0165011882781982
+
Steps: 1%| | 7181/1000000 [18:22:44<3252:52:21, 11.80s/it, lr=1e-5, step_loss=0.00853]
Steps: 1%| | 7182/1000000 [18:22:55<3213:10:59, 11.65s/it, lr=1e-5, step_loss=0.00853][RANK-0]: Step: [7182], local_loss=0.05690082907676697, train_loss=0.02610391192138195, time_cost=1.470592737197876
+
Steps: 1%| | 7182/1000000 [18:22:55<3213:10:59, 11.65s/it, lr=1e-5, step_loss=0.0569]
Steps: 1%| | 7183/1000000 [18:23:01<2716:34:12, 9.85s/it, lr=1e-5, step_loss=0.0569][RANK-0]: Step: [7183], local_loss=0.013270396739244461, train_loss=0.018313724547624588, time_cost=3.1287765502929688
+
Steps: 1%| | 7183/1000000 [18:23:01<2716:34:12, 9.85s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 7184/1000000 [18:23:10<2649:29:24, 9.61s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [7184], local_loss=0.017589984461665154, train_loss=0.032135412096977234, time_cost=1.2884464263916016
+
Steps: 1%| | 7184/1000000 [18:23:10<2649:29:24, 9.61s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%| | 7185/1000000 [18:23:21<2812:30:57, 10.20s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [7185], local_loss=0.011804264038801193, train_loss=0.1098877489566803, time_cost=4.588454484939575
+
Steps: 1%| | 7185/1000000 [18:23:21<2812:30:57, 10.20s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 7186/1000000 [18:23:30<2647:25:58, 9.60s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [7186], local_loss=0.01895313337445259, train_loss=0.026235628873109818, time_cost=3.4083755016326904
+
Steps: 1%| | 7186/1000000 [18:23:30<2647:25:58, 9.60s/it, lr=1e-5, step_loss=0.019]
Steps: 1%| | 7187/1000000 [18:23:41<2822:50:41, 10.24s/it, lr=1e-5, step_loss=0.019][RANK-0]: Step: [7187], local_loss=0.02852145954966545, train_loss=0.044245146214962006, time_cost=1.2206931114196777
+
Steps: 1%| | 7187/1000000 [18:23:41<2822:50:41, 10.24s/it, lr=1e-5, step_loss=0.0285]
Steps: 1%| | 7188/1000000 [18:23:53<2929:08:39, 10.62s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [7188], local_loss=0.05211196467280388, train_loss=0.04911916330456734, time_cost=3.7560195922851562
+
Steps: 1%| | 7188/1000000 [18:23:53<2929:08:39, 10.62s/it, lr=1e-5, step_loss=0.0521]
Steps: 1%| | 7189/1000000 [18:23:58<2459:20:54, 8.92s/it, lr=1e-5, step_loss=0.0521][RANK-0]: Step: [7189], local_loss=0.01985303871333599, train_loss=0.0301397442817688, time_cost=1.2386434078216553
+
Steps: 1%| | 7189/1000000 [18:23:58<2459:20:54, 8.92s/it, lr=1e-5, step_loss=0.0199]
Steps: 1%| | 7190/1000000 [18:24:05<2342:52:31, 8.50s/it, lr=1e-5, step_loss=0.0199][RANK-0]: Step: [7190], local_loss=0.02686738222837448, train_loss=0.04133230447769165, time_cost=3.516740560531616
+
Steps: 1%| | 7190/1000000 [18:24:05<2342:52:31, 8.50s/it, lr=1e-5, step_loss=0.0269]
Steps: 1%| | 7191/1000000 [18:24:20<2838:39:31, 10.29s/it, lr=1e-5, step_loss=0.0269][RANK-0]: Step: [7191], local_loss=0.0448928102850914, train_loss=0.05826258659362793, time_cost=1.9326324462890625
+
Steps: 1%| | 7191/1000000 [18:24:20<2838:39:31, 10.29s/it, lr=1e-5, step_loss=0.0449]
Steps: 1%| | 7192/1000000 [18:24:33<3059:19:47, 11.09s/it, lr=1e-5, step_loss=0.0449][RANK-0]: Step: [7192], local_loss=0.024912847205996513, train_loss=0.0349511094391346, time_cost=10.838793516159058
+
Steps: 1%| | 7192/1000000 [18:24:33<3059:19:47, 11.09s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%| | 7193/1000000 [18:24:40<2764:34:57, 10.02s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [7193], local_loss=0.052780766040086746, train_loss=0.01731083169579506, time_cost=1.732217788696289
+
Steps: 1%| | 7193/1000000 [18:24:40<2764:34:57, 10.02s/it, lr=1e-5, step_loss=0.0528]
Steps: 1%| | 7194/1000000 [18:24:52<2860:31:07, 10.37s/it, lr=1e-5, step_loss=0.0528][RANK-0]: Step: [7194], local_loss=0.025882046669721603, train_loss=0.030324138700962067, time_cost=2.5460197925567627
+
Steps: 1%| | 7194/1000000 [18:24:52<2860:31:07, 10.37s/it, lr=1e-5, step_loss=0.0259]
Steps: 1%| | 7195/1000000 [18:24:57<2442:43:39, 8.86s/it, lr=1e-5, step_loss=0.0259][RANK-0]: Step: [7195], local_loss=0.0732092335820198, train_loss=0.029758848249912262, time_cost=1.434220552444458
+
Steps: 1%| | 7195/1000000 [18:24:57<2442:43:39, 8.86s/it, lr=1e-5, step_loss=0.0732]
Steps: 1%| | 7196/1000000 [18:25:08<2644:44:01, 9.59s/it, lr=1e-5, step_loss=0.0732][RANK-0]: Step: [7196], local_loss=0.053403399884700775, train_loss=0.07649780809879303, time_cost=3.566573143005371
+
Steps: 1%| | 7196/1000000 [18:25:08<2644:44:01, 9.59s/it, lr=1e-5, step_loss=0.0534]
Steps: 1%| | 7197/1000000 [18:25:13<2277:18:16, 8.26s/it, lr=1e-5, step_loss=0.0534][RANK-0]: Step: [7197], local_loss=0.018442479893565178, train_loss=0.027093252167105675, time_cost=2.2212586402893066
+
Steps: 1%| | 7197/1000000 [18:25:13<2277:18:16, 8.26s/it, lr=1e-5, step_loss=0.0184]
Steps: 1%| | 7198/1000000 [18:25:24<2480:04:41, 8.99s/it, lr=1e-5, step_loss=0.0184][RANK-0]: Step: [7198], local_loss=0.0701446682214737, train_loss=0.030294310301542282, time_cost=3.6039037704467773
+
Steps: 1%| | 7198/1000000 [18:25:24<2480:04:41, 8.99s/it, lr=1e-5, step_loss=0.0701]
Steps: 1%| | 7199/1000000 [18:25:32<2376:44:22, 8.62s/it, lr=1e-5, step_loss=0.0701][RANK-0]: Step: [7199], local_loss=0.005760595668107271, train_loss=0.037214893847703934, time_cost=1.2762227058410645
+
Steps: 1%| | 7199/1000000 [18:25:32<2376:44:22, 8.62s/it, lr=1e-5, step_loss=0.00576]
Steps: 1%| | 7200/1000000 [18:25:42<2485:09:48, 9.01s/it, lr=1e-5, step_loss=0.00576][RANK-0]: Step: [7200], local_loss=0.01858033798635006, train_loss=0.030802078545093536, time_cost=3.578721761703491
+
Steps: 1%| | 7200/1000000 [18:25:42<2485:09:48, 9.01s/it, lr=1e-5, step_loss=0.0186]
Steps: 1%| | 7201/1000000 [18:25:48<2230:25:51, 8.09s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [7201], local_loss=0.34782811999320984, train_loss=0.0728730708360672, time_cost=1.459550142288208
+
Steps: 1%| | 7201/1000000 [18:25:48<2230:25:51, 8.09s/it, lr=1e-5, step_loss=0.348]
Steps: 1%| | 7202/1000000 [18:25:53<2010:05:42, 7.29s/it, lr=1e-5, step_loss=0.348][RANK-0]: Step: [7202], local_loss=0.009282917715609074, train_loss=0.07141058892011642, time_cost=2.717850685119629
+
Steps: 1%| | 7202/1000000 [18:25:53<2010:05:42, 7.29s/it, lr=1e-5, step_loss=0.00928]
Steps: 1%| | 7203/1000000 [18:25:59<1934:37:50, 7.02s/it, lr=1e-5, step_loss=0.00928][RANK-0]: Step: [7203], local_loss=0.013815273530781269, train_loss=0.027802754193544388, time_cost=1.2740752696990967
+
Steps: 1%| | 7203/1000000 [18:25:59<1934:37:50, 7.02s/it, lr=1e-5, step_loss=0.0138]
Steps: 1%| | 7204/1000000 [18:26:07<2002:37:05, 7.26s/it, lr=1e-5, step_loss=0.0138][RANK-0]: Step: [7204], local_loss=0.02459479682147503, train_loss=0.16893692314624786, time_cost=6.380138874053955
+
Steps: 1%| | 7204/1000000 [18:26:07<2002:37:05, 7.26s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%| | 7205/1000000 [18:26:19<2337:21:59, 8.48s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [7205], local_loss=0.06397031247615814, train_loss=0.03782274201512337, time_cost=8.019776344299316
+
Steps: 1%| | 7205/1000000 [18:26:19<2337:21:59, 8.48s/it, lr=1e-5, step_loss=0.064]
Steps: 1%| | 7206/1000000 [18:26:30<2560:44:55, 9.29s/it, lr=1e-5, step_loss=0.064][RANK-0]: Step: [7206], local_loss=0.04007174074649811, train_loss=9.109943389892578, time_cost=3.757873058319092
+
Steps: 1%| | 7206/1000000 [18:26:30<2560:44:55, 9.29s/it, lr=1e-5, step_loss=0.0401]
Steps: 1%| | 7207/1000000 [18:26:39<2526:07:35, 9.16s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [7207], local_loss=0.012425239197909832, train_loss=0.01645498350262642, time_cost=1.4378879070281982
+
Steps: 1%| | 7207/1000000 [18:26:39<2526:07:35, 9.16s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 7208/1000000 [18:26:48<2527:54:20, 9.17s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [7208], local_loss=0.0706857368350029, train_loss=0.03283827751874924, time_cost=4.2604570388793945
+
Steps: 1%| | 7208/1000000 [18:26:48<2527:54:20, 9.17s/it, lr=1e-5, step_loss=0.0707]
Steps: 1%| | 7209/1000000 [18:26:55<2356:56:56, 8.55s/it, lr=1e-5, step_loss=0.0707][RANK-0]: Step: [7209], local_loss=0.06491106748580933, train_loss=0.05919764190912247, time_cost=3.1021547317504883
+
Steps: 1%| | 7209/1000000 [18:26:55<2356:56:56, 8.55s/it, lr=1e-5, step_loss=0.0649]
Steps: 1%| | 7210/1000000 [18:27:00<2054:40:53, 7.45s/it, lr=1e-5, step_loss=0.0649][RANK-0]: Step: [7210], local_loss=0.03944052755832672, train_loss=0.03123730607330799, time_cost=1.2242825031280518
+
Steps: 1%| | 7210/1000000 [18:27:00<2054:40:53, 7.45s/it, lr=1e-5, step_loss=0.0394]
Steps: 1%| | 7211/1000000 [18:27:08<2088:57:08, 7.57s/it, lr=1e-5, step_loss=0.0394][RANK-0]: Step: [7211], local_loss=0.012150275520980358, train_loss=0.09945132583379745, time_cost=3.2531745433807373
+
Steps: 1%| | 7211/1000000 [18:27:08<2088:57:08, 7.57s/it, lr=1e-5, step_loss=0.0122]
Steps: 1%| | 7212/1000000 [18:27:21<2529:27:35, 9.17s/it, lr=1e-5, step_loss=0.0122][RANK-0]: Step: [7212], local_loss=0.015876485034823418, train_loss=0.026898398995399475, time_cost=4.499354839324951
+
Steps: 1%| | 7212/1000000 [18:27:21<2529:27:35, 9.17s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 7213/1000000 [18:27:25<2111:55:25, 7.66s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [7213], local_loss=0.013220364227890968, train_loss=0.01632196642458439, time_cost=1.3909575939178467
+
Steps: 1%| | 7213/1000000 [18:27:25<2111:55:25, 7.66s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 7214/1000000 [18:27:35<2373:45:56, 8.61s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [7214], local_loss=0.033678483217954636, train_loss=0.1233799085021019, time_cost=2.317875385284424
+
Steps: 1%| | 7214/1000000 [18:27:35<2373:45:56, 8.61s/it, lr=1e-5, step_loss=0.0337]
Steps: 1%| | 7215/1000000 [18:27:42<2187:57:56, 7.93s/it, lr=1e-5, step_loss=0.0337][RANK-0]: Step: [7215], local_loss=0.016257664188742638, train_loss=0.04104846343398094, time_cost=1.4413940906524658
+
Steps: 1%| | 7215/1000000 [18:27:42<2187:57:56, 7.93s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 7216/1000000 [18:27:46<1892:31:14, 6.86s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [7216], local_loss=0.02989412471652031, train_loss=0.03535973280668259, time_cost=1.3760921955108643
+
Steps: 1%| | 7216/1000000 [18:27:46<1892:31:14, 6.86s/it, lr=1e-5, step_loss=0.0299]
Steps: 1%| | 7217/1000000 [18:27:51<1761:54:23, 6.39s/it, lr=1e-5, step_loss=0.0299][RANK-0]: Step: [7217], local_loss=0.04735175520181656, train_loss=0.09329621493816376, time_cost=2.3858301639556885
+
Steps: 1%| | 7217/1000000 [18:27:51<1761:54:23, 6.39s/it, lr=1e-5, step_loss=0.0474]
Steps: 1%| | 7218/1000000 [18:27:56<1646:17:53, 5.97s/it, lr=1e-5, step_loss=0.0474][RANK-0]: Step: [7218], local_loss=0.01030691061168909, train_loss=11.990023612976074, time_cost=2.3319971561431885
+
Steps: 1%| | 7218/1000000 [18:27:56<1646:17:53, 5.97s/it, lr=1e-5, step_loss=0.0103]
Steps: 1%| | 7219/1000000 [18:28:09<2176:03:06, 7.89s/it, lr=1e-5, step_loss=0.0103][RANK-0]: Step: [7219], local_loss=0.009718239307403564, train_loss=0.036668822169303894, time_cost=6.148114204406738
+
Steps: 1%| | 7219/1000000 [18:28:09<2176:03:06, 7.89s/it, lr=1e-5, step_loss=0.00972]
Steps: 1%| | 7220/1000000 [18:28:24<2787:55:13, 10.11s/it, lr=1e-5, step_loss=0.00972][RANK-0]: Step: [7220], local_loss=0.010195496492087841, train_loss=0.042835310101509094, time_cost=7.399775266647339
+
Steps: 1%| | 7220/1000000 [18:28:24<2787:55:13, 10.11s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7221/1000000 [18:28:33<2700:41:50, 9.79s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7221], local_loss=76.22109985351562, train_loss=9.582636833190918, time_cost=1.5110437870025635
+
Steps: 1%| | 7221/1000000 [18:28:33<2700:41:50, 9.79s/it, lr=1e-5, step_loss=76.2]
Steps: 1%| | 7222/1000000 [18:28:44<2781:22:26, 10.09s/it, lr=1e-5, step_loss=76.2][RANK-0]: Step: [7222], local_loss=0.017226139083504677, train_loss=0.04650179296731949, time_cost=1.2610249519348145
+
Steps: 1%| | 7222/1000000 [18:28:44<2781:22:26, 10.09s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 7223/1000000 [18:28:52<2610:16:37, 9.47s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [7223], local_loss=0.022262297570705414, train_loss=0.047312986105680466, time_cost=1.2387902736663818
+
Steps: 1%| | 7223/1000000 [18:28:52<2610:16:37, 9.47s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 7224/1000000 [18:28:57<2230:25:19, 8.09s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [7224], local_loss=0.006877841427922249, train_loss=0.02930724062025547, time_cost=1.5946760177612305
+
Steps: 1%| | 7224/1000000 [18:28:57<2230:25:19, 8.09s/it, lr=1e-5, step_loss=0.00688]
Steps: 1%| | 7225/1000000 [18:29:09<2564:30:35, 9.30s/it, lr=1e-5, step_loss=0.00688][RANK-0]: Step: [7225], local_loss=0.06693334132432938, train_loss=0.06369238346815109, time_cost=2.8758163452148438
+
Steps: 1%| | 7225/1000000 [18:29:09<2564:30:35, 9.30s/it, lr=1e-5, step_loss=0.0669]
Steps: 1%| | 7226/1000000 [18:29:16<2392:49:45, 8.68s/it, lr=1e-5, step_loss=0.0669][RANK-0]: Step: [7226], local_loss=0.02121553011238575, train_loss=0.06047828495502472, time_cost=3.0223753452301025
+
Steps: 1%| | 7226/1000000 [18:29:16<2392:49:45, 8.68s/it, lr=1e-5, step_loss=0.0212]
Steps: 1%| | 7227/1000000 [18:29:30<2801:33:26, 10.16s/it, lr=1e-5, step_loss=0.0212][RANK-0]: Step: [7227], local_loss=0.011275989934802055, train_loss=0.07985536754131317, time_cost=5.916016101837158
+
Steps: 1%| | 7227/1000000 [18:29:30<2801:33:26, 10.16s/it, lr=1e-5, step_loss=0.0113]
Steps: 1%| | 7228/1000000 [18:29:39<2682:35:20, 9.73s/it, lr=1e-5, step_loss=0.0113][RANK-0]: Step: [7228], local_loss=1.0033482313156128, train_loss=0.15185031294822693, time_cost=2.3445494174957275
+
Steps: 1%| | 7228/1000000 [18:29:39<2682:35:20, 9.73s/it, lr=1e-5, step_loss=1]
Steps: 1%| | 7229/1000000 [18:29:43<2231:33:34, 8.09s/it, lr=1e-5, step_loss=1][RANK-0]: Step: [7229], local_loss=0.06271681189537048, train_loss=0.026094738394021988, time_cost=1.33756685256958
+
Steps: 1%| | 7229/1000000 [18:29:43<2231:33:34, 8.09s/it, lr=1e-5, step_loss=0.0627]
Steps: 1%| | 7230/1000000 [18:29:50<2122:22:16, 7.70s/it, lr=1e-5, step_loss=0.0627][RANK-0]: Step: [7230], local_loss=0.012647773139178753, train_loss=0.05698546767234802, time_cost=5.368394136428833
+
Steps: 1%| | 7230/1000000 [18:29:50<2122:22:16, 7.70s/it, lr=1e-5, step_loss=0.0126]
Steps: 1%| | 7231/1000000 [18:30:02<2494:35:42, 9.05s/it, lr=1e-5, step_loss=0.0126][RANK-0]: Step: [7231], local_loss=0.011887582950294018, train_loss=0.025260794907808304, time_cost=4.934450149536133
+
Steps: 1%| | 7231/1000000 [18:30:02<2494:35:42, 9.05s/it, lr=1e-5, step_loss=0.0119]
Steps: 1%| | 7232/1000000 [18:30:12<2588:28:06, 9.39s/it, lr=1e-5, step_loss=0.0119][RANK-0]: Step: [7232], local_loss=0.15273642539978027, train_loss=0.049793608486652374, time_cost=4.7153167724609375
+
Steps: 1%| | 7232/1000000 [18:30:12<2588:28:06, 9.39s/it, lr=1e-5, step_loss=0.153]
Steps: 1%| | 7233/1000000 [18:30:24<2773:00:52, 10.06s/it, lr=1e-5, step_loss=0.153][RANK-0]: Step: [7233], local_loss=0.03336593508720398, train_loss=0.15667724609375, time_cost=1.4130032062530518
+
Steps: 1%| | 7233/1000000 [18:30:24<2773:00:52, 10.06s/it, lr=1e-5, step_loss=0.0334]
Steps: 1%| | 7234/1000000 [18:30:31<2533:52:22, 9.19s/it, lr=1e-5, step_loss=0.0334][RANK-0]: Step: [7234], local_loss=0.2769108712673187, train_loss=0.057782359421253204, time_cost=2.615137815475464
+
Steps: 1%| | 7234/1000000 [18:30:31<2533:52:22, 9.19s/it, lr=1e-5, step_loss=0.277]
Steps: 1%| | 7235/1000000 [18:30:36<2214:46:44, 8.03s/it, lr=1e-5, step_loss=0.277][RANK-0]: Step: [7235], local_loss=0.01298818551003933, train_loss=0.1635030210018158, time_cost=4.289471387863159
+
Steps: 1%| | 7235/1000000 [18:30:36<2214:46:44, 8.03s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 7236/1000000 [18:30:40<1900:51:26, 6.89s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [7236], local_loss=0.015653878450393677, train_loss=0.029952339828014374, time_cost=1.3025503158569336
+
Steps: 1%| | 7236/1000000 [18:30:40<1900:51:26, 6.89s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 7237/1000000 [18:30:52<2307:53:15, 8.37s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [7237], local_loss=0.009783861227333546, train_loss=0.02892235480248928, time_cost=1.2288503646850586
+
Steps: 1%| | 7237/1000000 [18:30:52<2307:53:15, 8.37s/it, lr=1e-5, step_loss=0.00978]
Steps: 1%| | 7238/1000000 [18:31:05<2680:59:57, 9.72s/it, lr=1e-5, step_loss=0.00978][RANK-0]: Step: [7238], local_loss=0.008982781320810318, train_loss=0.021327920258045197, time_cost=1.2642273902893066
+
Steps: 1%| | 7238/1000000 [18:31:05<2680:59:57, 9.72s/it, lr=1e-5, step_loss=0.00898]
Steps: 1%| | 7239/1000000 [18:31:15<2675:23:05, 9.70s/it, lr=1e-5, step_loss=0.00898][RANK-0]: Step: [7239], local_loss=0.9948360323905945, train_loss=0.14186273515224457, time_cost=4.524645805358887
+
Steps: 1%| | 7239/1000000 [18:31:15<2675:23:05, 9.70s/it, lr=1e-5, step_loss=0.995]
Steps: 1%| | 7240/1000000 [18:31:23<2534:24:42, 9.19s/it, lr=1e-5, step_loss=0.995][RANK-0]: Step: [7240], local_loss=0.02369445562362671, train_loss=0.07631243020296097, time_cost=3.5986149311065674
+
Steps: 1%| | 7240/1000000 [18:31:23<2534:24:42, 9.19s/it, lr=1e-5, step_loss=0.0237]
Steps: 1%| | 7241/1000000 [18:31:37<2962:48:41, 10.74s/it, lr=1e-5, step_loss=0.0237][RANK-0]: Step: [7241], local_loss=0.0382157564163208, train_loss=0.07874450087547302, time_cost=5.884935140609741
+
Steps: 1%| | 7241/1000000 [18:31:37<2962:48:41, 10.74s/it, lr=1e-5, step_loss=0.0382]
Steps: 1%| | 7242/1000000 [18:31:44<2658:05:09, 9.64s/it, lr=1e-5, step_loss=0.0382][RANK-0]: Step: [7242], local_loss=1.2760425806045532, train_loss=0.22324249148368835, time_cost=2.4043264389038086
+
Steps: 1%| | 7242/1000000 [18:31:44<2658:05:09, 9.64s/it, lr=1e-5, step_loss=1.28]
Steps: 1%| | 7243/1000000 [18:31:55<2788:17:27, 10.11s/it, lr=1e-5, step_loss=1.28][RANK-0]: Step: [7243], local_loss=0.044243309646844864, train_loss=0.05116045102477074, time_cost=2.4313549995422363
+
Steps: 1%| | 7243/1000000 [18:31:55<2788:17:27, 10.11s/it, lr=1e-5, step_loss=0.0442]
Steps: 1%| | 7244/1000000 [18:32:01<2405:04:38, 8.72s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [7244], local_loss=0.018055472522974014, train_loss=0.018781783059239388, time_cost=1.2290587425231934
+
Steps: 1%| | 7244/1000000 [18:32:01<2405:04:38, 8.72s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%| | 7245/1000000 [18:32:06<2087:25:23, 7.57s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [7245], local_loss=4.584658622741699, train_loss=0.6018613576889038, time_cost=1.8799445629119873
+
Steps: 1%| | 7245/1000000 [18:32:06<2087:25:23, 7.57s/it, lr=1e-5, step_loss=4.58]
Steps: 1%| | 7246/1000000 [18:32:10<1823:48:41, 6.61s/it, lr=1e-5, step_loss=4.58][RANK-0]: Step: [7246], local_loss=0.04303566738963127, train_loss=0.07862542569637299, time_cost=1.5177929401397705
+
Steps: 1%| | 7246/1000000 [18:32:10<1823:48:41, 6.61s/it, lr=1e-5, step_loss=0.043]
Steps: 1%| | 7247/1000000 [18:32:17<1875:23:18, 6.80s/it, lr=1e-5, step_loss=0.043][RANK-0]: Step: [7247], local_loss=0.021824846044182777, train_loss=0.07954622805118561, time_cost=2.418301582336426
+
Steps: 1%| | 7247/1000000 [18:32:17<1875:23:18, 6.80s/it, lr=1e-5, step_loss=0.0218]
Steps: 1%| | 7248/1000000 [18:32:29<2265:55:03, 8.22s/it, lr=1e-5, step_loss=0.0218][RANK-0]: Step: [7248], local_loss=0.19867335259914398, train_loss=0.0779467523097992, time_cost=1.9706077575683594
+
Steps: 1%| | 7248/1000000 [18:32:29<2265:55:03, 8.22s/it, lr=1e-5, step_loss=0.199]
Steps: 1%| | 7249/1000000 [18:32:36<2195:07:59, 7.96s/it, lr=1e-5, step_loss=0.199][RANK-0]: Step: [7249], local_loss=0.35016825795173645, train_loss=0.08231143653392792, time_cost=2.6517770290374756
+
Steps: 1%| | 7249/1000000 [18:32:36<2195:07:59, 7.96s/it, lr=1e-5, step_loss=0.35]
Steps: 1%| | 7250/1000000 [18:32:47<2454:23:12, 8.90s/it, lr=1e-5, step_loss=0.35][RANK-0]: Step: [7250], local_loss=0.007185730617493391, train_loss=0.07298121601343155, time_cost=3.2624402046203613
+
Steps: 1%| | 7250/1000000 [18:32:47<2454:23:12, 8.90s/it, lr=1e-5, step_loss=0.00719]
Steps: 1%| | 7251/1000000 [18:33:00<2734:44:12, 9.92s/it, lr=1e-5, step_loss=0.00719][RANK-0]: Step: [7251], local_loss=0.06305987387895584, train_loss=0.07121285796165466, time_cost=10.07379412651062
+
Steps: 1%| | 7251/1000000 [18:33:00<2734:44:12, 9.92s/it, lr=1e-5, step_loss=0.0631]
Steps: 1%| | 7252/1000000 [18:33:09<2718:27:15, 9.86s/it, lr=1e-5, step_loss=0.0631][RANK-0]: Step: [7252], local_loss=0.050438571721315384, train_loss=0.049384549260139465, time_cost=2.19694185256958
+
Steps: 1%| | 7252/1000000 [18:33:09<2718:27:15, 9.86s/it, lr=1e-5, step_loss=0.0504]
Steps: 1%| | 7253/1000000 [18:33:17<2503:17:32, 9.08s/it, lr=1e-5, step_loss=0.0504][RANK-0]: Step: [7253], local_loss=0.03522711619734764, train_loss=0.029527828097343445, time_cost=1.2261419296264648
+
Steps: 1%| | 7253/1000000 [18:33:17<2503:17:32, 9.08s/it, lr=1e-5, step_loss=0.0352]
Steps: 1%| | 7254/1000000 [18:33:23<2321:32:44, 8.42s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [7254], local_loss=0.017607688903808594, train_loss=0.03931325301527977, time_cost=1.457608699798584
+
Steps: 1%| | 7254/1000000 [18:33:23<2321:32:44, 8.42s/it, lr=1e-5, step_loss=0.0176]
Steps: 1%| | 7255/1000000 [18:33:30<2166:39:19, 7.86s/it, lr=1e-5, step_loss=0.0176][RANK-0]: Step: [7255], local_loss=0.014699558727443218, train_loss=0.10377919673919678, time_cost=2.8527109622955322
+
Steps: 1%| | 7255/1000000 [18:33:30<2166:39:19, 7.86s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 7256/1000000 [18:33:35<1921:04:14, 6.97s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [7256], local_loss=0.014230296015739441, train_loss=0.08521413058042526, time_cost=1.9586684703826904
+
Steps: 1%| | 7256/1000000 [18:33:35<1921:04:14, 6.97s/it, lr=1e-5, step_loss=0.0142]
Steps: 1%| | 7257/1000000 [18:33:48<2449:42:46, 8.88s/it, lr=1e-5, step_loss=0.0142][RANK-0]: Step: [7257], local_loss=0.009753616526722908, train_loss=0.02746640518307686, time_cost=3.8701252937316895
+
Steps: 1%| | 7257/1000000 [18:33:48<2449:42:46, 8.88s/it, lr=1e-5, step_loss=0.00975]
Steps: 1%| | 7258/1000000 [18:34:00<2715:58:39, 9.85s/it, lr=1e-5, step_loss=0.00975][RANK-0]: Step: [7258], local_loss=0.9919294714927673, train_loss=0.14243276417255402, time_cost=4.174788475036621
+
Steps: 1%| | 7258/1000000 [18:34:00<2715:58:39, 9.85s/it, lr=1e-5, step_loss=0.992]
Steps: 1%| | 7259/1000000 [18:34:06<2367:17:09, 8.58s/it, lr=1e-5, step_loss=0.992][RANK-0]: Step: [7259], local_loss=0.054629135876894, train_loss=0.09137775003910065, time_cost=3.2049548625946045
+
Steps: 1%| | 7259/1000000 [18:34:06<2367:17:09, 8.58s/it, lr=1e-5, step_loss=0.0546]
Steps: 1%| | 7260/1000000 [18:34:12<2129:14:44, 7.72s/it, lr=1e-5, step_loss=0.0546][RANK-0]: Step: [7260], local_loss=0.010806749574840069, train_loss=0.034787751734256744, time_cost=4.049923419952393
+
Steps: 1%| | 7260/1000000 [18:34:12<2129:14:44, 7.72s/it, lr=1e-5, step_loss=0.0108]
Steps: 1%| | 7261/1000000 [18:34:17<1958:48:21, 7.10s/it, lr=1e-5, step_loss=0.0108][RANK-0]: Step: [7261], local_loss=0.06888189911842346, train_loss=0.03174915909767151, time_cost=2.9297754764556885
+
Steps: 1%| | 7261/1000000 [18:34:17<1958:48:21, 7.10s/it, lr=1e-5, step_loss=0.0689]
Steps: 1%| | 7262/1000000 [18:34:27<2138:25:00, 7.75s/it, lr=1e-5, step_loss=0.0689][RANK-0]: Step: [7262], local_loss=0.04264269024133682, train_loss=0.08038462698459625, time_cost=7.4483582973480225
+
Steps: 1%| | 7262/1000000 [18:34:27<2138:25:00, 7.75s/it, lr=1e-5, step_loss=0.0426]
Steps: 1%| | 7263/1000000 [18:34:34<2090:11:34, 7.58s/it, lr=1e-5, step_loss=0.0426][RANK-0]: Step: [7263], local_loss=0.035199254751205444, train_loss=0.029144693166017532, time_cost=1.2570559978485107
+
Steps: 1%| | 7263/1000000 [18:34:34<2090:11:34, 7.58s/it, lr=1e-5, step_loss=0.0352]
Steps: 1%| | 7264/1000000 [18:34:40<1941:48:13, 7.04s/it, lr=1e-5, step_loss=0.0352][RANK-0]: Step: [7264], local_loss=0.012301560491323471, train_loss=0.05483901500701904, time_cost=1.4532644748687744
+
Steps: 1%| | 7264/1000000 [18:34:40<1941:48:13, 7.04s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 7265/1000000 [18:34:49<2119:03:34, 7.68s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [7265], local_loss=0.014422199688851833, train_loss=0.04424925893545151, time_cost=3.9237215518951416
+
Steps: 1%| | 7265/1000000 [18:34:49<2119:03:34, 7.68s/it, lr=1e-5, step_loss=0.0144]
Steps: 1%| | 7266/1000000 [18:34:56<2118:09:09, 7.68s/it, lr=1e-5, step_loss=0.0144][RANK-0]: Step: [7266], local_loss=0.01803017035126686, train_loss=0.02287130057811737, time_cost=1.5666708946228027
+
Steps: 1%| | 7266/1000000 [18:34:56<2118:09:09, 7.68s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 7267/1000000 [18:35:02<1914:30:52, 6.94s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [7267], local_loss=0.007976035587489605, train_loss=0.031158864498138428, time_cost=2.2204275131225586
+
Steps: 1%| | 7267/1000000 [18:35:02<1914:30:52, 6.94s/it, lr=1e-5, step_loss=0.00798]
Steps: 1%| | 7268/1000000 [18:35:09<1913:44:31, 6.94s/it, lr=1e-5, step_loss=0.00798][RANK-0]: Step: [7268], local_loss=0.010227846913039684, train_loss=0.016935359686613083, time_cost=2.4085381031036377
+
Steps: 1%| | 7268/1000000 [18:35:09<1913:44:31, 6.94s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7269/1000000 [18:35:18<2103:20:57, 7.63s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7269], local_loss=0.007875976152718067, train_loss=0.13362383842468262, time_cost=1.2289035320281982
+
Steps: 1%| | 7269/1000000 [18:35:18<2103:20:57, 7.63s/it, lr=1e-5, step_loss=0.00788]
Steps: 1%| | 7270/1000000 [18:35:29<2400:27:31, 8.70s/it, lr=1e-5, step_loss=0.00788][RANK-0]: Step: [7270], local_loss=0.014683014713227749, train_loss=0.03803953900933266, time_cost=1.2569687366485596
+
Steps: 1%| | 7270/1000000 [18:35:29<2400:27:31, 8.70s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 7271/1000000 [18:35:34<2133:48:22, 7.74s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [7271], local_loss=0.30582162737846375, train_loss=0.05543362721800804, time_cost=2.7554078102111816
+
Steps: 1%| | 7271/1000000 [18:35:34<2133:48:22, 7.74s/it, lr=1e-5, step_loss=0.306]
Steps: 1%| | 7272/1000000 [18:35:44<2278:31:12, 8.26s/it, lr=1e-5, step_loss=0.306][RANK-0]: Step: [7272], local_loss=0.0154802817851305, train_loss=0.06416487693786621, time_cost=2.1688356399536133
+
Steps: 1%| | 7272/1000000 [18:35:44<2278:31:12, 8.26s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 7273/1000000 [18:35:51<2151:44:10, 7.80s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [7273], local_loss=0.014108267612755299, train_loss=0.06878014653921127, time_cost=1.851604700088501
+
Steps: 1%| | 7273/1000000 [18:35:51<2151:44:10, 7.80s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 7274/1000000 [18:35:59<2158:29:05, 7.83s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [7274], local_loss=0.1902223378419876, train_loss=0.07124058902263641, time_cost=3.9258577823638916
+
Steps: 1%| | 7274/1000000 [18:35:59<2158:29:05, 7.83s/it, lr=1e-5, step_loss=0.19]
Steps: 1%| | 7275/1000000 [18:36:03<1915:08:17, 6.95s/it, lr=1e-5, step_loss=0.19][RANK-0]: Step: [7275], local_loss=0.0486501008272171, train_loss=0.03153615817427635, time_cost=1.273392677307129
+
Steps: 1%| | 7275/1000000 [18:36:03<1915:08:17, 6.95s/it, lr=1e-5, step_loss=0.0487]
Steps: 1%| | 7276/1000000 [18:36:09<1766:45:06, 6.41s/it, lr=1e-5, step_loss=0.0487][RANK-0]: Step: [7276], local_loss=0.024285491555929184, train_loss=0.0538419634103775, time_cost=2.451172113418579
+
Steps: 1%| | 7276/1000000 [18:36:09<1766:45:06, 6.41s/it, lr=1e-5, step_loss=0.0243]
Steps: 1%| | 7277/1000000 [18:36:18<2037:35:49, 7.39s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [7277], local_loss=0.006883855909109116, train_loss=0.028813865035772324, time_cost=3.048685312271118
+
Steps: 1%| | 7277/1000000 [18:36:18<2037:35:49, 7.39s/it, lr=1e-5, step_loss=0.00688]
Steps: 1%| | 7278/1000000 [18:36:33<2677:19:40, 9.71s/it, lr=1e-5, step_loss=0.00688][RANK-0]: Step: [7278], local_loss=0.12512820959091187, train_loss=0.061337023973464966, time_cost=7.788571119308472
+
Steps: 1%| | 7278/1000000 [18:36:33<2677:19:40, 9.71s/it, lr=1e-5, step_loss=0.125]
Steps: 1%| | 7279/1000000 [18:36:46<2931:10:08, 10.63s/it, lr=1e-5, step_loss=0.125][RANK-0]: Step: [7279], local_loss=0.032842159271240234, train_loss=0.14596641063690186, time_cost=3.9720041751861572
+
Steps: 1%| | 7279/1000000 [18:36:46<2931:10:08, 10.63s/it, lr=1e-5, step_loss=0.0328]
Steps: 1%| | 7280/1000000 [18:36:54<2683:11:05, 9.73s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [7280], local_loss=0.015939144417643547, train_loss=0.018628492951393127, time_cost=3.7846720218658447
+
Steps: 1%| | 7280/1000000 [18:36:54<2683:11:05, 9.73s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 7281/1000000 [18:36:59<2273:05:34, 8.24s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [7281], local_loss=0.037340011447668076, train_loss=0.04906970635056496, time_cost=1.9557905197143555
+
Steps: 1%| | 7281/1000000 [18:36:59<2273:05:34, 8.24s/it, lr=1e-5, step_loss=0.0373]
Steps: 1%| | 7282/1000000 [18:37:05<2091:04:09, 7.58s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [7282], local_loss=0.009288360364735126, train_loss=0.02979445829987526, time_cost=1.7072327136993408
+
Steps: 1%| | 7282/1000000 [18:37:05<2091:04:09, 7.58s/it, lr=1e-5, step_loss=0.00929]
Steps: 1%| | 7283/1000000 [18:37:19<2688:11:31, 9.75s/it, lr=1e-5, step_loss=0.00929][RANK-0]: Step: [7283], local_loss=0.009582852944731712, train_loss=0.0774998813867569, time_cost=5.114053964614868
+
Steps: 1%| | 7283/1000000 [18:37:19<2688:11:31, 9.75s/it, lr=1e-5, step_loss=0.00958]
Steps: 1%| | 7284/1000000 [18:37:26<2438:34:00, 8.84s/it, lr=1e-5, step_loss=0.00958][RANK-0]: Step: [7284], local_loss=0.011379258707165718, train_loss=0.01648871600627899, time_cost=3.026132345199585
+
Steps: 1%| | 7284/1000000 [18:37:26<2438:34:00, 8.84s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 7285/1000000 [18:37:41<2965:20:04, 10.75s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [7285], local_loss=0.034715648740530014, train_loss=0.03366171941161156, time_cost=5.762998819351196
+
Steps: 1%| | 7285/1000000 [18:37:41<2965:20:04, 10.75s/it, lr=1e-5, step_loss=0.0347]
Steps: 1%| | 7286/1000000 [18:37:51<2849:46:53, 10.33s/it, lr=1e-5, step_loss=0.0347][RANK-0]: Step: [7286], local_loss=0.019314659759402275, train_loss=0.069294773042202, time_cost=3.757361888885498
+
Steps: 1%| | 7286/1000000 [18:37:51<2849:46:53, 10.33s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%| | 7287/1000000 [18:37:57<2482:21:33, 9.00s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [7287], local_loss=0.05217095464468002, train_loss=0.044401854276657104, time_cost=1.2347240447998047
+
Steps: 1%| | 7287/1000000 [18:37:57<2482:21:33, 9.00s/it, lr=1e-5, step_loss=0.0522]
Steps: 1%| | 7288/1000000 [18:38:01<2125:54:39, 7.71s/it, lr=1e-5, step_loss=0.0522][RANK-0]: Step: [7288], local_loss=0.06768560409545898, train_loss=0.05002770572900772, time_cost=1.9048528671264648
+
Steps: 1%| | 7288/1000000 [18:38:01<2125:54:39, 7.71s/it, lr=1e-5, step_loss=0.0677]
Steps: 1%| | 7289/1000000 [18:38:13<2431:12:35, 8.82s/it, lr=1e-5, step_loss=0.0677][RANK-0]: Step: [7289], local_loss=0.014520378783345222, train_loss=0.052478156983852386, time_cost=1.2378594875335693
+
Steps: 1%| | 7289/1000000 [18:38:13<2431:12:35, 8.82s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 7290/1000000 [18:38:26<2803:56:04, 10.17s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [7290], local_loss=0.18736068904399872, train_loss=0.06157982349395752, time_cost=4.161289215087891
+
Steps: 1%| | 7290/1000000 [18:38:26<2803:56:04, 10.17s/it, lr=1e-5, step_loss=0.187]
Steps: 1%| | 7291/1000000 [18:38:31<2376:01:29, 8.62s/it, lr=1e-5, step_loss=0.187][RANK-0]: Step: [7291], local_loss=0.013371314853429794, train_loss=0.01848333328962326, time_cost=1.2234911918640137
+
Steps: 1%| | 7291/1000000 [18:38:31<2376:01:29, 8.62s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 7292/1000000 [18:38:48<3104:07:30, 11.26s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [7292], local_loss=0.0343434140086174, train_loss=0.036238208413124084, time_cost=9.14631175994873
+
Steps: 1%| | 7292/1000000 [18:38:48<3104:07:30, 11.26s/it, lr=1e-5, step_loss=0.0343]
Steps: 1%| | 7293/1000000 [18:38:56<2787:27:12, 10.11s/it, lr=1e-5, step_loss=0.0343][RANK-0]: Step: [7293], local_loss=0.017812807112932205, train_loss=0.0607517771422863, time_cost=3.153184413909912
+
Steps: 1%| | 7293/1000000 [18:38:56<2787:27:12, 10.11s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%| | 7294/1000000 [18:39:01<2366:32:07, 8.58s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [7294], local_loss=0.0515684075653553, train_loss=0.03293870761990547, time_cost=1.9069585800170898
+
Steps: 1%| | 7294/1000000 [18:39:01<2366:32:07, 8.58s/it, lr=1e-5, step_loss=0.0516]
Steps: 1%| | 7295/1000000 [18:39:07<2144:33:37, 7.78s/it, lr=1e-5, step_loss=0.0516][RANK-0]: Step: [7295], local_loss=0.0363367535173893, train_loss=0.028896404430270195, time_cost=1.8074333667755127
+
Steps: 1%| | 7295/1000000 [18:39:07<2144:33:37, 7.78s/it, lr=1e-5, step_loss=0.0363]
Steps: 1%| | 7296/1000000 [18:39:11<1852:59:36, 6.72s/it, lr=1e-5, step_loss=0.0363][RANK-0]: Step: [7296], local_loss=0.014101563952863216, train_loss=0.032556626945734024, time_cost=1.7683916091918945
+
Steps: 1%| | 7296/1000000 [18:39:11<1852:59:36, 6.72s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 7297/1000000 [18:39:23<2297:04:48, 8.33s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [7297], local_loss=0.12274256348609924, train_loss=0.03701264038681984, time_cost=3.4239680767059326
+
Steps: 1%| | 7297/1000000 [18:39:23<2297:04:48, 8.33s/it, lr=1e-5, step_loss=0.123]
Steps: 1%| | 7298/1000000 [18:39:32<2309:36:30, 8.38s/it, lr=1e-5, step_loss=0.123][RANK-0]: Step: [7298], local_loss=0.01929139532148838, train_loss=0.02431129291653633, time_cost=1.4641938209533691
+
Steps: 1%| | 7298/1000000 [18:39:32<2309:36:30, 8.38s/it, lr=1e-5, step_loss=0.0193]
Steps: 1%| | 7299/1000000 [18:39:38<2119:30:16, 7.69s/it, lr=1e-5, step_loss=0.0193][RANK-0]: Step: [7299], local_loss=0.007286766078323126, train_loss=0.13664817810058594, time_cost=1.7184336185455322
+
Steps: 1%| | 7299/1000000 [18:39:38<2119:30:16, 7.69s/it, lr=1e-5, step_loss=0.00729]
Steps: 1%| | 7300/1000000 [18:39:42<1834:39:54, 6.65s/it, lr=1e-5, step_loss=0.00729][RANK-0]: Step: [7300], local_loss=0.05019613727927208, train_loss=0.039430901408195496, time_cost=1.237356424331665
+
Steps: 1%| | 7300/1000000 [18:39:42<1834:39:54, 6.65s/it, lr=1e-5, step_loss=0.0502]
Steps: 1%| | 7301/1000000 [18:39:54<2308:03:30, 8.37s/it, lr=1e-5, step_loss=0.0502][RANK-0]: Step: [7301], local_loss=0.025368375703692436, train_loss=0.018821755424141884, time_cost=1.2222890853881836
+
Steps: 1%| | 7301/1000000 [18:39:54<2308:03:30, 8.37s/it, lr=1e-5, step_loss=0.0254]
Steps: 1%| | 7302/1000000 [18:40:06<2563:45:42, 9.30s/it, lr=1e-5, step_loss=0.0254][RANK-0]: Step: [7302], local_loss=0.041923195123672485, train_loss=0.025271521881222725, time_cost=3.974189519882202
+
Steps: 1%| | 7302/1000000 [18:40:06<2563:45:42, 9.30s/it, lr=1e-5, step_loss=0.0419]
Steps: 1%| | 7303/1000000 [18:40:15<2543:03:40, 9.22s/it, lr=1e-5, step_loss=0.0419][RANK-0]: Step: [7303], local_loss=0.03247906267642975, train_loss=0.05253106355667114, time_cost=3.6194968223571777
+
Steps: 1%| | 7303/1000000 [18:40:15<2543:03:40, 9.22s/it, lr=1e-5, step_loss=0.0325]
Steps: 1%| | 7304/1000000 [18:40:26<2684:20:52, 9.73s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [7304], local_loss=0.008045237511396408, train_loss=0.02881539613008499, time_cost=2.3494272232055664
+
Steps: 1%| | 7304/1000000 [18:40:26<2684:20:52, 9.73s/it, lr=1e-5, step_loss=0.00805]
Steps: 1%| | 7305/1000000 [18:40:34<2561:00:27, 9.29s/it, lr=1e-5, step_loss=0.00805][RANK-0]: Step: [7305], local_loss=0.025615567341446877, train_loss=0.05371980369091034, time_cost=4.261118173599243
+
Steps: 1%| | 7305/1000000 [18:40:34<2561:00:27, 9.29s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%| | 7306/1000000 [18:40:47<2889:32:58, 10.48s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [7306], local_loss=0.007433210499584675, train_loss=18.298906326293945, time_cost=4.850928068161011
+
Steps: 1%| | 7306/1000000 [18:40:47<2889:32:58, 10.48s/it, lr=1e-5, step_loss=0.00743]
Steps: 1%| | 7307/1000000 [18:40:55<2645:02:06, 9.59s/it, lr=1e-5, step_loss=0.00743][RANK-0]: Step: [7307], local_loss=0.024768106639385223, train_loss=0.0242215096950531, time_cost=1.2489955425262451
+
Steps: 1%| | 7307/1000000 [18:40:55<2645:02:06, 9.59s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%| | 7308/1000000 [18:41:00<2302:20:55, 8.35s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [7308], local_loss=0.010202615521848202, train_loss=0.03236936777830124, time_cost=1.2546401023864746
+
Steps: 1%| | 7308/1000000 [18:41:00<2302:20:55, 8.35s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7309/1000000 [18:41:07<2194:55:40, 7.96s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7309], local_loss=0.02492579072713852, train_loss=0.046885013580322266, time_cost=3.2789878845214844
+
Steps: 1%| | 7309/1000000 [18:41:07<2194:55:40, 7.96s/it, lr=1e-5, step_loss=0.0249]
Steps: 1%| | 7310/1000000 [18:41:19<2524:59:10, 9.16s/it, lr=1e-5, step_loss=0.0249][RANK-0]: Step: [7310], local_loss=0.026322614401578903, train_loss=0.023831728845834732, time_cost=2.8007400035858154
+
Steps: 1%| | 7310/1000000 [18:41:19<2524:59:10, 9.16s/it, lr=1e-5, step_loss=0.0263]
Steps: 1%| | 7311/1000000 [18:41:30<2660:46:50, 9.65s/it, lr=1e-5, step_loss=0.0263][RANK-0]: Step: [7311], local_loss=0.015355517156422138, train_loss=0.03889179229736328, time_cost=4.453574895858765
+
Steps: 1%| | 7311/1000000 [18:41:30<2660:46:50, 9.65s/it, lr=1e-5, step_loss=0.0154]
Steps: 1%| | 7312/1000000 [18:41:39<2632:45:06, 9.55s/it, lr=1e-5, step_loss=0.0154][RANK-0]: Step: [7312], local_loss=0.016987955197691917, train_loss=0.017009353265166283, time_cost=2.24980092048645
+
Steps: 1%| | 7312/1000000 [18:41:39<2632:45:06, 9.55s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 7313/1000000 [18:41:50<2707:45:38, 9.82s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [7313], local_loss=0.02514249086380005, train_loss=0.02787785604596138, time_cost=2.6939480304718018
+
Steps: 1%| | 7313/1000000 [18:41:50<2707:45:38, 9.82s/it, lr=1e-5, step_loss=0.0251]
Steps: 1%| | 7314/1000000 [18:42:06<3213:34:09, 11.65s/it, lr=1e-5, step_loss=0.0251][RANK-0]: Step: [7314], local_loss=0.0102404048666358, train_loss=0.029570482671260834, time_cost=8.907772779464722
+
Steps: 1%| | 7314/1000000 [18:42:06<3213:34:09, 11.65s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7315/1000000 [18:42:23<3704:19:13, 13.43s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7315], local_loss=0.03236262500286102, train_loss=0.025496218353509903, time_cost=10.092353343963623
+
Steps: 1%| | 7315/1000000 [18:42:23<3704:19:13, 13.43s/it, lr=1e-5, step_loss=0.0324]
Steps: 1%| | 7316/1000000 [18:42:32<3276:30:35, 11.88s/it, lr=1e-5, step_loss=0.0324][RANK-0]: Step: [7316], local_loss=0.02501280978322029, train_loss=0.0422041118144989, time_cost=1.8239796161651611
+
Steps: 1%| | 7316/1000000 [18:42:32<3276:30:35, 11.88s/it, lr=1e-5, step_loss=0.025]
Steps: 1%| | 7317/1000000 [18:42:37<2781:29:49, 10.09s/it, lr=1e-5, step_loss=0.025][RANK-0]: Step: [7317], local_loss=0.007224561646580696, train_loss=0.03678325563669205, time_cost=1.5777251720428467
+
Steps: 1%| | 7317/1000000 [18:42:37<2781:29:49, 10.09s/it, lr=1e-5, step_loss=0.00722]
Steps: 1%| | 7318/1000000 [18:42:51<3090:23:12, 11.21s/it, lr=1e-5, step_loss=0.00722][RANK-0]: Step: [7318], local_loss=0.05048464983701706, train_loss=0.019076010212302208, time_cost=6.230224370956421
+
Steps: 1%| | 7318/1000000 [18:42:51<3090:23:12, 11.21s/it, lr=1e-5, step_loss=0.0505]
Steps: 1%| | 7319/1000000 [18:42:59<2777:18:28, 10.07s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [7319], local_loss=0.040070053189992905, train_loss=0.05798032879829407, time_cost=1.8216283321380615
+
Steps: 1%| | 7319/1000000 [18:42:59<2777:18:28, 10.07s/it, lr=1e-5, step_loss=0.0401]
Steps: 1%| | 7320/1000000 [18:43:10<2895:07:10, 10.50s/it, lr=1e-5, step_loss=0.0401][RANK-0]: Step: [7320], local_loss=0.006958654150366783, train_loss=0.04178736358880997, time_cost=3.568372964859009
+
Steps: 1%| | 7320/1000000 [18:43:10<2895:07:10, 10.50s/it, lr=1e-5, step_loss=0.00696]
Steps: 1%| | 7321/1000000 [18:43:22<3022:53:41, 10.96s/it, lr=1e-5, step_loss=0.00696][RANK-0]: Step: [7321], local_loss=0.012817702256143093, train_loss=0.1486780345439911, time_cost=4.42459511756897
+
Steps: 1%| | 7321/1000000 [18:43:22<3022:53:41, 10.96s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 7322/1000000 [18:43:27<2548:01:32, 9.24s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [7322], local_loss=0.013387173414230347, train_loss=0.015636060386896133, time_cost=2.762519598007202
+
Steps: 1%| | 7322/1000000 [18:43:28<2548:01:32, 9.24s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 7323/1000000 [18:43:38<2684:33:39, 9.74s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [7323], local_loss=0.03259244188666344, train_loss=0.0695943832397461, time_cost=6.1286780834198
+
Steps: 1%| | 7323/1000000 [18:43:38<2684:33:39, 9.74s/it, lr=1e-5, step_loss=0.0326]
Steps: 1%| | 7324/1000000 [18:43:49<2781:21:31, 10.09s/it, lr=1e-5, step_loss=0.0326][RANK-0]: Step: [7324], local_loss=0.011469355784356594, train_loss=0.08876071125268936, time_cost=1.3919241428375244
+
Steps: 1%| | 7324/1000000 [18:43:49<2781:21:31, 10.09s/it, lr=1e-5, step_loss=0.0115]
Steps: 1%| | 7325/1000000 [18:43:59<2746:25:44, 9.96s/it, lr=1e-5, step_loss=0.0115][RANK-0]: Step: [7325], local_loss=0.010124106891453266, train_loss=0.012881293892860413, time_cost=3.660048246383667
+
Steps: 1%| | 7325/1000000 [18:43:59<2746:25:44, 9.96s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 7326/1000000 [18:44:05<2420:47:58, 8.78s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [7326], local_loss=0.019972234964370728, train_loss=0.03466265648603439, time_cost=1.4896113872528076
+
Steps: 1%| | 7326/1000000 [18:44:05<2420:47:58, 8.78s/it, lr=1e-5, step_loss=0.02]
Steps: 1%| | 7327/1000000 [18:44:12<2257:09:59, 8.19s/it, lr=1e-5, step_loss=0.02][RANK-0]: Step: [7327], local_loss=0.05289241299033165, train_loss=0.02755674719810486, time_cost=2.2779898643493652
+
Steps: 1%| | 7327/1000000 [18:44:12<2257:09:59, 8.19s/it, lr=1e-5, step_loss=0.0529]
Steps: 1%| | 7328/1000000 [18:44:25<2671:44:12, 9.69s/it, lr=1e-5, step_loss=0.0529][RANK-0]: Step: [7328], local_loss=0.010102717205882072, train_loss=0.03581981733441353, time_cost=1.2423677444458008
+
Steps: 1%| | 7328/1000000 [18:44:25<2671:44:12, 9.69s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 7329/1000000 [18:44:38<2967:40:47, 10.76s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [7329], local_loss=0.01181061752140522, train_loss=0.046624816954135895, time_cost=5.243274450302124
+
Steps: 1%| | 7329/1000000 [18:44:38<2967:40:47, 10.76s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 7330/1000000 [18:44:43<2493:29:30, 9.04s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [7330], local_loss=0.026460761204361916, train_loss=0.02825399860739708, time_cost=1.966184377670288
+
Steps: 1%| | 7330/1000000 [18:44:43<2493:29:30, 9.04s/it, lr=1e-5, step_loss=0.0265]
Steps: 1%| | 7331/1000000 [18:44:48<2101:57:50, 7.62s/it, lr=1e-5, step_loss=0.0265][RANK-0]: Step: [7331], local_loss=0.007053389213979244, train_loss=0.06774254143238068, time_cost=2.25136137008667
+
Steps: 1%| | 7331/1000000 [18:44:48<2101:57:50, 7.62s/it, lr=1e-5, step_loss=0.00705]
Steps: 1%| | 7332/1000000 [18:44:59<2397:43:32, 8.70s/it, lr=1e-5, step_loss=0.00705][RANK-0]: Step: [7332], local_loss=0.5347004532814026, train_loss=0.08889249712228775, time_cost=3.5329508781433105
+
Steps: 1%| | 7332/1000000 [18:44:59<2397:43:32, 8.70s/it, lr=1e-5, step_loss=0.535]
Steps: 1%| | 7333/1000000 [18:45:03<2015:15:11, 7.31s/it, lr=1e-5, step_loss=0.535][RANK-0]: Step: [7333], local_loss=0.13351407647132874, train_loss=0.0434064120054245, time_cost=1.2659096717834473
+
Steps: 1%| | 7333/1000000 [18:45:03<2015:15:11, 7.31s/it, lr=1e-5, step_loss=0.134]
Steps: 1%| | 7334/1000000 [18:45:12<2140:27:33, 7.76s/it, lr=1e-5, step_loss=0.134][RANK-0]: Step: [7334], local_loss=0.00880468264222145, train_loss=0.033218976110219955, time_cost=6.241939306259155
+
Steps: 1%| | 7334/1000000 [18:45:12<2140:27:33, 7.76s/it, lr=1e-5, step_loss=0.0088]
Steps: 1%| | 7335/1000000 [18:45:23<2441:17:36, 8.85s/it, lr=1e-5, step_loss=0.0088][RANK-0]: Step: [7335], local_loss=0.35171616077423096, train_loss=0.06462328135967255, time_cost=2.0482256412506104
+
Steps: 1%| | 7335/1000000 [18:45:23<2441:17:36, 8.85s/it, lr=1e-5, step_loss=0.352]
Steps: 1%| | 7336/1000000 [18:45:30<2287:25:10, 8.30s/it, lr=1e-5, step_loss=0.352][RANK-0]: Step: [7336], local_loss=0.02561936154961586, train_loss=0.02304653823375702, time_cost=2.380880832672119
+
Steps: 1%| | 7336/1000000 [18:45:30<2287:25:10, 8.30s/it, lr=1e-5, step_loss=0.0256]
Steps: 1%| | 7337/1000000 [18:45:37<2179:12:26, 7.90s/it, lr=1e-5, step_loss=0.0256][RANK-0]: Step: [7337], local_loss=0.09473712742328644, train_loss=0.17494961619377136, time_cost=1.2371456623077393
+
Steps: 1%| | 7337/1000000 [18:45:37<2179:12:26, 7.90s/it, lr=1e-5, step_loss=0.0947]
Steps: 1%| | 7338/1000000 [18:45:50<2571:24:49, 9.33s/it, lr=1e-5, step_loss=0.0947][RANK-0]: Step: [7338], local_loss=0.0163196362555027, train_loss=0.06066916510462761, time_cost=5.472158670425415
+
Steps: 1%| | 7338/1000000 [18:45:50<2571:24:49, 9.33s/it, lr=1e-5, step_loss=0.0163]
Steps: 1%| | 7339/1000000 [18:46:03<2906:27:49, 10.54s/it, lr=1e-5, step_loss=0.0163][RANK-0]: Step: [7339], local_loss=0.0441453754901886, train_loss=0.06415534019470215, time_cost=5.079355001449585
+
Steps: 1%| | 7339/1000000 [18:46:03<2906:27:49, 10.54s/it, lr=1e-5, step_loss=0.0441]
Steps: 1%| | 7340/1000000 [18:46:14<2955:56:47, 10.72s/it, lr=1e-5, step_loss=0.0441][RANK-0]: Step: [7340], local_loss=0.017949702218174934, train_loss=0.025564594194293022, time_cost=2.265233039855957
+
Steps: 1%| | 7340/1000000 [18:46:14<2955:56:47, 10.72s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 7341/1000000 [18:46:20<2519:02:51, 9.14s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [7341], local_loss=0.013420241884887218, train_loss=0.02332491986453533, time_cost=2.5283327102661133
+
Steps: 1%| | 7341/1000000 [18:46:20<2519:02:51, 9.14s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 7342/1000000 [18:46:35<2999:06:35, 10.88s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [7342], local_loss=0.4441829025745392, train_loss=0.12834028899669647, time_cost=1.252023696899414
+
Steps: 1%| | 7342/1000000 [18:46:35<2999:06:35, 10.88s/it, lr=1e-5, step_loss=0.444]
Steps: 1%| | 7343/1000000 [18:46:47<3128:07:43, 11.34s/it, lr=1e-5, step_loss=0.444][RANK-0]: Step: [7343], local_loss=0.08369037508964539, train_loss=0.0825268030166626, time_cost=3.23732590675354
+
Steps: 1%| | 7343/1000000 [18:46:47<3128:07:43, 11.34s/it, lr=1e-5, step_loss=0.0837]
Steps: 1%| | 7344/1000000 [18:46:53<2674:58:03, 9.70s/it, lr=1e-5, step_loss=0.0837][RANK-0]: Step: [7344], local_loss=0.008352390490472317, train_loss=0.020041318610310555, time_cost=1.421898603439331
+
Steps: 1%| | 7344/1000000 [18:46:53<2674:58:03, 9.70s/it, lr=1e-5, step_loss=0.00835]
Steps: 1%| | 7345/1000000 [18:47:02<2655:20:15, 9.63s/it, lr=1e-5, step_loss=0.00835][RANK-0]: Step: [7345], local_loss=0.055184200406074524, train_loss=11.466867446899414, time_cost=1.6559059619903564
+
Steps: 1%| | 7345/1000000 [18:47:02<2655:20:15, 9.63s/it, lr=1e-5, step_loss=0.0552]
Steps: 1%| | 7346/1000000 [18:47:14<2809:16:19, 10.19s/it, lr=1e-5, step_loss=0.0552][RANK-0]: Step: [7346], local_loss=0.08482741564512253, train_loss=0.04901309311389923, time_cost=3.3863844871520996
+
Steps: 1%| | 7346/1000000 [18:47:14<2809:16:19, 10.19s/it, lr=1e-5, step_loss=0.0848]
Steps: 1%| | 7347/1000000 [18:47:25<2853:37:58, 10.35s/it, lr=1e-5, step_loss=0.0848][RANK-0]: Step: [7347], local_loss=0.02720602974295616, train_loss=0.03832727298140526, time_cost=1.2838246822357178
+
Steps: 1%| | 7347/1000000 [18:47:25<2853:37:58, 10.35s/it, lr=1e-5, step_loss=0.0272]
Steps: 1%| | 7348/1000000 [18:47:38<3098:44:42, 11.24s/it, lr=1e-5, step_loss=0.0272][RANK-0]: Step: [7348], local_loss=0.04898843169212341, train_loss=0.06357935070991516, time_cost=3.992835521697998
+
Steps: 1%| | 7348/1000000 [18:47:38<3098:44:42, 11.24s/it, lr=1e-5, step_loss=0.049]
Steps: 1%| | 7349/1000000 [18:47:42<2533:10:38, 9.19s/it, lr=1e-5, step_loss=0.049][RANK-0]: Step: [7349], local_loss=0.0276873167604208, train_loss=0.04625079035758972, time_cost=1.6536335945129395
+
Steps: 1%| | 7349/1000000 [18:47:42<2533:10:38, 9.19s/it, lr=1e-5, step_loss=0.0277]
Steps: 1%| | 7350/1000000 [18:47:48<2252:23:18, 8.17s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [7350], local_loss=0.010380096733570099, train_loss=0.04813014715909958, time_cost=1.7754952907562256
+
Steps: 1%| | 7350/1000000 [18:47:48<2252:23:18, 8.17s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7351/1000000 [18:47:55<2172:15:26, 7.88s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7351], local_loss=0.011833732016384602, train_loss=0.13787055015563965, time_cost=2.745234727859497
+
Steps: 1%| | 7351/1000000 [18:47:55<2172:15:26, 7.88s/it, lr=1e-5, step_loss=0.0118]
Steps: 1%| | 7352/1000000 [18:48:02<2091:36:46, 7.59s/it, lr=1e-5, step_loss=0.0118][RANK-0]: Step: [7352], local_loss=0.49623680114746094, train_loss=0.10728204995393753, time_cost=2.2849860191345215
+
Steps: 1%| | 7352/1000000 [18:48:02<2091:36:46, 7.59s/it, lr=1e-5, step_loss=0.496]
Steps: 1%| | 7353/1000000 [18:48:16<2630:30:10, 9.54s/it, lr=1e-5, step_loss=0.496][RANK-0]: Step: [7353], local_loss=0.06426674872636795, train_loss=0.03917238488793373, time_cost=5.771573543548584
+
Steps: 1%| | 7353/1000000 [18:48:16<2630:30:10, 9.54s/it, lr=1e-5, step_loss=0.0643]
Steps: 1%| | 7354/1000000 [18:48:22<2317:41:46, 8.41s/it, lr=1e-5, step_loss=0.0643][RANK-0]: Step: [7354], local_loss=0.35387566685676575, train_loss=0.06436532735824585, time_cost=1.716315507888794
+
Steps: 1%| | 7354/1000000 [18:48:22<2317:41:46, 8.41s/it, lr=1e-5, step_loss=0.354]
Steps: 1%| | 7355/1000000 [18:48:32<2462:11:55, 8.93s/it, lr=1e-5, step_loss=0.354][RANK-0]: Step: [7355], local_loss=0.06901334226131439, train_loss=0.05041932314634323, time_cost=6.403925895690918
+
Steps: 1%| | 7355/1000000 [18:48:32<2462:11:55, 8.93s/it, lr=1e-5, step_loss=0.069]
Steps: 1%| | 7356/1000000 [18:48:36<2074:23:06, 7.52s/it, lr=1e-5, step_loss=0.069][RANK-0]: Step: [7356], local_loss=0.01616351492702961, train_loss=0.05407349765300751, time_cost=1.4587702751159668
+
Steps: 1%| | 7356/1000000 [18:48:36<2074:23:06, 7.52s/it, lr=1e-5, step_loss=0.0162]
Steps: 1%| | 7357/1000000 [18:48:47<2351:39:17, 8.53s/it, lr=1e-5, step_loss=0.0162][RANK-0]: Step: [7357], local_loss=0.023807430639863014, train_loss=0.024358181282877922, time_cost=1.5501172542572021
+
Steps: 1%| | 7357/1000000 [18:48:47<2351:39:17, 8.53s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%| | 7358/1000000 [18:48:57<2433:29:07, 8.83s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [7358], local_loss=0.012451657094061375, train_loss=0.05610471963882446, time_cost=2.5237693786621094
+
Steps: 1%| | 7358/1000000 [18:48:57<2433:29:07, 8.83s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 7359/1000000 [18:49:03<2193:42:58, 7.96s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [7359], local_loss=0.05725988373160362, train_loss=0.03009718656539917, time_cost=1.3991200923919678
+
Steps: 1%| | 7359/1000000 [18:49:03<2193:42:58, 7.96s/it, lr=1e-5, step_loss=0.0573]
Steps: 1%| | 7360/1000000 [18:49:10<2149:39:11, 7.80s/it, lr=1e-5, step_loss=0.0573][RANK-0]: Step: [7360], local_loss=0.0472806952893734, train_loss=0.051098085939884186, time_cost=2.212195634841919
+
Steps: 1%| | 7360/1000000 [18:49:10<2149:39:11, 7.80s/it, lr=1e-5, step_loss=0.0473]
Steps: 1%| | 7361/1000000 [18:49:16<1953:30:36, 7.08s/it, lr=1e-5, step_loss=0.0473][RANK-0]: Step: [7361], local_loss=0.08131026476621628, train_loss=0.05280415713787079, time_cost=2.075075149536133
+
Steps: 1%| | 7361/1000000 [18:49:16<1953:30:36, 7.08s/it, lr=1e-5, step_loss=0.0813]
Steps: 1%| | 7362/1000000 [18:49:25<2145:17:41, 7.78s/it, lr=1e-5, step_loss=0.0813][RANK-0]: Step: [7362], local_loss=0.028125979006290436, train_loss=0.0457087866961956, time_cost=2.8410894870758057
+
Steps: 1%| | 7362/1000000 [18:49:25<2145:17:41, 7.78s/it, lr=1e-5, step_loss=0.0281]
Steps: 1%| | 7363/1000000 [18:49:38<2602:51:02, 9.44s/it, lr=1e-5, step_loss=0.0281][RANK-0]: Step: [7363], local_loss=0.20986120402812958, train_loss=0.04690689593553543, time_cost=5.4179723262786865
+
Steps: 1%| | 7363/1000000 [18:49:38<2602:51:02, 9.44s/it, lr=1e-5, step_loss=0.21]
Steps: 1%| | 7364/1000000 [18:49:48<2597:23:07, 9.42s/it, lr=1e-5, step_loss=0.21][RANK-0]: Step: [7364], local_loss=0.024022845551371574, train_loss=0.15972107648849487, time_cost=3.671708822250366
+
Steps: 1%| | 7364/1000000 [18:49:48<2597:23:07, 9.42s/it, lr=1e-5, step_loss=0.024]
Steps: 1%| | 7365/1000000 [18:49:53<2259:56:13, 8.20s/it, lr=1e-5, step_loss=0.024][RANK-0]: Step: [7365], local_loss=0.009243961423635483, train_loss=0.1501474827528, time_cost=2.429486036300659
+
Steps: 1%| | 7365/1000000 [18:49:53<2259:56:13, 8.20s/it, lr=1e-5, step_loss=0.00924]
Steps: 1%| | 7366/1000000 [18:50:06<2683:38:46, 9.73s/it, lr=1e-5, step_loss=0.00924][RANK-0]: Step: [7366], local_loss=0.035814471542835236, train_loss=0.027979716658592224, time_cost=5.467915773391724
+
Steps: 1%| | 7366/1000000 [18:50:06<2683:38:46, 9.73s/it, lr=1e-5, step_loss=0.0358]
Steps: 1%| | 7367/1000000 [18:50:11<2228:12:45, 8.08s/it, lr=1e-5, step_loss=0.0358][RANK-0]: Step: [7367], local_loss=0.006471915636211634, train_loss=0.020780453458428383, time_cost=1.5130434036254883
+
Steps: 1%| | 7367/1000000 [18:50:11<2228:12:45, 8.08s/it, lr=1e-5, step_loss=0.00647]
Steps: 1%| | 7368/1000000 [18:50:23<2591:58:48, 9.40s/it, lr=1e-5, step_loss=0.00647][RANK-0]: Step: [7368], local_loss=0.0071847811341285706, train_loss=0.042443595826625824, time_cost=5.41960072517395
+
Steps: 1%| | 7368/1000000 [18:50:23<2591:58:48, 9.40s/it, lr=1e-5, step_loss=0.00718]
Steps: 1%| | 7369/1000000 [18:50:35<2808:05:40, 10.18s/it, lr=1e-5, step_loss=0.00718][RANK-0]: Step: [7369], local_loss=0.02970142289996147, train_loss=0.04075693339109421, time_cost=5.368872880935669
+
Steps: 1%| | 7369/1000000 [18:50:35<2808:05:40, 10.18s/it, lr=1e-5, step_loss=0.0297]
Steps: 1%| | 7370/1000000 [18:50:40<2403:50:07, 8.72s/it, lr=1e-5, step_loss=0.0297][RANK-0]: Step: [7370], local_loss=0.01570107601583004, train_loss=0.07746051996946335, time_cost=2.702331066131592
+
Steps: 1%| | 7370/1000000 [18:50:40<2403:50:07, 8.72s/it, lr=1e-5, step_loss=0.0157]
Steps: 1%| | 7371/1000000 [18:50:46<2146:22:12, 7.78s/it, lr=1e-5, step_loss=0.0157][RANK-0]: Step: [7371], local_loss=0.035447634756565094, train_loss=0.0479893833398819, time_cost=2.811577081680298
+
Steps: 1%| | 7371/1000000 [18:50:46<2146:22:12, 7.78s/it, lr=1e-5, step_loss=0.0354]
Steps: 1%| | 7372/1000000 [18:50:51<1916:32:20, 6.95s/it, lr=1e-5, step_loss=0.0354][RANK-0]: Step: [7372], local_loss=0.04125962778925896, train_loss=0.03373105078935623, time_cost=1.2333343029022217
+
Steps: 1%| | 7372/1000000 [18:50:51<1916:32:20, 6.95s/it, lr=1e-5, step_loss=0.0413]
Steps: 1%| | 7373/1000000 [18:51:03<2300:59:36, 8.35s/it, lr=1e-5, step_loss=0.0413][RANK-0]: Step: [7373], local_loss=0.07027986645698547, train_loss=0.046041786670684814, time_cost=3.9497461318969727
+
Steps: 1%| | 7373/1000000 [18:51:03<2300:59:36, 8.35s/it, lr=1e-5, step_loss=0.0703]
Steps: 1%| | 7374/1000000 [18:51:11<2279:07:24, 8.27s/it, lr=1e-5, step_loss=0.0703][RANK-0]: Step: [7374], local_loss=0.012932095676660538, train_loss=0.03373870253562927, time_cost=6.59859037399292
+
Steps: 1%| | 7374/1000000 [18:51:11<2279:07:24, 8.27s/it, lr=1e-5, step_loss=0.0129]
Steps: 1%| | 7375/1000000 [18:51:17<2094:06:10, 7.59s/it, lr=1e-5, step_loss=0.0129][RANK-0]: Step: [7375], local_loss=0.02105221338570118, train_loss=0.03251571208238602, time_cost=1.726708173751831
+
Steps: 1%| | 7375/1000000 [18:51:17<2094:06:10, 7.59s/it, lr=1e-5, step_loss=0.0211]
Steps: 1%| | 7376/1000000 [18:51:23<2008:24:59, 7.28s/it, lr=1e-5, step_loss=0.0211][RANK-0]: Step: [7376], local_loss=0.032966699451208115, train_loss=0.0835113525390625, time_cost=2.380851984024048
+
Steps: 1%| | 7376/1000000 [18:51:23<2008:24:59, 7.28s/it, lr=1e-5, step_loss=0.033]
Steps: 1%| | 7377/1000000 [18:51:32<2117:48:54, 7.68s/it, lr=1e-5, step_loss=0.033][RANK-0]: Step: [7377], local_loss=0.006566404830664396, train_loss=0.020899789407849312, time_cost=1.2163910865783691
+
Steps: 1%| | 7377/1000000 [18:51:32<2117:48:54, 7.68s/it, lr=1e-5, step_loss=0.00657]
Steps: 1%| | 7378/1000000 [18:51:42<2360:02:40, 8.56s/it, lr=1e-5, step_loss=0.00657][RANK-0]: Step: [7378], local_loss=0.010564046911895275, train_loss=0.037956614047288895, time_cost=5.620026350021362
+
Steps: 1%| | 7378/1000000 [18:51:42<2360:02:40, 8.56s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 7379/1000000 [18:51:57<2837:30:09, 10.29s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [7379], local_loss=0.02058861218392849, train_loss=0.024977467954158783, time_cost=6.736259698867798
+
Steps: 1%| | 7379/1000000 [18:51:57<2837:30:09, 10.29s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 7380/1000000 [18:52:04<2608:07:19, 9.46s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [7380], local_loss=0.011359038762748241, train_loss=0.017299635335803032, time_cost=3.3343043327331543
+
Steps: 1%| | 7380/1000000 [18:52:04<2608:07:19, 9.46s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 7381/1000000 [18:52:14<2591:50:01, 9.40s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [7381], local_loss=0.006541684735566378, train_loss=0.03390883654356003, time_cost=5.230138778686523
+
Steps: 1%| | 7381/1000000 [18:52:14<2591:50:01, 9.40s/it, lr=1e-5, step_loss=0.00654]
Steps: 1%| | 7382/1000000 [18:52:26<2851:15:31, 10.34s/it, lr=1e-5, step_loss=0.00654][RANK-0]: Step: [7382], local_loss=0.03275647759437561, train_loss=0.027387244626879692, time_cost=5.789336919784546
+
Steps: 1%| | 7382/1000000 [18:52:26<2851:15:31, 10.34s/it, lr=1e-5, step_loss=0.0328]
Steps: 1%| | 7383/1000000 [18:52:35<2742:19:48, 9.95s/it, lr=1e-5, step_loss=0.0328][RANK-0]: Step: [7383], local_loss=0.062111347913742065, train_loss=0.05809298902750015, time_cost=2.5100669860839844
+
Steps: 1%| | 7383/1000000 [18:52:35<2742:19:48, 9.95s/it, lr=1e-5, step_loss=0.0621]
Steps: 1%| | 7384/1000000 [18:52:40<2349:35:52, 8.52s/it, lr=1e-5, step_loss=0.0621][RANK-0]: Step: [7384], local_loss=0.3909936547279358, train_loss=0.081964410841465, time_cost=3.8441383838653564
+
Steps: 1%| | 7384/1000000 [18:52:40<2349:35:52, 8.52s/it, lr=1e-5, step_loss=0.391]
Steps: 1%| | 7385/1000000 [18:52:52<2631:42:04, 9.54s/it, lr=1e-5, step_loss=0.391][RANK-0]: Step: [7385], local_loss=0.053454168140888214, train_loss=0.02728845924139023, time_cost=4.257980823516846
+
Steps: 1%| | 7385/1000000 [18:52:52<2631:42:04, 9.54s/it, lr=1e-5, step_loss=0.0535]
Steps: 1%| | 7386/1000000 [18:53:06<2952:20:44, 10.71s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [7386], local_loss=0.005084105301648378, train_loss=0.14210669696331024, time_cost=4.472755432128906
+
Steps: 1%| | 7386/1000000 [18:53:06<2952:20:44, 10.71s/it, lr=1e-5, step_loss=0.00508]
Steps: 1%| | 7387/1000000 [18:53:12<2620:07:36, 9.50s/it, lr=1e-5, step_loss=0.00508][RANK-0]: Step: [7387], local_loss=0.023212112486362457, train_loss=0.02078118547797203, time_cost=1.887927770614624
+
Steps: 1%| | 7387/1000000 [18:53:12<2620:07:36, 9.50s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 7388/1000000 [18:53:29<3175:11:49, 11.52s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [7388], local_loss=0.00995638594031334, train_loss=0.022835513576865196, time_cost=1.7771995067596436
+
Steps: 1%| | 7388/1000000 [18:53:29<3175:11:49, 11.52s/it, lr=1e-5, step_loss=0.00996]
Steps: 1%| | 7389/1000000 [18:53:33<2623:16:00, 9.51s/it, lr=1e-5, step_loss=0.00996][RANK-0]: Step: [7389], local_loss=0.018822764977812767, train_loss=0.021046461537480354, time_cost=1.9095513820648193
+
Steps: 1%| | 7389/1000000 [18:53:33<2623:16:00, 9.51s/it, lr=1e-5, step_loss=0.0188]
Steps: 1%| | 7390/1000000 [18:53:42<2515:48:40, 9.12s/it, lr=1e-5, step_loss=0.0188][RANK-0]: Step: [7390], local_loss=0.048479482531547546, train_loss=0.03369012475013733, time_cost=2.407766580581665
+
Steps: 1%| | 7390/1000000 [18:53:42<2515:48:40, 9.12s/it, lr=1e-5, step_loss=0.0485]
Steps: 1%| | 7391/1000000 [18:53:53<2710:35:45, 9.83s/it, lr=1e-5, step_loss=0.0485][RANK-0]: Step: [7391], local_loss=0.07975110411643982, train_loss=9.747352600097656, time_cost=1.7480831146240234
+
Steps: 1%| | 7391/1000000 [18:53:53<2710:35:45, 9.83s/it, lr=1e-5, step_loss=0.0798]
Steps: 1%| | 7392/1000000 [18:54:06<2945:17:23, 10.68s/it, lr=1e-5, step_loss=0.0798][RANK-0]: Step: [7392], local_loss=0.01910901628434658, train_loss=0.08494018018245697, time_cost=1.2873661518096924
+
Steps: 1%| | 7392/1000000 [18:54:06<2945:17:23, 10.68s/it, lr=1e-5, step_loss=0.0191]
Steps: 1%| | 7393/1000000 [18:54:16<2900:40:44, 10.52s/it, lr=1e-5, step_loss=0.0191][RANK-0]: Step: [7393], local_loss=0.1151934266090393, train_loss=0.03995072841644287, time_cost=4.1322362422943115
+
Steps: 1%| | 7393/1000000 [18:54:16<2900:40:44, 10.52s/it, lr=1e-5, step_loss=0.115]
Steps: 1%| | 7394/1000000 [18:54:28<2991:37:53, 10.85s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [7394], local_loss=0.014935186132788658, train_loss=0.08057688921689987, time_cost=4.60321044921875
+
Steps: 1%| | 7394/1000000 [18:54:28<2991:37:53, 10.85s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 7395/1000000 [18:54:34<2660:27:33, 9.65s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [7395], local_loss=0.06120749190449715, train_loss=0.18058741092681885, time_cost=2.3690576553344727
+
Steps: 1%| | 7395/1000000 [18:54:34<2660:27:33, 9.65s/it, lr=1e-5, step_loss=0.0612]
Steps: 1%| | 7396/1000000 [18:54:39<2239:06:21, 8.12s/it, lr=1e-5, step_loss=0.0612][RANK-0]: Step: [7396], local_loss=0.013344679959118366, train_loss=0.1394769847393036, time_cost=1.743647813796997
+
Steps: 1%| | 7396/1000000 [18:54:39<2239:06:21, 8.12s/it, lr=1e-5, step_loss=0.0133]
Steps: 1%| | 7397/1000000 [18:54:58<3121:42:33, 11.32s/it, lr=1e-5, step_loss=0.0133][RANK-0]: Step: [7397], local_loss=0.036824073642492294, train_loss=0.06413563340902328, time_cost=16.0467312335968
+
Steps: 1%| | 7397/1000000 [18:54:58<3121:42:33, 11.32s/it, lr=1e-5, step_loss=0.0368]
Steps: 1%| | 7398/1000000 [18:55:03<2643:30:47, 9.59s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [7398], local_loss=0.02334749884903431, train_loss=0.03369016945362091, time_cost=1.759361743927002
+
Steps: 1%| | 7398/1000000 [18:55:03<2643:30:47, 9.59s/it, lr=1e-5, step_loss=0.0233]
Steps: 1%| | 7399/1000000 [18:55:11<2473:22:04, 8.97s/it, lr=1e-5, step_loss=0.0233][RANK-0]: Step: [7399], local_loss=0.050473280251026154, train_loss=0.07305119931697845, time_cost=5.74849009513855
+
Steps: 1%| | 7399/1000000 [18:55:11<2473:22:04, 8.97s/it, lr=1e-5, step_loss=0.0505]
Steps: 1%| | 7400/1000000 [18:55:16<2144:28:33, 7.78s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [7400], local_loss=0.9971812963485718, train_loss=0.15011528134346008, time_cost=2.783756971359253
+
Steps: 1%| | 7400/1000000 [18:55:16<2144:28:33, 7.78s/it, lr=1e-5, step_loss=0.997]
Steps: 1%| | 7401/1000000 [18:55:25<2240:40:04, 8.13s/it, lr=1e-5, step_loss=0.997][RANK-0]: Step: [7401], local_loss=0.021261736750602722, train_loss=0.17845937609672546, time_cost=2.895354747772217
+
Steps: 1%| | 7401/1000000 [18:55:25<2240:40:04, 8.13s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 7402/1000000 [18:55:36<2502:05:12, 9.07s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [7402], local_loss=0.010356561280786991, train_loss=0.1504119336605072, time_cost=1.2392737865447998
+
Steps: 1%| | 7402/1000000 [18:55:36<2502:05:12, 9.07s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7403/1000000 [18:55:43<2357:46:40, 8.55s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7403], local_loss=0.040298230946063995, train_loss=0.02902865782380104, time_cost=1.3193845748901367
+
Steps: 1%| | 7403/1000000 [18:55:43<2357:46:40, 8.55s/it, lr=1e-5, step_loss=0.0403]
Steps: 1%| | 7404/1000000 [18:55:56<2725:28:00, 9.88s/it, lr=1e-5, step_loss=0.0403][RANK-0]: Step: [7404], local_loss=0.00806078128516674, train_loss=0.04617922008037567, time_cost=2.420759439468384
+
Steps: 1%| | 7404/1000000 [18:55:56<2725:28:00, 9.88s/it, lr=1e-5, step_loss=0.00806]
Steps: 1%| | 7405/1000000 [18:56:11<3086:23:20, 11.19s/it, lr=1e-5, step_loss=0.00806][RANK-0]: Step: [7405], local_loss=0.023187099024653435, train_loss=0.15601009130477905, time_cost=4.081856966018677
+
Steps: 1%| | 7405/1000000 [18:56:11<3086:23:20, 11.19s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 7406/1000000 [18:56:18<2757:46:45, 10.00s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [7406], local_loss=0.009676257148385048, train_loss=0.01845845766365528, time_cost=2.6869285106658936
+
Steps: 1%| | 7406/1000000 [18:56:18<2757:46:45, 10.00s/it, lr=1e-5, step_loss=0.00968]
Steps: 1%| | 7407/1000000 [18:56:24<2442:15:15, 8.86s/it, lr=1e-5, step_loss=0.00968][RANK-0]: Step: [7407], local_loss=0.1764475703239441, train_loss=0.06908257305622101, time_cost=2.254701852798462
+
Steps: 1%| | 7407/1000000 [18:56:24<2442:15:15, 8.86s/it, lr=1e-5, step_loss=0.176]
Steps: 1%| | 7408/1000000 [18:56:30<2169:22:37, 7.87s/it, lr=1e-5, step_loss=0.176][RANK-0]: Step: [7408], local_loss=0.1598542183637619, train_loss=0.0941685140132904, time_cost=2.619089365005493
+
Steps: 1%| | 7408/1000000 [18:56:30<2169:22:37, 7.87s/it, lr=1e-5, step_loss=0.16]
Steps: 1%| | 7409/1000000 [18:56:43<2638:10:11, 9.57s/it, lr=1e-5, step_loss=0.16][RANK-0]: Step: [7409], local_loss=0.026407688856124878, train_loss=0.21264836192131042, time_cost=1.239478349685669
+
Steps: 1%| | 7409/1000000 [18:56:43<2638:10:11, 9.57s/it, lr=1e-5, step_loss=0.0264]
Steps: 1%| | 7410/1000000 [18:56:48<2216:21:08, 8.04s/it, lr=1e-5, step_loss=0.0264][RANK-0]: Step: [7410], local_loss=0.009950391948223114, train_loss=0.028848519548773766, time_cost=1.3013873100280762
+
Steps: 1%| | 7410/1000000 [18:56:48<2216:21:08, 8.04s/it, lr=1e-5, step_loss=0.00995]
Steps: 1%| | 7411/1000000 [18:57:04<2867:58:26, 10.40s/it, lr=1e-5, step_loss=0.00995][RANK-0]: Step: [7411], local_loss=0.026759155094623566, train_loss=0.03217422962188721, time_cost=1.7672884464263916
+
Steps: 1%| | 7411/1000000 [18:57:04<2867:58:26, 10.40s/it, lr=1e-5, step_loss=0.0268]
Steps: 1%| | 7412/1000000 [18:57:16<3037:05:50, 11.02s/it, lr=1e-5, step_loss=0.0268][RANK-0]: Step: [7412], local_loss=0.01983054354786873, train_loss=0.12333010882139206, time_cost=8.673416376113892
+
Steps: 1%| | 7412/1000000 [18:57:16<3037:05:50, 11.02s/it, lr=1e-5, step_loss=0.0198]
Steps: 1%| | 7413/1000000 [18:57:23<2746:53:22, 9.96s/it, lr=1e-5, step_loss=0.0198][RANK-0]: Step: [7413], local_loss=0.06193070113658905, train_loss=0.04225774109363556, time_cost=5.7928736209869385
+
Steps: 1%| | 7413/1000000 [18:57:23<2746:53:22, 9.96s/it, lr=1e-5, step_loss=0.0619]
Steps: 1%| | 7414/1000000 [18:57:37<3051:41:24, 11.07s/it, lr=1e-5, step_loss=0.0619][RANK-0]: Step: [7414], local_loss=0.500005841255188, train_loss=0.2021791934967041, time_cost=4.985620021820068
+
Steps: 1%| | 7414/1000000 [18:57:37<3051:41:24, 11.07s/it, lr=1e-5, step_loss=0.5]
Steps: 1%| | 7415/1000000 [18:57:49<3111:54:58, 11.29s/it, lr=1e-5, step_loss=0.5][RANK-0]: Step: [7415], local_loss=0.01952155865728855, train_loss=0.03517094999551773, time_cost=1.7938685417175293
+
Steps: 1%| | 7415/1000000 [18:57:49<3111:54:58, 11.29s/it, lr=1e-5, step_loss=0.0195]
Steps: 1%| | 7416/1000000 [18:57:54<2581:07:32, 9.36s/it, lr=1e-5, step_loss=0.0195][RANK-0]: Step: [7416], local_loss=0.01355520635843277, train_loss=0.04255588725209236, time_cost=3.602739095687866
+
Steps: 1%| | 7416/1000000 [18:57:54<2581:07:32, 9.36s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 7417/1000000 [18:57:59<2249:20:32, 8.16s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [7417], local_loss=0.05653635039925575, train_loss=0.03999076411128044, time_cost=4.653803110122681
+
Steps: 1%| | 7417/1000000 [18:57:59<2249:20:32, 8.16s/it, lr=1e-5, step_loss=0.0565]
Steps: 1%| | 7418/1000000 [18:58:04<1961:15:40, 7.11s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [7418], local_loss=0.03856577351689339, train_loss=0.019892191514372826, time_cost=1.8930988311767578
+
Steps: 1%| | 7418/1000000 [18:58:04<1961:15:40, 7.11s/it, lr=1e-5, step_loss=0.0386]
Steps: 1%| | 7419/1000000 [18:58:18<2556:43:25, 9.27s/it, lr=1e-5, step_loss=0.0386][RANK-0]: Step: [7419], local_loss=0.020343469455838203, train_loss=0.029867108911275864, time_cost=10.784378051757812
+
Steps: 1%| | 7419/1000000 [18:58:18<2556:43:25, 9.27s/it, lr=1e-5, step_loss=0.0203]
Steps: 1%| | 7420/1000000 [18:58:24<2249:32:35, 8.16s/it, lr=1e-5, step_loss=0.0203][RANK-0]: Step: [7420], local_loss=0.9801168441772461, train_loss=0.1372768133878708, time_cost=4.5559656620025635
+
Steps: 1%| | 7420/1000000 [18:58:24<2249:32:35, 8.16s/it, lr=1e-5, step_loss=0.98]
Steps: 1%| | 7421/1000000 [18:58:29<1984:44:03, 7.20s/it, lr=1e-5, step_loss=0.98][RANK-0]: Step: [7421], local_loss=0.0243326835334301, train_loss=0.09118549525737762, time_cost=1.19809889793396
+
Steps: 1%| | 7421/1000000 [18:58:29<1984:44:03, 7.20s/it, lr=1e-5, step_loss=0.0243]
Steps: 1%| | 7422/1000000 [18:58:36<1968:51:10, 7.14s/it, lr=1e-5, step_loss=0.0243][RANK-0]: Step: [7422], local_loss=0.010975501500070095, train_loss=0.08583866059780121, time_cost=1.2678618431091309
+
Steps: 1%| | 7422/1000000 [18:58:36<1968:51:10, 7.14s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 7423/1000000 [18:58:51<2635:03:54, 9.56s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [7423], local_loss=0.010070634074509144, train_loss=0.046215515583753586, time_cost=4.7495410442352295
+
Steps: 1%| | 7423/1000000 [18:58:51<2635:03:54, 9.56s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 7424/1000000 [18:59:00<2581:21:39, 9.36s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [7424], local_loss=0.025249294936656952, train_loss=0.0745231956243515, time_cost=3.4213621616363525
+
Steps: 1%| | 7424/1000000 [18:59:00<2581:21:39, 9.36s/it, lr=1e-5, step_loss=0.0252]
Steps: 1%| | 7425/1000000 [18:59:09<2571:56:49, 9.33s/it, lr=1e-5, step_loss=0.0252][RANK-0]: Step: [7425], local_loss=0.04213237762451172, train_loss=0.033498018980026245, time_cost=6.055009365081787
+
Steps: 1%| | 7425/1000000 [18:59:09<2571:56:49, 9.33s/it, lr=1e-5, step_loss=0.0421]
Steps: 1%| | 7426/1000000 [18:59:22<2840:25:52, 10.30s/it, lr=1e-5, step_loss=0.0421][RANK-0]: Step: [7426], local_loss=0.04271654784679413, train_loss=0.029967082664370537, time_cost=3.18217134475708
+
Steps: 1%| | 7426/1000000 [18:59:22<2840:25:52, 10.30s/it, lr=1e-5, step_loss=0.0427]
Steps: 1%| | 7427/1000000 [18:59:33<2923:56:28, 10.60s/it, lr=1e-5, step_loss=0.0427][RANK-0]: Step: [7427], local_loss=0.02943248115479946, train_loss=0.025750044733285904, time_cost=2.554980993270874
+
Steps: 1%| | 7427/1000000 [18:59:33<2923:56:28, 10.60s/it, lr=1e-5, step_loss=0.0294]
Steps: 1%| | 7428/1000000 [18:59:37<2402:08:57, 8.71s/it, lr=1e-5, step_loss=0.0294][RANK-0]: Step: [7428], local_loss=0.07185613363981247, train_loss=0.019709665328264236, time_cost=1.337775707244873
+
Steps: 1%| | 7428/1000000 [18:59:37<2402:08:57, 8.71s/it, lr=1e-5, step_loss=0.0719]
Steps: 1%| | 7429/1000000 [18:59:44<2242:00:41, 8.13s/it, lr=1e-5, step_loss=0.0719][RANK-0]: Step: [7429], local_loss=0.05974731594324112, train_loss=0.0347023569047451, time_cost=1.2265186309814453
+
Steps: 1%| | 7429/1000000 [18:59:44<2242:00:41, 8.13s/it, lr=1e-5, step_loss=0.0597]
Steps: 1%| | 7430/1000000 [19:00:00<2920:16:34, 10.59s/it, lr=1e-5, step_loss=0.0597][RANK-0]: Step: [7430], local_loss=0.009237326681613922, train_loss=0.02345920354127884, time_cost=7.6804327964782715
+
Steps: 1%| | 7430/1000000 [19:00:00<2920:16:34, 10.59s/it, lr=1e-5, step_loss=0.00924]
Steps: 1%| | 7431/1000000 [19:00:11<2920:01:07, 10.59s/it, lr=1e-5, step_loss=0.00924][RANK-0]: Step: [7431], local_loss=0.010199816897511482, train_loss=0.022478759288787842, time_cost=2.093045234680176
+
Steps: 1%| | 7431/1000000 [19:00:11<2920:01:07, 10.59s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7432/1000000 [19:00:18<2620:07:33, 9.50s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7432], local_loss=0.03078356198966503, train_loss=6.6245012283325195, time_cost=1.2052206993103027
+
Steps: 1%| | 7432/1000000 [19:00:18<2620:07:33, 9.50s/it, lr=1e-5, step_loss=0.0308]
Steps: 1%| | 7433/1000000 [19:00:25<2418:08:15, 8.77s/it, lr=1e-5, step_loss=0.0308][RANK-0]: Step: [7433], local_loss=0.6651443243026733, train_loss=0.09763537347316742, time_cost=2.5120155811309814
+
Steps: 1%| | 7433/1000000 [19:00:25<2418:08:15, 8.77s/it, lr=1e-5, step_loss=0.665]
Steps: 1%| | 7434/1000000 [19:00:37<2665:38:58, 9.67s/it, lr=1e-5, step_loss=0.665][RANK-0]: Step: [7434], local_loss=0.04160912707448006, train_loss=0.061585575342178345, time_cost=1.9207613468170166
+
Steps: 1%| | 7434/1000000 [19:00:37<2665:38:58, 9.67s/it, lr=1e-5, step_loss=0.0416]
Steps: 1%| | 7435/1000000 [19:00:48<2816:26:19, 10.22s/it, lr=1e-5, step_loss=0.0416][RANK-0]: Step: [7435], local_loss=0.015059888362884521, train_loss=0.02585318684577942, time_cost=2.171516180038452
+
Steps: 1%| | 7435/1000000 [19:00:48<2816:26:19, 10.22s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 7436/1000000 [19:00:54<2462:24:56, 8.93s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [7436], local_loss=0.00990945566445589, train_loss=0.03379745036363602, time_cost=3.300673007965088
+
Steps: 1%| | 7436/1000000 [19:00:54<2462:24:56, 8.93s/it, lr=1e-5, step_loss=0.00991]
Steps: 1%| | 7437/1000000 [19:01:05<2594:51:06, 9.41s/it, lr=1e-5, step_loss=0.00991][RANK-0]: Step: [7437], local_loss=0.04894498735666275, train_loss=0.030747901648283005, time_cost=7.760608196258545
+
Steps: 1%| | 7437/1000000 [19:01:05<2594:51:06, 9.41s/it, lr=1e-5, step_loss=0.0489]
Steps: 1%| | 7438/1000000 [19:01:14<2551:44:56, 9.26s/it, lr=1e-5, step_loss=0.0489][RANK-0]: Step: [7438], local_loss=0.06870698928833008, train_loss=0.021150238811969757, time_cost=2.6916120052337646
+
Steps: 1%| | 7438/1000000 [19:01:14<2551:44:56, 9.26s/it, lr=1e-5, step_loss=0.0687]
Steps: 1%| | 7439/1000000 [19:01:18<2177:55:36, 7.90s/it, lr=1e-5, step_loss=0.0687][RANK-0]: Step: [7439], local_loss=0.033336177468299866, train_loss=0.08805682510137558, time_cost=1.631706714630127
+
Steps: 1%| | 7439/1000000 [19:01:18<2177:55:36, 7.90s/it, lr=1e-5, step_loss=0.0333]
Steps: 1%| | 7440/1000000 [19:01:27<2270:21:20, 8.23s/it, lr=1e-5, step_loss=0.0333][RANK-0]: Step: [7440], local_loss=0.02224365994334221, train_loss=0.030816253274679184, time_cost=3.0112478733062744
+
Steps: 1%| | 7440/1000000 [19:01:27<2270:21:20, 8.23s/it, lr=1e-5, step_loss=0.0222]
Steps: 1%| | 7441/1000000 [19:01:38<2469:22:10, 8.96s/it, lr=1e-5, step_loss=0.0222][RANK-0]: Step: [7441], local_loss=0.0593588724732399, train_loss=0.033637627959251404, time_cost=2.9646925926208496
+
Steps: 1%| | 7441/1000000 [19:01:38<2469:22:10, 8.96s/it, lr=1e-5, step_loss=0.0594]
Steps: 1%| | 7442/1000000 [19:01:52<2919:44:50, 10.59s/it, lr=1e-5, step_loss=0.0594][RANK-0]: Step: [7442], local_loss=0.06042594090104103, train_loss=0.05625075101852417, time_cost=4.710507154464722
+
Steps: 1%| | 7442/1000000 [19:01:52<2919:44:50, 10.59s/it, lr=1e-5, step_loss=0.0604]
Steps: 1%| | 7443/1000000 [19:01:57<2410:14:45, 8.74s/it, lr=1e-5, step_loss=0.0604][RANK-0]: Step: [7443], local_loss=0.006121908314526081, train_loss=0.020361920818686485, time_cost=1.2090342044830322
+
Steps: 1%| | 7443/1000000 [19:01:57<2410:14:45, 8.74s/it, lr=1e-5, step_loss=0.00612]
Steps: 1%| | 7444/1000000 [19:02:01<2047:09:12, 7.43s/it, lr=1e-5, step_loss=0.00612][RANK-0]: Step: [7444], local_loss=0.04365261644124985, train_loss=0.0415770523250103, time_cost=1.2070348262786865
+
Steps: 1%| | 7444/1000000 [19:02:01<2047:09:12, 7.43s/it, lr=1e-5, step_loss=0.0437]
Steps: 1%| | 7445/1000000 [19:02:15<2555:25:35, 9.27s/it, lr=1e-5, step_loss=0.0437][RANK-0]: Step: [7445], local_loss=0.3782854676246643, train_loss=0.12070681154727936, time_cost=9.83185076713562
+
Steps: 1%| | 7445/1000000 [19:02:15<2555:25:35, 9.27s/it, lr=1e-5, step_loss=0.378]
Steps: 1%| | 7446/1000000 [19:02:24<2566:59:45, 9.31s/it, lr=1e-5, step_loss=0.378][RANK-0]: Step: [7446], local_loss=0.02281287871301174, train_loss=0.025673698633909225, time_cost=2.0626378059387207
+
Steps: 1%| | 7446/1000000 [19:02:24<2566:59:45, 9.31s/it, lr=1e-5, step_loss=0.0228]
Steps: 1%| | 7447/1000000 [19:02:37<2835:54:02, 10.29s/it, lr=1e-5, step_loss=0.0228][RANK-0]: Step: [7447], local_loss=0.01144147478044033, train_loss=0.04192790389060974, time_cost=4.861479759216309
+
Steps: 1%| | 7447/1000000 [19:02:37<2835:54:02, 10.29s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 7448/1000000 [19:02:51<3189:46:50, 11.57s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [7448], local_loss=0.04520440101623535, train_loss=0.039689432829618454, time_cost=1.2284820079803467
+
Steps: 1%| | 7448/1000000 [19:02:51<3189:46:50, 11.57s/it, lr=1e-5, step_loss=0.0452]
Steps: 1%| | 7449/1000000 [19:03:00<2932:35:20, 10.64s/it, lr=1e-5, step_loss=0.0452][RANK-0]: Step: [7449], local_loss=0.016384705901145935, train_loss=0.06557926535606384, time_cost=2.3280675411224365
+
Steps: 1%| | 7449/1000000 [19:03:00<2932:35:20, 10.64s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%| | 7450/1000000 [19:03:12<3041:36:08, 11.03s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [7450], local_loss=0.008909029886126518, train_loss=0.027565738186240196, time_cost=5.7234861850738525
+
Steps: 1%| | 7450/1000000 [19:03:12<3041:36:08, 11.03s/it, lr=1e-5, step_loss=0.00891]
Steps: 1%| | 7451/1000000 [19:03:19<2714:26:19, 9.85s/it, lr=1e-5, step_loss=0.00891][RANK-0]: Step: [7451], local_loss=0.026109663769602776, train_loss=0.021435774862766266, time_cost=3.4364607334136963
+
Steps: 1%| | 7451/1000000 [19:03:19<2714:26:19, 9.85s/it, lr=1e-5, step_loss=0.0261]
Steps: 1%| | 7452/1000000 [19:03:33<3047:23:57, 11.05s/it, lr=1e-5, step_loss=0.0261][RANK-0]: Step: [7452], local_loss=0.036202169954776764, train_loss=0.02183600328862667, time_cost=10.469658613204956
+
Steps: 1%| | 7452/1000000 [19:03:33<3047:23:57, 11.05s/it, lr=1e-5, step_loss=0.0362]
Steps: 1%| | 7453/1000000 [19:03:44<3053:18:34, 11.07s/it, lr=1e-5, step_loss=0.0362][RANK-0]: Step: [7453], local_loss=0.008479202166199684, train_loss=0.05948550999164581, time_cost=8.047302007675171
+
Steps: 1%| | 7453/1000000 [19:03:44<3053:18:34, 11.07s/it, lr=1e-5, step_loss=0.00848]
Steps: 1%| | 7454/1000000 [19:04:00<3460:36:21, 12.55s/it, lr=1e-5, step_loss=0.00848][RANK-0]: Step: [7454], local_loss=0.024579070508480072, train_loss=0.06863285601139069, time_cost=13.181352376937866
+
Steps: 1%| | 7454/1000000 [19:04:00<3460:36:21, 12.55s/it, lr=1e-5, step_loss=0.0246]
Steps: 1%| | 7455/1000000 [19:04:11<3352:44:33, 12.16s/it, lr=1e-5, step_loss=0.0246][RANK-0]: Step: [7455], local_loss=0.16409209370613098, train_loss=0.04160183668136597, time_cost=1.342766284942627
+
Steps: 1%| | 7455/1000000 [19:04:11<3352:44:33, 12.16s/it, lr=1e-5, step_loss=0.164]
Steps: 1%| | 7456/1000000 [19:04:24<3428:32:49, 12.44s/it, lr=1e-5, step_loss=0.164][RANK-0]: Step: [7456], local_loss=0.4859958291053772, train_loss=0.09506301581859589, time_cost=3.838258981704712
+
Steps: 1%| | 7456/1000000 [19:04:24<3428:32:49, 12.44s/it, lr=1e-5, step_loss=0.486]
Steps: 1%| | 7457/1000000 [19:04:46<4180:39:53, 15.16s/it, lr=1e-5, step_loss=0.486][RANK-0]: Step: [7457], local_loss=0.023636745288968086, train_loss=0.15167132019996643, time_cost=11.334041833877563
+
Steps: 1%| | 7457/1000000 [19:04:46<4180:39:53, 15.16s/it, lr=1e-5, step_loss=0.0236]
Steps: 1%| | 7458/1000000 [19:04:51<3379:00:09, 12.26s/it, lr=1e-5, step_loss=0.0236][RANK-0]: Step: [7458], local_loss=0.0055916523560881615, train_loss=0.033702485263347626, time_cost=2.5145750045776367
+
Steps: 1%| | 7458/1000000 [19:04:51<3379:00:09, 12.26s/it, lr=1e-5, step_loss=0.00559]
Steps: 1%| | 7459/1000000 [19:04:56<2797:42:40, 10.15s/it, lr=1e-5, step_loss=0.00559][RANK-0]: Step: [7459], local_loss=0.043392062187194824, train_loss=0.174494206905365, time_cost=1.3773927688598633
+
Steps: 1%| | 7459/1000000 [19:04:56<2797:42:40, 10.15s/it, lr=1e-5, step_loss=0.0434]
Steps: 1%| | 7460/1000000 [19:05:02<2434:28:42, 8.83s/it, lr=1e-5, step_loss=0.0434][RANK-0]: Step: [7460], local_loss=0.01480583380907774, train_loss=0.09002117067575455, time_cost=1.962151288986206
+
Steps: 1%| | 7460/1000000 [19:05:02<2434:28:42, 8.83s/it, lr=1e-5, step_loss=0.0148]
Steps: 1%| | 7461/1000000 [19:05:07<2113:20:32, 7.67s/it, lr=1e-5, step_loss=0.0148][RANK-0]: Step: [7461], local_loss=0.006583037786185741, train_loss=0.04599551856517792, time_cost=1.9574873447418213
+
Steps: 1%| | 7461/1000000 [19:05:07<2113:20:32, 7.67s/it, lr=1e-5, step_loss=0.00658]
Steps: 1%| | 7462/1000000 [19:05:22<2699:29:11, 9.79s/it, lr=1e-5, step_loss=0.00658][RANK-0]: Step: [7462], local_loss=0.17713220417499542, train_loss=0.052523910999298096, time_cost=4.599792957305908
+
Steps: 1%| | 7462/1000000 [19:05:22<2699:29:11, 9.79s/it, lr=1e-5, step_loss=0.177]
Steps: 1%| | 7463/1000000 [19:05:37<3155:14:04, 11.44s/it, lr=1e-5, step_loss=0.177][RANK-0]: Step: [7463], local_loss=0.012126053683459759, train_loss=0.015208819881081581, time_cost=12.521008014678955
+
Steps: 1%| | 7463/1000000 [19:05:37<3155:14:04, 11.44s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 7464/1000000 [19:05:42<2623:32:36, 9.52s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [7464], local_loss=0.018273532390594482, train_loss=0.022266928106546402, time_cost=1.2870726585388184
+
Steps: 1%| | 7464/1000000 [19:05:42<2623:32:36, 9.52s/it, lr=1e-5, step_loss=0.0183]
Steps: 1%| | 7465/1000000 [19:05:52<2702:18:14, 9.80s/it, lr=1e-5, step_loss=0.0183][RANK-0]: Step: [7465], local_loss=0.06077180802822113, train_loss=0.20185428857803345, time_cost=1.772252082824707
+
Steps: 1%| | 7465/1000000 [19:05:52<2702:18:14, 9.80s/it, lr=1e-5, step_loss=0.0608]
Steps: 1%| | 7466/1000000 [19:05:59<2399:30:26, 8.70s/it, lr=1e-5, step_loss=0.0608][RANK-0]: Step: [7466], local_loss=0.01712682470679283, train_loss=0.027810271829366684, time_cost=1.3395516872406006
+
Steps: 1%| | 7466/1000000 [19:05:59<2399:30:26, 8.70s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 7467/1000000 [19:06:04<2100:42:04, 7.62s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [7467], local_loss=0.024831049144268036, train_loss=0.054668065160512924, time_cost=2.420032262802124
+
Steps: 1%| | 7467/1000000 [19:06:04<2100:42:04, 7.62s/it, lr=1e-5, step_loss=0.0248]
Steps: 1%| | 7468/1000000 [19:06:13<2258:42:29, 8.19s/it, lr=1e-5, step_loss=0.0248][RANK-0]: Step: [7468], local_loss=0.00972673948854208, train_loss=0.03175816684961319, time_cost=4.722328424453735
+
Steps: 1%| | 7468/1000000 [19:06:13<2258:42:29, 8.19s/it, lr=1e-5, step_loss=0.00973]
Steps: 1%| | 7469/1000000 [19:06:19<2038:07:37, 7.39s/it, lr=1e-5, step_loss=0.00973][RANK-0]: Step: [7469], local_loss=0.00863318145275116, train_loss=0.018758967518806458, time_cost=2.3402483463287354
+
Steps: 1%| | 7469/1000000 [19:06:19<2038:07:37, 7.39s/it, lr=1e-5, step_loss=0.00863]
Steps: 1%| | 7470/1000000 [19:06:24<1845:28:09, 6.69s/it, lr=1e-5, step_loss=0.00863][RANK-0]: Step: [7470], local_loss=0.01860729418694973, train_loss=0.018577277660369873, time_cost=1.9312188625335693
+
Steps: 1%| | 7470/1000000 [19:06:24<1845:28:09, 6.69s/it, lr=1e-5, step_loss=0.0186]
Steps: 1%| | 7471/1000000 [19:06:30<1843:59:23, 6.69s/it, lr=1e-5, step_loss=0.0186][RANK-0]: Step: [7471], local_loss=0.1865953952074051, train_loss=0.06538720428943634, time_cost=2.151127338409424
+
Steps: 1%| | 7471/1000000 [19:06:31<1843:59:23, 6.69s/it, lr=1e-5, step_loss=0.187]
Steps: 1%| | 7472/1000000 [19:06:39<2011:23:00, 7.30s/it, lr=1e-5, step_loss=0.187][RANK-0]: Step: [7472], local_loss=0.01586485467851162, train_loss=0.03421357646584511, time_cost=2.159374475479126
+
Steps: 1%| | 7472/1000000 [19:06:39<2011:23:00, 7.30s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 7473/1000000 [19:06:55<2673:20:50, 9.70s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [7473], local_loss=0.05530739575624466, train_loss=0.039170924574136734, time_cost=6.011081218719482
+
Steps: 1%| | 7473/1000000 [19:06:55<2673:20:50, 9.70s/it, lr=1e-5, step_loss=0.0553]
Steps: 1%| | 7474/1000000 [19:07:08<2965:44:33, 10.76s/it, lr=1e-5, step_loss=0.0553][RANK-0]: Step: [7474], local_loss=0.013601372949779034, train_loss=0.020846478641033173, time_cost=1.7099227905273438
+
Steps: 1%| | 7474/1000000 [19:07:08<2965:44:33, 10.76s/it, lr=1e-5, step_loss=0.0136]
Steps: 1%| | 7475/1000000 [19:07:17<2818:15:44, 10.22s/it, lr=1e-5, step_loss=0.0136][RANK-0]: Step: [7475], local_loss=0.015067625790834427, train_loss=0.02082662284374237, time_cost=1.301450252532959
+
Steps: 1%| | 7475/1000000 [19:07:17<2818:15:44, 10.22s/it, lr=1e-5, step_loss=0.0151]
Steps: 1%| | 7476/1000000 [19:07:28<2893:54:00, 10.50s/it, lr=1e-5, step_loss=0.0151][RANK-0]: Step: [7476], local_loss=0.2308642864227295, train_loss=0.07466532289981842, time_cost=2.4744250774383545
+
Steps: 1%| | 7476/1000000 [19:07:28<2893:54:00, 10.50s/it, lr=1e-5, step_loss=0.231]
Steps: 1%| | 7477/1000000 [19:07:35<2617:56:45, 9.50s/it, lr=1e-5, step_loss=0.231][RANK-0]: Step: [7477], local_loss=0.04720405489206314, train_loss=0.03845108672976494, time_cost=3.000814437866211
+
Steps: 1%| | 7477/1000000 [19:07:35<2617:56:45, 9.50s/it, lr=1e-5, step_loss=0.0472]
Steps: 1%| | 7478/1000000 [19:07:44<2607:01:12, 9.46s/it, lr=1e-5, step_loss=0.0472][RANK-0]: Step: [7478], local_loss=0.009643305093050003, train_loss=0.03672106936573982, time_cost=2.43123197555542
+
Steps: 1%| | 7478/1000000 [19:07:44<2607:01:12, 9.46s/it, lr=1e-5, step_loss=0.00964]
Steps: 1%| | 7479/1000000 [19:07:54<2629:50:02, 9.54s/it, lr=1e-5, step_loss=0.00964][RANK-0]: Step: [7479], local_loss=0.03710218146443367, train_loss=0.029326720163226128, time_cost=3.542503833770752
+
Steps: 1%| | 7479/1000000 [19:07:54<2629:50:02, 9.54s/it, lr=1e-5, step_loss=0.0371]
Steps: 1%| | 7480/1000000 [19:08:03<2603:07:16, 9.44s/it, lr=1e-5, step_loss=0.0371][RANK-0]: Step: [7480], local_loss=0.18968813121318817, train_loss=0.08899641036987305, time_cost=1.2945435047149658
+
Steps: 1%| | 7480/1000000 [19:08:03<2603:07:16, 9.44s/it, lr=1e-5, step_loss=0.19]
Steps: 1%| | 7481/1000000 [19:08:17<2945:48:57, 10.68s/it, lr=1e-5, step_loss=0.19][RANK-0]: Step: [7481], local_loss=0.04985247552394867, train_loss=0.042260847985744476, time_cost=2.6116981506347656
+
Steps: 1%| | 7481/1000000 [19:08:17<2945:48:57, 10.68s/it, lr=1e-5, step_loss=0.0499]
Steps: 1%| | 7482/1000000 [19:08:31<3228:00:25, 11.71s/it, lr=1e-5, step_loss=0.0499][RANK-0]: Step: [7482], local_loss=0.0027461983263492584, train_loss=0.07171283662319183, time_cost=2.2159643173217773
+
Steps: 1%| | 7482/1000000 [19:08:31<3228:00:25, 11.71s/it, lr=1e-5, step_loss=0.00275]
Steps: 1%| | 7483/1000000 [19:08:42<3169:26:33, 11.50s/it, lr=1e-5, step_loss=0.00275][RANK-0]: Step: [7483], local_loss=0.008510192856192589, train_loss=0.11748413741588593, time_cost=3.2509143352508545
+
Steps: 1%| | 7483/1000000 [19:08:42<3169:26:33, 11.50s/it, lr=1e-5, step_loss=0.00851]
Steps: 1%| | 7484/1000000 [19:08:50<2876:39:06, 10.43s/it, lr=1e-5, step_loss=0.00851][RANK-0]: Step: [7484], local_loss=0.07276489585638046, train_loss=0.04207666590809822, time_cost=3.034820318222046
+
Steps: 1%| | 7484/1000000 [19:08:50<2876:39:06, 10.43s/it, lr=1e-5, step_loss=0.0728]
Steps: 1%| | 7485/1000000 [19:08:57<2555:47:05, 9.27s/it, lr=1e-5, step_loss=0.0728][RANK-0]: Step: [7485], local_loss=0.01461000181734562, train_loss=0.04098903387784958, time_cost=1.3127200603485107
+
Steps: 1%| | 7485/1000000 [19:08:57<2555:47:05, 9.27s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 7486/1000000 [19:09:12<3039:27:01, 11.02s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [7486], local_loss=0.11052466183900833, train_loss=0.050255805253982544, time_cost=1.3120665550231934
+
Steps: 1%| | 7486/1000000 [19:09:12<3039:27:01, 11.02s/it, lr=1e-5, step_loss=0.111]
Steps: 1%| | 7487/1000000 [19:09:18<2617:23:15, 9.49s/it, lr=1e-5, step_loss=0.111][RANK-0]: Step: [7487], local_loss=0.0171139407902956, train_loss=0.016265658661723137, time_cost=1.5441570281982422
+
Steps: 1%| | 7487/1000000 [19:09:18<2617:23:15, 9.49s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 7488/1000000 [19:09:22<2238:26:05, 8.12s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [7488], local_loss=0.08460701256990433, train_loss=0.14400514960289001, time_cost=1.2276477813720703
+
Steps: 1%| | 7488/1000000 [19:09:22<2238:26:05, 8.12s/it, lr=1e-5, step_loss=0.0846]
Steps: 1%| | 7489/1000000 [19:09:29<2126:50:28, 7.71s/it, lr=1e-5, step_loss=0.0846][RANK-0]: Step: [7489], local_loss=0.007740090601146221, train_loss=0.05325154587626457, time_cost=2.09997820854187
+
Steps: 1%| | 7489/1000000 [19:09:29<2126:50:28, 7.71s/it, lr=1e-5, step_loss=0.00774]
Steps: 1%| | 7490/1000000 [19:09:36<2081:03:54, 7.55s/it, lr=1e-5, step_loss=0.00774][RANK-0]: Step: [7490], local_loss=0.009980952367186546, train_loss=0.025393549352884293, time_cost=2.7869138717651367
+
Steps: 1%| | 7490/1000000 [19:09:36<2081:03:54, 7.55s/it, lr=1e-5, step_loss=0.00998]
Steps: 1%| | 7491/1000000 [19:09:48<2392:55:18, 8.68s/it, lr=1e-5, step_loss=0.00998][RANK-0]: Step: [7491], local_loss=0.008525905199348927, train_loss=41.038299560546875, time_cost=2.6155593395233154
+
Steps: 1%| | 7491/1000000 [19:09:48<2392:55:18, 8.68s/it, lr=1e-5, step_loss=0.00853]
Steps: 1%| | 7492/1000000 [19:09:53<2092:34:46, 7.59s/it, lr=1e-5, step_loss=0.00853][RANK-0]: Step: [7492], local_loss=0.017111795023083687, train_loss=0.04658707231283188, time_cost=2.0842068195343018
+
Steps: 1%| | 7492/1000000 [19:09:53<2092:34:46, 7.59s/it, lr=1e-5, step_loss=0.0171]
Steps: 1%| | 7493/1000000 [19:10:08<2693:56:59, 9.77s/it, lr=1e-5, step_loss=0.0171][RANK-0]: Step: [7493], local_loss=0.07053867727518082, train_loss=0.06315295398235321, time_cost=4.216069459915161
+
Steps: 1%| | 7493/1000000 [19:10:08<2693:56:59, 9.77s/it, lr=1e-5, step_loss=0.0705]
Steps: 1%| | 7494/1000000 [19:10:18<2736:50:52, 9.93s/it, lr=1e-5, step_loss=0.0705][RANK-0]: Step: [7494], local_loss=0.07913758605718613, train_loss=0.08293460309505463, time_cost=1.7068259716033936
+
Steps: 1%| | 7494/1000000 [19:10:18<2736:50:52, 9.93s/it, lr=1e-5, step_loss=0.0791]
Steps: 1%| | 7495/1000000 [19:10:30<2896:01:35, 10.50s/it, lr=1e-5, step_loss=0.0791][RANK-0]: Step: [7495], local_loss=0.010171826928853989, train_loss=0.04830238223075867, time_cost=1.237699270248413
+
Steps: 1%| | 7495/1000000 [19:10:30<2896:01:35, 10.50s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7496/1000000 [19:10:39<2813:22:41, 10.20s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7496], local_loss=0.010086167603731155, train_loss=0.01668209582567215, time_cost=2.507411003112793
+
Steps: 1%| | 7496/1000000 [19:10:39<2813:22:41, 10.20s/it, lr=1e-5, step_loss=0.0101]
Steps: 1%| | 7497/1000000 [19:10:45<2448:09:05, 8.88s/it, lr=1e-5, step_loss=0.0101][RANK-0]: Step: [7497], local_loss=0.014942566864192486, train_loss=0.052608709782361984, time_cost=1.5109643936157227
+
Steps: 1%| | 7497/1000000 [19:10:45<2448:09:05, 8.88s/it, lr=1e-5, step_loss=0.0149]
Steps: 1%| | 7498/1000000 [19:10:52<2275:14:17, 8.25s/it, lr=1e-5, step_loss=0.0149][RANK-0]: Step: [7498], local_loss=0.02545035630464554, train_loss=0.04550045728683472, time_cost=2.550969362258911
+
Steps: 1%| | 7498/1000000 [19:10:52<2275:14:17, 8.25s/it, lr=1e-5, step_loss=0.0255]
Steps: 1%| | 7499/1000000 [19:10:56<1942:43:52, 7.05s/it, lr=1e-5, step_loss=0.0255][RANK-0]: Step: [7499], local_loss=0.05133415013551712, train_loss=0.05525742843747139, time_cost=1.4534413814544678
+
Steps: 1%| | 7499/1000000 [19:10:56<1942:43:52, 7.05s/it, lr=1e-5, step_loss=0.0513]
Steps: 1%| | 7500/1000000 [19:11:05<2070:21:53, 7.51s/it, lr=1e-5, step_loss=0.0513][RANK-0]: Step: [7500], local_loss=0.012461554259061813, train_loss=0.025814585387706757, time_cost=4.560282230377197
+
Steps: 1%| | 7500/1000000 [19:11:05<2070:21:53, 7.51s/it, lr=1e-5, step_loss=0.0125]
Steps: 1%| | 7501/1000000 [19:11:15<2302:06:41, 8.35s/it, lr=1e-5, step_loss=0.0125][RANK-0]: Step: [7501], local_loss=0.012805096805095673, train_loss=0.031019356101751328, time_cost=1.7215847969055176
+
Steps: 1%| | 7501/1000000 [19:11:15<2302:06:41, 8.35s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 7502/1000000 [19:11:20<2039:19:15, 7.40s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [7502], local_loss=0.04416896402835846, train_loss=0.030390426516532898, time_cost=2.0879597663879395
+
Steps: 1%| | 7502/1000000 [19:11:20<2039:19:15, 7.40s/it, lr=1e-5, step_loss=0.0442]
Steps: 1%| | 7503/1000000 [19:11:25<1867:30:02, 6.77s/it, lr=1e-5, step_loss=0.0442][RANK-0]: Step: [7503], local_loss=0.046889081597328186, train_loss=0.1032780259847641, time_cost=2.181946277618408
+
Steps: 1%| | 7503/1000000 [19:11:25<1867:30:02, 6.77s/it, lr=1e-5, step_loss=0.0469]
Steps: 1%| | 7504/1000000 [19:11:30<1714:26:32, 6.22s/it, lr=1e-5, step_loss=0.0469][RANK-0]: Step: [7504], local_loss=0.02337275631725788, train_loss=5.0918073654174805, time_cost=2.1462900638580322
+
Steps: 1%| | 7504/1000000 [19:11:30<1714:26:32, 6.22s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%| | 7505/1000000 [19:11:36<1651:24:19, 5.99s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [7505], local_loss=0.01608482375741005, train_loss=0.026606684550642967, time_cost=2.755666971206665
+
Steps: 1%| | 7505/1000000 [19:11:36<1651:24:19, 5.99s/it, lr=1e-5, step_loss=0.0161]
Steps: 1%| | 7506/1000000 [19:11:50<2348:41:04, 8.52s/it, lr=1e-5, step_loss=0.0161][RANK-0]: Step: [7506], local_loss=0.07130730152130127, train_loss=0.02775394916534424, time_cost=5.131317138671875
+
Steps: 1%| | 7506/1000000 [19:11:50<2348:41:04, 8.52s/it, lr=1e-5, step_loss=0.0713]
Steps: 1%| | 7507/1000000 [19:12:02<2635:32:27, 9.56s/it, lr=1e-5, step_loss=0.0713][RANK-0]: Step: [7507], local_loss=0.010430640541017056, train_loss=0.018652919679880142, time_cost=3.592097043991089
+
Steps: 1%| | 7507/1000000 [19:12:02<2635:32:27, 9.56s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7508/1000000 [19:12:11<2594:20:56, 9.41s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7508], local_loss=0.015014046803116798, train_loss=0.036311838775873184, time_cost=1.630462884902954
+
Steps: 1%| | 7508/1000000 [19:12:11<2594:20:56, 9.41s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 7509/1000000 [19:12:16<2242:30:53, 8.13s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [7509], local_loss=0.07075537741184235, train_loss=0.04934214800596237, time_cost=2.21920108795166
+
Steps: 1%| | 7509/1000000 [19:12:16<2242:30:53, 8.13s/it, lr=1e-5, step_loss=0.0708]
Steps: 1%| | 7510/1000000 [19:12:22<1995:35:46, 7.24s/it, lr=1e-5, step_loss=0.0708][RANK-0]: Step: [7510], local_loss=0.02228609286248684, train_loss=0.04082665219902992, time_cost=2.0078420639038086
+
Steps: 1%| | 7510/1000000 [19:12:22<1995:35:46, 7.24s/it, lr=1e-5, step_loss=0.0223]
Steps: 1%| | 7511/1000000 [19:12:31<2136:52:59, 7.75s/it, lr=1e-5, step_loss=0.0223][RANK-0]: Step: [7511], local_loss=0.057866115123033524, train_loss=0.02248864807188511, time_cost=2.5124268531799316
+
Steps: 1%| | 7511/1000000 [19:12:31<2136:52:59, 7.75s/it, lr=1e-5, step_loss=0.0579]
Steps: 1%| | 7512/1000000 [19:12:44<2625:12:12, 9.52s/it, lr=1e-5, step_loss=0.0579][RANK-0]: Step: [7512], local_loss=0.009494643658399582, train_loss=0.027786336839199066, time_cost=3.9936342239379883
+
Steps: 1%| | 7512/1000000 [19:12:44<2625:12:12, 9.52s/it, lr=1e-5, step_loss=0.00949]
Steps: 1%| | 7513/1000000 [19:12:56<2827:08:05, 10.25s/it, lr=1e-5, step_loss=0.00949][RANK-0]: Step: [7513], local_loss=0.015620319172739983, train_loss=0.022487584501504898, time_cost=4.338054895401001
+
Steps: 1%| | 7513/1000000 [19:12:56<2827:08:05, 10.25s/it, lr=1e-5, step_loss=0.0156]
Steps: 1%| | 7514/1000000 [19:13:01<2349:14:49, 8.52s/it, lr=1e-5, step_loss=0.0156][RANK-0]: Step: [7514], local_loss=0.013174088671803474, train_loss=0.08052173256874084, time_cost=1.9804353713989258
+
Steps: 1%| | 7514/1000000 [19:13:01<2349:14:49, 8.52s/it, lr=1e-5, step_loss=0.0132]
Steps: 1%| | 7515/1000000 [19:13:06<2075:40:43, 7.53s/it, lr=1e-5, step_loss=0.0132][RANK-0]: Step: [7515], local_loss=0.008064288645982742, train_loss=52.34651565551758, time_cost=2.2957189083099365
+
Steps: 1%| | 7515/1000000 [19:13:06<2075:40:43, 7.53s/it, lr=1e-5, step_loss=0.00806]
Steps: 1%| | 7516/1000000 [19:13:13<2011:36:24, 7.30s/it, lr=1e-5, step_loss=0.00806][RANK-0]: Step: [7516], local_loss=0.032517511397600174, train_loss=0.0497424453496933, time_cost=1.2326409816741943
+
Steps: 1%| | 7516/1000000 [19:13:13<2011:36:24, 7.30s/it, lr=1e-5, step_loss=0.0325]
Steps: 1%| | 7517/1000000 [19:13:22<2164:51:35, 7.85s/it, lr=1e-5, step_loss=0.0325][RANK-0]: Step: [7517], local_loss=0.009517095051705837, train_loss=0.020315025001764297, time_cost=1.808781385421753
+
Steps: 1%| | 7517/1000000 [19:13:22<2164:51:35, 7.85s/it, lr=1e-5, step_loss=0.00952]
Steps: 1%| | 7518/1000000 [19:13:28<2069:27:59, 7.51s/it, lr=1e-5, step_loss=0.00952][RANK-0]: Step: [7518], local_loss=0.05365920811891556, train_loss=0.05866461247205734, time_cost=2.770493984222412
+
Steps: 1%| | 7518/1000000 [19:13:28<2069:27:59, 7.51s/it, lr=1e-5, step_loss=0.0537]
Steps: 1%| | 7519/1000000 [19:13:40<2383:49:45, 8.65s/it, lr=1e-5, step_loss=0.0537][RANK-0]: Step: [7519], local_loss=0.01104743778705597, train_loss=0.03945264220237732, time_cost=1.2096028327941895
+
Steps: 1%| | 7519/1000000 [19:13:40<2383:49:45, 8.65s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 7520/1000000 [19:13:54<2825:38:22, 10.25s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [7520], local_loss=0.08922886848449707, train_loss=0.08433955907821655, time_cost=5.488579511642456
+
Steps: 1%| | 7520/1000000 [19:13:54<2825:38:22, 10.25s/it, lr=1e-5, step_loss=0.0892]
Steps: 1%| | 7521/1000000 [19:14:04<2857:36:14, 10.37s/it, lr=1e-5, step_loss=0.0892][RANK-0]: Step: [7521], local_loss=0.010932058095932007, train_loss=0.04759572446346283, time_cost=1.570277452468872
+
Steps: 1%| | 7521/1000000 [19:14:04<2857:36:14, 10.37s/it, lr=1e-5, step_loss=0.0109]
Steps: 1%| | 7522/1000000 [19:14:13<2726:31:20, 9.89s/it, lr=1e-5, step_loss=0.0109][RANK-0]: Step: [7522], local_loss=0.10532289743423462, train_loss=0.03308425843715668, time_cost=3.6119213104248047
+
Steps: 1%| | 7522/1000000 [19:14:13<2726:31:20, 9.89s/it, lr=1e-5, step_loss=0.105]
Steps: 1%| | 7523/1000000 [19:14:29<3234:13:59, 11.73s/it, lr=1e-5, step_loss=0.105][RANK-0]: Step: [7523], local_loss=0.06678059697151184, train_loss=0.07119853794574738, time_cost=2.1079349517822266
+
Steps: 1%| | 7523/1000000 [19:14:29<3234:13:59, 11.73s/it, lr=1e-5, step_loss=0.0668]
Steps: 1%| | 7524/1000000 [19:14:40<3185:27:12, 11.55s/it, lr=1e-5, step_loss=0.0668][RANK-0]: Step: [7524], local_loss=0.016387177631258965, train_loss=0.015393365174531937, time_cost=1.3020484447479248
+
Steps: 1%| | 7524/1000000 [19:14:40<3185:27:12, 11.55s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%| | 7525/1000000 [19:14:50<3027:00:37, 10.98s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [7525], local_loss=0.014316212385892868, train_loss=0.09479638189077377, time_cost=1.873786211013794
+
Steps: 1%| | 7525/1000000 [19:14:50<3027:00:37, 10.98s/it, lr=1e-5, step_loss=0.0143]
Steps: 1%| | 7526/1000000 [19:14:55<2542:41:44, 9.22s/it, lr=1e-5, step_loss=0.0143][RANK-0]: Step: [7526], local_loss=0.032183773815631866, train_loss=15.84921932220459, time_cost=1.3298890590667725
+
Steps: 1%| | 7526/1000000 [19:14:55<2542:41:44, 9.22s/it, lr=1e-5, step_loss=0.0322]
Steps: 1%| | 7527/1000000 [19:15:02<2376:04:57, 8.62s/it, lr=1e-5, step_loss=0.0322][RANK-0]: Step: [7527], local_loss=0.008138405159115791, train_loss=0.02030119113624096, time_cost=4.209859609603882
+
Steps: 1%| | 7527/1000000 [19:15:02<2376:04:57, 8.62s/it, lr=1e-5, step_loss=0.00814]
Steps: 1%| | 7528/1000000 [19:15:18<2940:55:55, 10.67s/it, lr=1e-5, step_loss=0.00814][RANK-0]: Step: [7528], local_loss=0.012998832389712334, train_loss=0.033596351742744446, time_cost=6.7741899490356445
+
Steps: 1%| | 7528/1000000 [19:15:18<2940:55:55, 10.67s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 7529/1000000 [19:15:23<2464:01:50, 8.94s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [7529], local_loss=0.020649077370762825, train_loss=0.22858625650405884, time_cost=1.8877418041229248
+
Steps: 1%| | 7529/1000000 [19:15:23<2464:01:50, 8.94s/it, lr=1e-5, step_loss=0.0206]
Steps: 1%| | 7530/1000000 [19:15:35<2748:35:12, 9.97s/it, lr=1e-5, step_loss=0.0206][RANK-0]: Step: [7530], local_loss=0.012829354032874107, train_loss=0.020267076790332794, time_cost=3.1344525814056396
+
Steps: 1%| | 7530/1000000 [19:15:35<2748:35:12, 9.97s/it, lr=1e-5, step_loss=0.0128]
Steps: 1%| | 7531/1000000 [19:15:40<2339:07:30, 8.48s/it, lr=1e-5, step_loss=0.0128][RANK-0]: Step: [7531], local_loss=0.13632065057754517, train_loss=0.03988821059465408, time_cost=1.226456642150879
+
Steps: 1%| | 7531/1000000 [19:15:40<2339:07:30, 8.48s/it, lr=1e-5, step_loss=0.136]
Steps: 1%| | 7532/1000000 [19:15:57<2999:33:31, 10.88s/it, lr=1e-5, step_loss=0.136][RANK-0]: Step: [7532], local_loss=0.023240789771080017, train_loss=0.07740408182144165, time_cost=8.141803979873657
+
Steps: 1%| | 7532/1000000 [19:15:57<2999:33:31, 10.88s/it, lr=1e-5, step_loss=0.0232]
Steps: 1%| | 7533/1000000 [19:16:06<2862:17:17, 10.38s/it, lr=1e-5, step_loss=0.0232][RANK-0]: Step: [7533], local_loss=0.012997787445783615, train_loss=0.04601535201072693, time_cost=1.7466239929199219
+
Steps: 1%| | 7533/1000000 [19:16:06<2862:17:17, 10.38s/it, lr=1e-5, step_loss=0.013]
Steps: 1%| | 7534/1000000 [19:16:12<2508:20:37, 9.10s/it, lr=1e-5, step_loss=0.013][RANK-0]: Step: [7534], local_loss=0.010599958710372448, train_loss=0.0547625795006752, time_cost=1.6276679039001465
+
Steps: 1%| | 7534/1000000 [19:16:12<2508:20:37, 9.10s/it, lr=1e-5, step_loss=0.0106]
Steps: 1%| | 7535/1000000 [19:16:16<2107:28:49, 7.64s/it, lr=1e-5, step_loss=0.0106][RANK-0]: Step: [7535], local_loss=0.01600760407745838, train_loss=0.027250204235315323, time_cost=1.2346694469451904
+
Steps: 1%| | 7535/1000000 [19:16:16<2107:28:49, 7.64s/it, lr=1e-5, step_loss=0.016]
Steps: 1%| | 7536/1000000 [19:16:27<2332:34:00, 8.46s/it, lr=1e-5, step_loss=0.016][RANK-0]: Step: [7536], local_loss=0.021270915865898132, train_loss=0.14925411343574524, time_cost=5.284202575683594
+
Steps: 1%| | 7536/1000000 [19:16:27<2332:34:00, 8.46s/it, lr=1e-5, step_loss=0.0213]
Steps: 1%| | 7537/1000000 [19:16:36<2427:47:43, 8.81s/it, lr=1e-5, step_loss=0.0213][RANK-0]: Step: [7537], local_loss=0.05658597871661186, train_loss=0.06010333448648453, time_cost=1.6174252033233643
+
Steps: 1%| | 7537/1000000 [19:16:36<2427:47:43, 8.81s/it, lr=1e-5, step_loss=0.0566]
Steps: 1%| | 7538/1000000 [19:16:43<2282:23:09, 8.28s/it, lr=1e-5, step_loss=0.0566][RANK-0]: Step: [7538], local_loss=0.01551937684416771, train_loss=0.14766764640808105, time_cost=2.5367653369903564
+
Steps: 1%| | 7538/1000000 [19:16:43<2282:23:09, 8.28s/it, lr=1e-5, step_loss=0.0155]
Steps: 1%| | 7539/1000000 [19:16:54<2463:39:01, 8.94s/it, lr=1e-5, step_loss=0.0155][RANK-0]: Step: [7539], local_loss=0.03363122045993805, train_loss=0.02912934124469757, time_cost=3.214677333831787
+
Steps: 1%| | 7539/1000000 [19:16:54<2463:39:01, 8.94s/it, lr=1e-5, step_loss=0.0336]
Steps: 1%| | 7540/1000000 [19:16:59<2166:42:03, 7.86s/it, lr=1e-5, step_loss=0.0336][RANK-0]: Step: [7540], local_loss=0.021496839821338654, train_loss=0.028463443741202354, time_cost=4.31702184677124
+
Steps: 1%| | 7540/1000000 [19:16:59<2166:42:03, 7.86s/it, lr=1e-5, step_loss=0.0215]
Steps: 1%| | 7541/1000000 [19:17:11<2491:46:22, 9.04s/it, lr=1e-5, step_loss=0.0215][RANK-0]: Step: [7541], local_loss=0.41706719994544983, train_loss=0.07255244255065918, time_cost=1.7846996784210205
+
Steps: 1%| | 7541/1000000 [19:17:11<2491:46:22, 9.04s/it, lr=1e-5, step_loss=0.417]
Steps: 1%| | 7542/1000000 [19:17:16<2169:29:04, 7.87s/it, lr=1e-5, step_loss=0.417][RANK-0]: Step: [7542], local_loss=0.036766115576028824, train_loss=0.04244333133101463, time_cost=2.433835983276367
+
Steps: 1%| | 7542/1000000 [19:17:16<2169:29:04, 7.87s/it, lr=1e-5, step_loss=0.0368]
Steps: 1%| | 7543/1000000 [19:17:24<2148:20:26, 7.79s/it, lr=1e-5, step_loss=0.0368][RANK-0]: Step: [7543], local_loss=0.05096495524048805, train_loss=0.03830189257860184, time_cost=3.0479116439819336
+
Steps: 1%| | 7543/1000000 [19:17:24<2148:20:26, 7.79s/it, lr=1e-5, step_loss=0.051]
Steps: 1%| | 7544/1000000 [19:17:40<2879:12:51, 10.44s/it, lr=1e-5, step_loss=0.051][RANK-0]: Step: [7544], local_loss=0.0450437068939209, train_loss=0.04559659957885742, time_cost=1.3070127964019775
+
Steps: 1%| | 7544/1000000 [19:17:40<2879:12:51, 10.44s/it, lr=1e-5, step_loss=0.045]
Steps: 1%| | 7545/1000000 [19:17:54<3141:45:24, 11.40s/it, lr=1e-5, step_loss=0.045][RANK-0]: Step: [7545], local_loss=0.0235042255371809, train_loss=0.019283685833215714, time_cost=6.0117645263671875
+
Steps: 1%| | 7545/1000000 [19:17:54<3141:45:24, 11.40s/it, lr=1e-5, step_loss=0.0235]
Steps: 1%| | 7546/1000000 [19:18:05<3130:06:18, 11.35s/it, lr=1e-5, step_loss=0.0235][RANK-0]: Step: [7546], local_loss=0.007353649940341711, train_loss=0.0342007651925087, time_cost=1.3094842433929443
+
Steps: 1%| | 7546/1000000 [19:18:05<3130:06:18, 11.35s/it, lr=1e-5, step_loss=0.00735]
Steps: 1%| | 7547/1000000 [19:18:14<2965:05:15, 10.76s/it, lr=1e-5, step_loss=0.00735][RANK-0]: Step: [7547], local_loss=0.3065115213394165, train_loss=0.053447019308805466, time_cost=5.381096363067627
+
Steps: 1%| | 7547/1000000 [19:18:14<2965:05:15, 10.76s/it, lr=1e-5, step_loss=0.307]
Steps: 1%| | 7548/1000000 [19:18:21<2607:13:25, 9.46s/it, lr=1e-5, step_loss=0.307][RANK-0]: Step: [7548], local_loss=0.09514619410037994, train_loss=0.04146794229745865, time_cost=2.6614608764648438
+
Steps: 1%| | 7548/1000000 [19:18:21<2607:13:25, 9.46s/it, lr=1e-5, step_loss=0.0951]
Steps: 1%| | 7549/1000000 [19:18:28<2399:53:11, 8.71s/it, lr=1e-5, step_loss=0.0951][RANK-0]: Step: [7549], local_loss=0.12024571001529694, train_loss=0.06678877025842667, time_cost=2.5473644733428955
+
Steps: 1%| | 7549/1000000 [19:18:28<2399:53:11, 8.71s/it, lr=1e-5, step_loss=0.12]
Steps: 1%| | 7550/1000000 [19:18:37<2457:08:07, 8.91s/it, lr=1e-5, step_loss=0.12][RANK-0]: Step: [7550], local_loss=0.013361607678234577, train_loss=0.020540563389658928, time_cost=3.2443156242370605
+
Steps: 1%| | 7550/1000000 [19:18:37<2457:08:07, 8.91s/it, lr=1e-5, step_loss=0.0134]
Steps: 1%| | 7551/1000000 [19:18:51<2891:03:52, 10.49s/it, lr=1e-5, step_loss=0.0134][RANK-0]: Step: [7551], local_loss=0.03133228048682213, train_loss=0.0446833036839962, time_cost=1.2401907444000244
+
Steps: 1%| | 7551/1000000 [19:18:51<2891:03:52, 10.49s/it, lr=1e-5, step_loss=0.0313]
Steps: 1%| | 7552/1000000 [19:19:03<3004:42:04, 10.90s/it, lr=1e-5, step_loss=0.0313][RANK-0]: Step: [7552], local_loss=0.012269150465726852, train_loss=0.02310331165790558, time_cost=8.836901426315308
+
Steps: 1%| | 7552/1000000 [19:19:03<3004:42:04, 10.90s/it, lr=1e-5, step_loss=0.0123]
Steps: 1%| | 7553/1000000 [19:19:13<2899:44:14, 10.52s/it, lr=1e-5, step_loss=0.0123][RANK-0]: Step: [7553], local_loss=0.009427347220480442, train_loss=0.06363111734390259, time_cost=1.2566125392913818
+
Steps: 1%| | 7553/1000000 [19:19:13<2899:44:14, 10.52s/it, lr=1e-5, step_loss=0.00943]
Steps: 1%| | 7554/1000000 [19:19:20<2644:11:06, 9.59s/it, lr=1e-5, step_loss=0.00943][RANK-0]: Step: [7554], local_loss=0.02410121075809002, train_loss=0.016398867592215538, time_cost=3.6521599292755127
+
Steps: 1%| | 7554/1000000 [19:19:20<2644:11:06, 9.59s/it, lr=1e-5, step_loss=0.0241]
Steps: 1%| | 7555/1000000 [19:19:26<2332:49:31, 8.46s/it, lr=1e-5, step_loss=0.0241][RANK-0]: Step: [7555], local_loss=0.012389526702463627, train_loss=0.07439129799604416, time_cost=1.4814527034759521
+
Steps: 1%| | 7555/1000000 [19:19:26<2332:49:31, 8.46s/it, lr=1e-5, step_loss=0.0124]
Steps: 1%| | 7556/1000000 [19:19:35<2347:25:21, 8.52s/it, lr=1e-5, step_loss=0.0124][RANK-0]: Step: [7556], local_loss=0.02658974751830101, train_loss=0.03984757512807846, time_cost=3.992257833480835
+
Steps: 1%| | 7556/1000000 [19:19:35<2347:25:21, 8.52s/it, lr=1e-5, step_loss=0.0266]
Steps: 1%| | 7557/1000000 [19:19:48<2742:28:25, 9.95s/it, lr=1e-5, step_loss=0.0266][RANK-0]: Step: [7557], local_loss=0.04773451387882233, train_loss=0.04183201119303703, time_cost=4.6023383140563965
+
Steps: 1%| | 7557/1000000 [19:19:48<2742:28:25, 9.95s/it, lr=1e-5, step_loss=0.0477]
Steps: 1%| | 7558/1000000 [19:19:58<2723:58:52, 9.88s/it, lr=1e-5, step_loss=0.0477][RANK-0]: Step: [7558], local_loss=0.042237021028995514, train_loss=0.07104633748531342, time_cost=1.2403781414031982
+
Steps: 1%| | 7558/1000000 [19:19:58<2723:58:52, 9.88s/it, lr=1e-5, step_loss=0.0422]
Steps: 1%| | 7559/1000000 [19:20:08<2731:26:26, 9.91s/it, lr=1e-5, step_loss=0.0422][RANK-0]: Step: [7559], local_loss=0.14641369879245758, train_loss=0.05948254466056824, time_cost=4.237059593200684
+
Steps: 1%| | 7559/1000000 [19:20:08<2731:26:26, 9.91s/it, lr=1e-5, step_loss=0.146]
Steps: 1%| | 7560/1000000 [19:20:16<2635:19:05, 9.56s/it, lr=1e-5, step_loss=0.146][RANK-0]: Step: [7560], local_loss=0.01786203682422638, train_loss=0.019940387457609177, time_cost=2.9605588912963867
+
Steps: 1%| | 7560/1000000 [19:20:16<2635:19:05, 9.56s/it, lr=1e-5, step_loss=0.0179]
Steps: 1%| | 7561/1000000 [19:20:28<2762:42:03, 10.02s/it, lr=1e-5, step_loss=0.0179][RANK-0]: Step: [7561], local_loss=0.015003453940153122, train_loss=0.05257219076156616, time_cost=3.426156759262085
+
Steps: 1%| | 7561/1000000 [19:20:28<2762:42:03, 10.02s/it, lr=1e-5, step_loss=0.015]
Steps: 1%| | 7562/1000000 [19:20:38<2816:17:13, 10.22s/it, lr=1e-5, step_loss=0.015][RANK-0]: Step: [7562], local_loss=0.02848886325955391, train_loss=0.03237856179475784, time_cost=5.014204263687134
+
Steps: 1%| | 7562/1000000 [19:20:38<2816:17:13, 10.22s/it, lr=1e-5, step_loss=0.0285]
Steps: 1%| | 7563/1000000 [19:20:52<3106:55:54, 11.27s/it, lr=1e-5, step_loss=0.0285][RANK-0]: Step: [7563], local_loss=0.10987161844968796, train_loss=0.04366908594965935, time_cost=5.364692211151123
+
Steps: 1%| | 7563/1000000 [19:20:52<3106:55:54, 11.27s/it, lr=1e-5, step_loss=0.11]
Steps: 1%| | 7564/1000000 [19:21:04<3158:20:14, 11.46s/it, lr=1e-5, step_loss=0.11][RANK-0]: Step: [7564], local_loss=0.0102031035348773, train_loss=0.051712822169065475, time_cost=4.7649147510528564
+
Steps: 1%| | 7564/1000000 [19:21:04<3158:20:14, 11.46s/it, lr=1e-5, step_loss=0.0102]
Steps: 1%| | 7565/1000000 [19:21:17<3284:57:31, 11.92s/it, lr=1e-5, step_loss=0.0102][RANK-0]: Step: [7565], local_loss=0.028858492150902748, train_loss=14.145042419433594, time_cost=5.304518938064575
+
Steps: 1%| | 7565/1000000 [19:21:17<3284:57:31, 11.92s/it, lr=1e-5, step_loss=0.0289]
Steps: 1%| | 7566/1000000 [19:21:22<2719:47:20, 9.87s/it, lr=1e-5, step_loss=0.0289][RANK-0]: Step: [7566], local_loss=0.014657027088105679, train_loss=0.030239112675189972, time_cost=1.238832950592041
+
Steps: 1%| | 7566/1000000 [19:21:22<2719:47:20, 9.87s/it, lr=1e-5, step_loss=0.0147]
Steps: 1%| | 7567/1000000 [19:21:32<2708:51:11, 9.83s/it, lr=1e-5, step_loss=0.0147][RANK-0]: Step: [7567], local_loss=0.0067008794285357, train_loss=0.07675066590309143, time_cost=1.258955955505371
+
Steps: 1%| | 7567/1000000 [19:21:32<2708:51:11, 9.83s/it, lr=1e-5, step_loss=0.0067]
Steps: 1%| | 7568/1000000 [19:21:41<2685:36:01, 9.74s/it, lr=1e-5, step_loss=0.0067][RANK-0]: Step: [7568], local_loss=0.9863988757133484, train_loss=0.13854749500751495, time_cost=1.2659142017364502
+
Steps: 1%| | 7568/1000000 [19:21:41<2685:36:01, 9.74s/it, lr=1e-5, step_loss=0.986]
Steps: 1%| | 7569/1000000 [19:21:47<2366:14:56, 8.58s/it, lr=1e-5, step_loss=0.986][RANK-0]: Step: [7569], local_loss=0.18237288296222687, train_loss=0.08292052894830704, time_cost=1.9556775093078613
+
Steps: 1%| | 7569/1000000 [19:21:47<2366:14:56, 8.58s/it, lr=1e-5, step_loss=0.182]
Steps: 1%| | 7570/1000000 [19:21:53<2162:58:15, 7.85s/it, lr=1e-5, step_loss=0.182][RANK-0]: Step: [7570], local_loss=0.03732869774103165, train_loss=0.07536642253398895, time_cost=1.2978415489196777
+
Steps: 1%| | 7570/1000000 [19:21:53<2162:58:15, 7.85s/it, lr=1e-5, step_loss=0.0373]
Steps: 1%| | 7571/1000000 [19:22:00<2110:20:52, 7.66s/it, lr=1e-5, step_loss=0.0373][RANK-0]: Step: [7571], local_loss=0.17314425110816956, train_loss=0.07085153460502625, time_cost=5.321019649505615
+
Steps: 1%| | 7571/1000000 [19:22:00<2110:20:52, 7.66s/it, lr=1e-5, step_loss=0.173]
Steps: 1%| | 7572/1000000 [19:22:05<1837:19:51, 6.66s/it, lr=1e-5, step_loss=0.173][RANK-0]: Step: [7572], local_loss=0.025276828557252884, train_loss=0.05378525331616402, time_cost=1.7395117282867432
+
Steps: 1%| | 7572/1000000 [19:22:05<1837:19:51, 6.66s/it, lr=1e-5, step_loss=0.0253]
Steps: 1%| | 7573/1000000 [19:22:09<1646:12:28, 5.97s/it, lr=1e-5, step_loss=0.0253][RANK-0]: Step: [7573], local_loss=0.0157563965767622, train_loss=0.037516187876462936, time_cost=1.6065950393676758
+
Steps: 1%| | 7573/1000000 [19:22:09<1646:12:28, 5.97s/it, lr=1e-5, step_loss=0.0158]
Steps: 1%| | 7574/1000000 [19:22:18<1894:20:30, 6.87s/it, lr=1e-5, step_loss=0.0158][RANK-0]: Step: [7574], local_loss=0.021718785166740417, train_loss=0.05212181434035301, time_cost=3.163060426712036
+
Steps: 1%| | 7574/1000000 [19:22:18<1894:20:30, 6.87s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%| | 7575/1000000 [19:22:28<2176:17:48, 7.89s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [7575], local_loss=0.06370813399553299, train_loss=0.07312008738517761, time_cost=1.6322009563446045
+
Steps: 1%| | 7575/1000000 [19:22:28<2176:17:48, 7.89s/it, lr=1e-5, step_loss=0.0637]
Steps: 1%| | 7576/1000000 [19:22:40<2489:25:16, 9.03s/it, lr=1e-5, step_loss=0.0637][RANK-0]: Step: [7576], local_loss=0.006931479088962078, train_loss=0.06276186555624008, time_cost=4.300534725189209
+
Steps: 1%| | 7576/1000000 [19:22:40<2489:25:16, 9.03s/it, lr=1e-5, step_loss=0.00693]
Steps: 1%| | 7577/1000000 [19:22:55<2967:57:47, 10.77s/it, lr=1e-5, step_loss=0.00693][RANK-0]: Step: [7577], local_loss=0.01971203088760376, train_loss=0.04227989912033081, time_cost=6.442862272262573
+
Steps: 1%| | 7577/1000000 [19:22:55<2967:57:47, 10.77s/it, lr=1e-5, step_loss=0.0197]
Steps: 1%| | 7578/1000000 [19:23:08<3198:36:48, 11.60s/it, lr=1e-5, step_loss=0.0197][RANK-0]: Step: [7578], local_loss=0.027749886736273766, train_loss=0.02978650853037834, time_cost=1.895155668258667
+
Steps: 1%| | 7578/1000000 [19:23:08<3198:36:48, 11.60s/it, lr=1e-5, step_loss=0.0277]
Steps: 1%| | 7579/1000000 [19:23:14<2739:59:09, 9.94s/it, lr=1e-5, step_loss=0.0277][RANK-0]: Step: [7579], local_loss=0.02839859202504158, train_loss=0.02477733977138996, time_cost=1.751051902770996
+
Steps: 1%| | 7579/1000000 [19:23:14<2739:59:09, 9.94s/it, lr=1e-5, step_loss=0.0284]
Steps: 1%| | 7580/1000000 [19:23:24<2672:33:58, 9.69s/it, lr=1e-5, step_loss=0.0284][RANK-0]: Step: [7580], local_loss=0.007124762050807476, train_loss=0.03408662974834442, time_cost=3.836139678955078
+
Steps: 1%| | 7580/1000000 [19:23:24<2672:33:58, 9.69s/it, lr=1e-5, step_loss=0.00712]
Steps: 1%| | 7581/1000000 [19:23:37<2966:46:58, 10.76s/it, lr=1e-5, step_loss=0.00712][RANK-0]: Step: [7581], local_loss=0.07616224139928818, train_loss=0.05750910937786102, time_cost=5.309298276901245
+
Steps: 1%| | 7581/1000000 [19:23:37<2966:46:58, 10.76s/it, lr=1e-5, step_loss=0.0762]
Steps: 1%| | 7582/1000000 [19:23:48<3008:04:47, 10.91s/it, lr=1e-5, step_loss=0.0762][RANK-0]: Step: [7582], local_loss=0.025813104584813118, train_loss=0.06045717000961304, time_cost=1.338423252105713
+
Steps: 1%| | 7582/1000000 [19:23:48<3008:04:47, 10.91s/it, lr=1e-5, step_loss=0.0258]
Steps: 1%| | 7583/1000000 [19:24:05<3471:43:52, 12.59s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [7583], local_loss=0.015944143757224083, train_loss=0.05262412875890732, time_cost=4.4711456298828125
+
Steps: 1%| | 7583/1000000 [19:24:05<3471:43:52, 12.59s/it, lr=1e-5, step_loss=0.0159]
Steps: 1%| | 7584/1000000 [19:24:13<3122:28:27, 11.33s/it, lr=1e-5, step_loss=0.0159][RANK-0]: Step: [7584], local_loss=0.009591632522642612, train_loss=0.022070232778787613, time_cost=2.1675045490264893
+
Steps: 1%| | 7584/1000000 [19:24:13<3122:28:27, 11.33s/it, lr=1e-5, step_loss=0.00959]
Steps: 1%| | 7585/1000000 [19:24:18<2608:32:52, 9.46s/it, lr=1e-5, step_loss=0.00959][RANK-0]: Step: [7585], local_loss=0.1617737114429474, train_loss=0.08679131418466568, time_cost=1.329540729522705
+
Steps: 1%| | 7585/1000000 [19:24:18<2608:32:52, 9.46s/it, lr=1e-5, step_loss=0.162]
Steps: 1%| | 7586/1000000 [19:24:26<2483:28:46, 9.01s/it, lr=1e-5, step_loss=0.162][RANK-0]: Step: [7586], local_loss=0.01702660322189331, train_loss=0.04364350438117981, time_cost=1.335700273513794
+
Steps: 1%| | 7586/1000000 [19:24:26<2483:28:46, 9.01s/it, lr=1e-5, step_loss=0.017]
Steps: 1%| | 7587/1000000 [19:24:37<2650:56:02, 9.62s/it, lr=1e-5, step_loss=0.017][RANK-0]: Step: [7587], local_loss=0.038072116672992706, train_loss=0.11431984603404999, time_cost=4.56442403793335
+
Steps: 1%| | 7587/1000000 [19:24:37<2650:56:02, 9.62s/it, lr=1e-5, step_loss=0.0381]
Steps: 1%| | 7588/1000000 [19:24:51<2983:25:50, 10.82s/it, lr=1e-5, step_loss=0.0381][RANK-0]: Step: [7588], local_loss=0.05781691148877144, train_loss=0.05374164879322052, time_cost=1.3175697326660156
+
Steps: 1%| | 7588/1000000 [19:24:51<2983:25:50, 10.82s/it, lr=1e-5, step_loss=0.0578]
Steps: 1%| | 7589/1000000 [19:25:02<3033:01:29, 11.00s/it, lr=1e-5, step_loss=0.0578][RANK-0]: Step: [7589], local_loss=0.050458237528800964, train_loss=0.037732914090156555, time_cost=2.66416072845459
+
Steps: 1%| | 7589/1000000 [19:25:02<3033:01:29, 11.00s/it, lr=1e-5, step_loss=0.0505]
Steps: 1%| | 7590/1000000 [19:25:11<2872:43:54, 10.42s/it, lr=1e-5, step_loss=0.0505][RANK-0]: Step: [7590], local_loss=0.04712069034576416, train_loss=0.03310137987136841, time_cost=1.878202199935913
+
Steps: 1%| | 7590/1000000 [19:25:11<2872:43:54, 10.42s/it, lr=1e-5, step_loss=0.0471]
Steps: 1%| | 7591/1000000 [19:25:21<2781:11:11, 10.09s/it, lr=1e-5, step_loss=0.0471][RANK-0]: Step: [7591], local_loss=0.009928008541464806, train_loss=0.025156129151582718, time_cost=1.3286128044128418
+
Steps: 1%| | 7591/1000000 [19:25:21<2781:11:11, 10.09s/it, lr=1e-5, step_loss=0.00993]
Steps: 1%| | 7592/1000000 [19:25:31<2799:15:56, 10.15s/it, lr=1e-5, step_loss=0.00993][RANK-0]: Step: [7592], local_loss=0.021550267934799194, train_loss=0.08377813547849655, time_cost=4.264641046524048
+
Steps: 1%| | 7592/1000000 [19:25:31<2799:15:56, 10.15s/it, lr=1e-5, step_loss=0.0216]
Steps: 1%| | 7593/1000000 [19:25:45<3157:50:02, 11.46s/it, lr=1e-5, step_loss=0.0216][RANK-0]: Step: [7593], local_loss=0.007551164831966162, train_loss=0.15983104705810547, time_cost=5.192648649215698
+
Steps: 1%| | 7593/1000000 [19:25:45<3157:50:02, 11.46s/it, lr=1e-5, step_loss=0.00755]
Steps: 1%| | 7594/1000000 [19:25:50<2630:51:26, 9.54s/it, lr=1e-5, step_loss=0.00755][RANK-0]: Step: [7594], local_loss=0.02582644857466221, train_loss=0.09018729627132416, time_cost=2.6213529109954834
+
Steps: 1%| | 7594/1000000 [19:25:50<2630:51:26, 9.54s/it, lr=1e-5, step_loss=0.0258]
Steps: 1%| | 7595/1000000 [19:25:58<2470:33:56, 8.96s/it, lr=1e-5, step_loss=0.0258][RANK-0]: Step: [7595], local_loss=0.07717417925596237, train_loss=0.06125156581401825, time_cost=1.376338005065918
+
Steps: 1%| | 7595/1000000 [19:25:58<2470:33:56, 8.96s/it, lr=1e-5, step_loss=0.0772]
Steps: 1%| | 7596/1000000 [19:26:08<2571:09:58, 9.33s/it, lr=1e-5, step_loss=0.0772][RANK-0]: Step: [7596], local_loss=0.3885117769241333, train_loss=0.07048038393259048, time_cost=4.039246082305908
+
Steps: 1%| | 7596/1000000 [19:26:08<2571:09:58, 9.33s/it, lr=1e-5, step_loss=0.389]
Steps: 1%| | 7597/1000000 [19:26:13<2185:54:53, 7.93s/it, lr=1e-5, step_loss=0.389][RANK-0]: Step: [7597], local_loss=0.01781577803194523, train_loss=0.05591244250535965, time_cost=1.3691699504852295
+
Steps: 1%| | 7597/1000000 [19:26:13<2185:54:53, 7.93s/it, lr=1e-5, step_loss=0.0178]
Steps: 1%| | 7598/1000000 [19:26:24<2427:38:47, 8.81s/it, lr=1e-5, step_loss=0.0178][RANK-0]: Step: [7598], local_loss=0.08127634227275848, train_loss=0.07179400324821472, time_cost=4.503535747528076
+
Steps: 1%| | 7598/1000000 [19:26:24<2427:38:47, 8.81s/it, lr=1e-5, step_loss=0.0813]
Steps: 1%| | 7599/1000000 [19:26:35<2641:09:02, 9.58s/it, lr=1e-5, step_loss=0.0813][RANK-0]: Step: [7599], local_loss=0.06252230703830719, train_loss=0.023741737008094788, time_cost=5.220286846160889
+
Steps: 1%| | 7599/1000000 [19:26:35<2641:09:02, 9.58s/it, lr=1e-5, step_loss=0.0625]
Steps: 1%| | 7600/1000000 [19:26:48<2937:28:56, 10.66s/it, lr=1e-5, step_loss=0.0625][RANK-0]: Step: [7600], local_loss=0.011025156825780869, train_loss=0.03500795364379883, time_cost=4.613228797912598
+
Steps: 1%| | 7600/1000000 [19:26:48<2937:28:56, 10.66s/it, lr=1e-5, step_loss=0.011]
Steps: 1%| | 7601/1000000 [19:26:52<2404:42:06, 8.72s/it, lr=1e-5, step_loss=0.011][RANK-0]: Step: [7601], local_loss=0.00969315879046917, train_loss=0.14020131528377533, time_cost=1.238219976425171
+
Steps: 1%| | 7601/1000000 [19:26:52<2404:42:06, 8.72s/it, lr=1e-5, step_loss=0.00969]
Steps: 1%| | 7602/1000000 [19:27:02<2429:49:50, 8.81s/it, lr=1e-5, step_loss=0.00969][RANK-0]: Step: [7602], local_loss=0.02820371650159359, train_loss=0.04583143815398216, time_cost=6.463079214096069
+
Steps: 1%| | 7602/1000000 [19:27:02<2429:49:50, 8.81s/it, lr=1e-5, step_loss=0.0282]
Steps: 1%| | 7603/1000000 [19:27:12<2583:03:50, 9.37s/it, lr=1e-5, step_loss=0.0282][RANK-0]: Step: [7603], local_loss=0.023801743984222412, train_loss=0.02998996712267399, time_cost=7.135660886764526
+
Steps: 1%| | 7603/1000000 [19:27:12<2583:03:50, 9.37s/it, lr=1e-5, step_loss=0.0238]
Steps: 1%| | 7604/1000000 [19:27:21<2557:47:11, 9.28s/it, lr=1e-5, step_loss=0.0238][RANK-0]: Step: [7604], local_loss=0.0179548691958189, train_loss=0.02171473205089569, time_cost=6.595480918884277
+
Steps: 1%| | 7604/1000000 [19:27:21<2557:47:11, 9.28s/it, lr=1e-5, step_loss=0.018]
Steps: 1%| | 7605/1000000 [19:27:26<2219:40:50, 8.05s/it, lr=1e-5, step_loss=0.018][RANK-0]: Step: [7605], local_loss=0.07128649950027466, train_loss=0.08984173834323883, time_cost=2.1645708084106445
+
Steps: 1%| | 7605/1000000 [19:27:26<2219:40:50, 8.05s/it, lr=1e-5, step_loss=0.0713]
Steps: 1%| | 7606/1000000 [19:27:38<2500:15:54, 9.07s/it, lr=1e-5, step_loss=0.0713][RANK-0]: Step: [7606], local_loss=0.006821641232818365, train_loss=0.02783110924065113, time_cost=3.7667295932769775
+
Steps: 1%| | 7606/1000000 [19:27:38<2500:15:54, 9.07s/it, lr=1e-5, step_loss=0.00682]
Steps: 1%| | 7607/1000000 [19:27:47<2504:07:09, 9.08s/it, lr=1e-5, step_loss=0.00682][RANK-0]: Step: [7607], local_loss=0.009530209004878998, train_loss=0.014141902327537537, time_cost=6.2605695724487305
+
Steps: 1%| | 7607/1000000 [19:27:47<2504:07:09, 9.08s/it, lr=1e-5, step_loss=0.00953]
Steps: 1%| | 7608/1000000 [19:27:55<2455:19:36, 8.91s/it, lr=1e-5, step_loss=0.00953][RANK-0]: Step: [7608], local_loss=0.11475034058094025, train_loss=0.04751323536038399, time_cost=2.220057249069214
+
Steps: 1%| | 7608/1000000 [19:27:55<2455:19:36, 8.91s/it, lr=1e-5, step_loss=0.115]
Steps: 1%| | 7609/1000000 [19:28:11<2962:23:56, 10.75s/it, lr=1e-5, step_loss=0.115][RANK-0]: Step: [7609], local_loss=0.08225148916244507, train_loss=0.03800499066710472, time_cost=6.143026113510132
+
Steps: 1%| | 7609/1000000 [19:28:11<2962:23:56, 10.75s/it, lr=1e-5, step_loss=0.0823]
Steps: 1%| | 7610/1000000 [19:28:18<2671:12:14, 9.69s/it, lr=1e-5, step_loss=0.0823][RANK-0]: Step: [7610], local_loss=0.0852312222123146, train_loss=0.0731322392821312, time_cost=1.7163047790527344
+
Steps: 1%| | 7610/1000000 [19:28:18<2671:12:14, 9.69s/it, lr=1e-5, step_loss=0.0852]
Steps: 1%| | 7611/1000000 [19:28:22<2199:55:13, 7.98s/it, lr=1e-5, step_loss=0.0852][RANK-0]: Step: [7611], local_loss=0.028292685747146606, train_loss=0.04564046114683151, time_cost=1.5311543941497803
+
Steps: 1%| | 7611/1000000 [19:28:22<2199:55:13, 7.98s/it, lr=1e-5, step_loss=0.0283]
Steps: 1%| | 7612/1000000 [19:28:27<2001:33:55, 7.26s/it, lr=1e-5, step_loss=0.0283][RANK-0]: Step: [7612], local_loss=0.03002539835870266, train_loss=0.05728199705481529, time_cost=1.313859462738037
+
Steps: 1%| | 7612/1000000 [19:28:27<2001:33:55, 7.26s/it, lr=1e-5, step_loss=0.03]
Steps: 1%| | 7613/1000000 [19:28:38<2318:17:12, 8.41s/it, lr=1e-5, step_loss=0.03][RANK-0]: Step: [7613], local_loss=0.04972950741648674, train_loss=0.0391848050057888, time_cost=1.2766647338867188
+
Steps: 1%| | 7613/1000000 [19:28:38<2318:17:12, 8.41s/it, lr=1e-5, step_loss=0.0497]
Steps: 1%| | 7614/1000000 [19:28:52<2713:58:08, 9.85s/it, lr=1e-5, step_loss=0.0497][RANK-0]: Step: [7614], local_loss=0.03461061418056488, train_loss=0.06056270748376846, time_cost=4.2515764236450195
+
Steps: 1%| | 7614/1000000 [19:28:52<2713:58:08, 9.85s/it, lr=1e-5, step_loss=0.0346]
Steps: 1%| | 7615/1000000 [19:29:04<2889:16:18, 10.48s/it, lr=1e-5, step_loss=0.0346][RANK-0]: Step: [7615], local_loss=0.019446982070803642, train_loss=0.012385552749037743, time_cost=1.2295241355895996
+
Steps: 1%| | 7615/1000000 [19:29:04<2889:16:18, 10.48s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 7616/1000000 [19:29:17<3139:13:21, 11.39s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [7616], local_loss=0.017229760065674782, train_loss=0.056136228144168854, time_cost=3.005223274230957
+
Steps: 1%| | 7616/1000000 [19:29:17<3139:13:21, 11.39s/it, lr=1e-5, step_loss=0.0172]
Steps: 1%| | 7617/1000000 [19:29:22<2606:27:34, 9.46s/it, lr=1e-5, step_loss=0.0172][RANK-0]: Step: [7617], local_loss=0.01457469817250967, train_loss=0.021209361031651497, time_cost=1.2504351139068604
+
Steps: 1%| | 7617/1000000 [19:29:22<2606:27:34, 9.46s/it, lr=1e-5, step_loss=0.0146]
Steps: 1%| | 7618/1000000 [19:29:27<2231:58:50, 8.10s/it, lr=1e-5, step_loss=0.0146][RANK-0]: Step: [7618], local_loss=0.009634742513298988, train_loss=0.015997782349586487, time_cost=2.157228469848633
+
Steps: 1%| | 7618/1000000 [19:29:27<2231:58:50, 8.10s/it, lr=1e-5, step_loss=0.00963]
Steps: 1%| | 7619/1000000 [19:29:42<2790:49:50, 10.12s/it, lr=1e-5, step_loss=0.00963][RANK-0]: Step: [7619], local_loss=0.0958881750702858, train_loss=0.08170200884342194, time_cost=6.524754047393799
+
Steps: 1%| | 7619/1000000 [19:29:42<2790:49:50, 10.12s/it, lr=1e-5, step_loss=0.0959]
Steps: 1%| | 7620/1000000 [19:29:51<2745:38:52, 9.96s/it, lr=1e-5, step_loss=0.0959][RANK-0]: Step: [7620], local_loss=0.01410928275436163, train_loss=0.14183276891708374, time_cost=2.353239059448242
+
Steps: 1%| | 7620/1000000 [19:29:51<2745:38:52, 9.96s/it, lr=1e-5, step_loss=0.0141]
Steps: 1%| | 7621/1000000 [19:29:56<2266:32:38, 8.22s/it, lr=1e-5, step_loss=0.0141][RANK-0]: Step: [7621], local_loss=0.018865400925278664, train_loss=0.1718509942293167, time_cost=1.233062982559204
+
Steps: 1%| | 7621/1000000 [19:29:56<2266:32:38, 8.22s/it, lr=1e-5, step_loss=0.0189]
Steps: 1%| | 7622/1000000 [19:30:01<2007:03:00, 7.28s/it, lr=1e-5, step_loss=0.0189][RANK-0]: Step: [7622], local_loss=0.016412492841482162, train_loss=0.040473245084285736, time_cost=2.1058297157287598
+
Steps: 1%| | 7622/1000000 [19:30:01<2007:03:00, 7.28s/it, lr=1e-5, step_loss=0.0164]
Steps: 1%| | 7623/1000000 [19:30:15<2553:44:29, 9.26s/it, lr=1e-5, step_loss=0.0164][RANK-0]: Step: [7623], local_loss=0.01266525313258171, train_loss=0.021832160651683807, time_cost=4.174635171890259
+
Steps: 1%| | 7623/1000000 [19:30:15<2553:44:29, 9.26s/it, lr=1e-5, step_loss=0.0127]
Steps: 1%| | 7624/1000000 [19:30:19<2185:15:05, 7.93s/it, lr=1e-5, step_loss=0.0127][RANK-0]: Step: [7624], local_loss=0.12948130071163177, train_loss=0.178220734000206, time_cost=3.8718600273132324
+
Steps: 1%| | 7624/1000000 [19:30:19<2185:15:05, 7.93s/it, lr=1e-5, step_loss=0.129]
Steps: 1%| | 7625/1000000 [19:30:26<2047:04:40, 7.43s/it, lr=1e-5, step_loss=0.129][RANK-0]: Step: [7625], local_loss=0.06170213967561722, train_loss=0.04421214759349823, time_cost=1.6858875751495361
+
Steps: 1%| | 7625/1000000 [19:30:26<2047:04:40, 7.43s/it, lr=1e-5, step_loss=0.0617]
Steps: 1%| | 7626/1000000 [19:30:39<2501:06:50, 9.07s/it, lr=1e-5, step_loss=0.0617][RANK-0]: Step: [7626], local_loss=0.008910039439797401, train_loss=0.08948065340518951, time_cost=9.380874633789062
+
Steps: 1%| | 7626/1000000 [19:30:39<2501:06:50, 9.07s/it, lr=1e-5, step_loss=0.00891]
Steps: 1%| | 7627/1000000 [19:30:48<2497:23:32, 9.06s/it, lr=1e-5, step_loss=0.00891][RANK-0]: Step: [7627], local_loss=0.19400592148303986, train_loss=0.04015662521123886, time_cost=1.5274124145507812
+
Steps: 1%| | 7627/1000000 [19:30:48<2497:23:32, 9.06s/it, lr=1e-5, step_loss=0.194]
Steps: 1%| | 7628/1000000 [19:30:58<2616:29:26, 9.49s/it, lr=1e-5, step_loss=0.194][RANK-0]: Step: [7628], local_loss=0.02048243209719658, train_loss=0.02209833636879921, time_cost=3.9719061851501465
+
Steps: 1%| | 7628/1000000 [19:30:58<2616:29:26, 9.49s/it, lr=1e-5, step_loss=0.0205]
Steps: 1%| | 7629/1000000 [19:31:03<2264:10:18, 8.21s/it, lr=1e-5, step_loss=0.0205][RANK-0]: Step: [7629], local_loss=0.01065527182072401, train_loss=0.07030251622200012, time_cost=2.0922346115112305
+
Steps: 1%| | 7629/1000000 [19:31:03<2264:10:18, 8.21s/it, lr=1e-5, step_loss=0.0107]
Steps: 1%| | 7630/1000000 [19:31:12<2322:20:45, 8.42s/it, lr=1e-5, step_loss=0.0107][RANK-0]: Step: [7630], local_loss=0.3380792438983917, train_loss=0.10850206762552261, time_cost=1.2379205226898193
+
Steps: 1%| | 7630/1000000 [19:31:12<2322:20:45, 8.42s/it, lr=1e-5, step_loss=0.338]
Steps: 1%| | 7631/1000000 [19:31:26<2767:40:02, 10.04s/it, lr=1e-5, step_loss=0.338][RANK-0]: Step: [7631], local_loss=0.02165507711470127, train_loss=0.023219052702188492, time_cost=4.596836566925049
+
Steps: 1%| | 7631/1000000 [19:31:26<2767:40:02, 10.04s/it, lr=1e-5, step_loss=0.0217]
Steps: 1%| | 7632/1000000 [19:31:32<2424:37:17, 8.80s/it, lr=1e-5, step_loss=0.0217][RANK-0]: Step: [7632], local_loss=0.012063252739608288, train_loss=0.06567833572626114, time_cost=1.8366987705230713
+
Steps: 1%| | 7632/1000000 [19:31:32<2424:37:17, 8.80s/it, lr=1e-5, step_loss=0.0121]
Steps: 1%| | 7633/1000000 [19:31:37<2118:48:35, 7.69s/it, lr=1e-5, step_loss=0.0121][RANK-0]: Step: [7633], local_loss=0.08933735638856888, train_loss=0.046278681606054306, time_cost=2.3930792808532715
+
Steps: 1%| | 7633/1000000 [19:31:37<2118:48:35, 7.69s/it, lr=1e-5, step_loss=0.0893]
Steps: 1%| | 7634/1000000 [19:31:50<2527:20:02, 9.17s/it, lr=1e-5, step_loss=0.0893][RANK-0]: Step: [7634], local_loss=0.01740533486008644, train_loss=0.035471949726343155, time_cost=3.798001527786255
+
Steps: 1%| | 7634/1000000 [19:31:50<2527:20:02, 9.17s/it, lr=1e-5, step_loss=0.0174]
Steps: 1%| | 7635/1000000 [19:32:02<2803:07:19, 10.17s/it, lr=1e-5, step_loss=0.0174][RANK-0]: Step: [7635], local_loss=0.026033535599708557, train_loss=0.05002065375447273, time_cost=1.2241852283477783
+
Steps: 1%| | 7635/1000000 [19:32:02<2803:07:19, 10.17s/it, lr=1e-5, step_loss=0.026]
Steps: 1%| | 7636/1000000 [19:32:14<2979:56:37, 10.81s/it, lr=1e-5, step_loss=0.026][RANK-0]: Step: [7636], local_loss=0.0065386914648115635, train_loss=33.83206558227539, time_cost=5.649762153625488
+
Steps: 1%| | 7636/1000000 [19:32:14<2979:56:37, 10.81s/it, lr=1e-5, step_loss=0.00654]
Steps: 1%| | 7637/1000000 [19:32:22<2712:23:48, 9.84s/it, lr=1e-5, step_loss=0.00654][RANK-0]: Step: [7637], local_loss=0.055480390787124634, train_loss=0.05507100000977516, time_cost=2.7393758296966553
+
Steps: 1%| | 7637/1000000 [19:32:22<2712:23:48, 9.84s/it, lr=1e-5, step_loss=0.0555]
Steps: 1%| | 7638/1000000 [19:32:27<2333:00:38, 8.46s/it, lr=1e-5, step_loss=0.0555][RANK-0]: Step: [7638], local_loss=0.07766303420066833, train_loss=0.13360942900180817, time_cost=2.2746176719665527
+
Steps: 1%| | 7638/1000000 [19:32:27<2333:00:38, 8.46s/it, lr=1e-5, step_loss=0.0777]
Steps: 1%| | 7639/1000000 [19:32:32<2064:07:49, 7.49s/it, lr=1e-5, step_loss=0.0777][RANK-0]: Step: [7639], local_loss=0.010427827015519142, train_loss=0.08153139799833298, time_cost=2.5290584564208984
+
Steps: 1%| | 7639/1000000 [19:32:32<2064:07:49, 7.49s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7640/1000000 [19:32:45<2484:41:47, 9.01s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7640], local_loss=0.09105845540761948, train_loss=0.06823564320802689, time_cost=5.564332485198975
+
Steps: 1%| | 7640/1000000 [19:32:45<2484:41:47, 9.01s/it, lr=1e-5, step_loss=0.0911]
Steps: 1%| | 7641/1000000 [19:32:49<2099:52:00, 7.62s/it, lr=1e-5, step_loss=0.0911][RANK-0]: Step: [7641], local_loss=0.05346880853176117, train_loss=0.05119764059782028, time_cost=1.2040135860443115
+
Steps: 1%| | 7641/1000000 [19:32:49<2099:52:00, 7.62s/it, lr=1e-5, step_loss=0.0535]
Steps: 1%| | 7642/1000000 [19:33:01<2410:08:16, 8.74s/it, lr=1e-5, step_loss=0.0535][RANK-0]: Step: [7642], local_loss=0.01385206077247858, train_loss=0.03290275111794472, time_cost=6.64572286605835
+
Steps: 1%| | 7642/1000000 [19:33:01<2410:08:16, 8.74s/it, lr=1e-5, step_loss=0.0139]
Steps: 1%| | 7643/1000000 [19:33:14<2794:56:53, 10.14s/it, lr=1e-5, step_loss=0.0139][RANK-0]: Step: [7643], local_loss=0.03800826519727707, train_loss=0.055613238364458084, time_cost=1.2197515964508057
+
Steps: 1%| | 7643/1000000 [19:33:14<2794:56:53, 10.14s/it, lr=1e-5, step_loss=0.038]
Steps: 1%| | 7644/1000000 [19:33:26<2899:14:50, 10.52s/it, lr=1e-5, step_loss=0.038][RANK-0]: Step: [7644], local_loss=0.044694144278764725, train_loss=0.05266892537474632, time_cost=1.2387549877166748
+
Steps: 1%| | 7644/1000000 [19:33:26<2899:14:50, 10.52s/it, lr=1e-5, step_loss=0.0447]
Steps: 1%| | 7645/1000000 [19:33:38<3083:41:01, 11.19s/it, lr=1e-5, step_loss=0.0447][RANK-0]: Step: [7645], local_loss=0.022615766152739525, train_loss=0.04551137238740921, time_cost=4.083263635635376
+
Steps: 1%| | 7645/1000000 [19:33:38<3083:41:01, 11.19s/it, lr=1e-5, step_loss=0.0226]
Steps: 1%| | 7646/1000000 [19:33:44<2647:21:07, 9.60s/it, lr=1e-5, step_loss=0.0226][RANK-0]: Step: [7646], local_loss=0.019386619329452515, train_loss=0.0545460544526577, time_cost=1.223893642425537
+
Steps: 1%| | 7646/1000000 [19:33:44<2647:21:07, 9.60s/it, lr=1e-5, step_loss=0.0194]
Steps: 1%| | 7647/1000000 [19:33:48<2194:02:41, 7.96s/it, lr=1e-5, step_loss=0.0194][RANK-0]: Step: [7647], local_loss=0.013121701776981354, train_loss=0.09137686342000961, time_cost=1.356562614440918
+
Steps: 1%| | 7647/1000000 [19:33:48<2194:02:41, 7.96s/it, lr=1e-5, step_loss=0.0131]
Steps: 1%| | 7648/1000000 [19:34:03<2734:44:46, 9.92s/it, lr=1e-5, step_loss=0.0131][RANK-0]: Step: [7648], local_loss=0.01810149848461151, train_loss=0.04165012389421463, time_cost=1.2161955833435059
+
Steps: 1%| | 7648/1000000 [19:34:03<2734:44:46, 9.92s/it, lr=1e-5, step_loss=0.0181]
Steps: 1%| | 7649/1000000 [19:34:15<2890:40:46, 10.49s/it, lr=1e-5, step_loss=0.0181][RANK-0]: Step: [7649], local_loss=0.007888815365731716, train_loss=0.08647096157073975, time_cost=5.388034820556641
+
Steps: 1%| | 7649/1000000 [19:34:15<2890:40:46, 10.49s/it, lr=1e-5, step_loss=0.00789]
Steps: 1%| | 7650/1000000 [19:34:26<2939:36:14, 10.66s/it, lr=1e-5, step_loss=0.00789][RANK-0]: Step: [7650], local_loss=0.010441966354846954, train_loss=0.047315046191215515, time_cost=2.4373459815979004
+
Steps: 1%| | 7650/1000000 [19:34:26<2939:36:14, 10.66s/it, lr=1e-5, step_loss=0.0104]
Steps: 1%| | 7651/1000000 [19:34:32<2555:48:54, 9.27s/it, lr=1e-5, step_loss=0.0104][RANK-0]: Step: [7651], local_loss=0.016457252204418182, train_loss=0.027116673067212105, time_cost=2.0587947368621826
+
Steps: 1%| | 7651/1000000 [19:34:32<2555:48:54, 9.27s/it, lr=1e-5, step_loss=0.0165]
Steps: 1%| | 7652/1000000 [19:34:40<2476:03:21, 8.98s/it, lr=1e-5, step_loss=0.0165][RANK-0]: Step: [7652], local_loss=0.038492172956466675, train_loss=0.04710618034005165, time_cost=4.532207489013672
+
Steps: 1%| | 7652/1000000 [19:34:40<2476:03:21, 8.98s/it, lr=1e-5, step_loss=0.0385]
Steps: 1%| | 7653/1000000 [19:34:45<2167:13:33, 7.86s/it, lr=1e-5, step_loss=0.0385][RANK-0]: Step: [7653], local_loss=0.12059085071086884, train_loss=0.0908065065741539, time_cost=2.650245428085327
+
Steps: 1%| | 7653/1000000 [19:34:45<2167:13:33, 7.86s/it, lr=1e-5, step_loss=0.121]
Steps: 1%| | 7654/1000000 [19:35:03<2978:12:18, 10.80s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [7654], local_loss=0.014515366405248642, train_loss=0.032185688614845276, time_cost=9.552348613739014
+
Steps: 1%| | 7654/1000000 [19:35:03<2978:12:18, 10.80s/it, lr=1e-5, step_loss=0.0145]
Steps: 1%| | 7655/1000000 [19:35:07<2444:58:30, 8.87s/it, lr=1e-5, step_loss=0.0145][RANK-0]: Step: [7655], local_loss=0.03827952593564987, train_loss=0.202991783618927, time_cost=1.2638323307037354
+
Steps: 1%| | 7655/1000000 [19:35:07<2444:58:30, 8.87s/it, lr=1e-5, step_loss=0.0383]
Steps: 1%| | 7656/1000000 [19:35:18<2629:18:29, 9.54s/it, lr=1e-5, step_loss=0.0383][RANK-0]: Step: [7656], local_loss=0.10226204246282578, train_loss=0.1238178089261055, time_cost=3.1220955848693848
+
Steps: 1%| | 7656/1000000 [19:35:18<2629:18:29, 9.54s/it, lr=1e-5, step_loss=0.102]
Steps: 1%| | 7657/1000000 [19:35:24<2330:39:09, 8.46s/it, lr=1e-5, step_loss=0.102][RANK-0]: Step: [7657], local_loss=1.0279351472854614, train_loss=0.1590481549501419, time_cost=1.2984514236450195
+
Steps: 1%| | 7657/1000000 [19:35:24<2330:39:09, 8.46s/it, lr=1e-5, step_loss=1.03]
Steps: 1%| | 7658/1000000 [19:35:29<2032:59:08, 7.38s/it, lr=1e-5, step_loss=1.03][RANK-0]: Step: [7658], local_loss=0.008105907589197159, train_loss=0.026878539472818375, time_cost=2.900059700012207
+
Steps: 1%| | 7658/1000000 [19:35:29<2032:59:08, 7.38s/it, lr=1e-5, step_loss=0.00811]
Steps: 1%| | 7659/1000000 [19:35:36<2007:33:14, 7.28s/it, lr=1e-5, step_loss=0.00811][RANK-0]: Step: [7659], local_loss=0.10394398868083954, train_loss=0.04806692898273468, time_cost=2.4296884536743164
+
Steps: 1%| | 7659/1000000 [19:35:36<2007:33:14, 7.28s/it, lr=1e-5, step_loss=0.104]
Steps: 1%| | 7660/1000000 [19:35:47<2275:48:06, 8.26s/it, lr=1e-5, step_loss=0.104][RANK-0]: Step: [7660], local_loss=0.02896815724670887, train_loss=0.049216918647289276, time_cost=2.688812732696533
+
Steps: 1%| | 7660/1000000 [19:35:47<2275:48:06, 8.26s/it, lr=1e-5, step_loss=0.029]
Steps: 1%| | 7661/1000000 [19:35:59<2580:28:52, 9.36s/it, lr=1e-5, step_loss=0.029][RANK-0]: Step: [7661], local_loss=0.023447373881936073, train_loss=0.016246600076556206, time_cost=1.2214722633361816
+
Steps: 1%| | 7661/1000000 [19:35:59<2580:28:52, 9.36s/it, lr=1e-5, step_loss=0.0234]
Steps: 1%| | 7662/1000000 [19:36:07<2491:22:40, 9.04s/it, lr=1e-5, step_loss=0.0234][RANK-0]: Step: [7662], local_loss=0.031686414033174515, train_loss=0.04684834182262421, time_cost=3.126391649246216
+
Steps: 1%| | 7662/1000000 [19:36:07<2491:22:40, 9.04s/it, lr=1e-5, step_loss=0.0317]
Steps: 1%| | 7663/1000000 [19:36:24<3109:37:32, 11.28s/it, lr=1e-5, step_loss=0.0317][RANK-0]: Step: [7663], local_loss=0.016914861276745796, train_loss=0.024773841723799706, time_cost=6.625073671340942
+
Steps: 1%| | 7663/1000000 [19:36:24<3109:37:32, 11.28s/it, lr=1e-5, step_loss=0.0169]
Steps: 1%| | 7664/1000000 [19:36:38<3394:56:35, 12.32s/it, lr=1e-5, step_loss=0.0169][RANK-0]: Step: [7664], local_loss=1.0180037021636963, train_loss=0.18438997864723206, time_cost=7.111282825469971
+
Steps: 1%| | 7664/1000000 [19:36:38<3394:56:35, 12.32s/it, lr=1e-5, step_loss=1.02]
Steps: 1%| | 7665/1000000 [19:36:49<3295:04:21, 11.95s/it, lr=1e-5, step_loss=1.02][RANK-0]: Step: [7665], local_loss=0.051155634224414825, train_loss=0.17585976421833038, time_cost=1.3228886127471924
+
Steps: 1%| | 7665/1000000 [19:36:49<3295:04:21, 11.95s/it, lr=1e-5, step_loss=0.0512]
Steps: 1%| | 7666/1000000 [19:36:56<2859:22:23, 10.37s/it, lr=1e-5, step_loss=0.0512][RANK-0]: Step: [7666], local_loss=0.0564577579498291, train_loss=0.028704257681965828, time_cost=4.888059616088867
+
Steps: 1%| | 7666/1000000 [19:36:56<2859:22:23, 10.37s/it, lr=1e-5, step_loss=0.0565]
Steps: 1%| | 7667/1000000 [19:37:02<2452:21:13, 8.90s/it, lr=1e-5, step_loss=0.0565][RANK-0]: Step: [7667], local_loss=0.011367298662662506, train_loss=0.0762089192867279, time_cost=2.4128754138946533
+
Steps: 1%| | 7667/1000000 [19:37:02<2452:21:13, 8.90s/it, lr=1e-5, step_loss=0.0114]
Steps: 1%| | 7668/1000000 [19:37:13<2686:42:35, 9.75s/it, lr=1e-5, step_loss=0.0114][RANK-0]: Step: [7668], local_loss=0.08303479105234146, train_loss=0.09328466653823853, time_cost=5.555953741073608
+
Steps: 1%| | 7668/1000000 [19:37:13<2686:42:35, 9.75s/it, lr=1e-5, step_loss=0.083]
Steps: 1%| | 7669/1000000 [19:37:21<2533:27:52, 9.19s/it, lr=1e-5, step_loss=0.083][RANK-0]: Step: [7669], local_loss=0.12070940434932709, train_loss=0.040586888790130615, time_cost=3.2680022716522217
+
Steps: 1%| | 7669/1000000 [19:37:21<2533:27:52, 9.19s/it, lr=1e-5, step_loss=0.121]
Steps: 1%| | 7670/1000000 [19:37:26<2159:56:58, 7.84s/it, lr=1e-5, step_loss=0.121][RANK-0]: Step: [7670], local_loss=0.07742464542388916, train_loss=0.16906550526618958, time_cost=1.4065804481506348
+
Steps: 1%| | 7670/1000000 [19:37:26<2159:56:58, 7.84s/it, lr=1e-5, step_loss=0.0774]
Steps: 1%| | 7671/1000000 [19:37:36<2355:20:42, 8.54s/it, lr=1e-5, step_loss=0.0774][RANK-0]: Step: [7671], local_loss=0.0794011801481247, train_loss=0.04922131821513176, time_cost=1.3116450309753418
+
Steps: 1%| | 7671/1000000 [19:37:36<2355:20:42, 8.54s/it, lr=1e-5, step_loss=0.0794]
Steps: 1%| | 7672/1000000 [19:37:43<2210:47:11, 8.02s/it, lr=1e-5, step_loss=0.0794][RANK-0]: Step: [7672], local_loss=0.012979595921933651, train_loss=0.013461150228977203, time_cost=2.2229905128479004
+
Steps: 1%| | 7672/1000000 [19:37:43<2210:47:11, 8.02s/it, lr=1e-5, step_loss=0.013] scripts/text_condition/npu/train_inpaint_sparse1d_newmodel_motion.sh: line 81: 212 Killed accelerate launch --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml --machine_rank=${MACHINE_RANK} --main_process_ip=${MAIN_PROCESS_IP_VALUE} opensora/train/train_inpaint.py --model OpenSoraInpaint-L/122 --text_encoder_name google/mt5-xxl --cache_dir "../../cache_dir/" --dataset inpaint --data "scripts/train_data/video_data_debug.txt" --ae WFVAEModel_D8_4x8x8 --ae_path "/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL" --sample_rate 1 --num_frames 93 --max_height 320 --max_width 320 --interpolation_scale_t 1.0 --interpolation_scale_h 1.0 --interpolation_scale_w 1.0 --attention_mode xformers --gradient_checkpointing --train_batch_size=1 --dataloader_num_workers 0 --gradient_accumulation_steps=1 --max_train_steps=1000000 --learning_rate=1e-5 --lr_scheduler="constant" --lr_warmup_steps=0 --mixed_precision="bf16" --report_to="wandb" --checkpointing_steps=1000 --allow_tf32 --model_max_length 512 --use_image_num 0 --use_ema --ema_start_step 0 --cfg 0.1 --noise_offset 0.0 --use_rope --skip_low_resolution --speed_factor 1.0 --ema_decay 0.9999 --drop_short_ratio 0.0 --hw_stride 32 --sparse1d --sparse_n 4 --use_motion --train_fps 16 --seed 1234 --trained_data_global_step 0 --group_data --use_decord --prediction_type "v_prediction" --rescale_betas_zero_snr --t2v_ratio 0.0 --i2v_ratio 0.0 --transition_ratio 0.0 --v2v_ratio 0.0 --Semantic_ratio 0.2 --bbox_ratio 0.2 --background_ratio 0.2 --fixed_ratio 0.1 --Semantic_expansion_ratio 0.1 --fixed_bg_ratio 0.1 --clear_video_ratio 0.0 --min_clear_ratio 0.25 --default_text_ratio 0.0 --output_dir /home/save_dir/runs/$PROJECT --pretrained_transformer_model_path "/home/image_data/captions/vpre_latest_134k/model_ema" --yolomodel_pathorname "/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt"
diff --git a/log_inpaint_93x320x320_stage1.txt b/log_inpaint_93x320x320_stage1.txt
new file mode 100644
index 000000000..0227c1af4
--- /dev/null
+++ b/log_inpaint_93x320x320_stage1.txt
@@ -0,0 +1,2416 @@
+[2024-09-09 12:27:09,377] torch.distributed.run: [WARNING]
+[2024-09-09 12:27:09,377] torch.distributed.run: [WARNING] *****************************************
+[2024-09-09 12:27:09,377] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+[2024-09-09 12:27:09,377] torch.distributed.run: [WARNING] *****************************************
+[2024-09-09 12:27:16,508] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-09 12:27:16,604] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-09 12:27:16,769] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-09 12:27:18,155] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:209: ImportWarning:
+ *************************************************************************************************************
+ The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
+ The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
+ The backend in torch.distributed.init_process_group set to hccl now..
+ The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
+ The device parameters have been replaced with npu in the function below:
+ torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty
+ *************************************************************************************************************
+
+ warnings.warn(msg, ImportWarning)
+[2024-09-09 12:27:18,304] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-09 12:27:18,529] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-09 12:27:18,681] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-09 12:27:19,071] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 405's current affinity list: 0-191
+pid 405's new affinity list: 96-119
+pid 406's current affinity list: 0-191
+pid 406's new affinity list: 120-143
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 407's current affinity list: 0-191
+pid 407's new affinity list: 144-167
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+The npu_config.on_npu is True
+pid 401's current affinity list: 0-191
+pid 401's new affinity list: 0-23
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _has_inf_or_nan
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 408's current affinity list: 0-191
+pid 408's new affinity list: 168-191
+pid 403's current affinity list: 0-191
+pid 403's new affinity list: 48-71
+pid 402's current affinity list: 0-191
+pid 402's new affinity list: 24-47
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+pid 404's current affinity list: 0-191
+pid 404's new affinity list: 72-95
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-5]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+[RANK-4]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:27,947] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:27,948] [INFO] [comm.py:637:init_distributed] cdb=None
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:27,951] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:27,952] [INFO] [comm.py:637:init_distributed] cdb=None
+09/09/2024 12:27:27 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 5
+Local process index: 5
+Device: npu:5
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+09/09/2024 12:27:27 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 4
+Local process index: 4
+Device: npu:4
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+[RANK-6]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:28,653] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:28,653] [INFO] [comm.py:637:init_distributed] cdb=None
+09/09/2024 12:27:28 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 6
+Local process index: 6
+Device: npu:6
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-0]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:29,451] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:29,451] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-09-09 12:27:29,451] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend hccl
+Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+09/09/2024 12:27:29 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 0
+Local process index: 0
+Device: npu:0
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+[RANK-7]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:29,834] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:29,835] [INFO] [comm.py:637:init_distributed] cdb=None
+09/09/2024 12:27:29 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 7
+Local process index: 7
+Device: npu:7
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-2]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:30,043] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:30,043] [INFO] [comm.py:637:init_distributed] cdb=None
+09/09/2024 12:27:30 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 2
+Local process index: 2
+Device: npu:2
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+[RANK-1]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:30,093] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:30,093] [INFO] [comm.py:637:init_distributed] cdb=None
+09/09/2024 12:27:30 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 1
+Local process index: 1
+Device: npu:1
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-3]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=0, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema', yolomodel_pathorname='/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt')
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-09 12:27:30,699] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-09 12:27:30,699] [INFO] [comm.py:637:init_distributed] cdb=None
+09/09/2024 12:27:30 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 3
+Local process index: 3
+Device: npu:3
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
0%| | 0/1 [00:00, ?it/s]
0%| | 0/1 [00:00, ?it/s]Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3290/478625 [00:00<00:14, 32890.46it/s][A
+
1%|▏ | 6580/478625 [00:00<00:14, 32359.90it/s][A
+
2%|▏ | 9978/478625 [00:00<00:14, 33090.22it/s][A
+
3%|▎ | 13342/478625 [00:00<00:13, 33303.03it/s][A
+
3%|▎ | 16674/478625 [00:00<00:14, 32369.75it/s][A
+
4%|▍ | 20078/478625 [00:00<00:13, 32923.79it/s][A
+
5%|▍ | 23375/478625 [00:00<00:14, 31833.28it/s][A
+
6%|▌ | 26685/478625 [00:00<00:14, 32220.02it/s][A
+
6%|▋ | 29961/478625 [00:00<00:13, 32381.91it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
7%|▋ | 33205/478625 [00:01<00:13, 31880.95it/s][A
+
1%| | 3267/478625 [00:00<00:14, 32628.96it/s][A
+
8%|▊ | 36398/478625 [00:01<00:13, 31856.52it/s][A
+
1%|▏ | 6530/478625 [00:00<00:14, 32120.76it/s][A
+
8%|▊ | 39706/478625 [00:01<00:13, 32222.06it/s][A
+
2%|▏ | 9877/478625 [00:00<00:14, 32729.25it/s][A
+
9%|▉ | 42932/478625 [00:01<00:13, 31680.96it/s][A
+
3%|▎ | 13195/478625 [00:00<00:14, 32905.17it/s][A
+
10%|▉ | 46295/478625 [00:01<00:13, 32255.26it/s][A
+
3%|▎ | 16487/478625 [00:00<00:14, 32160.17it/s][A
+
10%|█ | 49525/478625 [00:01<00:13, 31876.23it/s][A
+
4%|▍ | 19836/478625 [00:00<00:14, 32603.45it/s][A
+
11%|█ | 52917/478625 [00:01<00:13, 32477.50it/s][A
+
5%|▍ | 23100/478625 [00:00<00:14, 31975.37it/s][A
+
12%|█▏ | 56300/478625 [00:01<00:12, 32876.94it/s][A
+
6%|▌ | 26417/478625 [00:00<00:13, 32344.81it/s][A
+
12%|█▏ | 59591/478625 [00:01<00:13, 32016.66it/s][A
+
6%|▌ | 29774/478625 [00:00<00:13, 32720.62it/s][A
+
13%|█▎ | 63008/478625 [00:01<00:12, 32623.86it/s][A
+
7%|▋ | 33049/478625 [00:01<00:13, 32140.13it/s][A
+
14%|█▍ | 66277/478625 [00:02<00:12, 32136.34it/s][A
+
8%|▊ | 36390/478625 [00:01<00:13, 32520.18it/s][A
+
15%|█▍ | 69665/478625 [00:02<00:12, 32646.91it/s][A
+
8%|▊ | 39718/478625 [00:01<00:13, 32745.88it/s][A
+
15%|█▌ | 73020/478625 [00:02<00:12, 32910.40it/s][A
+
9%|▉ | 42996/478625 [00:01<00:13, 32042.14it/s][A
+
16%|█▌ | 76315/478625 [00:02<00:12, 32383.21it/s][A
+
10%|▉ | 46327/478625 [00:01<00:13, 32414.10it/s][A
+
17%|█▋ | 79692/478625 [00:02<00:12, 32790.67it/s][A
+
10%|█ | 49573/478625 [00:01<00:13, 31985.27it/s][A
+
17%|█▋ | 82975/478625 [00:02<00:12, 32123.33it/s][A
+
11%|█ | 52904/478625 [00:01<00:13, 32374.98it/s][A
+
18%|█▊ | 86283/478625 [00:02<00:12, 32401.84it/s][A
+
12%|█▏ | 56231/478625 [00:01<00:12, 32637.96it/s][A
+
19%|█▊ | 89649/478625 [00:02<00:11, 32771.47it/s][A
+
12%|█▏ | 59498/478625 [00:01<00:13, 32010.07it/s][A
+
19%|█▉ | 92930/478625 [00:02<00:12, 31945.10it/s][A
+
13%|█▎ | 62866/478625 [00:01<00:12, 32499.14it/s][A
+
20%|██ | 96274/478625 [00:02<00:11, 32380.09it/s][A
+
14%|█▍ | 66120/478625 [00:02<00:12, 31928.84it/s][A
+
21%|██ | 99518/478625 [00:03<00:11, 31827.18it/s][A
+
15%|█▍ | 69476/478625 [00:02<00:12, 32406.14it/s][A
+
21%|██▏ | 102745/478625 [00:03<00:11, 31953.98it/s][A
+
15%|█▌ | 72801/478625 [00:02<00:12, 32652.12it/s][A
+
22%|██▏ | 106051/478625 [00:03<00:11, 32277.06it/s][A
+
16%|█▌ | 76070/478625 [00:02<00:12, 32033.91it/s][A
+
23%|██▎ | 109283/478625 [00:03<00:11, 31770.33it/s][A
+
17%|█▋ | 79399/478625 [00:02<00:12, 32400.89it/s][A
+
24%|██▎ | 112668/478625 [00:03<00:11, 32378.03it/s][A
+
17%|█▋ | 82643/478625 [00:02<00:12, 31926.37it/s][A
+
24%|██▍ | 115910/478625 [00:03<00:11, 31789.64it/s][A
+
18%|█▊ | 85964/478625 [00:02<00:12, 32302.63it/s][A
+
25%|██▍ | 119253/478625 [00:03<00:11, 32249.49it/s][A
+
19%|█▊ | 89266/478625 [00:02<00:11, 32513.03it/s][A
+
26%|██▌ | 122665/478625 [00:03<00:10, 32800.78it/s][A
+
19%|█▉ | 92520/478625 [00:02<00:12, 31940.96it/s][A
+
26%|██▋ | 125949/478625 [00:03<00:10, 32116.25it/s][A
+
20%|██ | 95835/478625 [00:02<00:11, 32294.94it/s][A
+
27%|██▋ | 129319/478625 [00:03<00:10, 32577.55it/s][A
+
21%|██ | 99068/478625 [00:03<00:11, 31805.73it/s][A
+
28%|██▊ | 132621/478625 [00:04<00:10, 32077.50it/s][A
+
21%|██▏ | 102379/478625 [00:03<00:11, 32181.33it/s][A
+
28%|██▊ | 136016/478625 [00:04<00:10, 32623.08it/s][A
+
22%|██▏ | 105695/478625 [00:03<00:11, 32466.91it/s][A
+
29%|██▉ | 139364/478625 [00:04<00:10, 32872.64it/s][A
+
23%|██▎ | 108945/478625 [00:03<00:11, 31955.72it/s][A
+
30%|██▉ | 142655/478625 [00:04<00:10, 32228.96it/s][A
+
23%|██▎ | 112290/478625 [00:03<00:11, 32393.96it/s][A
+
30%|███ | 145921/478625 [00:04<00:10, 32352.47it/s][A
+
24%|██▍ | 115610/478625 [00:03<00:11, 32631.92it/s][A09/09/2024 12:29:27 - INFO - __main__ - optimizer: AdamW (
+Parameter Group 0
+ amsgrad: False
+ betas: (0.9, 0.999)
+ capturable: False
+ differentiable: False
+ eps: 1e-08
+ foreach: False
+ fused: None
+ lr: 1e-05
+ maximize: False
+ weight_decay: 0.01
+)
+
+
31%|███ | 149244/478625 [00:04<00:10, 32610.82it/s][A
+
25%|██▍ | 118876/478625 [00:03<00:11, 32013.78it/s][A
+
32%|███▏ | 152509/478625 [00:04<00:10, 32020.10it/s][A
+
26%|██▌ | 122189/478625 [00:03<00:11, 32340.46it/s][A
+
33%|███▎ | 155887/478625 [00:04<00:09, 32535.02it/s][A
+
26%|██▌ | 125427/478625 [00:03<00:11, 31873.10it/s][A
+
33%|███▎ | 159145/478625 [00:04<00:09, 31979.14it/s][A
+
27%|██▋ | 128743/478625 [00:03<00:10, 32248.33it/s][A
+
34%|███▍ | 162348/478625 [00:05<00:09, 31908.83it/s][A
+
28%|██▊ | 132097/478625 [00:04<00:10, 32626.57it/s][A
+
35%|███▍ | 165687/478625 [00:05<00:09, 32342.33it/s][A
+
28%|██▊ | 135363/478625 [00:04<00:10, 32041.71it/s][AYou are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
+
+
35%|███▌ | 168924/478625 [00:05<00:09, 31812.10it/s][A
+
29%|██▉ | 138685/478625 [00:04<00:10, 32386.79it/s][A
+
36%|███▌ | 172223/478625 [00:05<00:09, 32157.54it/s][A
+
30%|██▉ | 141927/478625 [00:04<00:10, 31877.93it/s][A
+
37%|███▋ | 175442/478625 [00:05<00:09, 31759.09it/s][A
+
30%|███ | 145239/478625 [00:04<00:10, 32239.14it/s][A
+
37%|███▋ | 178785/478625 [00:05<00:09, 32247.38it/s][A
+
31%|███ | 148577/478625 [00:04<00:10, 32575.15it/s][A
+
38%|███▊ | 182084/478625 [00:05<00:09, 32463.81it/s][A
+
32%|███▏ | 151838/478625 [00:04<00:10, 31871.10it/s][A
+
39%|███▊ | 185333/478625 [00:05<00:09, 31885.97it/s][A
+
32%|███▏ | 155173/478625 [00:04<00:10, 32303.46it/s][A
+
39%|███▉ | 188717/478625 [00:05<00:08, 32458.78it/s][A
+
33%|███▎ | 158408/478625 [00:04<00:10, 31908.35it/s][A
+
40%|████ | 191967/478625 [00:05<00:08, 31868.45it/s][A
+
34%|███▍ | 161696/478625 [00:05<00:09, 32191.62it/s][Amissing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
41%|████ | 195309/478625 [00:06<00:08, 32323.07it/s][A
+
34%|███▍ | 165022/478625 [00:05<00:09, 32505.24it/s][A
+
42%|████▏ | 198673/478625 [00:06<00:08, 32710.83it/s][A
+
35%|███▌ | 168276/478625 [00:05<00:09, 31976.62it/s][A
+
42%|████▏ | 201948/478625 [00:06<00:08, 32088.75it/s][A
+
36%|███▌ | 171583/478625 [00:05<00:09, 32296.25it/s][A
+
43%|████▎ | 205183/478625 [00:06<00:08, 32163.25it/s][A
+
37%|███▋ | 174816/478625 [00:05<00:09, 31812.38it/s][A
+
44%|████▎ | 208550/478625 [00:06<00:08, 32607.05it/s][A
+
37%|███▋ | 178173/478625 [00:05<00:09, 32328.38it/s][A
+
44%|████▍ | 211814/478625 [00:06<00:08, 31970.31it/s][A
+
38%|███▊ | 181454/478625 [00:05<00:09, 32467.91it/s][A
+
45%|████▍ | 215143/478625 [00:06<00:08, 32356.85it/s][A
+
39%|███▊ | 184704/478625 [00:05<00:09, 31800.44it/s][A
+
46%|████▌ | 218383/478625 [00:06<00:08, 31890.84it/s][A
+
39%|███▉ | 188039/478625 [00:05<00:09, 32254.73it/s][A
+
46%|████▋ | 221748/478625 [00:06<00:07, 32406.09it/s][A
+
40%|███▉ | 191394/478625 [00:05<00:08, 32633.69it/s][A
+
47%|████▋ | 225117/478625 [00:06<00:07, 32783.33it/s][A
+
41%|████ | 194661/478625 [00:06<00:08, 32053.02it/s][A
+
48%|████▊ | 228399/478625 [00:07<00:07, 32066.87it/s][A
+
41%|████▏ | 197973/478625 [00:06<00:08, 32363.07it/s][A
+
48%|████▊ | 231695/478625 [00:07<00:07, 32326.29it/s][A
+
42%|████▏ | 201213/478625 [00:06<00:08, 31842.20it/s][A
+
49%|████▉ | 234932/478625 [00:07<00:07, 31829.19it/s][A
+
43%|████▎ | 204533/478625 [00:06<00:08, 32238.98it/s][A
+
50%|████▉ | 238291/478625 [00:07<00:07, 32344.97it/s][A
+
43%|████▎ | 207804/478625 [00:06<00:08, 32377.09it/s][A
+
50%|█████ | 241630/478625 [00:07<00:07, 32651.05it/s][A
+
44%|████▍ | 211045/478625 [00:06<00:08, 31868.88it/s][A
+
51%|█████ | 244899/478625 [00:07<00:07, 32104.98it/s][A
+
45%|████▍ | 214326/478625 [00:06<00:08, 32143.61it/s][A
+
52%|█████▏ | 248281/478625 [00:07<00:07, 32608.19it/s][A
+
45%|████▌ | 217544/478625 [00:06<00:08, 31710.38it/s][A
+
53%|█████▎ | 251546/478625 [00:07<00:07, 32038.57it/s][A
+
46%|████▌ | 220882/478625 [00:06<00:08, 32195.50it/s][A
+
53%|█████▎ | 254754/478625 [00:07<00:06, 32032.79it/s][A
+
47%|████▋ | 224220/478625 [00:06<00:07, 32544.21it/s][A
+
54%|█████▍ | 258063/478625 [00:07<00:06, 32343.74it/s][A
+
48%|████▊ | 227477/478625 [00:07<00:07, 32039.51it/s][A
+
48%|████▊ | 230705/478625 [00:07<00:07, 32106.87it/s][A
+
55%|█████▍ | 261300/478625 [00:08<00:06, 31778.73it/s][Amissing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
55%|█████▌ | 264661/478625 [00:08<00:06, 32314.14it/s][A
+
49%|████▉ | 233918/478625 [00:07<00:07, 31698.58it/s][A
+
50%|████▉ | 237275/478625 [00:07<00:07, 32247.91it/s][A
+
56%|█████▌ | 267896/478625 [00:08<00:06, 31850.10it/s][A
+
50%|█████ | 240612/478625 [00:07<00:07, 32577.94it/s][A
+
57%|█████▋ | 271225/478625 [00:08<00:06, 32270.83it/s][A
+
57%|█████▋ | 274544/478625 [00:08<00:06, 32539.68it/s][A
+
51%|█████ | 243873/478625 [00:07<00:07, 31938.15it/s][A
+
58%|█████▊ | 277801/478625 [00:08<00:06, 32094.51it/s][A
+
52%|█████▏ | 247222/478625 [00:07<00:07, 32391.66it/s][A
+
59%|█████▊ | 281109/478625 [00:08<00:06, 32384.27it/s][A
+
52%|█████▏ | 250544/478625 [00:07<00:06, 32634.87it/s][A
+
59%|█████▉ | 284450/478625 [00:08<00:05, 32686.83it/s][A
+
53%|█████▎ | 253811/478625 [00:07<00:07, 32079.25it/s][A
+
60%|██████ | 287721/478625 [00:08<00:05, 32037.35it/s][A
+
54%|█████▎ | 257123/478625 [00:07<00:06, 32382.89it/s][A
+
61%|██████ | 291076/478625 [00:09<00:05, 32479.65it/s][A
+
54%|█████▍ | 260365/478625 [00:08<00:06, 31682.78it/s][A
+
55%|█████▌ | 263691/478625 [00:08<00:06, 32141.73it/s][A
+
61%|██████▏ | 294328/478625 [00:09<00:05, 31634.32it/s][A
+
56%|█████▌ | 267023/478625 [00:08<00:06, 32486.32it/s][A
+
62%|██████▏ | 297661/478625 [00:09<00:05, 32125.66it/s][A
+
63%|██████▎ | 301030/478625 [00:09<00:05, 32583.05it/s][A
+
56%|█████▋ | 270276/478625 [00:08<00:06, 31951.80it/s][A
+
57%|█████▋ | 273596/478625 [00:08<00:06, 32315.60it/s][A
+
64%|██████▎ | 304294/478625 [00:09<00:05, 31982.11it/s][A
+
64%|██████▍ | 307592/478625 [00:09<00:05, 32271.47it/s][A
+
58%|█████▊ | 276832/478625 [00:08<00:06, 31788.76it/s][A
+
59%|█████▊ | 280167/478625 [00:08<00:06, 32245.19it/s][A
+
65%|██████▍ | 310824/478625 [00:09<00:05, 31691.55it/s][A
+
59%|█████▉ | 283499/478625 [00:08<00:05, 32560.29it/s][A
+
66%|██████▌ | 314139/478625 [00:09<00:05, 32115.84it/s][A
+
66%|██████▋ | 317539/478625 [00:09<00:04, 32670.08it/s][A
+
60%|█████▉ | 286759/478625 [00:08<00:05, 32018.29it/s][A
+
61%|██████ | 290080/478625 [00:08<00:05, 32368.24it/s][A
+
67%|██████▋ | 320810/478625 [00:09<00:04, 32134.32it/s][A
+
68%|██████▊ | 324175/478625 [00:10<00:04, 32579.07it/s][A
+
61%|██████▏ | 293321/478625 [00:09<00:05, 31673.92it/s][A
+
62%|██████▏ | 296638/478625 [00:09<00:05, 32110.55it/s][A
+
68%|██████▊ | 327437/478625 [00:10<00:04, 31960.71it/s][A
+
63%|██████▎ | 299978/478625 [00:09<00:05, 32489.08it/s][A
+
69%|██████▉ | 330733/478625 [00:10<00:04, 32251.74it/s][A
+
70%|██████▉ | 333973/478625 [00:10<00:04, 32292.79it/s][A
+
63%|██████▎ | 303231/478625 [00:09<00:05, 31926.67it/s][A
+
70%|███████ | 337206/478625 [00:10<00:04, 31908.35it/s][A
+
64%|██████▍ | 306535/478625 [00:09<00:05, 32250.96it/s][A
+
71%|███████ | 340592/478625 [00:10<00:04, 32483.15it/s][A
+
65%|██████▍ | 309868/478625 [00:09<00:05, 32566.95it/s][A
+
72%|███████▏ | 343844/478625 [00:10<00:04, 31971.38it/s][A
+
65%|██████▌ | 313128/478625 [00:09<00:05, 32003.06it/s][A
+
73%|███████▎ | 347183/478625 [00:10<00:04, 32386.82it/s][A
+
66%|██████▌ | 316443/478625 [00:09<00:05, 32337.81it/s][A
+
73%|███████▎ | 350548/478625 [00:10<00:03, 32758.48it/s][A
+
67%|██████▋ | 319681/478625 [00:09<00:04, 31897.30it/s][A
+
67%|██████▋ | 322984/478625 [00:10<00:04, 32229.80it/s][A
+
74%|███████▍ | 353827/478625 [00:10<00:03, 32022.02it/s][A
+
68%|██████▊ | 326287/478625 [00:10<00:04, 32463.62it/s][A
+
75%|███████▍ | 357185/478625 [00:11<00:03, 32478.38it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
69%|██████▉ | 329536/478625 [00:10<00:04, 31739.08it/s][A
+
75%|███████▌ | 360513/478625 [00:11<00:03, 32069.61it/s][A
+
70%|██████▉ | 332867/478625 [00:10<00:04, 32198.79it/s][A
+
76%|███████▌ | 363910/478625 [00:11<00:03, 32624.16it/s][A
+
77%|███████▋ | 367219/478625 [00:11<00:03, 32759.95it/s][A
+
70%|███████ | 336092/478625 [00:10<00:04, 31688.47it/s][A
+
71%|███████ | 339415/478625 [00:10<00:04, 32137.63it/s][A
+
77%|███████▋ | 370499/478625 [00:11<00:03, 31944.28it/s][A
+
72%|███████▏ | 342774/478625 [00:10<00:04, 32563.27it/s][A
+
78%|███████▊ | 373787/478625 [00:11<00:03, 32215.63it/s][A
+
79%|███████▉ | 377110/478625 [00:11<00:03, 32512.47it/s][A
+
72%|███████▏ | 346034/478625 [00:10<00:04, 31952.74it/s][A
+
73%|███████▎ | 349352/478625 [00:10<00:04, 32310.38it/s][A
+
79%|███████▉ | 380366/478625 [00:11<00:03, 31896.59it/s][A
+
80%|████████ | 383675/478625 [00:11<00:02, 32244.74it/s][A
+
74%|███████▎ | 352587/478625 [00:10<00:03, 31822.47it/s][A
+
74%|███████▍ | 355920/478625 [00:11<00:03, 32264.29it/s][A
+
81%|████████ | 386904/478625 [00:11<00:02, 31883.00it/s][A
+
75%|███████▌ | 359230/478625 [00:11<00:03, 32509.18it/s][A
+
82%|████████▏ | 390245/478625 [00:12<00:02, 32330.94it/s][A
+
82%|████████▏ | 393541/478625 [00:12<00:02, 32514.48it/s][A
+
76%|███████▌ | 362484/478625 [00:11<00:03, 32007.91it/s][A
+
76%|███████▋ | 365816/478625 [00:11<00:03, 32392.18it/s][A
+
83%|████████▎ | 396795/478625 [00:12<00:02, 32045.21it/s][A
+
84%|████████▎ | 400137/478625 [00:12<00:02, 32449.65it/s][A
+
77%|███████▋ | 369059/478625 [00:11<00:03, 31539.87it/s][A
+
84%|████████▍ | 403385/478625 [00:12<00:02, 32069.96it/s][A
+
78%|███████▊ | 372395/478625 [00:11<00:03, 32068.51it/s][A
+
85%|████████▍ | 406772/478625 [00:12<00:02, 32598.05it/s][A
+
78%|███████▊ | 375711/478625 [00:11<00:03, 32387.53it/s][A
0%| | 0/1 [00:00, ?it/s]09/09/2024 12:29:35 - INFO - opensora.dataset.t2v_datasets - Building /home/image_data/captions/TV01_clips_final_478625_llavanext_217405_aes478625.json...
+
+
86%|████████▌ | 410098/478625 [00:12<00:02, 32792.12it/s][A
+
79%|███████▉ | 378955/478625 [00:11<00:03, 31829.12it/s][A
+
86%|████████▋ | 413380/478625 [00:12<00:02, 32118.80it/s][A
+
80%|███████▉ | 382299/478625 [00:11<00:02, 32300.24it/s][A
+
81%|████████ | 385611/478625 [00:11<00:02, 32539.90it/s][A
+
87%|████████▋ | 416596/478625 [00:12<00:01, 31643.96it/s][A
+
81%|████████ | 388869/478625 [00:12<00:02, 31997.94it/s][A
+
88%|████████▊ | 419764/478625 [00:13<00:01, 31408.29it/s][A
+
82%|████████▏ | 392196/478625 [00:12<00:02, 32370.76it/s][A
+
88%|████████▊ | 423098/478625 [00:13<00:01, 31973.04it/s][A
+
89%|████████▉ | 426400/478625 [00:13<00:01, 32279.21it/s][A
+
83%|████████▎ | 395437/478625 [00:12<00:02, 31889.33it/s][A
+
+
90%|████████▉ | 429631/478625 [00:13<00:01, 31877.65it/s][A
83%|████████▎ | 398746/478625 [00:12<00:02, 32240.20it/s][A
+
+
90%|█████████ | 432985/478625 [00:13<00:01, 32365.69it/s][A
84%|████████▍ | 402063/478625 [00:12<00:02, 32514.16it/s][A
+
91%|█████████ | 436247/478625 [00:13<00:01, 32437.99it/s][A
+
85%|████████▍ | 405318/478625 [00:12<00:02, 32026.07it/s][A
+
85%|████████▌ | 408632/478625 [00:12<00:02, 32353.36it/s][A
+
92%|█████████▏| 439493/478625 [00:13<00:01, 31893.29it/s][A
+
93%|█████████▎| 442839/478625 [00:13<00:01, 32353.53it/s][A
+
86%|████████▌ | 411871/478625 [00:12<00:02, 31738.26it/s][A
+
93%|█████████▎| 446078/478625 [00:13<00:01, 31885.92it/s][A
+
87%|████████▋ | 415088/478625 [00:12<00:01, 31862.73it/s][A
+
94%|█████████▍| 449462/478625 [00:13<00:00, 32459.97it/s][A
+
87%|████████▋ | 418403/478625 [00:12<00:01, 32240.17it/s][A
+
95%|█████████▍| 452842/478625 [00:14<00:00, 32855.55it/s][A
+
88%|████████▊ | 421630/478625 [00:13<00:01, 31823.66it/s][A
+
89%|████████▉ | 424952/478625 [00:13<00:01, 32232.47it/s][A
+
95%|█████████▌| 456131/478625 [00:14<00:00, 32210.41it/s][A
+
96%|█████████▌| 459463/478625 [00:14<00:00, 32534.59it/s][A
+
89%|████████▉ | 428178/478625 [00:13<00:01, 31744.38it/s][A
+
90%|█████████ | 431517/478625 [00:13<00:01, 32225.71it/s][A
+
97%|█████████▋| 462721/478625 [00:14<00:00, 32086.32it/s][A
+
91%|█████████ | 434851/478625 [00:13<00:01, 32554.58it/s][A
+
97%|█████████▋| 466059/478625 [00:14<00:00, 32463.32it/s][A
+
98%|█████████▊| 469309/478625 [00:14<00:00, 32453.49it/s][A
+
92%|█████████▏| 438110/478625 [00:13<00:01, 31989.52it/s][A
+
92%|█████████▏| 441445/478625 [00:13<00:01, 32386.80it/s][A
+
99%|█████████▊| 472557/478625 [00:14<00:00, 31811.33it/s][A
+
93%|█████████▎| 444785/478625 [00:13<00:01, 32685.92it/s][A
+
99%|█████████▉| 475801/478625 [00:14<00:00, 31993.39it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32257.96it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.19s/it]
100%|██████████| 1/1 [00:21<00:00, 21.19s/it]
+
+
94%|█████████▎| 448057/478625 [00:13<00:00, 32159.65it/s][A
+
94%|█████████▍| 451372/478625 [00:14<00:00, 32448.56it/s][A
+
95%|█████████▍| 454620/478625 [00:14<00:00, 31969.21it/s][A
+
96%|█████████▌| 457952/478625 [00:14<00:00, 32364.10it/s][A
+
96%|█████████▋| 461284/478625 [00:14<00:00, 32646.32it/s][A
+
97%|█████████▋| 464552/478625 [00:14<00:00, 31940.88it/s][A
+
98%|█████████▊| 467764/478625 [00:14<00:00, 31991.78it/s][Atime 21.85038995742798
+n_elements: 474899
+data length: 474899
+
+
98%|█████████▊| 470967/478625 [00:14<00:00, 31573.46it/s][A
+
99%|█████████▉| 474318/478625 [00:14<00:00, 32141.47it/s][A
+
100%|█████████▉| 477628/478625 [00:14<00:00, 32424.24it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32204.01it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.25s/it]
100%|██████████| 1/1 [00:21<00:00, 21.25s/it]
+time 21.90780997276306
+n_elements: 474899
+data length: 474899
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
0%| | 0/1 [00:00, ?it/s]/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 2946/478625 [00:00<00:16, 29455.25it/s][A
+
1%| | 5916/478625 [00:00<00:15, 29595.83it/s][A
+
2%|▏ | 8876/478625 [00:00<00:16, 28895.03it/s][A
+
2%|▏ | 11768/478625 [00:00<00:16, 28899.26it/s][A
+
3%|▎ | 14660/478625 [00:00<00:16, 28381.35it/s][A
+
4%|▎ | 17639/478625 [00:00<00:15, 28850.66it/s][A
+
4%|▍ | 20585/478625 [00:00<00:15, 29046.53it/s][A
+
5%|▍ | 23492/478625 [00:00<00:15, 28486.33it/s][A
+
6%|▌ | 26396/478625 [00:00<00:15, 28653.96it/s][A
+
6%|▌ | 29340/478625 [00:01<00:15, 28890.36it/s][A
+
7%|▋ | 32231/478625 [00:01<00:15, 28402.11it/s][A
+
7%|▋ | 35173/478625 [00:01<00:15, 28702.74it/s][A
+
8%|▊ | 38178/478625 [00:01<00:15, 29103.39it/s][A
+
9%|▊ | 41091/478625 [00:01<00:15, 28610.67it/s][A
0%| | 0/1 [00:00, ?it/s]
+
9%|▉ | 44037/478625 [00:01<00:15, 28861.49it/s][A
+
10%|▉ | 46964/478625 [00:01<00:14, 28980.54it/s][A
+
10%|█ | 49865/478625 [00:01<00:15, 28445.19it/s][A
+
11%|█ | 52798/478625 [00:01<00:14, 28703.34it/s][A
+
12%|█▏ | 55768/478625 [00:01<00:14, 28997.15it/s][A
+
12%|█▏ | 58671/478625 [00:02<00:14, 28548.87it/s][A
+
13%|█▎ | 61656/478625 [00:02<00:14, 28930.10it/s][A
+
13%|█▎ | 64586/478625 [00:02<00:14, 29038.42it/s][A
+
14%|█▍ | 67492/478625 [00:02<00:14, 28577.72it/s][A
+
15%|█▍ | 70437/478625 [00:02<00:14, 28834.32it/s][A
+
15%|█▌ | 73382/478625 [00:02<00:13, 29014.36it/s][A
+
16%|█▌ | 76286/478625 [00:02<00:14, 28632.65it/s][A
+
17%|█▋ | 79282/478625 [00:02<00:13, 29022.64it/s][A
+
17%|█▋ | 82187/478625 [00:02<00:13, 28523.81it/s][A
+
18%|█▊ | 85177/478625 [00:02<00:13, 28926.39it/s][A
+
18%|█▊ | 88125/478625 [00:03<00:13, 29087.44it/s][A
+
19%|█▉ | 91036/478625 [00:03<00:13, 28550.81it/s][A
+
20%|█▉ | 93926/478625 [00:03<00:13, 28652.09it/s][A
+
20%|██ | 96923/478625 [00:03<00:13, 29039.26it/s][A
+
21%|██ | 99830/478625 [00:03<00:13, 28493.17it/s][A
+
21%|██▏ | 102801/478625 [00:03<00:13, 28849.79it/s][A
+
22%|██▏ | 105786/478625 [00:03<00:12, 29144.74it/s][A
+
23%|██▎ | 108704/478625 [00:03<00:12, 28684.47it/s][A
+
23%|██▎ | 111712/478625 [00:03<00:12, 29094.19it/s][A
+
24%|██▍ | 114681/478625 [00:03<00:12, 29268.38it/s][A
+
25%|██▍ | 117611/478625 [00:04<00:12, 28694.64it/s][A
+
25%|██▌ | 120609/478625 [00:04<00:12, 29032.18it/s][A
+
26%|██▌ | 123608/478625 [00:04<00:12, 29313.39it/s][A
+
26%|██▋ | 126542/478625 [00:04<00:12, 28792.69it/s][A
+
27%|██▋ | 129495/478625 [00:04<00:12, 29007.56it/s][A
+
28%|██▊ | 132492/478625 [00:04<00:11, 29291.24it/s][A
+
28%|██▊ | 135424/478625 [00:04<00:11, 28733.36it/s][A
+
29%|██▉ | 138389/478625 [00:04<00:11, 29001.91it/s][A
+
30%|██▉ | 141293/478625 [00:04<00:11, 28579.26it/s][A
+
30%|███ | 144154/478625 [00:05<00:11, 28499.98it/s][A
+
31%|███ | 147099/478625 [00:05<00:11, 28779.30it/s][A
+
31%|███▏ | 149979/478625 [00:05<00:11, 28289.98it/s][A
+
32%|███▏ | 152955/478625 [00:05<00:11, 28718.93it/s][A
+
33%|███▎ | 155932/478625 [00:05<00:11, 29028.41it/s][A
+
33%|███▎ | 158838/478625 [00:05<00:11, 28678.23it/s][A
+
34%|███▍ | 161739/478625 [00:05<00:11, 28774.38it/s][A
+
34%|███▍ | 164707/478625 [00:05<00:10, 29041.75it/s][A
+
35%|███▌ | 167613/478625 [00:05<00:10, 28680.46it/s][A
+
36%|███▌ | 170521/478625 [00:05<00:10, 28795.91it/s][A
+
36%|███▌ | 173493/478625 [00:06<00:10, 29066.83it/s][A
+
37%|███▋ | 176401/478625 [00:06<00:10, 28613.32it/s][A
+
37%|███▋ | 179404/478625 [00:06<00:10, 29030.22it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
38%|███▊ | 182322/478625 [00:06<00:10, 29074.16it/s][A
+
1%| | 3292/478625 [00:00<00:14, 32915.64it/s][A
+
39%|███▊ | 185232/478625 [00:06<00:10, 28571.99it/s][A
+
1%|▏ | 6584/478625 [00:00<00:14, 32425.12it/s][A
+
39%|███▉ | 188120/478625 [00:06<00:10, 28662.32it/s][A
+
2%|▏ | 9948/478625 [00:00<00:14, 32972.50it/s][A
+
40%|███▉ | 191117/478625 [00:06<00:09, 29048.91it/s][A
+
3%|▎ | 13308/478625 [00:00<00:14, 33218.40it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
41%|████ | 194024/478625 [00:06<00:09, 28476.31it/s][A
+
3%|▎ | 16631/478625 [00:00<00:14, 32314.33it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
41%|████ | 197012/478625 [00:06<00:09, 28888.22it/s][A
+
4%|▍ | 19975/478625 [00:00<00:14, 32688.79it/s][A
+
42%|████▏ | 199993/478625 [00:06<00:09, 29158.44it/s][A
+
5%|▍ | 23248/478625 [00:00<00:14, 32089.96it/s][A
+
42%|████▏ | 202912/478625 [00:07<00:09, 28626.32it/s][A
+
6%|▌ | 26590/478625 [00:00<00:13, 32501.75it/s][A
+
43%|████▎ | 205855/478625 [00:07<00:09, 28862.22it/s][A
+
6%|▋ | 29938/478625 [00:00<00:13, 32802.13it/s][A
+
44%|████▎ | 208745/478625 [00:07<00:09, 28398.36it/s][A
+
7%|▋ | 33222/478625 [00:01<00:13, 32254.48it/s][A
+
44%|████▍ | 211623/478625 [00:07<00:09, 28507.87it/s][A
+
8%|▊ | 36564/478625 [00:01<00:13, 32601.37it/s][A
+
45%|████▍ | 214599/478625 [00:07<00:09, 28874.54it/s][A
+
8%|▊ | 39828/478625 [00:01<00:13, 32001.44it/s][A
+
45%|████▌ | 217489/478625 [00:07<00:09, 28503.07it/s][A
+
9%|▉ | 43186/478625 [00:01<00:13, 32467.59it/s][A
+
46%|████▌ | 220488/478625 [00:07<00:08, 28912.35it/s][A
+
10%|▉ | 46516/478625 [00:01<00:13, 32713.80it/s][A
+
47%|████▋ | 223456/478625 [00:07<00:08, 29138.26it/s][A
+
10%|█ | 49791/478625 [00:01<00:13, 32139.77it/s][A
+
47%|████▋ | 226372/478625 [00:07<00:08, 28619.50it/s][A
+
11%|█ | 53125/478625 [00:01<00:13, 32492.51it/s][A
+
48%|████▊ | 229300/478625 [00:07<00:08, 28811.32it/s][A
+
12%|█▏ | 56378/478625 [00:01<00:13, 31970.22it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
49%|████▊ | 232184/478625 [00:08<00:08, 28801.34it/s][A
+
12%|█▏ | 59735/478625 [00:01<00:12, 32438.84it/s][A
+
1%| | 3291/478625 [00:00<00:14, 32894.89it/s][A
+
49%|████▉ | 235066/478625 [00:08<00:08, 28356.96it/s][A
+
13%|█▎ | 63089/478625 [00:01<00:12, 32762.26it/s][A
0%| | 0/1 [00:00, ?it/s]
+
1%|▏ | 6581/478625 [00:00<00:14, 32360.89it/s][A
+
50%|████▉ | 238050/478625 [00:08<00:08, 28753.03it/s][A
+
14%|█▍ | 66369/478625 [00:02<00:12, 32271.00it/s][A
0%| | 0/1 [00:00, ?it/s]
+
2%|▏ | 9958/478625 [00:00<00:14, 32996.91it/s][A
+
50%|█████ | 240989/478625 [00:08<00:08, 28938.44it/s][A
+
15%|█▍ | 69749/478625 [00:02<00:12, 32720.59it/s][A
+
3%|▎ | 13291/478625 [00:00<00:14, 33124.44it/s][A
+
51%|█████ | 243885/478625 [00:08<00:08, 28403.48it/s][A
+
15%|█▌ | 73025/478625 [00:02<00:12, 32110.90it/s][A
+
3%|▎ | 16605/478625 [00:00<00:14, 32314.68it/s][A
+
52%|█████▏ | 246846/478625 [00:08<00:08, 28756.91it/s][A
+
16%|█▌ | 76390/478625 [00:02<00:12, 32559.92it/s][A
+
4%|▍ | 19977/478625 [00:00<00:13, 32781.67it/s][A
+
52%|█████▏ | 249820/478625 [00:08<00:07, 29046.93it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
17%|█▋ | 79754/478625 [00:02<00:12, 32850.93it/s][A
+
5%|▍ | 23259/478625 [00:00<00:14, 32175.52it/s][A
+
53%|█████▎ | 252728/478625 [00:08<00:07, 28474.80it/s][A
+
17%|█▋ | 83043/478625 [00:02<00:12, 32284.36it/s][A
+
6%|▌ | 26615/478625 [00:00<00:13, 32605.92it/s][A
+
53%|█████▎ | 255677/478625 [00:08<00:07, 28771.17it/s][A
+
18%|█▊ | 86384/478625 [00:02<00:12, 32612.57it/s][A
+
6%|▋ | 29968/478625 [00:00<00:13, 32889.19it/s][A
+
54%|█████▍ | 258589/478625 [00:08<00:07, 28872.48it/s][A
+
19%|█▊ | 89707/478625 [00:02<00:12, 32084.18it/s][A
+
7%|▋ | 33260/478625 [00:01<00:13, 32352.24it/s][A
+
55%|█████▍ | 261479/478625 [00:09<00:07, 27999.47it/s][A
+
19%|█▉ | 93035/478625 [00:02<00:11, 32431.76it/s][A
+
8%|▊ | 36609/478625 [00:01<00:13, 32692.84it/s][A
+
55%|█████▌ | 264494/478625 [00:09<00:07, 28624.83it/s][A
+
20%|██ | 96358/478625 [00:02<00:11, 32666.33it/s][A
+
8%|▊ | 39882/478625 [00:01<00:13, 32116.93it/s][A
+
56%|█████▌ | 267460/478625 [00:09<00:07, 28928.68it/s][A
+
21%|██ | 99628/478625 [00:03<00:11, 32149.61it/s][A
+
9%|▉ | 43257/478625 [00:01<00:13, 32598.28it/s][A
+
56%|█████▋ | 270359/478625 [00:09<00:07, 28536.28it/s][A
+
22%|██▏ | 102966/478625 [00:03<00:11, 32508.06it/s][A
+
10%|▉ | 46605/478625 [00:01<00:13, 32859.95it/s][A
+
57%|█████▋ | 273287/478625 [00:09<00:07, 28753.65it/s][A
+
22%|██▏ | 106306/478625 [00:03<00:11, 32771.16it/s][A
+
10%|█ | 49894/478625 [00:01<00:13, 32169.57it/s][A
+
58%|█████▊ | 276166/478625 [00:09<00:07, 28132.05it/s][A
+
23%|██▎ | 109586/478625 [00:03<00:11, 32147.17it/s][A
+
11%|█ | 53255/478625 [00:01<00:13, 32591.01it/s][A
+
58%|█████▊ | 278993/478625 [00:09<00:07, 28170.21it/s][A
+
24%|██▎ | 112933/478625 [00:03<00:11, 32535.47it/s][A
+
12%|█▏ | 56611/478625 [00:01<00:12, 32874.87it/s][A
+
59%|█████▉ | 281922/478625 [00:09<00:06, 28499.40it/s][A
+
24%|██▍ | 116191/478625 [00:03<00:11, 32038.73it/s][A
+
13%|█▎ | 59902/478625 [00:01<00:12, 32311.60it/s][A
+
59%|█████▉ | 284775/478625 [00:09<00:06, 28084.19it/s][A
+
25%|██▍ | 119526/478625 [00:03<00:11, 32423.01it/s][A
+
13%|█▎ | 63261/478625 [00:01<00:12, 32685.26it/s][A
+
60%|██████ | 287782/478625 [00:10<00:06, 28667.53it/s][A
+
26%|██▌ | 122867/478625 [00:03<00:10, 32713.20it/s][A
+
14%|█▍ | 66534/478625 [00:02<00:12, 32214.85it/s][A
+
61%|██████ | 290762/478625 [00:10<00:06, 29000.69it/s][A
+
26%|██▋ | 126142/478625 [00:03<00:10, 32117.50it/s][A
0%| | 0/1 [00:00, ?it/s]
+
15%|█▍ | 69920/478625 [00:02<00:12, 32696.31it/s][A
+
61%|██████▏ | 293665/478625 [00:10<00:06, 27827.88it/s][A
+
27%|██▋ | 129464/478625 [00:03<00:10, 32440.40it/s][A
+
15%|█▌ | 73257/478625 [00:02<00:12, 32894.33it/s][A
+
62%|██████▏ | 296592/478625 [00:10<00:06, 28242.67it/s][A
+
28%|██▊ | 132712/478625 [00:04<00:10, 32021.49it/s][A
+
16%|█▌ | 76550/478625 [00:02<00:12, 32333.89it/s][A
+
63%|██████▎ | 299595/478625 [00:10<00:06, 28762.96it/s][A
+
28%|██▊ | 136083/478625 [00:04<00:10, 32514.57it/s][A
+
17%|█▋ | 79890/478625 [00:02<00:12, 32646.49it/s][A
+
63%|██████▎ | 302480/478625 [00:10<00:06, 28490.75it/s][A
+
29%|██▉ | 139418/478625 [00:04<00:10, 32761.17it/s][A
+
17%|█▋ | 83158/478625 [00:02<00:12, 32166.64it/s][A
+
64%|██████▍ | 305461/478625 [00:10<00:05, 28876.43it/s][A
+
30%|██▉ | 142697/478625 [00:04<00:10, 32160.11it/s][A
+
18%|█▊ | 86501/478625 [00:02<00:12, 32535.66it/s][A
+
64%|██████▍ | 308425/478625 [00:10<00:05, 29101.10it/s][A
+
31%|███ | 146018/478625 [00:04<00:10, 32466.18it/s][A
+
19%|█▉ | 89847/478625 [00:02<00:11, 32807.16it/s][A
+
65%|██████▌ | 311339/478625 [00:10<00:05, 28653.24it/s][A
+
31%|███ | 149268/478625 [00:04<00:10, 32070.55it/s][A
+
19%|█▉ | 93131/478625 [00:02<00:11, 32263.39it/s][A
+
66%|██████▌ | 314311/478625 [00:10<00:05, 28965.25it/s][A
+
32%|███▏ | 152608/478625 [00:04<00:10, 32459.16it/s][A
+
20%|██ | 96480/478625 [00:02<00:11, 32622.77it/s][A
+
66%|██████▋ | 317305/478625 [00:11<00:05, 29251.55it/s][A
+
33%|███▎ | 155955/478625 [00:04<00:09, 32756.03it/s][A
+
21%|██ | 99746/478625 [00:03<00:11, 32140.08it/s][A
+
67%|██████▋ | 320233/478625 [00:11<00:05, 28700.00it/s][A
+
33%|███▎ | 159233/478625 [00:04<00:09, 32267.92it/s][A
+
22%|██▏ | 103075/478625 [00:03<00:11, 32475.26it/s][A
+
68%|██████▊ | 323166/478625 [00:11<00:05, 28884.39it/s][A
+
34%|███▍ | 162508/478625 [00:05<00:09, 32408.85it/s][A
+
22%|██▏ | 106439/478625 [00:03<00:11, 32817.87it/s][A
+
68%|██████▊ | 326146/478625 [00:11<00:05, 29154.17it/s][A
+
35%|███▍ | 165752/478625 [00:05<00:09, 31979.90it/s][A
+
23%|██▎ | 109724/478625 [00:03<00:11, 32272.00it/s][A
+
69%|██████▉ | 329064/478625 [00:11<00:05, 28428.58it/s][A
+
35%|███▌ | 169109/478625 [00:05<00:09, 32445.10it/s][A
+
24%|██▎ | 113072/478625 [00:03<00:11, 32626.89it/s][A
+
69%|██████▉ | 332005/478625 [00:11<00:05, 28712.36it/s][A
+
36%|███▌ | 172444/478625 [00:05<00:09, 32711.06it/s][A
+
24%|██▍ | 116338/478625 [00:03<00:11, 32118.16it/s][A
+
70%|██████▉ | 334881/478625 [00:11<00:05, 28717.49it/s][A
+
37%|███▋ | 175718/478625 [00:05<00:09, 32104.14it/s][A
+
25%|██▌ | 119674/478625 [00:03<00:11, 32481.05it/s][A
+
71%|███████ | 337756/478625 [00:11<00:05, 27852.54it/s][A
+
37%|███▋ | 179059/478625 [00:05<00:09, 32484.52it/s][A
+
26%|██▌ | 123025/478625 [00:03<00:10, 32782.61it/s][A
+
71%|███████ | 340723/478625 [00:11<00:04, 28378.87it/s][A
+
38%|███▊ | 182335/478625 [00:05<00:09, 32564.17it/s][A
+
26%|██▋ | 126306/478625 [00:03<00:10, 32201.00it/s][A
+
72%|███████▏ | 343568/478625 [00:11<00:04, 28086.85it/s][A
+
39%|███▉ | 185594/478625 [00:05<00:09, 31938.62it/s][A
+
27%|██▋ | 129655/478625 [00:03<00:10, 32578.53it/s][A
+
72%|███████▏ | 346534/478625 [00:12<00:04, 28547.44it/s][A
+
39%|███▉ | 188921/478625 [00:05<00:08, 32326.90it/s][A
+
28%|██▊ | 132917/478625 [00:04<00:10, 32133.47it/s][A
+
73%|███████▎ | 349498/478625 [00:12<00:04, 28867.14it/s][A
+
40%|████ | 192158/478625 [00:05<00:08, 31947.49it/s][A
+
28%|██▊ | 136277/478625 [00:04<00:10, 32562.43it/s][A
+
74%|███████▎ | 352389/478625 [00:12<00:04, 28346.13it/s][A
+
41%|████ | 195462/478625 [00:06<00:08, 32266.49it/s][A
+
29%|██▉ | 139636/478625 [00:04<00:10, 32863.63it/s][A
+
74%|███████▍ | 355394/478625 [00:12<00:04, 28845.56it/s][A
+
42%|████▏ | 198796/478625 [00:06<00:08, 32582.47it/s][A
+
30%|██▉ | 142925/478625 [00:04<00:10, 32312.98it/s][A
+
75%|███████▍ | 358352/478625 [00:12<00:04, 29059.76it/s][A
+
42%|████▏ | 202057/478625 [00:06<00:08, 32071.38it/s][A
+
31%|███ | 146258/478625 [00:04<00:10, 32611.58it/s][A
+
75%|███████▌ | 361262/478625 [00:12<00:04, 28620.47it/s][A
+
43%|████▎ | 205329/478625 [00:06<00:08, 32260.81it/s][A
+
31%|███ | 149523/478625 [00:04<00:10, 32138.80it/s][A
+
76%|███████▌ | 364258/478625 [00:12<00:03, 29013.16it/s][A
+
44%|████▎ | 208558/478625 [00:06<00:08, 31820.56it/s][A
+
32%|███▏ | 152883/478625 [00:04<00:10, 32567.49it/s][A
+
77%|███████▋ | 367241/478625 [00:12<00:03, 29254.54it/s][A
+
44%|████▍ | 211900/478625 [00:06<00:08, 32291.05it/s][A
+
33%|███▎ | 156242/478625 [00:04<00:09, 32866.93it/s][A
+
77%|███████▋ | 370169/478625 [00:12<00:03, 28401.12it/s][A
+
45%|████▍ | 215225/478625 [00:06<00:08, 32572.00it/s][A
+
33%|███▎ | 159532/478625 [00:04<00:09, 32311.80it/s][A
+
78%|███████▊ | 373159/478625 [00:12<00:03, 28836.68it/s][A
+
46%|████▌ | 218485/478625 [00:06<00:08, 32052.54it/s][A
+
34%|███▍ | 162858/478625 [00:05<00:09, 32589.65it/s][A
+
79%|███████▊ | 376095/478625 [00:13<00:03, 28987.51it/s][A
+
46%|████▋ | 221855/478625 [00:06<00:07, 32535.05it/s][A
+
35%|███▍ | 166242/478625 [00:05<00:09, 32956.03it/s][A
+
79%|███████▉ | 378999/478625 [00:13<00:03, 28582.15it/s][A
+
47%|████▋ | 225112/478625 [00:06<00:07, 32077.06it/s][A
+
35%|███▌ | 169541/478625 [00:05<00:09, 32410.65it/s][A
+
80%|███████▉ | 382032/478625 [00:13<00:03, 29093.59it/s][A
+
48%|████▊ | 228420/478625 [00:07<00:07, 32371.37it/s][A
+
36%|███▌ | 172893/478625 [00:05<00:09, 32734.91it/s][A
+
80%|████████ | 384964/478625 [00:13<00:03, 29160.08it/s][A
+
48%|████▊ | 231694/478625 [00:07<00:07, 32478.36it/s][A
+
37%|███▋ | 176170/478625 [00:05<00:09, 32250.52it/s][A
+
81%|████████ | 387883/478625 [00:13<00:03, 28619.72it/s][A
+
49%|████▉ | 234944/478625 [00:07<00:07, 31978.17it/s][A
+
38%|███▊ | 179526/478625 [00:05<00:09, 32634.73it/s][A
+
82%|████████▏ | 390869/478625 [00:13<00:03, 28982.98it/s][A
+
50%|████▉ | 238306/478625 [00:07<00:07, 32461.47it/s][A
+
38%|███▊ | 182824/478625 [00:05<00:09, 32736.05it/s][A
+
82%|████████▏ | 393820/478625 [00:13<00:02, 29135.69it/s][A
+
50%|█████ | 241608/478625 [00:07<00:07, 31968.69it/s][A
+
39%|███▉ | 186100/478625 [00:05<00:09, 32212.04it/s][A
+
83%|████████▎ | 396737/478625 [00:13<00:02, 28598.91it/s][A
+
51%|█████ | 244958/478625 [00:07<00:07, 32415.03it/s][A
+
40%|███▉ | 189500/478625 [00:05<00:08, 32737.66it/s][A
+
84%|████████▎ | 399720/478625 [00:13<00:02, 28958.65it/s][A
+
52%|█████▏ | 248294/478625 [00:07<00:07, 32690.98it/s][A
+
40%|████ | 192777/478625 [00:05<00:08, 32138.23it/s][A
+
84%|████████▍ | 402620/478625 [00:14<00:02, 28374.12it/s][A
+
53%|█████▎ | 251567/478625 [00:07<00:07, 32126.84it/s][A
+
41%|████ | 196132/478625 [00:06<00:08, 32551.22it/s][A
+
85%|████████▍ | 405589/478625 [00:14<00:02, 28758.81it/s][A
+
53%|█████▎ | 254923/478625 [00:07<00:06, 32545.54it/s][A
+
42%|████▏ | 199487/478625 [00:06<00:08, 32845.00it/s][A
+
85%|████████▌ | 408518/478625 [00:14<00:02, 28912.07it/s][A
+
54%|█████▍ | 258240/478625 [00:07<00:06, 32728.60it/s][A
+
42%|████▏ | 202775/478625 [00:06<00:08, 32348.75it/s][A
+
86%|████████▌ | 411413/478625 [00:14<00:02, 28426.35it/s][A
+
55%|█████▍ | 261516/478625 [00:08<00:06, 31987.83it/s][A
+
43%|████▎ | 206067/478625 [00:06<00:08, 32514.54it/s][A
+
87%|████████▋ | 414395/478625 [00:14<00:02, 28835.33it/s][A
+
55%|█████▌ | 264848/478625 [00:08<00:06, 32377.85it/s][A
+
44%|████▎ | 209322/478625 [00:06<00:08, 31994.04it/s][A
+
87%|████████▋ | 417282/478625 [00:14<00:02, 28491.85it/s][A
+
56%|█████▌ | 268091/478625 [00:08<00:06, 31866.92it/s][A
+
44%|████▍ | 212685/478625 [00:06<00:08, 32474.06it/s][A
+
88%|████████▊ | 420134/478625 [00:14<00:02, 28106.69it/s][A
+
57%|█████▋ | 271422/478625 [00:08<00:06, 32287.34it/s][A
+
45%|████▌ | 216046/478625 [00:06<00:08, 32808.03it/s][A
+
88%|████████▊ | 422999/478625 [00:14<00:01, 28239.28it/s][A
+
57%|█████▋ | 274753/478625 [00:08<00:06, 32587.16it/s][A
+
46%|████▌ | 219330/478625 [00:06<00:08, 32282.13it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
89%|████████▉ | 425990/478625 [00:14<00:01, 28730.75it/s][A
+
58%|█████▊ | 278015/478625 [00:08<00:06, 32078.42it/s][A
+
47%|████▋ | 222691/478625 [00:06<00:07, 32670.32it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3235/478625 [00:00<00:14, 32345.56it/s][A
+
90%|████████▉ | 428866/478625 [00:14<00:01, 28330.01it/s][A
+
59%|█████▉ | 281364/478625 [00:08<00:06, 32492.10it/s][A
+
47%|████▋ | 225962/478625 [00:06<00:07, 32193.66it/s][A
+
1%| | 3330/478625 [00:00<00:14, 33280.27it/s][A
+
1%|▏ | 6470/478625 [00:00<00:15, 31102.47it/s][A
+
90%|█████████ | 431846/478625 [00:15<00:01, 28761.78it/s][A
+
59%|█████▉ | 284617/478625 [00:08<00:06, 32018.15it/s][A
+
48%|████▊ | 229292/478625 [00:07<00:07, 32517.37it/s][A
+
1%|▏ | 6659/478625 [00:00<00:14, 32635.55it/s][A
+
2%|▏ | 9772/478625 [00:00<00:14, 31958.37it/s][A
+
91%|█████████ | 434835/478625 [00:15<00:01, 29094.50it/s][A
+
60%|██████ | 287985/478625 [00:08<00:05, 32503.93it/s][A
+
49%|████▊ | 232573/478625 [00:07<00:07, 32603.32it/s][A
+
2%|▏ | 10059/478625 [00:00<00:14, 33251.02it/s][A
+
3%|▎ | 13026/478625 [00:00<00:14, 32181.84it/s][A
+
91%|█████████▏| 437747/478625 [00:15<00:01, 28596.84it/s][A
+
61%|██████ | 291335/478625 [00:08<00:05, 32797.62it/s][A
+
49%|████▉ | 235836/478625 [00:07<00:07, 32189.60it/s][A
+
3%|▎ | 13471/478625 [00:00<00:13, 33587.62it/s][A
+
3%|▎ | 16247/478625 [00:00<00:14, 31507.13it/s][A
+
92%|█████████▏| 440742/478625 [00:15<00:01, 28993.01it/s][A
+
62%|██████▏ | 294618/478625 [00:09<00:05, 31965.93it/s][A
+
50%|████▉ | 239195/478625 [00:07<00:07, 32602.54it/s][A
+
4%|▎ | 16831/478625 [00:00<00:14, 32608.57it/s][A
+
4%|▍ | 19549/478625 [00:00<00:14, 32008.31it/s][A
+
93%|█████████▎| 443717/478625 [00:15<00:01, 29215.63it/s][A
+
62%|██████▏ | 297973/478625 [00:09<00:05, 32427.97it/s][A
+
51%|█████ | 242458/478625 [00:07<00:07, 32092.70it/s][A
+
4%|▍ | 20209/478625 [00:00<00:13, 32994.70it/s][A
+
5%|▍ | 22754/478625 [00:00<00:14, 31618.51it/s][A
+
93%|█████████▎| 446641/478625 [00:15<00:01, 28660.34it/s][A
+
63%|██████▎ | 301222/478625 [00:09<00:05, 32026.69it/s][A
+
51%|█████▏ | 245822/478625 [00:07<00:07, 32545.82it/s][A
+
5%|▍ | 23513/478625 [00:00<00:14, 32393.66it/s][A
+
5%|▌ | 25919/478625 [00:00<00:14, 31045.81it/s][A
+
94%|█████████▍| 449633/478625 [00:15<00:00, 29028.49it/s][A
+
64%|██████▎ | 304533/478625 [00:09<00:05, 32343.93it/s][A
+
52%|█████▏ | 249173/478625 [00:07<00:06, 32828.58it/s][A
+
6%|▌ | 26916/478625 [00:00<00:13, 32901.89it/s][A
+
6%|▌ | 29143/478625 [00:00<00:14, 31408.97it/s][A
+
95%|█████████▍| 452601/478625 [00:15<00:00, 29218.72it/s][A
+
64%|██████▍ | 307875/478625 [00:09<00:05, 32660.32it/s][A
+
53%|█████▎ | 252459/478625 [00:07<00:07, 32262.97it/s][A
+
6%|▋ | 30326/478625 [00:00<00:13, 33267.39it/s][A
+
7%|▋ | 32287/478625 [00:01<00:14, 30875.09it/s][A
+
95%|█████████▌| 455526/478625 [00:15<00:00, 28623.73it/s][A
+
65%|██████▌ | 311145/478625 [00:09<00:05, 32164.84it/s][A
+
53%|█████▎ | 255821/478625 [00:07<00:06, 32660.32it/s][A
+
7%|▋ | 33657/478625 [00:01<00:13, 32604.68it/s][A
+
7%|▋ | 35437/478625 [00:01<00:14, 31060.17it/s][A
+
96%|█████████▌| 458513/478625 [00:15<00:00, 28987.62it/s][A
+
66%|██████▌ | 314492/478625 [00:09<00:05, 32547.03it/s][A
+
54%|█████▍ | 259093/478625 [00:07<00:06, 32674.84it/s][A
+
8%|▊ | 37033/478625 [00:01<00:13, 32949.47it/s][A
+
8%|▊ | 38643/478625 [00:01<00:14, 31357.18it/s][A
+
96%|█████████▋| 461521/478625 [00:16<00:00, 29309.64it/s][A
+
66%|██████▋ | 317750/478625 [00:09<00:05, 32062.49it/s][A
+
55%|█████▍ | 262363/478625 [00:08<00:06, 32107.73it/s][A
+
8%|▊ | 40332/478625 [00:01<00:13, 32421.38it/s][A
+
9%|▊ | 41782/478625 [00:01<00:14, 30991.20it/s][A
+
97%|█████████▋| 464455/478625 [00:16<00:00, 28454.01it/s][A
+
67%|██████▋ | 321107/478625 [00:09<00:04, 32503.09it/s][A
+
56%|█████▌ | 265719/478625 [00:08<00:06, 32532.58it/s][A
+
9%|▉ | 43711/478625 [00:01<00:13, 32823.88it/s][A
+
9%|▉ | 44964/478625 [00:01<00:13, 31235.62it/s][A
+
98%|█████████▊| 467308/478625 [00:16<00:00, 28474.15it/s][A
+
68%|██████▊ | 324469/478625 [00:10<00:04, 32833.06it/s][A
+
56%|█████▌ | 268976/478625 [00:08<00:06, 32061.60it/s][A
+
10%|▉ | 47120/478625 [00:01<00:12, 33196.90it/s][A
+
10%|█ | 48191/478625 [00:01<00:13, 30846.52it/s][A
+
98%|█████████▊| 470161/478625 [00:16<00:00, 28171.37it/s][A
+
68%|██████▊ | 327755/478625 [00:10<00:04, 32182.32it/s][A
+
57%|█████▋ | 272343/478625 [00:08<00:06, 32533.15it/s][A
+
11%|█ | 50443/478625 [00:01<00:13, 32505.36it/s][A
+
11%|█ | 51419/478625 [00:01<00:13, 31265.93it/s][A
+
99%|█████████▉| 473166/478625 [00:16<00:00, 28720.90it/s][A
+
69%|██████▉ | 331014/478625 [00:10<00:04, 32301.48it/s][A
+
58%|█████▊ | 275702/478625 [00:08<00:06, 32845.32it/s][A
+
11%|█▏ | 53846/478625 [00:01<00:12, 32953.50it/s][A
+
11%|█▏ | 54685/478625 [00:01<00:13, 31676.35it/s][A
+
99%|█████████▉| 476196/478625 [00:16<00:00, 29185.44it/s][A
+
70%|██████▉ | 334388/478625 [00:10<00:04, 32724.55it/s][A
+
58%|█████▊ | 278990/478625 [00:08<00:06, 32251.97it/s][A
+
12%|█▏ | 57146/478625 [00:01<00:13, 32378.42it/s][A
100%|██████████| 478625/478625 [00:16<00:00, 28743.92it/s]
+
100%|██████████| 1/1 [00:23<00:00, 23.27s/it]
100%|██████████| 1/1 [00:23<00:00, 23.27s/it]
+
+
12%|█▏ | 57856/478625 [00:01<00:13, 31149.31it/s][A
+
71%|███████ | 337664/478625 [00:10<00:04, 32050.76it/s][A
+
59%|█████▉ | 282364/478625 [00:08<00:06, 32687.66it/s][A
+
13%|█▎ | 60527/478625 [00:01<00:12, 32797.63it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A09/09/2024 12:29:59 - INFO - opensora.dataset.t2v_datasets - no_cap: 0, too_long: 3711, too_short: 2, no_resolution: 0, resolution_mismatch: 0, Counter(sample_size): Counter({'93x160x320': 84930, '29x160x320': 73201, '45x160x320': 68295, '61x160x320': 44578, '77x160x320': 38630, '93x128x320': 17805, '29x128x320': 16948, '93x224x320': 16403, '93x192x320': 15259, '45x128x320': 14788, '61x128x320': 9795, '29x224x320': 8615, '29x192x320': 8528, '45x224x320': 8477, '45x192x320': 8309, '77x128x320': 7730, '61x224x320': 6211, '61x192x320': 5983, '77x224x320': 5788, '77x192x320': 5268, '93x256x320': 3164, '45x256x320': 1510, '29x256x320': 1480, '61x256x320': 1152, '77x256x320': 1090, '93x96x320': 282, '45x96x320': 200, '29x96x320': 169, '61x96x320': 163, '77x96x320': 148}), cnt_movie: 0, cnt_img: 0, before filter: 478625, after filter: 474899
+
+
13%|█▎ | 61020/478625 [00:01<00:13, 31292.89it/s][A
+
71%|███████▏ | 341023/478625 [00:10<00:04, 32500.88it/s][A
+
60%|█████▉ | 285637/478625 [00:08<00:05, 32205.63it/s][A
+
13%|█▎ | 63921/478625 [00:01<00:12, 33134.46it/s][A
+
1%| | 3251/478625 [00:00<00:14, 32497.63it/s][A
+
13%|█▎ | 64236/478625 [00:02<00:13, 31548.45it/s][A
+
72%|███████▏ | 344278/478625 [00:10<00:04, 31975.72it/s][A
+
60%|██████ | 288987/478625 [00:08<00:05, 32584.91it/s][A
+
14%|█▍ | 67238/478625 [00:02<00:12, 32576.42it/s][A
+
1%|▏ | 6501/478625 [00:00<00:14, 31842.45it/s][A
+
14%|█▍ | 67394/478625 [00:02<00:13, 31066.56it/s][A
+
73%|███████▎ | 347610/478625 [00:10<00:04, 32367.96it/s][A09/09/2024 12:29:59 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | motion_score: 474899, cnt_no_motion: 13 | 192077 > 0.95, 0.7 > 65730 Mean: 0.8593367888417824, Var: 0.03075349223473551, Std: 0.17536673639757203, Min: -0.0717548280954361, Max: 1.0
+
+
61%|██████ | 292275/478625 [00:08<00:05, 32670.01it/s][A
+
15%|█▍ | 70634/478625 [00:02<00:12, 32980.80it/s][A
+
2%|▏ | 9829/478625 [00:00<00:14, 32489.52it/s][A
+
15%|█▍ | 70572/478625 [00:02<00:13, 31275.86it/s][A
+
73%|███████▎ | 350948/478625 [00:10<00:03, 32664.04it/s][A
+
62%|██████▏ | 295545/478625 [00:09<00:05, 32194.25it/s][A
+
3%|▎ | 13145/478625 [00:00<00:14, 32751.51it/s][A
+
15%|█▌ | 73936/478625 [00:02<00:12, 32395.45it/s][A
+
15%|█▌ | 73702/478625 [00:02<00:13, 30343.73it/s][A
+
74%|███████▍ | 354218/478625 [00:10<00:03, 32127.59it/s][A
+
62%|██████▏ | 298893/478625 [00:09<00:05, 32571.40it/s][A
+
16%|█▌ | 77323/478625 [00:02<00:12, 32827.27it/s][A
+
3%|▎ | 16422/478625 [00:00<00:14, 31960.71it/s][A
+
16%|█▌ | 76744/478625 [00:02<00:13, 30282.47it/s][A
+
75%|███████▍ | 357585/478625 [00:11<00:03, 32580.51it/s][A09/09/2024 12:29:59 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | aesthetic_score: 478625, cnt_no_aesthetic: 0 | 14374 > 5.75, 4.5 > 113830 Mean: 4.846693657797633, Var: 0.24147353645946146, Std: 0.4913995690468821, Min: 2.685077953338623, Max: 6.742257436116536
+time 23.925863027572632
+n_elements: 474899
+data length: 474899
+
+
63%|██████▎ | 302153/478625 [00:09<00:05, 32097.18it/s][A
+
17%|█▋ | 80700/478625 [00:02<00:12, 33104.99it/s][A
+
4%|▍ | 19743/478625 [00:00<00:14, 32376.70it/s][A
+
17%|█▋ | 79847/478625 [00:02<00:13, 30498.16it/s][A
+
75%|███████▌ | 360847/478625 [00:11<00:03, 32157.38it/s][A
+
64%|██████▍ | 305530/478625 [00:09<00:05, 32586.70it/s][A
+
18%|█▊ | 84014/478625 [00:02<00:12, 32559.18it/s][A
+
5%|▍ | 22984/478625 [00:00<00:14, 31713.46it/s][A
+
17%|█▋ | 82901/478625 [00:02<00:13, 30304.13it/s][A
+
76%|███████▌ | 364209/478625 [00:11<00:03, 32585.64it/s][A
+
65%|██████▍ | 308878/478625 [00:09<00:05, 32848.91it/s][A
+
18%|█▊ | 87393/478625 [00:02<00:11, 32920.35it/s][A
+
5%|▌ | 26315/478625 [00:00<00:14, 32207.09it/s][A
+
18%|█▊ | 86178/478625 [00:02<00:12, 30990.72it/s][A
+
77%|███████▋ | 367539/478625 [00:11<00:03, 32796.03it/s][A
+
65%|██████▌ | 312166/478625 [00:09<00:05, 32305.85it/s][A
+
6%|▌ | 29624/478625 [00:00<00:13, 32475.59it/s][A
+
19%|█▉ | 90689/478625 [00:02<00:11, 32366.08it/s][A
+
19%|█▊ | 89439/478625 [00:02<00:12, 31469.46it/s][A
+
77%|███████▋ | 370822/478625 [00:11<00:03, 32081.89it/s][A
+
66%|██████▌ | 315514/478625 [00:09<00:04, 32648.99it/s][A
+
20%|█▉ | 94054/478625 [00:02<00:11, 32742.07it/s][A
+
7%|▋ | 32875/478625 [00:01<00:13, 31915.09it/s][A
+
19%|█▉ | 92589/478625 [00:02<00:12, 30866.67it/s][A
+
78%|███████▊ | 374161/478625 [00:11<00:03, 32464.42it/s][A
+
67%|██████▋ | 318782/478625 [00:09<00:04, 32194.17it/s][A
+
20%|██ | 97443/478625 [00:02<00:11, 33035.04it/s][A
+
8%|▊ | 36203/478625 [00:01<00:13, 32322.10it/s][A
+
20%|██ | 95744/478625 [00:03<00:12, 31065.98it/s][A
+
79%|███████▉ | 377412/478625 [00:11<00:03, 32011.14it/s][A
+
67%|██████▋ | 322140/478625 [00:09<00:04, 32599.61it/s][A
+
8%|▊ | 39500/478625 [00:01<00:13, 32515.71it/s][A
+
21%|██ | 100750/478625 [00:03<00:11, 32536.23it/s][A
+
21%|██ | 98854/478625 [00:03<00:12, 30515.81it/s][A
+
80%|███████▉ | 380796/478625 [00:11<00:03, 32546.90it/s][A
+
68%|██████▊ | 325480/478625 [00:10<00:04, 32835.15it/s][A
+
22%|██▏ | 104107/478625 [00:03<00:11, 32838.95it/s][A
+
9%|▉ | 42755/478625 [00:01<00:13, 31883.97it/s][A
+
21%|██▏ | 101950/478625 [00:03<00:12, 30644.10it/s][A
+
80%|████████ | 384155/478625 [00:11<00:02, 32854.57it/s][A09/09/2024 12:30:00 - INFO - __main__ - after train_dataloader
+09/09/2024 12:30:00 - INFO - __main__ - before accelerator.prepare
+[2024-09-09 12:30:00,669] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.6, git-hash=unknown, git-branch=unknown
+
+
69%|██████▊ | 328766/478625 [00:10<00:04, 32071.06it/s][A
+
10%|▉ | 46072/478625 [00:01<00:13, 32263.35it/s][A
+
22%|██▏ | 107394/478625 [00:03<00:11, 32227.00it/s][A
+
22%|██▏ | 105118/478625 [00:03<00:12, 30947.63it/s][A
+
81%|████████ | 387444/478625 [00:11<00:02, 32283.28it/s][A
+
69%|██████▉ | 332119/478625 [00:10<00:04, 32498.06it/s][A
+
23%|██▎ | 110805/478625 [00:03<00:11, 32776.62it/s][A
+
10%|█ | 49302/478625 [00:01<00:13, 31753.48it/s][A
+
23%|██▎ | 108216/478625 [00:03<00:12, 30538.17it/s][A
+
82%|████████▏ | 390792/478625 [00:12<00:02, 32633.04it/s][A
+
70%|███████ | 335374/478625 [00:10<00:04, 31977.01it/s][A
+
24%|██▍ | 114182/478625 [00:03<00:11, 33067.47it/s][A
+
11%|█ | 52614/478625 [00:01<00:13, 32155.47it/s][A
+
23%|██▎ | 111319/478625 [00:03<00:11, 30680.78it/s][A
+
82%|████████▏ | 394059/478625 [00:12<00:02, 32196.78it/s][A
+
71%|███████ | 338726/478625 [00:10<00:04, 32427.76it/s][A
+
12%|█▏ | 55936/478625 [00:01<00:13, 32469.82it/s][A
+
25%|██▍ | 117492/478625 [00:03<00:11, 32423.89it/s][A
+
24%|██▍ | 114591/478625 [00:03<00:11, 31283.58it/s][A
+
83%|████████▎ | 397397/478625 [00:12<00:02, 32542.75it/s][A
+
71%|███████▏ | 342114/478625 [00:10<00:04, 32855.24it/s][A
+
25%|██▌ | 120879/478625 [00:03<00:10, 32845.82it/s][A
+
12%|█▏ | 59186/478625 [00:01<00:13, 31899.09it/s][A
+
25%|██▍ | 117722/478625 [00:03<00:11, 30432.47it/s][A
+
84%|████████▎ | 400747/478625 [00:12<00:02, 32823.67it/s][A
+
72%|███████▏ | 345404/478625 [00:10<00:04, 32300.34it/s][A
+
13%|█▎ | 62489/478625 [00:01<00:12, 32215.27it/s][A
+
26%|██▌ | 124168/478625 [00:03<00:10, 32332.51it/s][A
+
25%|██▌ | 120772/478625 [00:03<00:11, 30305.17it/s][A
+
84%|████████▍ | 404032/478625 [00:12<00:02, 32362.26it/s][A
+
73%|███████▎ | 348759/478625 [00:10<00:03, 32666.20it/s][A
+
27%|██▋ | 127569/478625 [00:03<00:10, 32821.51it/s][A
+
14%|█▎ | 65714/478625 [00:02<00:12, 31765.77it/s][A
+
26%|██▌ | 124044/478625 [00:03<00:11, 31012.52it/s][A
+
85%|████████▌ | 407382/478625 [00:12<00:02, 32695.76it/s][A
+
74%|███████▎ | 352055/478625 [00:10<00:03, 32182.95it/s][A
+
27%|██▋ | 130945/478625 [00:03<00:10, 33097.18it/s][A
+
14%|█▍ | 69037/478625 [00:02<00:12, 32195.04it/s][A
+
27%|██▋ | 127150/478625 [00:04<00:11, 30702.51it/s][A
+
86%|████████▌ | 410655/478625 [00:12<00:02, 32099.21it/s][A
+
74%|███████▍ | 355442/478625 [00:10<00:03, 32675.36it/s][A
+
15%|█▌ | 72342/478625 [00:02<00:12, 32445.80it/s][A
+
28%|██▊ | 134258/478625 [00:04<00:10, 32481.43it/s][A
+
27%|██▋ | 130439/478625 [00:04<00:11, 31345.23it/s][A
+
86%|████████▋ | 413969/478625 [00:12<00:01, 32403.64it/s][A
+
75%|███████▍ | 358810/478625 [00:11<00:03, 32970.64it/s][A
+
29%|██▉ | 137608/478625 [00:04<00:10, 32779.84it/s][A
+
16%|█▌ | 75590/478625 [00:02<00:12, 31945.29it/s][A
+
28%|██▊ | 133578/478625 [00:04<00:11, 30705.50it/s][A
+
87%|████████▋ | 417225/478625 [00:12<00:01, 32447.70it/s][A
+
76%|███████▌ | 362111/478625 [00:11<00:03, 32404.42it/s][A
+
29%|██▉ | 141000/478625 [00:04<00:10, 33114.39it/s][A
+
16%|█▋ | 78912/478625 [00:02<00:12, 32318.47it/s][A
+
29%|██▊ | 136654/478625 [00:04<00:11, 30609.96it/s][A
+
88%|████████▊ | 420473/478625 [00:12<00:01, 31973.44it/s][A
+
76%|███████▋ | 365475/478625 [00:11<00:03, 32766.55it/s][A
+
30%|███ | 144315/478625 [00:04<00:10, 32352.02it/s][A
+
17%|█▋ | 82147/478625 [00:02<00:12, 31783.12it/s][A
+
29%|██▉ | 139920/478625 [00:04<00:10, 31210.68it/s][A
+
89%|████████▊ | 423827/478625 [00:13<00:01, 32432.88it/s][A
+
77%|███████▋ | 368756/478625 [00:11<00:03, 32695.44it/s][A
+
31%|███ | 147608/478625 [00:04<00:10, 32518.75it/s][A
+
18%|█▊ | 85465/478625 [00:02<00:12, 32192.77it/s][A
+
30%|██▉ | 143045/478625 [00:04<00:10, 30574.60it/s][A
+
89%|████████▉ | 427182/478625 [00:13<00:01, 32762.50it/s][A
+
78%|███████▊ | 372028/478625 [00:11<00:03, 32118.03it/s][A
+
19%|█▊ | 88784/478625 [00:02<00:12, 32485.82it/s][A
+
32%|███▏ | 150865/478625 [00:04<00:10, 32078.67it/s][A
+
31%|███ | 146177/478625 [00:04<00:10, 30792.03it/s][A
+
90%|████████▉ | 430461/478625 [00:13<00:01, 32211.38it/s][A
+
78%|███████▊ | 375388/478625 [00:11<00:03, 32550.32it/s][A
+
32%|███▏ | 154241/478625 [00:04<00:09, 32571.06it/s][A
+
19%|█▉ | 92036/478625 [00:02<00:12, 31860.54it/s][A
+
31%|███ | 149449/478625 [00:04<00:10, 30686.27it/s][A
+
91%|█████████ | 433803/478625 [00:13<00:01, 32567.01it/s][A
+
79%|███████▉ | 378647/478625 [00:11<00:03, 32104.55it/s][A
+
33%|███▎ | 157607/478625 [00:04<00:09, 32891.87it/s][A
+
20%|█▉ | 95333/478625 [00:02<00:11, 32185.72it/s][A
+
32%|███▏ | 152748/478625 [00:04<00:10, 31356.95it/s][A
+
91%|█████████▏| 437063/478625 [00:13<00:01, 32135.62it/s][A
+
80%|███████▉ | 382017/478625 [00:11<00:02, 32572.91it/s][A
+
21%|██ | 98663/478625 [00:03<00:11, 32512.19it/s][A
+
34%|███▎ | 160900/478625 [00:04<00:09, 32310.38it/s][A
+
33%|███▎ | 156033/478625 [00:05<00:10, 31794.97it/s][A
+
92%|█████████▏| 440417/478625 [00:13<00:01, 32548.52it/s][A
+
81%|████████ | 385373/478625 [00:11<00:02, 32862.87it/s][A
+
34%|███▍ | 164253/478625 [00:05<00:09, 32639.09it/s][A
+
21%|██▏ | 101918/478625 [00:03<00:11, 31889.47it/s][A
+
33%|███▎ | 159217/478625 [00:05<00:10, 31045.86it/s][A
+
93%|█████████▎| 443776/478625 [00:13<00:01, 32855.74it/s][A
+
81%|████████ | 388662/478625 [00:11<00:02, 32308.14it/s][A
+
22%|██▏ | 105232/478625 [00:03<00:11, 32254.98it/s][A
+
35%|███▌ | 167521/478625 [00:05<00:09, 32213.57it/s][A
+
34%|███▍ | 162328/478625 [00:05<00:10, 30622.17it/s][A
+
93%|█████████▎| 447064/478625 [00:13<00:00, 32210.33it/s][A
+
82%|████████▏ | 392034/478625 [00:12<00:02, 32720.89it/s][A
+
36%|███▌ | 170871/478625 [00:05<00:09, 32590.30it/s][A
+
23%|██▎ | 108462/478625 [00:03<00:11, 31880.42it/s][A
+
35%|███▍ | 165423/478625 [00:05<00:10, 30716.30it/s][A
+
94%|█████████▍| 450415/478625 [00:13<00:00, 32591.36it/s][A
+
83%|████████▎ | 395310/478625 [00:12<00:02, 32194.77it/s][A
+
36%|███▋ | 174212/478625 [00:05<00:09, 32830.74it/s][A
+
23%|██▎ | 111794/478625 [00:03<00:11, 32303.04it/s][A
+
35%|███▌ | 168498/478625 [00:05<00:10, 30557.13it/s][A
+
95%|█████████▍| 453678/478625 [00:14<00:00, 32050.17it/s][A
+
83%|████████▎ | 398674/478625 [00:12<00:02, 32616.82it/s][A
+
24%|██▍ | 115108/478625 [00:03<00:11, 32548.88it/s][A
+
37%|███▋ | 177498/478625 [00:05<00:09, 32340.42it/s][A
+
36%|███▌ | 171571/478625 [00:05<00:10, 30606.28it/s][A
+
95%|█████████▌| 457036/478625 [00:14<00:00, 32497.53it/s][A
+
84%|████████▍ | 402041/478625 [00:12<00:02, 32926.55it/s][A
+
38%|███▊ | 180836/478625 [00:05<00:09, 32645.08it/s][A
+
25%|██▍ | 118366/478625 [00:03<00:11, 32011.70it/s][A
+
37%|███▋ | 174773/478625 [00:05<00:09, 30385.71it/s][A
+
96%|█████████▌| 460399/478625 [00:14<00:00, 32831.55it/s][A
+
85%|████████▍ | 405337/478625 [00:12<00:02, 32353.14it/s][A
+
25%|██▌ | 121693/478625 [00:03<00:11, 32380.00it/s][A
+
38%|███▊ | 184104/478625 [00:05<00:09, 32022.34it/s][A
+
37%|███▋ | 177963/478625 [00:05<00:09, 30828.71it/s][A
+
97%|█████████▋| 463686/478625 [00:14<00:00, 32188.29it/s][A
+
85%|████████▌ | 408699/478625 [00:12<00:02, 32723.38it/s][A
+
39%|███▉ | 187489/478625 [00:05<00:08, 32557.58it/s][A
+
26%|██▌ | 124934/478625 [00:03<00:11, 31833.03it/s][A
+
38%|███▊ | 181145/478625 [00:05<00:09, 31119.14it/s][A
+
98%|█████████▊| 466919/478625 [00:14<00:00, 32226.91it/s][A
+
86%|████████▌ | 411975/478625 [00:12<00:02, 32077.58it/s][A
+
40%|███▉ | 190894/478625 [00:05<00:08, 32997.05it/s][A
+
27%|██▋ | 128246/478625 [00:03<00:10, 32209.58it/s][A
+
38%|███▊ | 184260/478625 [00:05<00:09, 30251.68it/s][A
+
98%|█████████▊| 470145/478625 [00:14<00:00, 31902.09it/s][A
+
87%|████████▋ | 415246/478625 [00:12<00:01, 32259.68it/s][A
+
27%|██▋ | 131590/478625 [00:04<00:10, 32571.12it/s][A
+
41%|████ | 194198/478625 [00:05<00:08, 32486.82it/s][A
+
39%|███▉ | 187554/478625 [00:06<00:09, 31034.66it/s][A
+
99%|█████████▉| 473492/478625 [00:14<00:00, 32361.91it/s][A
+
87%|████████▋ | 418611/478625 [00:12<00:01, 32668.04it/s][A
+
41%|████▏ | 197568/478625 [00:06<00:08, 32841.03it/s][A
+
28%|██▊ | 134850/478625 [00:04<00:10, 32015.38it/s][A
+
100%|█████████▉| 476855/478625 [00:14<00:00, 32735.33it/s][A
+
40%|███▉ | 190664/478625 [00:06<00:09, 30176.30it/s][A
+
88%|████████▊ | 421881/478625 [00:12<00:01, 32174.99it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32373.29it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.43s/it]
100%|██████████| 1/1 [00:21<00:00, 21.43s/it]
+
+
29%|██▉ | 138154/478625 [00:04<00:10, 32315.57it/s][A
+
42%|████▏ | 200856/478625 [00:06<00:08, 32345.47it/s][A
+
40%|████ | 193690/478625 [00:06<00:09, 30145.80it/s][A
+
89%|████████▉ | 425240/478625 [00:13<00:01, 32589.37it/s][A
+
43%|████▎ | 204218/478625 [00:06<00:08, 32717.94it/s][A
+
30%|██▉ | 141389/478625 [00:04<00:10, 31844.98it/s][A
+
41%|████ | 196976/478625 [00:06<00:09, 30938.41it/s][A
+
90%|████████▉ | 428503/478625 [00:13<00:01, 32176.84it/s][A
+
43%|████▎ | 207530/478625 [00:06<00:08, 32835.85it/s][A
+
30%|███ | 144670/478625 [00:04<00:10, 32126.00it/s][A
+
42%|████▏ | 200076/478625 [00:06<00:09, 30933.18it/s][A
+
90%|█████████ | 431881/478625 [00:13<00:01, 32647.62it/s][A
+
31%|███ | 148020/478625 [00:04<00:10, 32530.27it/s][A
+
44%|████▍ | 210816/478625 [00:06<00:08, 32255.23it/s][A
+
42%|████▏ | 203174/478625 [00:06<00:09, 29831.31it/s][A
+
91%|█████████ | 435255/478625 [00:13<00:01, 32969.39it/s][A
+
45%|████▍ | 214186/478625 [00:06<00:08, 32679.14it/s][A
+
32%|███▏ | 151276/478625 [00:04<00:10, 31916.22it/s][A
+
43%|████▎ | 206386/478625 [00:06<00:08, 30493.66it/s][A
+
92%|█████████▏| 438555/478625 [00:13<00:01, 32425.44it/s][A
+
32%|███▏ | 154603/478625 [00:04<00:10, 32312.29it/s][A
+
45%|████▌ | 217458/478625 [00:06<00:08, 32215.11it/s][A
+
44%|████▍ | 209445/478625 [00:06<00:08, 30020.73it/s][A
+
92%|█████████▏| 441906/478625 [00:13<00:01, 32744.28it/s][A
+
46%|████▌ | 220837/478625 [00:06<00:07, 32676.95it/s][A
+
33%|███▎ | 157913/478625 [00:04<00:10, 31839.04it/s][Atime 22.08217716217041
+
+
44%|████▍ | 212676/478625 [00:06<00:08, 30686.09it/s][An_elements: 474899
+data length: 474899
+
+
93%|█████████▎| 445184/478625 [00:13<00:01, 32187.44it/s][A
+
47%|████▋ | 224224/478625 [00:06<00:07, 33028.69it/s][A
+
34%|███▎ | 161246/478625 [00:05<00:09, 32273.74it/s][A
+
45%|████▌ | 215862/478625 [00:06<00:08, 31030.35it/s][A
+
94%|█████████▎| 448578/478625 [00:13<00:00, 32700.10it/s][A
+
+
34%|███▍ | 164529/478625 [00:05<00:09, 32435.86it/s][A
48%|████▊ | 227530/478625 [00:06<00:07, 32549.27it/s][A
+
46%|████▌ | 218971/478625 [00:07<00:08, 30850.80it/s][A
+
94%|█████████▍| 451941/478625 [00:13<00:00, 32950.86it/s][A
+
48%|████▊ | 230845/478625 [00:07<00:07, 32723.69it/s][A
+
35%|███▌ | 167776/478625 [00:05<00:09, 31982.89it/s][A
+
46%|████▋ | 222255/478625 [00:07<00:08, 31436.14it/s][A
+
95%|█████████▌| 455239/478625 [00:14<00:00, 32452.80it/s][A
+
36%|███▌ | 171115/478625 [00:05<00:09, 32394.45it/s][A
+
49%|████▉ | 234120/478625 [00:07<00:07, 32216.68it/s][A
+
47%|████▋ | 225436/478625 [00:07<00:08, 31163.70it/s][A
+
96%|█████████▌| 458602/478625 [00:14<00:00, 32796.74it/s][A
+
36%|███▋ | 174462/478625 [00:05<00:09, 32710.10it/s][A
+
50%|████▉ | 237486/478625 [00:07<00:07, 32638.66it/s][A
+
48%|████▊ | 228715/478625 [00:07<00:07, 31642.01it/s][A
+
97%|█████████▋| 461885/478625 [00:14<00:00, 32297.70it/s][A
+
50%|█████ | 240873/478625 [00:07<00:07, 33002.53it/s][A
+
37%|███▋ | 177736/478625 [00:05<00:09, 32075.42it/s][A
+
48%|████▊ | 231883/478625 [00:07<00:07, 31341.72it/s][A
+
97%|█████████▋| 465236/478625 [00:14<00:00, 32651.57it/s][A
+
+
38%|███▊ | 181062/478625 [00:05<00:09, 32422.75it/s][A
51%|█████ | 244176/478625 [00:07<00:07, 32399.23it/s][A
+
49%|████▉ | 235020/478625 [00:07<00:08, 30446.48it/s][A
+
98%|█████████▊| 468505/478625 [00:14<00:00, 32591.75it/s][A
+
52%|█████▏ | 247546/478625 [00:07<00:07, 32780.10it/s][A
+
39%|███▊ | 184308/478625 [00:05<00:09, 31798.47it/s][A
+
50%|████▉ | 238206/478625 [00:07<00:07, 30856.19it/s][A
+
99%|█████████▊| 471767/478625 [00:14<00:00, 32132.46it/s][A
+
52%|█████▏ | 250828/478625 [00:07<00:07, 32205.20it/s][A
+
39%|███▉ | 187636/478625 [00:05<00:09, 32230.03it/s][A
+
50%|█████ | 241392/478625 [00:07<00:07, 31148.37it/s][A
+
99%|█████████▉| 475135/478625 [00:14<00:00, 32584.85it/s][A
+
53%|█████▎ | 254231/478625 [00:07<00:06, 32738.26it/s][A
+
40%|███▉ | 190982/478625 [00:05<00:08, 32589.83it/s][A
+
51%|█████ | 244512/478625 [00:07<00:07, 30593.37it/s][A
+
100%|█████████▉| 478488/478625 [00:14<00:00, 32863.89it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32509.51it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.30s/it]
100%|██████████| 1/1 [00:21<00:00, 21.30s/it]
+
+
54%|█████▍ | 257586/478625 [00:07<00:06, 32975.39it/s][A
+
41%|████ | 194245/478625 [00:06<00:08, 32081.45it/s][A
+
52%|█████▏ | 247825/478625 [00:08<00:07, 31333.63it/s][A
+
41%|████▏ | 197562/478625 [00:06<00:08, 32400.66it/s][A
+
55%|█████▍ | 260887/478625 [00:07<00:06, 32224.93it/s][A
+
52%|█████▏ | 250964/478625 [00:08<00:07, 30721.95it/s][A
+
55%|█████▌ | 264237/478625 [00:08<00:06, 32595.85it/s][A
+
42%|████▏ | 200806/478625 [00:06<00:08, 31936.43it/s][A
+
53%|█████▎ | 254181/478625 [00:08<00:07, 31143.11it/s][A
+
56%|█████▌ | 267608/478625 [00:08<00:06, 32922.60it/s][A
+
43%|████▎ | 204104/478625 [00:06<00:08, 32242.43it/s][A
+
54%|█████▍ | 257342/478625 [00:08<00:07, 31279.80it/s][A
+
43%|████▎ | 207379/478625 [00:06<00:08, 32390.76it/s][A
+
57%|█████▋ | 270905/478625 [00:08<00:06, 32290.56it/s][A
+
54%|█████▍ | 260474/478625 [00:08<00:07, 30591.65it/s][A
+
57%|█████▋ | 274249/478625 [00:08<00:06, 32626.78it/s][A
+
44%|████▍ | 210621/478625 [00:06<00:08, 31918.08it/s][A
+
55%|█████▌ | 263576/478625 [00:08<00:07, 30715.13it/s][A
+
45%|████▍ | 213921/478625 [00:06<00:08, 32234.74it/s][A
+
58%|█████▊ | 277516/478625 [00:08<00:06, 32114.13it/s][Atime 21.969990730285645
+
+
56%|█████▌ | 266825/478625 [00:08<00:06, 31237.75it/s][An_elements: 474899
+data length: 474899
+
+
59%|█████▊ | 280890/478625 [00:08<00:06, 32587.94it/s][A
+
45%|████▌ | 217147/478625 [00:06<00:08, 31756.58it/s][A
+
56%|█████▋ | 269953/478625 [00:08<00:06, 30723.18it/s][A
+
59%|█████▉ | 284250/478625 [00:08<00:05, 32885.40it/s][A
+
46%|████▌ | 220488/478625 [00:06<00:08, 32241.98it/s][A
+
57%|█████▋ | 273250/478625 [00:08<00:06, 31352.81it/s][A
+
47%|████▋ | 223814/478625 [00:06<00:07, 32542.53it/s][A
+
60%|██████ | 287542/478625 [00:08<00:05, 32349.07it/s][A
+
58%|█████▊ | 276390/478625 [00:08<00:06, 30655.46it/s][A
+
61%|██████ | 290888/478625 [00:08<00:05, 32675.45it/s][A
+
47%|████▋ | 227071/478625 [00:07<00:07, 31965.37it/s][A
+
58%|█████▊ | 279565/478625 [00:09<00:06, 30973.26it/s][A
+
48%|████▊ | 230324/478625 [00:07<00:07, 32130.96it/s][A
+
61%|██████▏ | 294159/478625 [00:09<00:05, 31924.08it/s][A
+
59%|█████▉ | 282867/478625 [00:09<00:06, 31574.59it/s][A
+
49%|████▉ | 233638/478625 [00:07<00:07, 32427.42it/s][A
+
62%|██████▏ | 297528/478625 [00:09<00:05, 32438.64it/s][A
+
60%|█████▉ | 286029/478625 [00:09<00:06, 30439.45it/s][A
+
49%|████▉ | 236884/478625 [00:07<00:07, 31934.13it/s][A
+
63%|██████▎ | 300886/478625 [00:09<00:05, 32772.04it/s][A
+
60%|██████ | 289084/478625 [00:09<00:06, 30420.85it/s][A
+
50%|█████ | 240203/478625 [00:07<00:07, 32302.98it/s][A
+
64%|██████▎ | 304168/478625 [00:09<00:05, 32321.29it/s][A
+
61%|██████ | 292146/478625 [00:09<00:06, 30477.64it/s][A
+
51%|█████ | 243437/478625 [00:07<00:07, 31835.66it/s][A
+
64%|██████▍ | 307541/478625 [00:09<00:05, 32734.56it/s][A
+
62%|██████▏ | 295200/478625 [00:09<00:06, 30203.82it/s][A
+
52%|█████▏ | 246751/478625 [00:07<00:07, 32218.78it/s][A
+
65%|██████▍ | 310818/478625 [00:09<00:05, 32213.17it/s][A
+
62%|██████▏ | 298514/478625 [00:09<00:05, 31066.72it/s][A
+
52%|█████▏ | 250069/478625 [00:07<00:07, 32501.74it/s][A
+
66%|██████▌ | 314193/478625 [00:09<00:05, 32663.26it/s][A
+
63%|██████▎ | 301626/478625 [00:09<00:05, 30811.74it/s][A
+
53%|█████▎ | 253322/478625 [00:07<00:07, 31920.31it/s][A
+
66%|██████▋ | 317576/478625 [00:09<00:04, 33005.57it/s][A
+
64%|██████▎ | 304916/478625 [00:09<00:05, 31417.10it/s][A
+
54%|█████▎ | 256646/478625 [00:07<00:06, 32305.75it/s][A
+
67%|██████▋ | 320880/478625 [00:09<00:04, 32313.63it/s][A
+
64%|██████▍ | 308094/478625 [00:09<00:05, 31521.14it/s][A
+
54%|█████▍ | 259880/478625 [00:08<00:06, 31605.79it/s][A
+
68%|██████▊ | 324260/478625 [00:09<00:04, 32748.30it/s][A
+
65%|██████▌ | 311249/478625 [00:10<00:05, 30230.60it/s][A
+
55%|█████▍ | 263215/478625 [00:08<00:06, 32114.04it/s][A
+
68%|██████▊ | 327540/478625 [00:10<00:04, 31945.40it/s][A
+
66%|██████▌ | 314362/478625 [00:10<00:05, 30489.24it/s][A
+
56%|█████▌ | 266549/478625 [00:08<00:06, 32472.84it/s][A
+
69%|██████▉ | 330855/478625 [00:10<00:04, 32295.74it/s][A
+
66%|██████▋ | 317574/478625 [00:10<00:05, 30930.12it/s][A
+
56%|█████▋ | 269801/478625 [00:08<00:06, 31881.22it/s][A
+
70%|██████▉ | 334227/478625 [00:10<00:04, 32713.22it/s][A
+
67%|██████▋ | 320675/478625 [00:10<00:05, 30419.13it/s][A
+
57%|█████▋ | 273136/478625 [00:08<00:06, 32311.70it/s][A
+
71%|███████ | 337503/478625 [00:10<00:04, 32154.69it/s][A
+
68%|██████▊ | 323966/478625 [00:10<00:04, 31145.57it/s][A
+
58%|█████▊ | 276372/478625 [00:08<00:06, 31816.92it/s][A
+
71%|███████ | 340892/478625 [00:10<00:04, 32661.69it/s][A
+
68%|██████▊ | 327087/478625 [00:10<00:04, 30718.78it/s][A
+
58%|█████▊ | 279695/478625 [00:08<00:06, 32228.35it/s][A
+
72%|███████▏ | 344163/478625 [00:10<00:04, 32213.41it/s][A
+
69%|██████▉ | 330293/478625 [00:10<00:04, 31109.81it/s][A
+
59%|█████▉ | 283034/478625 [00:08<00:06, 32569.13it/s][A
+
73%|███████▎ | 347539/478625 [00:10<00:04, 32664.61it/s][A
+
70%|██████▉ | 333589/478625 [00:10<00:04, 31652.85it/s][A
+
60%|█████▉ | 286295/478625 [00:08<00:06, 32008.61it/s][A
+
73%|███████▎ | 350924/478625 [00:10<00:03, 33013.69it/s][A
+
70%|███████ | 336759/478625 [00:10<00:04, 31174.99it/s][A
+
61%|██████ | 289605/478625 [00:09<00:05, 32327.93it/s][A
+
74%|███████▍ | 354229/478625 [00:10<00:03, 32496.34it/s][A
+
71%|███████ | 340069/478625 [00:10<00:04, 31740.35it/s][A
+
61%|██████ | 292852/478625 [00:09<00:05, 32368.50it/s][A
+
75%|███████▍ | 357627/478625 [00:10<00:03, 32932.53it/s][A
+
72%|███████▏ | 343355/478625 [00:11<00:04, 32069.10it/s][A
+
62%|██████▏ | 296092/478625 [00:09<00:05, 31899.56it/s][A
+
75%|███████▌ | 360924/478625 [00:11<00:03, 32411.09it/s][A
+
72%|███████▏ | 346566/478625 [00:11<00:04, 31409.93it/s][A
+
63%|██████▎ | 299416/478625 [00:09<00:05, 32292.01it/s][A
+
76%|███████▌ | 364311/478625 [00:11<00:03, 32839.21it/s][A
+
73%|███████▎ | 349872/478625 [00:11<00:04, 31893.19it/s][A
+
63%|██████▎ | 302648/478625 [00:09<00:05, 31841.38it/s][A
+
77%|███████▋ | 367631/478625 [00:11<00:03, 32942.73it/s][A
+
74%|███████▍ | 353066/478625 [00:11<00:04, 31214.54it/s][A
+
64%|██████▍ | 305963/478625 [00:09<00:05, 32225.27it/s][A
+
77%|███████▋ | 370928/478625 [00:11<00:03, 32237.88it/s][A
+
74%|███████▍ | 356369/478625 [00:11<00:03, 31744.37it/s][A
+
65%|██████▍ | 309280/478625 [00:09<00:05, 32504.39it/s][A
+
78%|███████▊ | 374323/478625 [00:11<00:03, 32739.24it/s][A
+
75%|███████▌ | 359683/478625 [00:11<00:03, 32154.92it/s][A
+
65%|██████▌ | 312533/478625 [00:09<00:05, 31980.26it/s][A
+
79%|███████▉ | 377602/478625 [00:11<00:03, 32257.87it/s][A
+
76%|███████▌ | 362903/478625 [00:11<00:03, 31504.61it/s][A
+
66%|██████▌ | 315855/478625 [00:09<00:05, 32344.91it/s][A
+
80%|███████▉ | 380982/478625 [00:11<00:02, 32708.68it/s][A
+
77%|███████▋ | 366157/478625 [00:11<00:03, 31804.75it/s][A
+
67%|██████▋ | 319093/478625 [00:09<00:05, 31842.45it/s][A
+
80%|████████ | 384385/478625 [00:11<00:02, 33098.39it/s][A
+
77%|███████▋ | 369342/478625 [00:11<00:03, 30842.49it/s][A
+
67%|██████▋ | 322424/478625 [00:10<00:04, 32273.42it/s][A
+
81%|████████ | 387699/478625 [00:11<00:02, 32568.60it/s][A
+
78%|███████▊ | 372486/478625 [00:12<00:03, 31014.49it/s][A
+
68%|██████▊ | 325718/478625 [00:10<00:04, 32469.99it/s][A
+
82%|████████▏ | 391083/478625 [00:11<00:02, 32940.20it/s][A
+
78%|███████▊ | 375709/478625 [00:12<00:03, 31369.09it/s][A
+
69%|██████▊ | 328968/478625 [00:10<00:04, 31634.38it/s][A
+
82%|████████▏ | 394381/478625 [00:12<00:02, 32452.21it/s][A
+
79%|███████▉ | 378852/478625 [00:12<00:03, 30868.88it/s][A
+
69%|██████▉ | 332300/478625 [00:10<00:04, 32124.48it/s][A
+
83%|████████▎ | 397769/478625 [00:12<00:02, 32870.30it/s][A
+
80%|███████▉ | 381944/478625 [00:12<00:03, 30880.44it/s][A
+
70%|███████ | 335518/478625 [00:10<00:04, 31674.95it/s][A
+
84%|████████▍ | 401159/478625 [00:12<00:02, 33171.64it/s][A
+
80%|████████ | 385150/478625 [00:12<00:02, 31226.34it/s][A
+
71%|███████ | 338834/478625 [00:10<00:04, 32108.89it/s][A
+
85%|████████▍ | 404479/478625 [00:12<00:02, 32634.74it/s][A
+
81%|████████ | 388276/478625 [00:12<00:02, 30624.76it/s][A
+
71%|███████▏ | 342178/478625 [00:10<00:04, 32500.28it/s][A
+
85%|████████▌ | 407830/478625 [00:12<00:02, 32890.31it/s][A
+
82%|████████▏ | 391565/478625 [00:12<00:02, 31288.38it/s][A
+
72%|███████▏ | 345432/478625 [00:10<00:04, 31913.67it/s][A
+
86%|████████▌ | 411141/478625 [00:12<00:02, 32408.27it/s][A
+
82%|████████▏ | 394699/478625 [00:12<00:02, 29844.56it/s][A
+
73%|███████▎ | 348752/478625 [00:10<00:04, 32289.12it/s][A
+
87%|████████▋ | 414512/478625 [00:12<00:01, 32788.65it/s][A
+
83%|████████▎ | 397980/478625 [00:12<00:02, 30693.88it/s][A
+
87%|████████▋ | 417794/478625 [00:12<00:01, 32751.00it/s][A
+
74%|███████▎ | 352050/478625 [00:10<00:03, 31809.27it/s][A
+
84%|████████▍ | 401201/478625 [00:12<00:02, 31133.52it/s][A
+
74%|███████▍ | 355388/478625 [00:11<00:03, 32266.60it/s][A
+
88%|████████▊ | 421072/478625 [00:12<00:01, 32325.93it/s][A
+
84%|████████▍ | 404327/478625 [00:13<00:02, 30912.55it/s][A
+
75%|███████▍ | 358702/478625 [00:11<00:03, 32521.40it/s][A
+
89%|████████▊ | 424476/478625 [00:13<00:01, 32830.49it/s][A
+
85%|████████▌ | 407595/478625 [00:13<00:02, 31430.61it/s][A
+
89%|████████▉ | 427853/478625 [00:13<00:01, 33105.71it/s][A
+
76%|███████▌ | 361958/478625 [00:11<00:03, 31976.32it/s][A
+
86%|████████▌ | 410901/478625 [00:13<00:02, 31909.86it/s][A
+
76%|███████▋ | 365302/478625 [00:11<00:03, 32405.67it/s][A
+
90%|█████████ | 431166/478625 [00:13<00:01, 32553.36it/s][A
+
87%|████████▋ | 414098/478625 [00:13<00:02, 31173.72it/s][A
+
77%|███████▋ | 368595/478625 [00:11<00:03, 32557.94it/s][A
+
91%|█████████ | 434542/478625 [00:13<00:01, 32907.62it/s][A
+
87%|████████▋ | 417291/478625 [00:13<00:01, 31393.50it/s][A
+
78%|███████▊ | 371854/478625 [00:11<00:03, 31737.56it/s][A
+
91%|█████████▏| 437836/478625 [00:13<00:01, 32439.01it/s][A
+
88%|████████▊ | 420436/478625 [00:13<00:01, 30704.51it/s][A
+
78%|███████▊ | 375208/478625 [00:11<00:03, 32263.85it/s][A
+
92%|█████████▏| 441237/478625 [00:13<00:01, 32898.65it/s][A
+
88%|████████▊ | 423513/478625 [00:13<00:01, 30675.73it/s][A
+
93%|█████████▎| 444600/478625 [00:13<00:01, 33112.17it/s][A
+
79%|███████▉ | 378440/478625 [00:11<00:03, 31749.21it/s][A
+
89%|████████▉ | 426654/478625 [00:13<00:01, 30889.34it/s][A
+
+
94%|█████████▎| 447914/478625 [00:13<00:00, 32606.97it/s][A
80%|███████▉ | 381771/478625 [00:11<00:03, 32203.61it/s][A
+
90%|████████▉ | 429747/478625 [00:13<00:01, 30315.30it/s][A
+
94%|█████████▍| 451279/478625 [00:13<00:00, 32911.14it/s][A
+
80%|████████ | 385102/478625 [00:11<00:02, 32493.94it/s][A
+
90%|█████████ | 432904/478625 [00:13<00:01, 30682.25it/s][A
+
95%|█████████▍| 454573/478625 [00:13<00:00, 32471.51it/s][A
+
81%|████████ | 388355/478625 [00:12<00:02, 32011.67it/s][A
+
91%|█████████ | 436187/478625 [00:14<00:01, 31312.81it/s][A
+
96%|█████████▌| 457956/478625 [00:14<00:00, 32869.67it/s][A
+
82%|████████▏ | 391675/478625 [00:12<00:02, 32358.69it/s][A
+
92%|█████████▏| 439322/478625 [00:14<00:01, 30531.38it/s][A
+
96%|█████████▋| 461358/478625 [00:14<00:00, 33207.92it/s][A
+
83%|████████▎ | 394915/478625 [00:12<00:02, 31913.21it/s][A
+
92%|█████████▏| 442505/478625 [00:14<00:01, 30910.28it/s][A
+
83%|████████▎ | 398229/478625 [00:12<00:02, 32270.22it/s][A
+
97%|█████████▋| 464682/478625 [00:14<00:00, 32504.19it/s][A
+
93%|█████████▎| 445602/478625 [00:14<00:01, 30123.89it/s][A
+
84%|████████▍ | 401554/478625 [00:12<00:02, 32559.14it/s][A
+
98%|█████████▊| 467977/478625 [00:14<00:00, 32633.15it/s][A
+
94%|█████████▍| 448929/478625 [00:14<00:00, 31037.89it/s][A
+
98%|█████████▊| 471244/478625 [00:14<00:00, 32221.56it/s][A
+
85%|████████▍ | 404813/478625 [00:12<00:02, 31968.75it/s][A
+
94%|█████████▍| 452103/478625 [00:14<00:00, 31242.15it/s][A
+
99%|█████████▉| 474638/478625 [00:14<00:00, 32725.06it/s][A
+
85%|████████▌ | 408142/478625 [00:12<00:02, 32355.08it/s][A
+
95%|█████████▌| 455234/478625 [00:14<00:00, 30807.49it/s][A
+
100%|█████████▉| 478023/478625 [00:14<00:00, 33055.34it/s][A
+
86%|████████▌ | 411381/478625 [00:12<00:02, 31820.24it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32664.25it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.22s/it]
100%|██████████| 1/1 [00:21<00:00, 21.22s/it]
+
+
96%|█████████▌| 458461/478625 [00:14<00:00, 31233.26it/s][A
+
87%|████████▋ | 414675/478625 [00:12<00:01, 32146.69it/s][A
+
96%|█████████▋| 461706/478625 [00:14<00:00, 31589.68it/s][A
+
87%|████████▋ | 417898/478625 [00:12<00:01, 32169.48it/s][A
+
97%|█████████▋| 464869/478625 [00:15<00:00, 30953.41it/s][A
+
88%|████████▊ | 421118/478625 [00:13<00:01, 31746.88it/s][A
+
98%|█████████▊| 468072/478625 [00:15<00:00, 31267.45it/s][A
+
89%|████████▊ | 424460/478625 [00:13<00:01, 32238.44it/s][A
+
98%|█████████▊| 471203/478625 [00:15<00:00, 30954.06it/s][A
+
89%|████████▉ | 427770/478625 [00:13<00:01, 32490.40it/s][A
+
99%|█████████▉| 474474/478625 [00:15<00:00, 31470.14it/s][A
+
90%|█████████ | 431022/478625 [00:13<00:01, 31919.50it/s][Atime 21.885778665542603
+
+
100%|█████████▉| 477625/478625 [00:15<00:00, 31215.95it/s][An_elements: 474899
+data length: 474899
+
+
91%|█████████ | 434331/478625 [00:13<00:01, 32251.39it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 30975.81it/s]
+
100%|██████████| 1/1 [00:22<00:00, 22.07s/it]
100%|██████████| 1/1 [00:22<00:00, 22.07s/it]
+
+
91%|█████████▏| 437560/478625 [00:13<00:01, 31807.88it/s][A
+
92%|█████████▏| 440879/478625 [00:13<00:01, 32212.08it/s][A
+
93%|█████████▎| 444182/478625 [00:13<00:01, 32426.16it/s][A
+
93%|█████████▎| 447427/478625 [00:13<00:00, 31964.33it/s][A
+
94%|█████████▍| 450728/478625 [00:14<00:00, 32268.89it/s][A
+
95%|█████████▍| 453958/478625 [00:14<00:00, 31771.57it/s][Atime 22.72363829612732
+n_elements: 474899
+data length: 474899
+
+
96%|█████████▌| 457269/478625 [00:14<00:00, 32162.75it/s][A
+
96%|█████████▌| 460590/478625 [00:14<00:00, 32470.05it/s][A
+
97%|█████████▋| 463840/478625 [00:14<00:00, 31817.88it/s][A
+
98%|█████████▊| 467049/478625 [00:14<00:00, 31893.51it/s][A
+
98%|█████████▊| 470242/478625 [00:14<00:00, 31562.72it/s][A
+
99%|█████████▉| 473571/478625 [00:14<00:00, 32070.16it/s][A
+
100%|█████████▉| 476892/478625 [00:14<00:00, 32406.93it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32162.17it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.45s/it]
100%|██████████| 1/1 [00:21<00:00, 21.45s/it]
+time 22.10815191268921
+n_elements: 474899
+data length: 474899
+[2024-09-09 12:30:33,533] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2024-09-09 12:30:33,538] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2024-09-09 12:30:33,538] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+zp rank is 6, zp_size=8
+zp rank is 3, zp_size=8
+zp rank is 7, zp_size=8
+zp rank is 2, zp_size=8
+zp rank is 1, zp_size=8
+zp rank is 4, zp_size=8
+zp rank is 5, zp_size=8
+[2024-09-09 12:30:33,689] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
+[2024-09-09 12:30:33,690] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=.NewCls'>
+[2024-09-09 12:30:33,690] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
+[2024-09-09 12:30:33,690] [INFO] [stage_1_and_2.py:173:__init__] Reduce bucket size 536870912
+[2024-09-09 12:30:33,690] [INFO] [stage_1_and_2.py:174:__init__] Allgather bucket size 536870912
+[2024-09-09 12:30:33,690] [INFO] [stage_1_and_2.py:175:__init__] CPU Offload: False
+[2024-09-09 12:30:33,690] [INFO] [stage_1_and_2.py:176:__init__] Round robin gradient partitioning: False
+zp rank is 0, zp_size=8
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [164918]
+[2024-09-09 12:30:39,057] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states
+[2024-09-09 12:30:39,058] [INFO] [utils.py:792:see_memory_usage] MA 17.78 GB Max_MA 18.44 GB CA 18.78 GB Max_CA 19 GB
+[2024-09-09 12:30:39,059] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 202.78 GB, percent = 13.4%
+[] -> [195210]
+[] -> [184079]
+[] -> [164918]
+[] -> [195210]
+[] -> [184079]
+[] -> [164918]
+[] -> [164918]
+[] -> [164918]
+[] -> [164918]
+[] -> [184079]
+[] -> [195210]
+[] -> [184079]
+[] -> [164918]
+[] -> [184079]
+[] -> [195210]
+[] -> [164918]
+[] -> [195210]
+[] -> [184079]
+[] -> [184079]
+[] -> [164918]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[] -> [184079]
+[2024-09-09 12:30:40,996] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states
+[2024-09-09 12:30:40,997] [INFO] [utils.py:792:see_memory_usage] MA 20.41 GB Max_MA 24.35 GB CA 25.36 GB Max_CA 25 GB
+[2024-09-09 12:30:40,997] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 203.0 GB, percent = 13.4%
+[2024-09-09 12:30:40,997] [INFO] [stage_1_and_2.py:552:__init__] optimizer state initialized
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 470, in fixed_mask
+ return get_mask_tensor(video_tensor,MaskType.fixed_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 476, in fixed_bg_mask
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+[2024-09-09 12:30:42,898] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer
+[2024-09-09 12:30:42,899] [INFO] [utils.py:792:see_memory_usage] MA 20.41 GB Max_MA 20.41 GB CA 25.36 GB Max_CA 25 GB
+[2024-09-09 12:30:42,899] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 195.77 GB, percent = 13.0%
+[2024-09-09 12:30:42,908] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
+[2024-09-09 12:30:42,908] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2024-09-09 12:30:42,909] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2024-09-09 12:30:42,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[(0.9, 0.999)]
+[2024-09-09 12:30:42,912] [INFO] [config.py:984:print] DeepSpeedEngine configuration:
+[2024-09-09 12:30:42,912] [INFO] [config.py:988:print] activation_checkpointing_config {
+ "partition_activations": false,
+ "contiguous_memory_optimization": false,
+ "cpu_checkpointing": false,
+ "number_checkpoints": null,
+ "synchronize_checkpoint_boundary": false,
+ "profile": false
+}
+[2024-09-09 12:30:42,912] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2024-09-09 12:30:42,912] [INFO] [config.py:988:print] amp_enabled .................. False
+[2024-09-09 12:30:42,912] [INFO] [config.py:988:print] amp_params ................... False
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] autotuning_config ............ {
+ "enabled": false,
+ "start_step": null,
+ "end_step": null,
+ "metric_path": null,
+ "arg_mappings": null,
+ "metric": "throughput",
+ "model_info": null,
+ "results_dir": "autotuning_results",
+ "exps_dir": "autotuning_exps",
+ "overwrite": true,
+ "fast": true,
+ "start_profile_step": 3,
+ "end_profile_step": 5,
+ "tuner_type": "gridsearch",
+ "tuner_early_stopping": 5,
+ "tuner_num_trials": 50,
+ "model_info_path": null,
+ "mp_size": 1,
+ "max_train_batch_size": null,
+ "min_train_batch_size": 1,
+ "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+ "min_train_micro_batch_size_per_gpu": 1,
+ "num_tuning_micro_batch_sizes": 3
+}
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] bfloat16_enabled ............. True
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] comms_config .................
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] communication_data_type ...... torch.float32
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] curriculum_params_legacy ..... False
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] data_efficiency_enabled ...... False
+[2024-09-09 12:30:42,913] [INFO] [config.py:988:print] dataloader_drop_last ......... False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] disable_allgather ............ False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] dump_state ................... False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_enabled ........... False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] eigenvalue_verbose ........... False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] elasticity_enabled ........... False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] flops_profiler_config ........ {
+ "enabled": false,
+ "recompute_fwd_factor": 0.0,
+ "profile_step": 1,
+ "module_depth": -1,
+ "top_modules": 1,
+ "detailed": true,
+ "output_file": null
+}
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] fp16_auto_cast ............... None
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] fp16_enabled ................. False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] global_rank .................. 0
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] grad_accum_dtype ............. None
+[2024-09-09 12:30:42,914] [INFO] [config.py:988:print] gradient_accumulation_steps .. 1
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] gradient_clipping ............ 1.0
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] graph_harvesting ............. False
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] load_universal_checkpoint .... False
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] loss_scale ................... 1.0
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] memory_breakdown ............. False
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] mics_hierarchial_params_gather False
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] mics_shard_size .............. -1
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] nebula_config ................ {
+ "enabled": false,
+ "persistent_storage_path": null,
+ "persistent_time_interval": 100,
+ "num_of_version_in_retention": 2,
+ "enable_nebula_load": true,
+ "load_path": null
+}
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] optimizer_name ............... None
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] optimizer_params ............. None
+[2024-09-09 12:30:42,915] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] pld_enabled .................. False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] pld_params ................... False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] prescale_gradients ........... False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] scheduler_name ............... None
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] scheduler_params ............. None
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] sparse_attention ............. None
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] steps_per_print .............. inf
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] train_batch_size ............. 8
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 1
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] use_node_local_storage ....... False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] wall_clock_breakdown ......... False
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] weight_quantization_config ... None
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] world_size ................... 8
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] zero_allow_untested_optimizer True
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=536870912 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=536870912 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2024-09-09 12:30:42,916] [INFO] [config.py:988:print] zero_enabled ................. True
+[2024-09-09 12:30:42,917] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True
+[2024-09-09 12:30:42,917] [INFO] [config.py:988:print] zero_optimization_stage ...... 2
+[2024-09-09 12:30:42,917] [INFO] [config.py:974:print_user_config] json = {
+ "fp16": {
+ "enabled": false,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 16,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "bf16": {
+ "enabled": true
+ },
+ "communication_data_type": "fp32",
+ "gradient_clipping": 1.0,
+ "train_micro_batch_size_per_gpu": 1,
+ "train_batch_size": 8,
+ "gradient_accumulation_steps": 1,
+ "zero_optimization": {
+ "stage": 2,
+ "overlap_comm": true,
+ "allgather_bucket_size": 5.368709e+08,
+ "contiguous_gradients": true,
+ "reduce_bucket_size": 5.368709e+08
+ },
+ "steps_per_print": inf,
+ "zero_allow_untested_optimizer": true
+}
+09/09/2024 12:30:42 - INFO - __main__ - after accelerator.prepare
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 476, in fixed_bg_mask
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 467, in background_mask
+ return get_mask_tensor(video_tensor,MaskType.background_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 461, in Semantic_mask
+ return get_mask_tensor(video_tensor,MaskType.Semantic_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 464, in bbox_mask
+ return get_mask_tensor(video_tensor,MaskType.bbox_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+09/09/2024 12:30:43 - INFO - __main__ - init trackers...
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 476, in fixed_bg_mask
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+wandb: Currently logged in as: pkuhxy (pkuhxy-Peking University). Use `wandb login --relogin` to force relogin
+wandb: - Waiting for wandb.init()...
wandb: \ Waiting for wandb.init()...
wandb: wandb version 0.17.9 is available! To upgrade, please run:
+wandb: $ pip install wandb --upgrade
+wandb: Tracking run with wandb version 0.16.3
+wandb: Run data is saved locally in /home/image_data/hxy/Open-Sora-Plan/wandb/run-20240909_123048-adfh19sk
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run colorful-firefly-2
+wandb: ⭐️ View project at https://wandb.ai/pkuhxy-Peking%20University/inpaint_93x320x320_stage1
+wandb: 🚀 View run at https://wandb.ai/pkuhxy-Peking%20University/inpaint_93x320x320_stage1/runs/adfh19sk
+09/09/2024 12:30:50 - INFO - __main__ - ***** Running training *****
+09/09/2024 12:30:50 - INFO - __main__ - Model = DeepSpeedEngine(
+ (module): OpenSoraInpaint(
+ (pos_embed): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (transformer_blocks): ModuleList(
+ (0-31): 32 x BasicTransformerBlock(
+ (norm1): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn1): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (norm2): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn2): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (ff): FeedForward(
+ (net): ModuleList(
+ (0): GELU(
+ (proj): Linear(in_features=2304, out_features=9216, bias=True)
+ )
+ (1): Dropout(p=0.0, inplace=False)
+ (2): Linear(in_features=9216, out_features=2304, bias=True)
+ )
+ )
+ )
+ )
+ (norm_out): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (proj_out): Linear(in_features=2304, out_features=32, bias=True)
+ (adaln_single): AdaLayerNormSingle(
+ (emb): PixArtAlphaCombinedTimestepSizeEmbeddings(
+ (time_proj): Timesteps()
+ (timestep_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (caption_projection): PixArtAlphaTextProjection(
+ (linear_1): Linear(in_features=4096, out_features=2304, bias=True)
+ (act_1): GELU(approximate='tanh')
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ (motion_projection): MotionAdaLayerNormSingle(
+ (emb): MotionEmbeddings(
+ (motion_proj): Timesteps()
+ (motion_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (pos_embed_mask): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(4, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ (pos_embed_masked_hidden_states): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ )
+)
+09/09/2024 12:30:50 - INFO - __main__ - Num examples = 474899
+09/09/2024 12:30:50 - INFO - __main__ - Num Epochs = 17
+09/09/2024 12:30:50 - INFO - __main__ - Instantaneous batch size per device = 1
+09/09/2024 12:30:50 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8
+09/09/2024 12:30:50 - INFO - __main__ - Gradient Accumulation steps = 1
+09/09/2024 12:30:50 - INFO - __main__ - Total optimization steps = 1000000
+09/09/2024 12:30:50 - INFO - __main__ - Total optimization steps (num_update_steps_per_epoch) = 59362
+09/09/2024 12:30:50 - INFO - __main__ - Total trainable parameters = 2.8204808 B
+
Steps: 0%| | 0/1000000 [00:00, ?it/s][] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+[] -> [195210]
+shuffled_megabatches 59363
+have been trained idx: 0
+after shuffled_megabatches 59363
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 476, in fixed_bg_mask
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 941, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 770, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 743, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 452, in __iter__
+ current_batch = next(dataloader_iter)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
+ data = self._next_data()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+ return self.collate_fn(data)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/dataset_utils.py", line 94, in __call__
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 535, in __call__
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 476, in fixed_bg_mask
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 382, in get_mask_tensor
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/utils/mask_utils.py", line 209, in get_mask
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False)
+NameError: name 'tracker' is not defined
+wandb: - 0.004 MB of 0.004 MB uploaded
wandb: \ 0.004 MB of 0.004 MB uploaded
wandb: | 0.004 MB of 0.004 MB uploaded
wandb: / 0.004 MB of 0.004 MB uploaded
wandb: - 0.004 MB of 0.004 MB uploaded
wandb: \ 0.020 MB of 0.039 MB uploaded
wandb: | 0.020 MB of 0.039 MB uploaded
[2024-09-09 12:31:04,770] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 401 closing signal SIGTERM
+[2024-09-09 12:31:04,771] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 403 closing signal SIGTERM
+[2024-09-09 12:31:04,771] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 405 closing signal SIGTERM
+wandb: / 0.034 MB of 0.039 MB uploaded
wandb: - 0.039 MB of 0.039 MB uploaded
Process ForkServerProcess-2:
+Process ForkServerProcess-6:
+Process ForkServerProcess-8:
+Process ForkServerProcess-3:
+Process ForkServerProcess-4:
+Process ForkServerProcess-9:
+Process ForkServerProcess-5:
+Process ForkServerProcess-7:
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+EOFError
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+EOFError
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+Traceback (most recent call last):
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+EOFError
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+EOFError
+EOFError
+EOFError
+EOFError
+scripts/text_condition/npu/train_inpaint_sparse1d_newmodel_motion.sh: line 81: 203 Killed accelerate launch --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml --machine_rank=${MACHINE_RANK} --main_process_ip=${MAIN_PROCESS_IP_VALUE} opensora/train/train_inpaint.py --model OpenSoraInpaint-L/122 --text_encoder_name google/mt5-xxl --cache_dir "../../cache_dir/" --dataset inpaint --data "scripts/train_data/video_data_debug.txt" --ae WFVAEModel_D8_4x8x8 --ae_path "/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL" --sample_rate 1 --num_frames 93 --max_height 320 --max_width 320 --interpolation_scale_t 1.0 --interpolation_scale_h 1.0 --interpolation_scale_w 1.0 --attention_mode xformers --gradient_checkpointing --train_batch_size=1 --dataloader_num_workers 0 --gradient_accumulation_steps=1 --max_train_steps=1000000 --learning_rate=1e-5 --lr_scheduler="constant" --lr_warmup_steps=0 --mixed_precision="bf16" --report_to="wandb" --checkpointing_steps=1000 --allow_tf32 --model_max_length 512 --use_image_num 0 --snr_gamma 5.0 --use_ema --ema_start_step 0 --cfg 0.1 --noise_offset 0.0 --use_rope --skip_low_resolution --speed_factor 1.0 --ema_decay 0.9999 --drop_short_ratio 0.0 --hw_stride 32 --sparse1d --sparse_n 4 --use_motion --train_fps 16 --seed 1234 --trained_data_global_step 0 --group_data --use_decord --prediction_type "v_prediction" --rescale_betas_zero_snr --t2v_ratio 0.0 --i2v_ratio 0.0 --transition_ratio 0.0 --v2v_ratio 0.0 --Semantic_ratio 0.2 --bbox_ratio 0.2 --background_ratio 0.2 --fixed_ratio 0.1 --Semantic_expansion_ratio 0.1 --fixed_bg_ratio 0.1 --clear_video_ratio 0.0 --min_clear_ratio 0.25 --default_text_ratio 0.0 --output_dir /home/save_dir/runs/$PROJECT --pretrained_transformer_model_path "/home/image_data/captions/vpre_latest_134k/model_ema" --yolomodel_pathorname "/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt"
diff --git a/log_inpaint_93x320x320_stage1_swap.txt b/log_inpaint_93x320x320_stage1_swap.txt
new file mode 100644
index 000000000..21e3d37c0
--- /dev/null
+++ b/log_inpaint_93x320x320_stage1_swap.txt
@@ -0,0 +1,2454 @@
+[2024-09-05 19:39:28,530] torch.distributed.run: [WARNING]
+[2024-09-05 19:39:28,530] torch.distributed.run: [WARNING] *****************************************
+[2024-09-05 19:39:28,530] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+[2024-09-05 19:39:28,530] torch.distributed.run: [WARNING] *****************************************
+[2024-09-05 19:39:34,260] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:209: ImportWarning:
+ *************************************************************************************************************
+ The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
+ The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
+ The backend in torch.distributed.init_process_group set to hccl now..
+ The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
+ The device parameters have been replaced with npu in the function below:
+ torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty
+ *************************************************************************************************************
+
+ warnings.warn(msg, ImportWarning)
+[2024-09-05 19:39:34,466] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[2024-09-05 19:39:34,501] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-05 19:39:34,542] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-05 19:39:34,598] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-05 19:39:34,666] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[2024-09-05 19:39:34,677] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+[2024-09-05 19:39:34,718] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to npu (auto detect)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py:171: RuntimeWarning: torch.jit.script will be disabled by transfer_to_npu, which currently does not support it.
+ warnings.warn(msg, RuntimeWarning)
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 400's current affinity list: 0-191
+pid 400's new affinity list: 48-71
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+The npu_config.on_npu is True
+pid 398's current affinity list: 0-191
+pid 398's new affinity list: 0-23
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 403's current affinity list: 0-191
+pid 403's new affinity list: 120-143
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 404's current affinity list: 0-191
+pid 404's new affinity list: 144-167
+pid 401's current affinity list: 0-191
+pid 401's new affinity list: 72-95
+skip replace _has_inf_or_nan
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 402's current affinity list: 0-191
+pid 402's new affinity list: 96-119
+pid 405's current affinity list: 0-191
+pid 405's new affinity list: 168-191
+skip replace _has_inf_or_nan
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+skip replace _DeepSpeedEngine__check_params
+skip replace __init__
+skip replace _change_recovery_script_permissions
+skip replace _copy_recovery_script
+skip replace _get_expert_ckpt_name
+skip replace is_iterable_style_dataset
+skip replace is_map_style_dataset
+skip replace load_moe_state_dict
+pid 399's current affinity list: 0-191
+pid 399's new affinity list: 24-47
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/lightning_fabric/__init__.py:41: Deprecated call to `pkg_resources.declare_namespace('lightning_fabric')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/pytorch_lightning/__init__.py:37: Deprecated call to `pkg_resources.declare_namespace('pytorch_lightning')`.
+Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-0]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:47,954] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:47,954] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-09-05 19:39:47,954] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend hccl
+Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+09/05/2024 19:39:47 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 0
+Local process index: 0
+Device: npu:0
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-3]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:48,151] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:48,151] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:48 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 3
+Local process index: 3
+Device: npu:3
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-5]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:48,226] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:48,226] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:48 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 5
+Local process index: 5
+Device: npu:5
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-6]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:48,568] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:48,568] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:48 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 6
+Local process index: 6
+Device: npu:6
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+[RANK-4]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:48,606] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:48,607] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:48 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 4
+Local process index: 4
+Device: npu:4
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-7]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:48,722] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:48,722] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:48 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 7
+Local process index: 7
+Device: npu:7
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-2]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:49,410] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:49,410] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:49 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 2
+Local process index: 2
+Device: npu:2
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:20: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 0.29. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead.
+ deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/models/transformer_2d.py:25: FutureWarning: `Transformer2DModel` is deprecated and will be removed in version 0.29. Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead.
+ deprecate("Transformer2DModel", "0.29", deprecation_message)
+[RANK-1]: Namespace(dataset='inpaint', data='scripts/train_data/video_data_debug.txt', sample_rate=1, train_fps=16, drop_short_ratio=0.0, speed_factor=1.0, num_frames=93, max_height=320, max_width=320, use_img_from_vid=False, use_image_num=0, model_max_length=512, cfg=0.1, dataloader_num_workers=8, train_batch_size=1, group_data=True, hw_stride=32, skip_low_resolution=True, force_resolution=False, trained_data_global_step=0, use_decord=True, model='OpenSoraInpaint-L/122', enable_8bit_t5=False, tile_overlap_factor=0.125, enable_tiling=False, compress_kv=False, attention_mode='xformers', use_rope=True, compress_kv_factor=1, interpolation_scale_h=1.0, interpolation_scale_w=1.0, interpolation_scale_t=1.0, downsampler=None, ae='WFVAEModel_D8_4x8x8', ae_path='/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL', text_encoder_name='google/mt5-xxl', cache_dir='../../cache_dir/', pretrained=None, enable_stable_fp32=False, sparse1d=True, sparse2d=False, sparse_n=4, tile_sample_min_size=512, tile_sample_min_size_t=33, adapt_vae=False, use_motion=True, gradient_checkpointing=True, snr_gamma=5.0, use_ema=True, ema_decay=0.9999, ema_start_step=0, noise_offset=0.0, prediction_type='v_prediction', rescale_betas_zero_snr=True, num_sampling_steps=50, guidance_scale=2.5, enable_tracker=False, seed=1234, output_dir='/home/save_dir/runs/inpaint_93x320x320_stage1_swap', checkpoints_total_limit=None, checkpointing_steps=1000, resume_from_checkpoint=None, logging_dir='logs', report_to='wandb', num_train_epochs=100, max_train_steps=1000000, gradient_accumulation_steps=1, optimizer='adamW', learning_rate=1e-05, scale_lr=False, lr_warmup_steps=0, use_8bit_adam=False, adam_beta1=0.9, adam_beta2=0.999, prodigy_decouple=True, adam_weight_decay=0.01, adam_weight_decay_text_encoder=None, adam_epsilon=1e-08, prodigy_use_bias_correction=True, prodigy_safeguard_warmup=True, max_grad_norm=1.0, prodigy_beta3=None, lr_scheduler='constant', allow_tf32=True, mixed_precision='bf16', local_rank=-1, sp_size=1, train_sp_batch_size=1, t2v_ratio=0.0, i2v_ratio=0.0, transition_ratio=0.0, v2v_ratio=0.0, clear_video_ratio=0.0, Semantic_ratio=0.2, bbox_ratio=0.2, background_ratio=0.2, fixed_ratio=0.1, Semantic_expansion_ratio=0.1, fixed_bg_ratio=0.1, min_clear_ratio=0.25, default_text_ratio=0.0, pretrained_transformer_model_path='/home/image_data/captions/vpre_latest_134k/model_ema')
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/deepspeed/comm/comm.py:163: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
+ utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+[2024-09-05 19:39:50,913] [WARNING] [comm.py:163:init_deepspeed_backend] HCCL backend in DeepSpeed not yet implemented
+[2024-09-05 19:39:50,913] [INFO] [comm.py:637:init_distributed] cdb=None
+09/05/2024 19:39:50 - INFO - __main__ - Distributed environment: DEEPSPEED Backend: hccl
+Num processes: 8
+Process index: 1
+Local process index: 1
+Device: npu:1
+
+Mixed precision type: bf16
+ds_config: {'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True}, 'communication_data_type': 'fp32', 'gradient_clipping': 1.0, 'train_micro_batch_size_per_gpu': 'auto', 'train_batch_size': 'auto', 'gradient_accumulation_steps': 'auto', 'zero_optimization': {'stage': 2, 'overlap_comm': True, 'allgather_bucket_size': 536870912, 'contiguous_gradients': True, 'reduce_bucket_size': 536870912}, 'steps_per_print': inf}
+
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/diffusers/configuration_utils.py:244: FutureWarning: It is deprecated to pass a pretrained model name or path to `from_config`.If you were trying to load a model, please use .load_config(...) followed by .from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary instead. This functionality will be removed in v1.0.0.
+ deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+init from /home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL/wfvae.ckpt
+Load from ema model!
+['encoder.wavelet_tranform_3d.h_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_conv.conv.weight', 'encoder.wavelet_tranform_3d.h_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.g_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.hh_v_conv.conv.weight', 'encoder.wavelet_tranform_3d.gh_v_conv.conv.weight', 'encoder.wavelet_tranform_2d.aa', 'encoder.wavelet_tranform_2d.ad', 'encoder.wavelet_tranform_2d.da', 'encoder.wavelet_tranform_2d.dd', 'decoder.inverse_wavelet_tranform_3d.h', 'decoder.inverse_wavelet_tranform_3d.g', 'decoder.inverse_wavelet_tranform_3d.hh', 'decoder.inverse_wavelet_tranform_3d.gh', 'decoder.inverse_wavelet_tranform_3d.h_v', 'decoder.inverse_wavelet_tranform_3d.g_v', 'decoder.inverse_wavelet_tranform_3d.hh_v', 'decoder.inverse_wavelet_tranform_3d.gh_v', 'decoder.inverse_wavelet_tranform_2d.aa', 'decoder.inverse_wavelet_tranform_2d.ad', 'decoder.inverse_wavelet_tranform_2d.da', 'decoder.inverse_wavelet_tranform_2d.dd'] []
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+ return self.fget.__get__(instance, owner)()
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+Loading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
0%| | 0/1 [00:00, ?it/s]
0%| | 0/1 [00:00, ?it/s]
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3251/478625 [00:00<00:14, 32490.05it/s][A
+
1%|▏ | 6501/478625 [00:00<00:14, 32025.38it/s][A
+
2%|▏ | 9840/478625 [00:00<00:14, 32638.48it/s][A
+
3%|▎ | 13204/478625 [00:00<00:14, 33030.86it/s][A
+
3%|▎ | 16508/478625 [00:00<00:14, 32286.15it/s][A
+
4%|▍ | 19874/478625 [00:00<00:14, 32742.51it/s][A
+
5%|▍ | 23152/478625 [00:00<00:14, 32173.65it/s][A
+
6%|▌ | 26519/478625 [00:00<00:13, 32638.90it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
6%|▌ | 29886/478625 [00:00<00:13, 32953.46it/s][A
+
1%| | 3283/478625 [00:00<00:14, 32823.38it/s][A
+
7%|▋ | 33184/478625 [00:01<00:13, 32351.19it/s][A
+
1%|▏ | 6566/478625 [00:00<00:14, 32342.98it/s][A
+
8%|▊ | 36526/478625 [00:01<00:13, 32671.28it/s][A
+
2%|▏ | 9914/478625 [00:00<00:14, 32853.43it/s][A
+
8%|▊ | 39797/478625 [00:01<00:13, 32021.81it/s][A
+
3%|▎ | 13268/478625 [00:00<00:14, 33119.53it/s][A
+
9%|▉ | 43159/478625 [00:01<00:13, 32492.61it/s][A
+
3%|▎ | 16581/478625 [00:00<00:14, 32189.47it/s][A
+
10%|▉ | 46494/478625 [00:01<00:13, 32744.78it/s][A
+
4%|▍ | 19961/478625 [00:00<00:14, 32722.97it/s][A
+
10%|█ | 49772/478625 [00:01<00:13, 32223.13it/s][ALoading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+
+
5%|▍ | 23238/478625 [00:00<00:14, 32140.06it/s][A
+
11%|█ | 53133/478625 [00:01<00:13, 32629.93it/s][A
+
6%|▌ | 26597/478625 [00:00<00:13, 32589.26it/s][A
+
12%|█▏ | 56465/478625 [00:01<00:12, 32832.76it/s][A
+
6%|▋ | 29954/478625 [00:00<00:13, 32889.01it/s][A
+
12%|█▏ | 59751/478625 [00:01<00:13, 32212.64it/s][A
+
7%|▋ | 33246/478625 [00:01<00:13, 32351.35it/s][A
+
13%|█▎ | 63097/478625 [00:01<00:12, 32578.64it/s][A
+
8%|▊ | 36594/478625 [00:01<00:13, 32688.78it/s][A
+
14%|█▍ | 66359/478625 [00:02<00:12, 32136.45it/s][A
+
8%|▊ | 39866/478625 [00:01<00:13, 32124.55it/s][A
+
15%|█▍ | 69733/478625 [00:02<00:12, 32605.80it/s][A
+
9%|▉ | 43239/478625 [00:01<00:13, 32598.57it/s][A
+
15%|█▌ | 73035/478625 [00:02<00:12, 32725.93it/s][A
+
10%|▉ | 46583/478625 [00:01<00:13, 32847.91it/s][A
+
16%|█▌ | 76311/478625 [00:02<00:12, 32222.12it/s][A
+
10%|█ | 49871/478625 [00:01<00:13, 32257.25it/s][A
+
17%|█▋ | 79659/478625 [00:02<00:12, 32590.76it/s][A
+
11%|█ | 53203/478625 [00:01<00:13, 32567.66it/s][A
+
17%|█▋ | 82921/478625 [00:02<00:12, 32070.33it/s][A
+
12%|█▏ | 56464/478625 [00:01<00:13, 32088.75it/s][A
+
18%|█▊ | 86262/478625 [00:02<00:12, 32462.77it/s][A
+
12%|█▏ | 59817/478625 [00:01<00:12, 32509.86it/s][A
+
19%|█▊ | 89641/478625 [00:02<00:11, 32852.13it/s][A
+
13%|█▎ | 63175/478625 [00:01<00:12, 32824.58it/s][A
+
19%|█▉ | 92929/478625 [00:02<00:11, 32164.75it/s][A
+
14%|█▍ | 66461/478625 [00:02<00:12, 32330.60it/s][A
+
20%|██ | 96253/478625 [00:02<00:11, 32478.88it/s][A
+
15%|█▍ | 69832/478625 [00:02<00:12, 32736.44it/s][A
+
21%|██ | 99505/478625 [00:03<00:11, 32066.83it/s][A
+
15%|█▌ | 73109/478625 [00:02<00:12, 32133.77it/s][A
+
21%|██▏ | 102799/478625 [00:03<00:11, 32319.95it/s][A
+
16%|█▌ | 76473/478625 [00:02<00:12, 32573.07it/s][A
+
22%|██▏ | 106121/478625 [00:03<00:11, 32583.83it/s][A
+
17%|█▋ | 79824/478625 [00:02<00:12, 32848.29it/s][A
+
23%|██▎ | 109382/478625 [00:03<00:11, 32102.49it/s][A
+
17%|█▋ | 83112/478625 [00:02<00:12, 32298.31it/s][A
+
24%|██▎ | 112716/478625 [00:03<00:11, 32465.14it/s][A
+
18%|█▊ | 86456/478625 [00:02<00:12, 32631.32it/s][A
+
24%|██▍ | 115966/478625 [00:03<00:11, 31931.62it/s][A
+
19%|█▊ | 89728/478625 [00:02<00:12, 32004.69it/s][A
+
25%|██▍ | 119288/478625 [00:03<00:11, 32308.30it/s][A
+
19%|█▉ | 93086/478625 [00:02<00:11, 32464.39it/s][A
+
26%|██▌ | 122641/478625 [00:03<00:10, 32667.67it/s][A
+
20%|██ | 96421/478625 [00:02<00:11, 32722.66it/s][A
+
26%|██▋ | 125911/478625 [00:03<00:10, 32112.50it/s][A
+
21%|██ | 99697/478625 [00:03<00:11, 32236.14it/s][A
+
27%|██▋ | 129234/478625 [00:03<00:10, 32439.16it/s][A
+
22%|██▏ | 103041/478625 [00:03<00:11, 32588.35it/s][A
+
28%|██▊ | 132577/478625 [00:04<00:10, 32730.58it/s][A
+
22%|██▏ | 106387/478625 [00:03<00:11, 32845.84it/s][A
+
28%|██▊ | 135853/478625 [00:04<00:10, 32219.83it/s][A
+
23%|██▎ | 109675/478625 [00:03<00:11, 32270.95it/s][A
+
29%|██▉ | 139191/478625 [00:04<00:10, 32559.00it/s][A
+
24%|██▎ | 113030/478625 [00:03<00:11, 32644.78it/s][A
+
30%|██▉ | 142450/478625 [00:04<00:10, 32087.24it/s][A
+
24%|██▍ | 116298/478625 [00:03<00:11, 32122.90it/s][A
+
30%|███ | 145757/478625 [00:04<00:10, 32373.64it/s][A
+
25%|██▍ | 119648/478625 [00:03<00:11, 32525.61it/s][A
+
31%|███ | 149096/478625 [00:04<00:10, 32673.27it/s][A
+
26%|██▌ | 122994/478625 [00:03<00:10, 32799.95it/s][A
+
32%|███▏ | 152366/478625 [00:04<00:10, 32095.14it/s][A
+
26%|██▋ | 126277/478625 [00:03<00:10, 32183.13it/s][A
+
33%|███▎ | 155723/478625 [00:04<00:09, 32526.29it/s][A
+
27%|██▋ | 129642/478625 [00:03<00:10, 32612.02it/s][A
+
33%|███▎ | 158979/478625 [00:04<00:09, 32002.31it/s][A
+
28%|██▊ | 132907/478625 [00:04<00:10, 32196.10it/s][A
+
34%|███▍ | 162232/478625 [00:05<00:09, 32155.78it/s][A
+
28%|██▊ | 136257/478625 [00:04<00:10, 32579.18it/s][A
+
35%|███▍ | 165592/478625 [00:05<00:09, 32580.34it/s][A
+
29%|██▉ | 139620/478625 [00:04<00:10, 32888.66it/s][A
+
35%|███▌ | 168853/478625 [00:05<00:09, 32007.20it/s][A
+
30%|██▉ | 142912/478625 [00:04<00:10, 32279.68it/s][A
+
36%|███▌ | 172172/478625 [00:05<00:09, 32353.70it/s][A
+
31%|███ | 146245/478625 [00:04<00:10, 32587.08it/s][A
+
37%|███▋ | 175411/478625 [00:05<00:09, 31880.44it/s][A
+
31%|███ | 149507/478625 [00:04<00:10, 32115.68it/s][A
+
37%|███▋ | 178742/478625 [00:05<00:09, 32297.77it/s][A
+
32%|███▏ | 152863/478625 [00:04<00:10, 32538.83it/s][A
+
38%|███▊ | 182007/478625 [00:05<00:09, 32399.05it/s][A
+
33%|███▎ | 156221/478625 [00:04<00:09, 32845.67it/s][A
+
39%|███▊ | 185250/478625 [00:05<00:09, 31926.93it/s][A
+
33%|███▎ | 159509/478625 [00:04<00:09, 32201.63it/s][A
+
39%|███▉ | 188571/478625 [00:05<00:08, 32303.00it/s][A
+
34%|███▍ | 162815/478625 [00:05<00:09, 32451.55it/s][A
+
40%|████ | 191804/478625 [00:05<00:09, 31774.14it/s][A
+
35%|███▍ | 166064/478625 [00:05<00:09, 32021.19it/s][A
+
41%|████ | 195139/478625 [00:06<00:08, 32236.69it/s][A
+
35%|███▌ | 169404/478625 [00:05<00:09, 32424.20it/s][A
+
41%|████▏ | 198463/478625 [00:06<00:08, 32531.65it/s][A
+
36%|███▌ | 172746/478625 [00:05<00:09, 32715.76it/s][A
+
42%|████▏ | 201719/478625 [00:06<00:08, 31991.68it/s][A
+
37%|███▋ | 176021/478625 [00:05<00:09, 32209.60it/s][A
+
43%|████▎ | 205009/478625 [00:06<00:08, 32256.59it/s][A
+
37%|███▋ | 179365/478625 [00:05<00:09, 32569.53it/s][A
+
44%|████▎ | 208329/478625 [00:06<00:08, 32534.28it/s][A
+
38%|███▊ | 182625/478625 [00:05<00:09, 31831.18it/s][A
+
44%|████▍ | 211585/478625 [00:06<00:08, 31980.45it/s][A
+
39%|███▉ | 185988/478625 [00:05<00:09, 32354.09it/s][A
+
45%|████▍ | 214915/478625 [00:06<00:08, 32347.85it/s][A
+
40%|███▉ | 189354/478625 [00:05<00:08, 32737.96it/s][A
+
46%|████▌ | 218153/478625 [00:06<00:08, 31907.49it/s][A
+
40%|████ | 192632/478625 [00:05<00:08, 32253.81it/s][A
+
46%|████▋ | 221457/478625 [00:06<00:07, 32239.41it/s][A
+
41%|████ | 195970/478625 [00:06<00:08, 32583.89it/s][A
+
47%|████▋ | 224795/478625 [00:06<00:07, 32575.49it/s][A
+
42%|████▏ | 199306/478625 [00:06<00:08, 32811.26it/s][A
+
48%|████▊ | 228055/478625 [00:07<00:07, 32057.00it/s][A
+
42%|████▏ | 202590/478625 [00:06<00:08, 32180.23it/s][A
+
48%|████▊ | 231289/478625 [00:07<00:07, 32137.60it/s][A
+
43%|████▎ | 205859/478625 [00:06<00:08, 32327.44it/s][A
+
49%|████▉ | 234506/478625 [00:07<00:07, 31687.41it/s][A
+
44%|████▎ | 209095/478625 [00:06<00:08, 31930.44it/s][A
+
50%|████▉ | 237832/478625 [00:07<00:07, 32146.80it/s][A
+
44%|████▍ | 212434/478625 [00:06<00:08, 32359.18it/s][A
+
50%|█████ | 241158/478625 [00:07<00:07, 32474.77it/s][A
+
45%|████▌ | 215779/478625 [00:06<00:08, 32678.91it/s][A
+
51%|█████ | 244408/478625 [00:07<00:07, 31921.41it/s][ALoading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+
+
46%|████▌ | 219050/478625 [00:06<00:08, 32187.41it/s][A
+
52%|█████▏ | 247721/478625 [00:07<00:07, 32276.24it/s][A
+
46%|████▋ | 222408/478625 [00:06<00:07, 32594.69it/s][A
+
52%|█████▏ | 250952/478625 [00:07<00:07, 31661.65it/s][ALoading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+
+
47%|████▋ | 225671/478625 [00:06<00:07, 32027.31it/s][A
+
53%|█████▎ | 254291/478625 [00:07<00:06, 32167.04it/s][A
+
48%|████▊ | 228996/478625 [00:07<00:07, 32385.39it/s][A
+
54%|█████▍ | 257587/478625 [00:07<00:06, 32399.46it/s][A
+
49%|████▊ | 232293/478625 [00:07<00:07, 32556.39it/s][A
+
54%|█████▍ | 260831/478625 [00:08<00:06, 31622.30it/s][A
+
49%|████▉ | 235552/478625 [00:07<00:07, 32008.96it/s][ALoading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+
+
55%|█████▌ | 264147/478625 [00:08<00:06, 32063.91it/s][ALoading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+
+
50%|████▉ | 238896/478625 [00:07<00:07, 32427.97it/s][A
+
56%|█████▌ | 267473/478625 [00:08<00:06, 32414.72it/s][A
+
51%|█████ | 242142/478625 [00:07<00:07, 32006.34it/s][A
+
57%|█████▋ | 270719/478625 [00:08<00:06, 31762.52it/s][A
+
51%|█████▏ | 245482/478625 [00:07<00:07, 32415.93it/s][A
+
57%|█████▋ | 274063/478625 [00:08<00:06, 32252.05it/s][A
+
52%|█████▏ | 248789/478625 [00:07<00:07, 32606.49it/s][A
+
58%|█████▊ | 277293/478625 [00:08<00:06, 31763.23it/s][A
+
53%|█████▎ | 252052/478625 [00:07<00:07, 32054.03it/s][A
+
59%|█████▊ | 280626/478625 [00:08<00:06, 32220.65it/s][A
+
53%|█████▎ | 255402/478625 [00:07<00:06, 32478.10it/s][A
+
59%|█████▉ | 283952/478625 [00:08<00:05, 32525.23it/s][A
+
54%|█████▍ | 258653/478625 [00:07<00:06, 31856.25it/s][A
+
60%|██████ | 287208/478625 [00:08<00:05, 32010.93it/s][A
+
55%|█████▍ | 261935/478625 [00:08<00:06, 32137.36it/s][A
+
61%|██████ | 290548/478625 [00:08<00:05, 32416.22it/s][A
+
55%|█████▌ | 265263/478625 [00:08<00:06, 32473.74it/s][A
+
61%|██████▏ | 293793/478625 [00:09<00:05, 31589.42it/s][A
+
56%|█████▌ | 268514/478625 [00:08<00:06, 31917.60it/s][A
+
62%|██████▏ | 297139/478625 [00:09<00:05, 32134.09it/s][A
+
57%|█████▋ | 271830/478625 [00:08<00:06, 32280.92it/s][A
+
63%|██████▎ | 300471/478625 [00:09<00:05, 32480.46it/s][A
+
57%|█████▋ | 275159/478625 [00:08<00:06, 32577.68it/s][A
+
63%|██████▎ | 303724/478625 [00:09<00:05, 31932.09it/s][A
+
58%|█████▊ | 278420/478625 [00:08<00:06, 31957.02it/s][A
+
64%|██████▍ | 307068/478625 [00:09<00:05, 32336.66it/s][A
+
59%|█████▉ | 281767/478625 [00:08<00:06, 32398.16it/s][A
+
65%|██████▍ | 310306/478625 [00:09<00:05, 31851.35it/s][A
+
60%|█████▉ | 285011/478625 [00:08<00:06, 31957.20it/s][A
+
66%|██████▌ | 313621/478625 [00:09<00:05, 32229.28it/s][ALoading OpenSoraInpaint pretrained weights...
+Loading pretrained model from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors...
+
+
60%|██████ | 288365/478625 [00:08<00:05, 32421.87it/s][A
+
66%|██████▌ | 316948/478625 [00:09<00:04, 32535.86it/s][A
+
61%|██████ | 291611/478625 [00:08<00:05, 32430.32it/s][A
+
67%|██████▋ | 320205/478625 [00:09<00:04, 31983.01it/s][A
+
62%|██████▏ | 294857/478625 [00:09<00:05, 31954.29it/s][A
+
68%|██████▊ | 323505/478625 [00:10<00:04, 32279.81it/s][A
+
62%|██████▏ | 298199/478625 [00:09<00:05, 32383.41it/s][A
+
68%|██████▊ | 326763/478625 [00:10<00:04, 31656.42it/s][A
+
63%|██████▎ | 301441/478625 [00:09<00:05, 31910.07it/s][A
+
69%|██████▉ | 329997/478625 [00:10<00:04, 31855.38it/s][A
+
64%|██████▎ | 304785/478625 [00:09<00:05, 32358.00it/s][A
+
70%|██████▉ | 333329/478625 [00:10<00:04, 32286.25it/s][A
+
64%|██████▍ | 308103/478625 [00:09<00:05, 32600.54it/s][A
+
70%|███████ | 336561/478625 [00:10<00:04, 31755.18it/s][A
+
65%|██████▌ | 311366/478625 [00:09<00:05, 32049.03it/s][A
+
71%|███████ | 339884/478625 [00:10<00:04, 32186.21it/s][A
+
66%|██████▌ | 314725/478625 [00:09<00:05, 32501.56it/s][A
+
72%|███████▏ | 343188/478625 [00:10<00:04, 32436.39it/s][A
+
66%|██████▋ | 317979/478625 [00:09<00:05, 31947.73it/s][A
+
72%|███████▏ | 346435/478625 [00:10<00:04, 31861.30it/s][A
+
67%|██████▋ | 321312/478625 [00:09<00:04, 32352.62it/s][A
+
73%|███████▎ | 349766/478625 [00:10<00:03, 32284.63it/s][A
+
68%|██████▊ | 324666/478625 [00:10<00:04, 32701.41it/s][A
+
74%|███████▍ | 352998/478625 [00:10<00:03, 31736.49it/s][A
+
69%|██████▊ | 327940/478625 [00:10<00:04, 31849.43it/s][A
+
74%|███████▍ | 356350/478625 [00:11<00:03, 32258.28it/s][A
+
69%|██████▉ | 331279/478625 [00:10<00:04, 32298.56it/s][A
+
75%|███████▌ | 359705/478625 [00:11<00:03, 32639.09it/s][A
+
70%|██████▉ | 334515/478625 [00:10<00:04, 31800.09it/s][A
+
76%|███████▌ | 362973/478625 [00:11<00:03, 32019.61it/s][A
+
71%|███████ | 337855/478625 [00:10<00:04, 32267.10it/s][A
+
77%|███████▋ | 366293/478625 [00:11<00:03, 32363.61it/s][A
+
71%|███████▏ | 341218/478625 [00:10<00:04, 32668.12it/s][A
+
77%|███████▋ | 369534/478625 [00:11<00:03, 31620.98it/s][A
+
72%|███████▏ | 344489/478625 [00:10<00:04, 31940.32it/s][A
+
78%|███████▊ | 372834/478625 [00:11<00:03, 32021.77it/s][A
+
73%|███████▎ | 347842/478625 [00:10<00:04, 32403.03it/s][A
+
79%|███████▊ | 376164/478625 [00:11<00:03, 32395.11it/s][A
+
73%|███████▎ | 351181/478625 [00:10<00:03, 32691.74it/s][A
+
79%|███████▉ | 379408/478625 [00:11<00:03, 31880.52it/s][A
+
74%|███████▍ | 354455/478625 [00:10<00:03, 32128.10it/s][A
+
80%|███████▉ | 382773/478625 [00:11<00:02, 32399.01it/s][A
+
75%|███████▍ | 357803/478625 [00:11<00:03, 32524.17it/s][A
+
81%|████████ | 386017/478625 [00:11<00:02, 31903.17it/s][A
+
75%|███████▌ | 361060/478625 [00:11<00:03, 32074.43it/s][A
+
81%|████████▏ | 389359/478625 [00:12<00:02, 32347.68it/s][Amissing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+
+
76%|███████▌ | 364402/478625 [00:11<00:03, 32469.25it/s][A
+
82%|████████▏ | 392735/478625 [00:12<00:02, 32763.45it/s][A
+
77%|███████▋ | 367714/478625 [00:11<00:03, 32660.77it/s][A
+
83%|████████▎ | 396015/478625 [00:12<00:02, 32083.38it/s][A
+
78%|███████▊ | 370983/478625 [00:11<00:03, 31879.86it/s][A
+
83%|████████▎ | 399344/478625 [00:12<00:02, 32436.44it/s][A
+
78%|███████▊ | 374330/478625 [00:11<00:03, 32342.34it/s][A
+
84%|████████▍ | 402688/478625 [00:12<00:02, 32731.28it/s][A
+
79%|███████▉ | 377570/478625 [00:11<00:03, 31859.53it/s][A
+
85%|████████▍ | 405965/478625 [00:12<00:02, 32088.09it/s][A
+
80%|███████▉ | 380904/478625 [00:11<00:03, 32258.68it/s][A
+
86%|████████▌ | 409307/478625 [00:12<00:02, 32477.90it/s][A
+
80%|████████ | 384257/478625 [00:11<00:02, 32632.31it/s][A
+
86%|████████▌ | 412559/478625 [00:12<00:02, 31832.48it/s][A
+
81%|████████ | 387524/478625 [00:11<00:02, 32108.02it/s][A
+
87%|████████▋ | 415789/478625 [00:12<00:01, 31967.39it/s][A
+
82%|████████▏ | 390859/478625 [00:12<00:02, 32471.49it/s][A
+
88%|████████▊ | 419128/478625 [00:12<00:01, 32383.76it/s][A
+
82%|████████▏ | 394110/478625 [00:12<00:02, 32033.34it/s][A
+
88%|████████▊ | 422370/478625 [00:13<00:01, 31879.00it/s][A
+
83%|████████▎ | 397440/478625 [00:12<00:02, 32404.75it/s][A
+
89%|████████▉ | 425710/478625 [00:13<00:01, 32325.07it/s][A
+
84%|████████▎ | 400774/478625 [00:12<00:02, 32680.28it/s][A
+
90%|████████▉ | 428946/478625 [00:13<00:01, 31796.07it/s][A
+
84%|████████▍ | 404045/478625 [00:12<00:02, 32208.41it/s][A
+
90%|█████████ | 432253/478625 [00:13<00:01, 32167.46it/s][A
+
85%|████████▌ | 407366/478625 [00:12<00:02, 32500.49it/s][A
+
91%|█████████ | 435618/478625 [00:13<00:01, 32603.51it/s][A
+
86%|████████▌ | 410619/478625 [00:12<00:02, 31982.15it/s][A
+
92%|█████████▏| 438882/478625 [00:13<00:01, 32012.27it/s][A
+
86%|████████▋ | 413942/478625 [00:12<00:01, 32348.40it/s][A
+
92%|█████████▏| 442225/478625 [00:13<00:01, 32425.38it/s][A
+
87%|████████▋ | 417180/478625 [00:12<00:01, 32284.58it/s][A
+
93%|█████████▎| 445472/478625 [00:13<00:01, 31895.03it/s][A
+
88%|████████▊ | 420411/478625 [00:12<00:01, 31844.22it/s][A
+
94%|█████████▍| 448824/478625 [00:13<00:00, 32368.81it/s][A
+
89%|████████▊ | 423778/478625 [00:13<00:01, 32381.85it/s][A
+
94%|█████████▍| 452137/478625 [00:14<00:00, 32593.15it/s][A
+
89%|████████▉ | 427116/478625 [00:13<00:01, 32675.61it/s][A
+
95%|█████████▌| 455400/478625 [00:14<00:00, 32029.06it/s][A
+
90%|████████▉ | 430386/478625 [00:13<00:01, 32124.07it/s][A
+
96%|█████████▌| 458746/478625 [00:14<00:00, 32447.34it/s][A
+
91%|█████████ | 433734/478625 [00:13<00:01, 32522.78it/s][A
+
97%|█████████▋| 461995/478625 [00:14<00:00, 31840.94it/s][A
+
91%|█████████▏| 436990/478625 [00:13<00:01, 32089.53it/s][A
+
97%|█████████▋| 465295/478625 [00:14<00:00, 32177.89it/s][A
+
92%|█████████▏| 440329/478625 [00:13<00:01, 32469.69it/s][A
+
98%|█████████▊| 468517/478625 [00:14<00:00, 32111.94it/s][A
+
93%|█████████▎| 443674/478625 [00:13<00:01, 32758.55it/s][A
+
99%|█████████▊| 471731/478625 [00:14<00:00, 31604.72it/s][A
+
93%|█████████▎| 446953/478625 [00:13<00:00, 32239.27it/s][A
+
99%|█████████▉| 475087/478625 [00:14<00:00, 32177.51it/s][A
+
94%|█████████▍| 450290/478625 [00:13<00:00, 32571.00it/s][A
+
100%|█████████▉| 478437/478625 [00:14<00:00, 32566.37it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32244.93it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.07s/it]
100%|██████████| 1/1 [00:21<00:00, 21.07s/it]
+
+
95%|█████████▍| 453550/478625 [00:14<00:00, 31939.75it/s][A
+
95%|█████████▌| 456900/478625 [00:14<00:00, 32394.66it/s][A
+
96%|█████████▌| 460268/478625 [00:14<00:00, 32771.74it/s][A
+
97%|█████████▋| 463549/478625 [00:14<00:00, 32041.30it/s][A
+
98%|█████████▊| 466784/478625 [00:14<00:00, 32128.82it/s][A
+
98%|█████████▊| 470001/478625 [00:14<00:00, 31803.25it/s][A
+
99%|█████████▉| 473329/478625 [00:14<00:00, 32236.00it/s][Atime 21.760502099990845
+n_elements: 474899
+data length: 474899
+
+
100%|█████████▉| 476679/478625 [00:14<00:00, 32609.13it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32351.36it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.01s/it]
100%|██████████| 1/1 [00:21<00:00, 21.01s/it]
+time 21.66859006881714
+n_elements: 474899
+data length: 474899
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+missing_keys 4 ['pos_embed_mask.0.proj.weight', 'pos_embed_mask.0.proj.bias', 'pos_embed_mask.1.weight', 'pos_embed_masked_hidden_states.1.weight'], unexpected_keys 0
+Successfully load 695/699 keys from /home/image_data/captions/vpre_latest_134k/model_ema/diffusion_pytorch_model.safetensors!
+09/05/2024 19:41:57 - INFO - __main__ - optimizer: AdamW (
+Parameter Group 0
+ amsgrad: False
+ betas: (0.9, 0.999)
+ capturable: False
+ differentiable: False
+ eps: 1e-08
+ foreach: False
+ fused: None
+ lr: 1e-05
+ maximize: False
+ weight_decay: 0.01
+)
+You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
0%| | 0/1 [00:00, ?it/s]09/05/2024 19:42:05 - INFO - opensora.dataset.t2v_datasets - Building /home/image_data/captions/TV01_clips_final_478625_llavanext_217405_aes478625.json...
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3313/478625 [00:00<00:14, 33125.06it/s][A
+
1%|▏ | 6626/478625 [00:00<00:14, 32474.06it/s][A
0%| | 0/1 [00:00, ?it/s]
+
2%|▏ | 9996/478625 [00:00<00:14, 33024.73it/s][A
+
3%|▎ | 13366/478625 [00:00<00:13, 33288.44it/s][A
+
3%|▎ | 16696/478625 [00:00<00:14, 32443.20it/s][A
+
4%|▍ | 20075/478625 [00:00<00:13, 32888.71it/s][A
+
5%|▍ | 23368/478625 [00:00<00:14, 32372.55it/s][A
0%| | 0/1 [00:00, ?it/s]
+
6%|▌ | 26754/478625 [00:00<00:13, 32833.92it/s][A
0%| | 0/1 [00:00, ?it/s]
+
6%|▋ | 30131/478625 [00:00<00:13, 33120.14it/s][A
+
7%|▋ | 33446/478625 [00:01<00:13, 32451.60it/s][A
0%| | 0/1 [00:00, ?it/s]
+
8%|▊ | 36829/478625 [00:01<00:13, 32862.46it/s][A
+
8%|▊ | 40119/478625 [00:01<00:13, 32267.72it/s][A
+
9%|▉ | 43487/478625 [00:01<00:13, 32684.39it/s][A/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.py:550: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+ warnings.warn(
+
+
10%|▉ | 46850/478625 [00:01<00:13, 32964.82it/s][A
+
10%|█ | 50150/478625 [00:01<00:13, 32440.52it/s][A
+
11%|█ | 53530/478625 [00:01<00:12, 32840.00it/s][A
+
12%|█▏ | 56818/478625 [00:01<00:13, 32285.96it/s][A
+
13%|█▎ | 60198/478625 [00:01<00:12, 32727.35it/s][A
+
13%|█▎ | 63578/478625 [00:01<00:12, 33042.05it/s][A
+
14%|█▍ | 66886/478625 [00:02<00:12, 32491.49it/s][A
+
15%|█▍ | 70247/478625 [00:02<00:12, 32818.80it/s][A
+
15%|█▌ | 73533/478625 [00:02<00:12, 32251.40it/s][A
+
16%|█▌ | 76929/478625 [00:02<00:12, 32751.55it/s][A
+
17%|█▋ | 80278/478625 [00:02<00:12, 32967.39it/s][A
+
17%|█▋ | 83578/478625 [00:02<00:12, 32447.99it/s][A
+
18%|█▊ | 86956/478625 [00:02<00:11, 32837.12it/s][A
+
19%|█▉ | 90339/478625 [00:02<00:11, 33127.77it/s][A
+
20%|█▉ | 93655/478625 [00:02<00:11, 32466.89it/s][A
0%| | 0/1 [00:00, ?it/s]
+
20%|██ | 97021/478625 [00:02<00:11, 32816.55it/s][A
+
21%|██ | 100307/478625 [00:03<00:11, 32305.66it/s][A
+
22%|██▏ | 103682/478625 [00:03<00:11, 32726.50it/s][A
+
22%|██▏ | 107056/478625 [00:03<00:11, 33024.41it/s][A
+
23%|██▎ | 110362/478625 [00:03<00:11, 32433.74it/s][A
+
24%|██▍ | 113691/478625 [00:03<00:11, 32684.62it/s][A
+
24%|██▍ | 116963/478625 [00:03<00:11, 32287.75it/s][A
+
25%|██▌ | 120336/478625 [00:03<00:10, 32711.11it/s][A
+
26%|██▌ | 123710/478625 [00:03<00:10, 33012.32it/s][A
+
27%|██▋ | 127014/478625 [00:03<00:10, 32414.60it/s][A
+
27%|██▋ | 130396/478625 [00:03<00:10, 32825.32it/s][A
+
28%|██▊ | 133682/478625 [00:04<00:10, 32364.12it/s][A
+
29%|██▊ | 137054/478625 [00:04<00:10, 32760.03it/s][A
+
29%|██▉ | 140434/478625 [00:04<00:10, 33066.54it/s][A
+
30%|███ | 143744/478625 [00:04<00:10, 32396.58it/s][A
+
31%|███ | 147094/478625 [00:04<00:10, 32719.67it/s][A
+
31%|███▏ | 150370/478625 [00:04<00:10, 32226.21it/s][A
+
32%|███▏ | 153734/478625 [00:04<00:09, 32640.75it/s][A
+
33%|███▎ | 157134/478625 [00:04<00:09, 33041.79it/s][A
+
34%|███▎ | 160442/478625 [00:04<00:09, 32386.85it/s][A
+
34%|███▍ | 163783/478625 [00:05<00:09, 32685.56it/s][A
+
35%|███▍ | 167056/478625 [00:05<00:09, 32298.42it/s][A
+
36%|███▌ | 170418/478625 [00:05<00:09, 32685.64it/s][A
+
36%|███▋ | 173789/478625 [00:05<00:09, 32987.82it/s][A
+
37%|███▋ | 177091/478625 [00:05<00:09, 32446.91it/s][A
+
38%|███▊ | 180441/478625 [00:05<00:09, 32732.21it/s][A
+
38%|███▊ | 183718/478625 [00:05<00:09, 32150.71it/s][A
+
39%|███▉ | 187058/478625 [00:05<00:08, 32515.34it/s][A
+
40%|███▉ | 190460/478625 [00:05<00:08, 32959.27it/s][A
+
40%|████ | 193759/478625 [00:05<00:08, 32456.05it/s][A
+
41%|████ | 197123/478625 [00:06<00:08, 32801.59it/s][A
+
42%|████▏ | 200407/478625 [00:06<00:08, 32281.21it/s][A
+
43%|████▎ | 203768/478625 [00:06<00:08, 32669.04it/s][A
+
43%|████▎ | 207109/478625 [00:06<00:08, 32887.17it/s][A
+
44%|████▍ | 210401/478625 [00:06<00:08, 32344.12it/s][A
+
45%|████▍ | 213763/478625 [00:06<00:08, 32718.69it/s][A
+
45%|████▌ | 217038/478625 [00:06<00:08, 32240.71it/s][A
+
46%|████▌ | 220430/478625 [00:06<00:07, 32733.52it/s][A
+
47%|████▋ | 223820/478625 [00:06<00:07, 33077.23it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
47%|████▋ | 227131/478625 [00:06<00:07, 32449.12it/s][A
+
1%| | 3324/478625 [00:00<00:14, 33236.94it/s][A
+
48%|████▊ | 230424/478625 [00:07<00:07, 32587.43it/s][A
+
1%|▏ | 6648/478625 [00:00<00:14, 32531.88it/s][A
+
49%|████▉ | 233802/478625 [00:07<00:07, 32938.23it/s][A
+
2%|▏ | 9903/478625 [00:00<00:14, 32191.73it/s][A
+
50%|████▉ | 237099/478625 [00:07<00:07, 32398.99it/s][A
+
3%|▎ | 13181/478625 [00:00<00:14, 32417.46it/s][A
+
50%|█████ | 240468/478625 [00:07<00:07, 32777.68it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
3%|▎ | 16424/478625 [00:00<00:14, 31346.46it/s][A
+
51%|█████ | 243749/478625 [00:07<00:07, 32220.33it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
1%| | 3175/478625 [00:00<00:14, 31740.80it/s][A
+
4%|▍ | 19672/478625 [00:00<00:14, 31719.72it/s][A
+
52%|█████▏ | 247123/478625 [00:07<00:07, 32665.55it/s][A
+
1%| | 3300/478625 [00:00<00:14, 32993.74it/s][A
+
5%|▍ | 22878/478625 [00:00<00:14, 31435.92it/s][A
+
1%|▏ | 6350/478625 [00:00<00:15, 29866.97it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
52%|█████▏ | 250483/478625 [00:07<00:06, 32938.62it/s][A
+
1%|▏ | 6600/478625 [00:00<00:14, 32493.47it/s][A
+
5%|▌ | 26177/478625 [00:00<00:14, 31919.06it/s][A
+
2%|▏ | 9597/478625 [00:00<00:15, 31010.92it/s][A
+
1%| | 3214/478625 [00:00<00:14, 32136.20it/s][A
+
53%|█████▎ | 253780/478625 [00:07<00:06, 32368.76it/s][A
+
2%|▏ | 9980/478625 [00:00<00:14, 33083.20it/s][A
+
6%|▌ | 29557/478625 [00:00<00:13, 32496.93it/s][A
+
3%|▎ | 12896/478625 [00:00<00:14, 31774.66it/s][A
+
1%|▏ | 6428/478625 [00:00<00:14, 31674.22it/s][A
+
54%|█████▎ | 257123/478625 [00:07<00:06, 32680.39it/s][A
+
3%|▎ | 13334/478625 [00:00<00:13, 33261.10it/s][A
+
7%|▋ | 32810/478625 [00:01<00:13, 32059.10it/s][A
+
3%|▎ | 16080/478625 [00:00<00:14, 31165.62it/s][A
+
2%|▏ | 9739/478625 [00:00<00:14, 32323.00it/s][A
+
54%|█████▍ | 260395/478625 [00:07<00:06, 32059.46it/s][A
+
3%|▎ | 16661/478625 [00:00<00:14, 32430.21it/s][A
+
8%|▊ | 36213/478625 [00:01<00:13, 32649.60it/s][A
+
4%|▍ | 19371/478625 [00:00<00:14, 31744.71it/s][A
+
3%|▎ | 13062/478625 [00:00<00:14, 32678.80it/s][A
+
55%|█████▌ | 263736/478625 [00:08<00:06, 32452.83it/s][A
+
4%|▍ | 20054/478625 [00:00<00:13, 32926.57it/s][A
+
8%|▊ | 39482/478625 [00:01<00:13, 32557.33it/s][A
+
5%|▍ | 22663/478625 [00:00<00:14, 32120.40it/s][A
+
3%|▎ | 16331/478625 [00:00<00:14, 31820.29it/s][A
+
56%|█████▌ | 267099/478625 [00:08<00:06, 32798.12it/s][A
+
5%|▍ | 23351/478625 [00:00<00:14, 32323.88it/s][A
+
9%|▉ | 42740/478625 [00:01<00:13, 32090.21it/s][A
+
5%|▌ | 25879/478625 [00:00<00:14, 31498.58it/s][A
+
4%|▍ | 19636/478625 [00:00<00:14, 32227.85it/s][A
+
56%|█████▋ | 270383/478625 [00:08<00:06, 32274.68it/s][A
+
6%|▌ | 26725/478625 [00:00<00:13, 32764.01it/s][A
+
10%|▉ | 46103/478625 [00:01<00:13, 32520.35it/s][A
+
6%|▌ | 29173/478625 [00:00<00:14, 31938.19it/s][A
+
5%|▍ | 22882/478625 [00:00<00:14, 31677.91it/s][A
+
57%|█████▋ | 273742/478625 [00:08<00:06, 32659.21it/s][A
+
6%|▋ | 30108/478625 [00:00<00:13, 33089.10it/s][A
+
10%|█ | 49358/478625 [00:01<00:13, 31642.28it/s][A
+
7%|▋ | 32371/478625 [00:01<00:14, 31420.22it/s][A
+
5%|▌ | 26226/478625 [00:00<00:14, 32224.66it/s][A
+
58%|█████▊ | 277012/478625 [00:08<00:06, 32139.80it/s][A
+
7%|▋ | 33420/478625 [00:01<00:13, 32546.24it/s][A
+
11%|█ | 52686/478625 [00:01<00:13, 32119.76it/s][A
+
7%|▋ | 35659/478625 [00:01<00:13, 31854.20it/s][A
+
6%|▌ | 29530/478625 [00:00<00:13, 32475.23it/s][A
+
59%|█████▊ | 280371/478625 [00:08<00:06, 32565.25it/s][A
+
8%|▊ | 36795/478625 [00:01<00:13, 32904.56it/s][A
+
12%|█▏ | 55911/478625 [00:01<00:13, 32157.36it/s][A
+
8%|▊ | 38952/478625 [00:01<00:13, 32174.05it/s][A
+
7%|▋ | 32781/478625 [00:01<00:13, 31852.81it/s][A
+
59%|█████▉ | 283773/478625 [00:08<00:05, 32994.74it/s][A
+
8%|▊ | 40089/478625 [00:01<00:13, 32294.93it/s][A
+
12%|█▏ | 59131/478625 [00:01<00:13, 31705.04it/s][A
+
9%|▉ | 42173/478625 [00:01<00:13, 31534.32it/s][A
+
8%|▊ | 36099/478625 [00:01<00:13, 32249.95it/s][A
+
60%|█████▉ | 287076/478625 [00:08<00:05, 32442.34it/s][A
+
9%|▉ | 43460/478625 [00:01<00:13, 32713.45it/s][A
+
13%|█▎ | 62467/478625 [00:01<00:12, 32190.21it/s][A
+
9%|▉ | 45448/478625 [00:01<00:13, 31891.12it/s][A
+
8%|▊ | 39390/478625 [00:01<00:13, 32445.75it/s][A
+
61%|██████ | 290448/478625 [00:08<00:05, 32817.40it/s][A
+
10%|▉ | 46825/478625 [00:01<00:13, 32990.29it/s][A
+
10%|█ | 48641/478625 [00:01<00:13, 31484.20it/s][A
+
14%|█▎ | 65690/478625 [00:02<00:13, 31465.72it/s][A
+
9%|▉ | 42638/478625 [00:01<00:13, 31815.36it/s][A
+
61%|██████▏ | 293734/478625 [00:09<00:05, 32087.36it/s][A
+
10%|█ | 50128/478625 [00:01<00:13, 32444.84it/s][A
+
11%|█ | 51911/478625 [00:01<00:13, 31840.63it/s][A
+
14%|█▍ | 69009/478625 [00:02<00:12, 31966.84it/s][A
+
10%|▉ | 45969/478625 [00:01<00:13, 32255.36it/s][A
+
62%|██████▏ | 297134/478625 [00:09<00:05, 32644.87it/s][A
+
11%|█ | 53480/478625 [00:01<00:12, 32759.25it/s][A
+
12%|█▏ | 55215/478625 [00:01<00:13, 32193.32it/s][A
+
15%|█▌ | 72272/478625 [00:02<00:12, 32161.32it/s][A
+
10%|█ | 49199/478625 [00:01<00:13, 31739.62it/s][A
+
63%|██████▎ | 300511/478625 [00:09<00:05, 32975.40it/s][A
+
12%|█▏ | 56760/478625 [00:01<00:13, 32223.37it/s][A
+
12%|█▏ | 58437/478625 [00:01<00:13, 31613.49it/s][A
+
16%|█▌ | 75492/478625 [00:02<00:12, 31439.15it/s][A
+
11%|█ | 52518/478625 [00:01<00:13, 32165.41it/s][A
+
63%|██████▎ | 303813/478625 [00:09<00:05, 32365.29it/s][A
+
13%|█▎ | 60157/478625 [00:01<00:12, 32736.43it/s][A
+
13%|█▎ | 61734/478625 [00:01<00:13, 32011.79it/s][A
+
16%|█▋ | 78727/478625 [00:02<00:12, 31704.71it/s][A
+
12%|█▏ | 55865/478625 [00:01<00:12, 32550.71it/s][A
+
64%|██████▍ | 307176/478625 [00:09<00:05, 32734.05it/s][A
+
13%|█▎ | 63541/478625 [00:01<00:12, 33060.70it/s][A
+
14%|█▎ | 65027/478625 [00:02<00:12, 32281.30it/s][A
+
17%|█▋ | 81982/478625 [00:02<00:12, 31195.84it/s][A
+
12%|█▏ | 59124/478625 [00:01<00:13, 31888.16it/s][A
+
0%| | 0/478625 [00:00, ?it/s][A
+
65%|██████▍ | 310454/478625 [00:09<00:05, 32283.78it/s][A
+
14%|█▍ | 66851/478625 [00:02<00:12, 32486.24it/s][A
+
14%|█▍ | 68258/478625 [00:02<00:12, 31701.61it/s][A
+
18%|█▊ | 85273/478625 [00:02<00:12, 31694.08it/s][A
+
13%|█▎ | 62459/478625 [00:01<00:12, 32316.94it/s][A
+
1%| | 3224/478625 [00:00<00:14, 32231.96it/s][A
+
66%|██████▌ | 313821/478625 [00:09<00:05, 32688.97it/s][A
+
15%|█▍ | 70237/478625 [00:02<00:12, 32889.71it/s][A
+
15%|█▍ | 71536/478625 [00:02<00:12, 32018.49it/s][A
+
19%|█▊ | 88603/478625 [00:02<00:12, 32163.68it/s][A
+
14%|█▎ | 65695/478625 [00:02<00:12, 31830.35it/s][A
+
1%|▏ | 6448/478625 [00:00<00:14, 31629.49it/s][A
+
66%|██████▋ | 317180/478625 [00:09<00:04, 32951.65it/s][A
+
15%|█▌ | 73530/478625 [00:02<00:12, 32371.61it/s][A
+
16%|█▌ | 74742/478625 [00:02<00:12, 31512.16it/s][A
+
19%|█▉ | 91824/478625 [00:02<00:12, 30741.50it/s][A
+
14%|█▍ | 69018/478625 [00:02<00:12, 32238.01it/s][A
+
2%|▏ | 9743/478625 [00:00<00:14, 32222.68it/s][A
+
67%|██████▋ | 320478/478625 [00:09<00:04, 32415.84it/s][A
+
16%|█▌ | 76915/478625 [00:02<00:12, 32805.31it/s][A
+
16%|█▋ | 77897/478625 [00:02<00:12, 31321.54it/s][A
+
20%|█▉ | 95061/478625 [00:02<00:12, 31208.86it/s][A
+
15%|█▌ | 72283/478625 [00:02<00:12, 32357.86it/s][A
+
3%|▎ | 13040/478625 [00:00<00:14, 32512.17it/s][A
+
68%|██████▊ | 323869/478625 [00:09<00:04, 32853.09it/s][A
+
17%|█▋ | 80272/478625 [00:02<00:12, 33030.19it/s][A
+
17%|█▋ | 81195/478625 [00:02<00:12, 31807.64it/s][A
+
21%|██ | 98212/478625 [00:03<00:12, 31295.65it/s][A
+
16%|█▌ | 75522/478625 [00:02<00:12, 31855.41it/s][A
+
3%|▎ | 16293/478625 [00:00<00:14, 31676.00it/s][A
+
68%|██████▊ | 327158/478625 [00:10<00:04, 32218.00it/s][A
+
17%|█▋ | 83578/478625 [00:02<00:12, 32467.33it/s][A
+
18%|█▊ | 84379/478625 [00:02<00:12, 31330.03it/s][A
+
21%|██ | 101351/478625 [00:03<00:12, 30987.48it/s][A
+
16%|█▋ | 78818/478625 [00:02<00:12, 32179.64it/s][A
+
4%|▍ | 19586/478625 [00:00<00:14, 32091.79it/s][A
+
69%|██████▉ | 330435/478625 [00:10<00:04, 32378.12it/s][A
+
18%|█▊ | 86934/478625 [00:02<00:11, 32786.53it/s][A
+
18%|█▊ | 87646/478625 [00:02<00:12, 31722.04it/s][A
+
22%|██▏ | 104631/478625 [00:03<00:11, 31517.73it/s][A
+
17%|█▋ | 82039/478625 [00:02<00:12, 31678.11it/s][A
+
5%|▍ | 22872/478625 [00:00<00:14, 32336.40it/s][A
+
70%|██████▉ | 333802/478625 [00:10<00:04, 32758.04it/s][A
+
19%|█▉ | 90216/478625 [00:02<00:12, 32313.43it/s][A
+
19%|█▉ | 90821/478625 [00:02<00:12, 31272.39it/s][A
+
23%|██▎ | 107789/478625 [00:03<00:11, 31258.31it/s][A
+
18%|█▊ | 85368/478625 [00:02<00:12, 32149.54it/s][A
+
5%|▌ | 26109/478625 [00:00<00:14, 31555.97it/s][A
+
+
70%|███████ | 337081/478625 [00:10<00:04, 32251.34it/s][A
20%|█▉ | 93536/478625 [00:02<00:11, 32572.58it/s][A
+
20%|█▉ | 94085/478625 [00:02<00:12, 31673.98it/s][A
+
23%|██▎ | 111058/478625 [00:03<00:11, 31678.36it/s][A
+
19%|█▊ | 88698/478625 [00:02<00:12, 32488.60it/s][A
+
6%|▌ | 29390/478625 [00:00<00:14, 31935.65it/s][A
+
+
71%|███████ | 340411/478625 [00:10<00:04, 32557.94it/s][A
20%|██ | 96893/478625 [00:02<00:11, 32864.78it/s][A
+
20%|██ | 97256/478625 [00:03<00:12, 31515.76it/s][A
+
24%|██▍ | 114232/478625 [00:03<00:11, 31694.13it/s][A
+
19%|█▉ | 91950/478625 [00:02<00:12, 31781.55it/s][A
+
7%|▋ | 32588/478625 [00:01<00:14, 31223.25it/s][A
+
72%|███████▏ | 343670/478625 [00:10<00:04, 32044.66it/s][A
+
21%|██ | 100182/478625 [00:03<00:11, 32331.40it/s][A
+
21%|██ | 100410/478625 [00:03<00:12, 31144.20it/s][A
+
25%|██▍ | 117404/478625 [00:03<00:11, 31095.57it/s][A
+
20%|█▉ | 95267/478625 [00:02<00:11, 32188.04it/s][A
+
7%|▋ | 35790/478625 [00:01<00:14, 31457.40it/s][A
+
73%|███████▎ | 347044/478625 [00:10<00:04, 32541.77it/s]
+[A
22%|██▏ | 103516/478625 [00:03<00:11, 32627.98it/s][A
+
22%|██▏ | 103685/478625 [00:03<00:11, 31616.34it/s][A
+
25%|██▌ | 120687/478625 [00:03<00:11, 31602.89it/s][A
+
21%|██ | 98594/478625 [00:03<00:11, 32505.37it/s][A
+
8%|▊ | 39075/478625 [00:01<00:13, 31839.91it/s][A
+
73%|███████▎ | 350409/478625 [00:10<00:03, 32867.45it/s][A
+
22%|██▏ | 106782/478625 [00:03<00:11, 32063.14it/s][A
+
22%|██▏ | 106945/478625 [00:03<00:11, 31906.82it/s][A
+
26%|██▌ | 124012/478625 [00:03<00:11, 32088.32it/s][A
+
21%|██▏ | 101849/478625 [00:03<00:11, 31816.53it/s][A
+
9%|▉ | 42263/478625 [00:01<00:13, 31312.76it/s][A
+
74%|███████▍ | 353699/478625 [00:10<00:03, 32295.19it/s]
+[A
23%|██▎ | 110135/478625 [00:03<00:11, 32490.16it/s][A
+
23%|██▎ | 110138/478625 [00:03<00:11, 31397.64it/s][A
+
27%|██▋ | 127225/478625 [00:04<00:11, 31232.08it/s][A
+
22%|██▏ | 105147/478625 [00:03<00:11, 32156.64it/s][A
+
10%|▉ | 45513/478625 [00:01<00:13, 31661.99it/s][A
+
75%|███████▍ | 357063/478625 [00:10<00:03, 32689.59it/s][A
+
24%|██▎ | 113508/478625 [00:03<00:11, 32854.15it/s][A
+
24%|██▎ | 113438/478625 [00:03<00:11, 31869.42it/s][A
+
27%|██▋ | 130491/478625 [00:04<00:11, 31641.33it/s][A
+
23%|██▎ | 108367/478625 [00:03<00:11, 31608.82it/s][A
+
10%|█ | 48683/478625 [00:01<00:13, 31284.27it/s][A
+
75%|███████▌ | 360449/478625 [00:11<00:03, 33032.50it/s][A
+
24%|██▍ | 116797/478625 [00:03<00:11, 32264.93it/s][A
+
24%|██▍ | 116628/478625 [00:03<00:11, 31083.19it/s][A
+
28%|██▊ | 133661/478625 [00:04<00:11, 30799.89it/s][A
+
23%|██▎ | 111662/478625 [00:03<00:11, 32000.81it/s][A
+
11%|█ | 51956/478625 [00:01<00:13, 31708.28it/s][A
+
76%|███████▌ | 363756/478625 [00:11<00:03, 32470.23it/s][A
+
25%|██▌ | 120157/478625 [00:03<00:10, 32655.67it/s][A
+
25%|██▌ | 119918/478625 [00:03<00:11, 31611.42it/s][A
+
29%|██▊ | 136974/478625 [00:04<00:10, 31474.48it/s][A
+
24%|██▍ | 114977/478625 [00:03<00:11, 32337.35it/s][A
+
12%|█▏ | 55248/478625 [00:01<00:13, 32064.05it/s][A
+
77%|███████▋ | 367090/478625 [00:11<00:03, 32725.40it/s][A
+
26%|██▌ | 123444/478625 [00:03<00:11, 32169.72it/s][A
+
26%|██▌ | 123187/478625 [00:03<00:11, 31927.36it/s][A
+
29%|██▉ | 140255/478625 [00:04<00:10, 31864.52it/s][A
+
25%|██▍ | 118215/478625 [00:03<00:11, 31713.74it/s][A
+
12%|█▏ | 58457/478625 [00:01<00:13, 31427.84it/s][A
+
26%|██▋ | 126808/478625 [00:03<00:10, 32598.46it/s][A
+
77%|███████▋ | 370366/478625 [00:11<00:03, 31956.41it/s][A
+
26%|██▋ | 126384/478625 [00:03<00:11, 31395.68it/s][A
+
30%|██▉ | 143448/478625 [00:04<00:10, 31387.56it/s][A
+
25%|██▌ | 121538/478625 [00:03<00:11, 32158.17it/s][A
+
13%|█▎ | 61687/478625 [00:01<00:13, 31683.93it/s][A
+
27%|██▋ | 130148/478625 [00:03<00:10, 32831.78it/s][A
+
78%|███████▊ | 373750/478625 [00:11<00:03, 32504.40it/s][A
+
27%|██▋ | 129666/478625 [00:04<00:10, 31811.55it/s][A
+
31%|███ | 146631/478625 [00:04<00:10, 31514.21it/s][A
+
26%|██▌ | 124758/478625 [00:03<00:11, 31651.47it/s][A
+
14%|█▎ | 64959/478625 [00:02<00:12, 31987.57it/s][A
+
79%|███████▉ | 377113/478625 [00:11<00:03, 32834.58it/s][A
+
28%|██▊ | 133434/478625 [00:04<00:10, 32332.13it/s][A
+
28%|██▊ | 132852/478625 [00:04<00:11, 31389.35it/s][A
+
31%|███▏ | 149787/478625 [00:04<00:10, 31144.64it/s][A
+
27%|██▋ | 128023/478625 [00:03<00:10, 31941.70it/s][A
+
14%|█▍ | 68161/478625 [00:02<00:13, 31470.57it/s][A
+
29%|██▊ | 136812/478625 [00:04<00:10, 32718.20it/s][A
+
79%|███████▉ | 380401/478625 [00:11<00:03, 32208.27it/s][A
+
28%|██▊ | 136100/478625 [00:04<00:10, 31677.88it/s][A
+
32%|███▏ | 153087/478625 [00:04<00:10, 31689.99it/s][A
+
27%|██▋ | 131357/478625 [00:04<00:10, 32351.76it/s][A
+
15%|█▍ | 71377/478625 [00:02<00:12, 31672.82it/s][A
+
29%|██▉ | 140183/478625 [00:04<00:10, 33009.71it/s][A
+
80%|████████ | 383768/478625 [00:11<00:02, 32634.65it/s][A
+
29%|██▉ | 139388/478625 [00:04<00:10, 32030.08it/s][A
+
33%|███▎ | 156426/478625 [00:04<00:10, 32191.86it/s][A
+
28%|██▊ | 134596/478625 [00:04<00:10, 31784.11it/s][A
+
16%|█▌ | 74548/478625 [00:02<00:12, 31236.56it/s][A
+
30%|██▉ | 143487/478625 [00:04<00:10, 32403.59it/s][A
+
81%|████████ | 387036/478625 [00:11<00:02, 32223.70it/s][A
+
30%|██▉ | 142594/478625 [00:04<00:10, 31474.37it/s][A
+
33%|███▎ | 159649/478625 [00:05<00:10, 31761.61it/s][A
+
29%|██▉ | 137924/478625 [00:04<00:10, 32220.35it/s][A
+
16%|█▋ | 77851/478625 [00:02<00:12, 31760.94it/s][A
+
31%|███ | 146831/478625 [00:04<00:10, 32707.38it/s][A
+
82%|████████▏ | 390401/478625 [00:11<00:02, 32641.74it/s][A
+
30%|███ | 145870/478625 [00:04<00:10, 31850.98it/s][A
+
34%|███▍ | 162941/478625 [00:05<00:09, 32101.45it/s][A
+
29%|██▉ | 141150/478625 [00:04<00:10, 31694.18it/s][A
+
17%|█▋ | 81143/478625 [00:02<00:12, 32100.66it/s][A
+
82%|████████▏ | 393765/478625 [00:12<00:02, 32936.17it/s][A
+
31%|███▏ | 150105/478625 [00:04<00:10, 32231.96it/s][A
+
31%|███ | 149151/478625 [00:04<00:10, 32132.31it/s][A
+
35%|███▍ | 166154/478625 [00:05<00:09, 31824.32it/s][A
+
30%|███ | 144404/478625 [00:04<00:10, 31941.37it/s][A
+
18%|█▊ | 84356/478625 [00:02<00:12, 31494.69it/s][A
+
32%|███▏ | 153466/478625 [00:04<00:09, 32635.02it/s][A
+
83%|████████▎ | 397062/478625 [00:12<00:02, 32359.55it/s][A
+
32%|███▏ | 152367/478625 [00:04<00:10, 31478.80it/s][A
+
35%|███▌ | 169339/478625 [00:05<00:09, 31326.70it/s][A
+
31%|███ | 147723/478625 [00:04<00:10, 32307.41it/s][A
+
18%|█▊ | 87631/478625 [00:02<00:12, 31861.73it/s][A
+
33%|███▎ | 156850/478625 [00:04<00:09, 32990.42it/s][A
+
84%|████████▎ | 400420/478625 [00:12<00:02, 32715.93it/s][A
+
33%|███▎ | 155668/478625 [00:04<00:10, 31925.34it/s][A
+
36%|███▌ | 172602/478625 [00:05<00:09, 31708.73it/s][A
+
32%|███▏ | 150957/478625 [00:04<00:10, 31705.75it/s][A
+
19%|█▉ | 90821/478625 [00:02<00:12, 31355.97it/s][A
+
33%|███▎ | 160152/478625 [00:04<00:09, 32389.13it/s][A
+
84%|████████▍ | 403695/478625 [00:12<00:02, 32222.93it/s][A
+
33%|███▎ | 158865/478625 [00:05<00:10, 31471.44it/s][A
+
37%|███▋ | 175776/478625 [00:05<00:09, 31382.79it/s][A
+
32%|███▏ | 154274/478625 [00:04<00:10, 32133.18it/s][A
+
20%|█▉ | 94104/478625 [00:02<00:12, 31786.06it/s][A
+
34%|███▍ | 163495/478625 [00:05<00:09, 32694.20it/s][A
+
85%|████████▌ | 407054/478625 [00:12<00:02, 32623.28it/s][A
+
34%|███▍ | 162116/478625 [00:05<00:09, 31773.50it/s][A
+
37%|███▋ | 179043/478625 [00:05<00:09, 31760.25it/s][A
+
33%|███▎ | 157595/478625 [00:04<00:09, 32447.55it/s][A
+
20%|██ | 97322/478625 [00:03<00:11, 31901.46it/s][A
+
86%|████████▌ | 410421/478625 [00:12<00:02, 32931.86it/s][A
+
35%|███▍ | 166768/478625 [00:05<00:09, 32251.59it/s][A
+
35%|███▍ | 165403/478625 [00:05<00:09, 32095.96it/s][A
+
38%|███▊ | 182222/478625 [00:05<00:09, 31375.59it/s][A
+
34%|███▎ | 160843/478625 [00:05<00:10, 31761.43it/s][A
+
21%|██ | 100515/478625 [00:03<00:12, 31444.02it/s][A
+
36%|███▌ | 170104/478625 [00:05<00:09, 32575.84it/s][A
+
86%|████████▋ | 413717/478625 [00:12<00:02, 32246.46it/s][A
+
35%|███▌ | 168616/478625 [00:05<00:09, 31468.88it/s][A
+
39%|███▊ | 185362/478625 [00:05<00:09, 31104.32it/s][A
+
34%|███▍ | 164138/478625 [00:05<00:09, 32107.89it/s][A
+
22%|██▏ | 103777/478625 [00:03<00:11, 31745.90it/s][A
+
36%|███▌ | 173468/478625 [00:05<00:09, 32888.52it/s][A
+
87%|████████▋ | 416947/478625 [00:12<00:01, 32247.54it/s][A
+
36%|███▌ | 171900/478625 [00:05<00:09, 31869.08it/s][A
+
39%|███▉ | 188671/478625 [00:05<00:09, 31685.54it/s][A
+
35%|███▍ | 167353/478625 [00:05<00:09, 31666.68it/s][A
+
22%|██▏ | 107075/478625 [00:03<00:11, 32109.45it/s][A
+
37%|███▋ | 176760/478625 [00:05<00:09, 32392.82it/s][A
+
88%|████████▊ | 420175/478625 [00:12<00:01, 31957.87it/s][A
+
37%|███▋ | 175091/478625 [00:05<00:09, 31371.84it/s][A
+
40%|████ | 191842/478625 [00:06<00:09, 30549.39it/s][A
+
36%|███▌ | 170644/478625 [00:05<00:09, 32029.95it/s][A
+
23%|██▎ | 110289/478625 [00:03<00:11, 31544.40it/s][A
+
38%|███▊ | 180111/478625 [00:05<00:09, 32720.45it/s][A
+
88%|████████▊ | 423547/478625 [00:12<00:01, 32475.52it/s][A
+
37%|███▋ | 178377/478625 [00:05<00:09, 31804.64it/s][A
+
41%|████ | 195061/478625 [00:06<00:09, 31023.82it/s][A
+
36%|███▋ | 173976/478625 [00:05<00:09, 32408.36it/s][A
+
24%|██▎ | 113545/478625 [00:03<00:11, 31841.33it/s][A
+
89%|████████▉ | 426904/478625 [00:13<00:01, 32799.11it/s][A
+
38%|███▊ | 183386/478625 [00:05<00:09, 32044.64it/s][A
+
38%|███▊ | 181649/478625 [00:05<00:09, 32072.27it/s][A
+
41%|████▏ | 198176/478625 [00:06<00:09, 31059.69it/s][A
+
37%|███▋ | 177220/478625 [00:05<00:09, 31809.78it/s][A
+
24%|██▍ | 116733/478625 [00:03<00:11, 31365.82it/s][A
+
90%|████████▉ | 430187/478625 [00:13<00:01, 32300.82it/s][A
+
39%|███▉ | 186758/478625 [00:05<00:08, 32534.22it/s][A
+
39%|███▊ | 184860/478625 [00:05<00:09, 31306.49it/s][A
+
42%|████▏ | 201288/478625 [00:06<00:08, 30917.45it/s][A
+
38%|███▊ | 180488/478625 [00:05<00:09, 32063.84it/s][A
+
25%|██▌ | 120023/478625 [00:03<00:11, 31813.66it/s][A
+
91%|█████████ | 433571/478625 [00:13<00:01, 32751.90it/s][A
+
40%|███▉ | 190145/478625 [00:05<00:08, 32926.38it/s][A
+
39%|███▉ | 188147/478625 [00:05<00:09, 31761.66it/s][A
+
43%|████▎ | 204573/478625 [00:06<00:08, 31486.92it/s][A
+
38%|███▊ | 183698/478625 [00:05<00:09, 31482.48it/s][A
+
26%|██▌ | 123322/478625 [00:03<00:11, 32158.89it/s][A
+
91%|█████████▏| 436849/478625 [00:13<00:01, 32269.04it/s][A
+
40%|████ | 193442/478625 [00:05<00:08, 32352.89it/s][A
+
40%|███▉ | 191448/478625 [00:06<00:08, 32128.77it/s][A
+
43%|████▎ | 207832/478625 [00:06<00:08, 31812.89it/s][A
+
39%|███▉ | 187005/478625 [00:05<00:09, 31944.38it/s][A
+
26%|██▋ | 126541/478625 [00:03<00:11, 31503.12it/s][A
+
92%|█████████▏| 440219/478625 [00:13<00:01, 32688.13it/s][A
+
41%|████ | 196793/478625 [00:06<00:08, 32691.67it/s][A
+
41%|████ | 194665/478625 [00:06<00:09, 31518.04it/s][A
+
44%|████▍ | 211017/478625 [00:06<00:08, 31428.31it/s][A
+
40%|███▉ | 190348/478625 [00:05<00:08, 32379.89it/s][A
+
27%|██▋ | 129781/478625 [00:04<00:10, 31738.24it/s][A
+
93%|█████████▎| 443604/478625 [00:13<00:01, 33030.02it/s][A
+
42%|████▏ | 200066/478625 [00:06<00:08, 32259.70it/s][A
+
41%|████▏ | 197956/478625 [00:06<00:08, 31925.63it/s][A
+
45%|████▍ | 214248/478625 [00:06<00:08, 31686.32it/s][A
+
40%|████ | 193590/478625 [00:06<00:08, 31808.87it/s][A
+
28%|██▊ | 132959/478625 [00:04<00:11, 31276.39it/s][A
+
93%|█████████▎| 446910/478625 [00:13<00:00, 32506.89it/s][A
+
42%|████▏ | 203407/478625 [00:06<00:08, 32596.53it/s][A
+
42%|████▏ | 201153/478625 [00:06<00:08, 31356.80it/s][A
+
45%|████▌ | 217419/478625 [00:06<00:08, 31336.90it/s][A
+
41%|████ | 196887/478625 [00:06<00:08, 32148.69it/s][A
+
28%|██▊ | 136141/478625 [00:04<00:10, 31434.73it/s][A
+
94%|█████████▍| 450276/478625 [00:13<00:00, 32842.94it/s][A
+
43%|████▎ | 206705/478625 [00:06<00:08, 32707.35it/s][A
+
43%|████▎ | 204403/478625 [00:06<00:08, 31690.56it/s][A
+
46%|████▌ | 220752/478625 [00:06<00:08, 31922.34it/s][A
+
42%|████▏ | 200125/478625 [00:06<00:08, 31731.60it/s][A
+
29%|██▉ | 139431/478625 [00:04<00:10, 31865.92it/s][A
+
+
44%|████▍ | 209978/478625 [00:06<00:08, 32202.63it/s][A
95%|█████████▍| 453564/478625 [00:13<00:00, 32290.08it/s][A
+
43%|████▎ | 207646/478625 [00:06<00:08, 31906.07it/s][A
+
47%|████▋ | 224068/478625 [00:07<00:07, 32256.17it/s][A
+
43%|████▎ | 203444/478625 [00:06<00:08, 32158.85it/s][A
+
30%|██▉ | 142621/478625 [00:04<00:10, 31334.33it/s][A
+
+
45%|████▍ | 213363/478625 [00:06<00:08, 32686.94it/s][A
95%|█████████▌| 456917/478625 [00:14<00:00, 32653.83it/s][A
+
44%|████▍ | 210840/478625 [00:06<00:08, 31407.25it/s][A
+
47%|████▋ | 227296/478625 [00:07<00:07, 31704.30it/s][A
+
43%|████▎ | 206706/478625 [00:06<00:08, 32293.12it/s][A
+
30%|███ | 145876/478625 [00:04<00:10, 31690.66it/s][A
+
96%|█████████▌| 460313/478625 [00:14<00:00, 33037.95it/s][A
+
45%|████▌ | 216635/478625 [00:06<00:08, 32145.41it/s][A
+
45%|████▍ | 214125/478625 [00:06<00:08, 31828.77it/s][A
+
48%|████▊ | 230543/478625 [00:07<00:07, 31927.56it/s][A
+
44%|████▍ | 209938/478625 [00:06<00:08, 31741.81it/s][A
+
31%|███ | 149140/478625 [00:04<00:10, 31971.20it/s][A
+
46%|████▌ | 220019/478625 [00:06<00:07, 32642.89it/s][A
+
97%|█████████▋| 463620/478625 [00:14<00:00, 32330.39it/s][A
+
45%|████▌ | 217312/478625 [00:06<00:08, 31271.03it/s][A
+
49%|████▉ | 233760/478625 [00:07<00:07, 31998.55it/s][A
+
45%|████▍ | 213256/478625 [00:06<00:08, 32162.04it/s][A
+
32%|███▏ | 152340/478625 [00:04<00:10, 31439.82it/s][A
+
47%|████▋ | 223402/478625 [00:06<00:07, 32990.73it/s][A
+
98%|█████████▊| 466877/478625 [00:14<00:00, 32399.69it/s][A
+
46%|████▌ | 220595/478625 [00:06<00:08, 31727.81it/s][A
+
50%|████▉ | 236962/478625 [00:07<00:07, 31612.36it/s][A
+
45%|████▌ | 216548/478625 [00:06<00:08, 32385.33it/s][A
+
33%|███▎ | 155650/478625 [00:04<00:10, 31925.50it/s][A
+
47%|████▋ | 226705/478625 [00:06<00:07, 32477.16it/s][A
+
98%|█████████▊| 470222/478625 [00:14<00:00, 32072.73it/s][A
+
47%|████▋ | 223890/478625 [00:07<00:07, 32088.23it/s][A
+
50%|█████ | 240247/478625 [00:07<00:07, 31974.53it/s][A
+
46%|████▌ | 219790/478625 [00:06<00:08, 31876.56it/s][A
+
33%|███▎ | 158846/478625 [00:05<00:10, 31484.97it/s][A
+
48%|████▊ | 230049/478625 [00:07<00:07, 32758.14it/s][A
+
99%|█████████▉| 473601/478625 [00:14<00:00, 32573.75it/s][A
+
47%|████▋ | 227102/478625 [00:07<00:07, 31561.65it/s][A
+
51%|█████ | 243447/478625 [00:07<00:07, 31530.28it/s][A
+
47%|████▋ | 223114/478625 [00:06<00:07, 32277.39it/s][A
+
34%|███▍ | 162017/478625 [00:05<00:10, 31550.42it/s][A
+
100%|█████████▉| 476967/478625 [00:14<00:00, 32892.91it/s][A
+
49%|████▊ | 233328/478625 [00:07<00:07, 32069.81it/s][A
+
48%|████▊ | 230331/478625 [00:07<00:07, 31774.13it/s][A
+
52%|█████▏ | 246652/478625 [00:07<00:07, 31682.20it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32603.48it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.28s/it]
100%|██████████| 1/1 [00:21<00:00, 21.28s/it]
+
+
47%|████▋ | 226345/478625 [00:07<00:07, 31802.35it/s][A
+
35%|███▍ | 165267/478625 [00:05<00:09, 31829.96it/s][A
+
49%|████▉ | 236697/478625 [00:07<00:07, 32542.26it/s][A
+
49%|████▉ | 233592/478625 [00:07<00:07, 32020.58it/s][A
+
52%|█████▏ | 249942/478625 [00:07<00:07, 32039.86it/s][A09/05/2024 19:42:26 - INFO - opensora.dataset.t2v_datasets - no_cap: 0, too_long: 3711, too_short: 2, no_resolution: 0, resolution_mismatch: 0, Counter(sample_size): Counter({'93x160x320': 84930, '29x160x320': 73201, '45x160x320': 68295, '61x160x320': 44578, '77x160x320': 38630, '93x128x320': 17805, '29x128x320': 16948, '93x224x320': 16403, '93x192x320': 15259, '45x128x320': 14788, '61x128x320': 9795, '29x224x320': 8615, '29x192x320': 8528, '45x224x320': 8477, '45x192x320': 8309, '77x128x320': 7730, '61x224x320': 6211, '61x192x320': 5983, '77x224x320': 5788, '77x192x320': 5268, '93x256x320': 3164, '45x256x320': 1510, '29x256x320': 1480, '61x256x320': 1152, '77x256x320': 1090, '93x96x320': 282, '45x96x320': 200, '29x96x320': 169, '61x96x320': 163, '77x96x320': 148}), cnt_movie: 0, cnt_img: 0, before filter: 478625, after filter: 474899
+
+
48%|████▊ | 229657/478625 [00:07<00:07, 32188.03it/s][A
+
35%|███▌ | 168452/478625 [00:05<00:09, 31375.95it/s][A
+
50%|█████ | 240089/478625 [00:07<00:07, 32946.92it/s][A
+
49%|████▉ | 236797/478625 [00:07<00:07, 31477.68it/s][A
+
53%|█████▎ | 253148/478625 [00:07<00:07, 31523.85it/s][A
+
49%|████▊ | 232899/478625 [00:07<00:07, 32255.09it/s][A
+
36%|███▌ | 171712/478625 [00:05<00:09, 31734.98it/s][A
+
51%|█████ | 243388/478625 [00:07<00:07, 32361.64it/s][A
+
50%|█████ | 240088/478625 [00:07<00:07, 31898.21it/s][A
+
54%|█████▎ | 256433/478625 [00:08<00:06, 31912.06it/s][A
+
49%|████▉ | 236127/478625 [00:07<00:07, 31756.84it/s][A
+
37%|███▋ | 174888/478625 [00:05<00:09, 31299.48it/s][A
+
52%|█████▏ | 246723/478625 [00:07<00:07, 32649.54it/s][A
+
51%|█████ | 243281/478625 [00:07<00:07, 31383.87it/s][A09/05/2024 19:42:26 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | motion_score: 474899, cnt_no_motion: 13 | 192077 > 0.95, 0.7 > 65730 Mean: 0.8593367888417824, Var: 0.03075349223473551, Std: 0.17536673639757203, Min: -0.0717548280954361, Max: 1.0
+
+
54%|█████▍ | 259627/478625 [00:08<00:07, 31090.50it/s][A
+
50%|█████ | 239445/478625 [00:07<00:07, 32172.78it/s][A
+
37%|███▋ | 178182/478625 [00:05<00:09, 31780.04it/s][A
+
52%|█████▏ | 250063/478625 [00:07<00:07, 32172.25it/s][A
+
52%|█████▏ | 246557/478625 [00:07<00:07, 31785.56it/s][A
+
55%|█████▍ | 262975/478625 [00:08<00:06, 31785.49it/s][A
+
51%|█████ | 242665/478625 [00:07<00:07, 31687.89it/s][A
+
38%|███▊ | 181416/478625 [00:05<00:09, 31943.32it/s][A
+
53%|█████▎ | 253442/478625 [00:07<00:06, 32642.12it/s][A
+
52%|█████▏ | 249843/478625 [00:07<00:07, 32101.64it/s][A
+
56%|█████▌ | 266278/478625 [00:08<00:06, 32128.88it/s][A
+
51%|█████▏ | 245962/478625 [00:07<00:07, 32064.00it/s][A
+
39%|███▊ | 184613/478625 [00:05<00:09, 31216.98it/s][A09/05/2024 19:42:27 - INFO - opensora.dataset.t2v_datasets - before filter: 478625, after filter: 474899 | aesthetic_score: 478625, cnt_no_aesthetic: 0 | 14374 > 5.75, 4.5 > 113830 Mean: 4.846693657797633, Var: 0.24147353645946146, Std: 0.4913995690468821, Min: 2.685077953338623, Max: 6.742257436116536
+
+
54%|█████▎ | 256806/478625 [00:07<00:06, 32935.49it/s][Atime 21.951740264892578
+
+
53%|█████▎ | 253056/478625 [00:07<00:07, 31459.42it/s][A
+
56%|█████▋ | 269496/478625 [00:08<00:06, 31706.50it/s][An_elements: 474899
+data length: 474899
+09/05/2024 19:42:27 - INFO - __main__ - after train_dataloader
+09/05/2024 19:42:27 - INFO - __main__ - before accelerator.prepare
+[2024-09-05 19:42:27,082] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.6, git-hash=unknown, git-branch=unknown
+
+
52%|█████▏ | 249279/478625 [00:07<00:07, 32388.85it/s][A
+
39%|███▉ | 187897/478625 [00:05<00:09, 31692.37it/s][A
+
54%|█████▍ | 260103/478625 [00:07<00:06, 32135.17it/s][A
+
54%|█████▎ | 256340/478625 [00:08<00:06, 31864.21it/s][A
+
57%|█████▋ | 272778/478625 [00:08<00:06, 32032.57it/s][A
+
53%|█████▎ | 252521/478625 [00:07<00:07, 31761.67it/s][A
+
40%|███▉ | 191201/478625 [00:06<00:08, 32088.70it/s][A
+
55%|█████▌ | 263468/478625 [00:08<00:06, 32575.24it/s][A
+
58%|█████▊ | 275992/478625 [00:08<00:06, 32062.08it/s][A
+
54%|█████▍ | 259531/478625 [00:08<00:07, 31122.93it/s][A
+
53%|█████▎ | 255843/478625 [00:07<00:06, 32188.11it/s][A
+
41%|████ | 194414/478625 [00:06<00:08, 31597.63it/s][A
+
56%|█████▌ | 266852/478625 [00:08<00:06, 32946.06it/s][A
+
55%|█████▍ | 262815/478625 [00:08<00:06, 31621.42it/s][A
+
58%|█████▊ | 279201/478625 [00:08<00:06, 31131.62it/s][A
+
54%|█████▍ | 259068/478625 [00:08<00:06, 32205.10it/s][A
+
41%|████▏ | 197692/478625 [00:06<00:08, 31944.37it/s][A
+
56%|█████▋ | 270151/478625 [00:08<00:06, 32278.18it/s][A
+
56%|█████▌ | 266100/478625 [00:08<00:06, 31982.40it/s][A
+
59%|█████▉ | 282481/478625 [00:08<00:06, 31616.78it/s][A
+
55%|█████▍ | 262292/478625 [00:08<00:06, 31693.38it/s][A
+
42%|████▏ | 200890/478625 [00:06<00:08, 31416.34it/s][A
+
57%|█████▋ | 273505/478625 [00:08<00:06, 32645.52it/s][A
+
56%|█████▋ | 269303/478625 [00:08<00:06, 31341.65it/s][A
+
60%|█████▉ | 285649/478625 [00:09<00:06, 31348.85it/s][A
+
55%|█████▌ | 265593/478625 [00:08<00:06, 32080.68it/s][A
+
43%|████▎ | 204176/478625 [00:06<00:08, 31838.69it/s][A
+
58%|█████▊ | 276775/478625 [00:08<00:06, 32201.48it/s][A
+
57%|█████▋ | 272574/478625 [00:08<00:06, 31739.08it/s][A
+
60%|██████ | 288893/478625 [00:09<00:05, 31667.49it/s][A
+
56%|█████▌ | 268805/478625 [00:08<00:06, 31614.57it/s][A
+
43%|████▎ | 207383/478625 [00:06<00:08, 31903.77it/s][A
+
59%|█████▊ | 280135/478625 [00:08<00:06, 32608.95it/s][A
+
58%|█████▊ | 275902/478625 [00:08<00:06, 32192.05it/s][A
+
61%|██████ | 292064/478625 [00:09<00:05, 31621.89it/s][A
+
57%|█████▋ | 272110/478625 [00:08<00:06, 32034.73it/s][A
+
44%|████▍ | 210576/478625 [00:06<00:08, 31455.37it/s][A
+
59%|█████▉ | 283495/478625 [00:08<00:05, 32901.10it/s][A
+
58%|█████▊ | 279126/478625 [00:08<00:06, 31734.51it/s][A
+
62%|██████▏ | 295229/478625 [00:09<00:05, 31227.78it/s][A
+
58%|█████▊ | 275411/478625 [00:08<00:06, 32322.67it/s][A
+
45%|████▍ | 213858/478625 [00:06<00:08, 31854.28it/s][A
+
60%|█████▉ | 286789/478625 [00:08<00:05, 32259.82it/s][A
+
59%|█████▉ | 282470/478625 [00:08<00:06, 32235.26it/s][A
+
62%|██████▏ | 298573/478625 [00:09<00:05, 31876.67it/s][A
+
58%|█████▊ | 278646/478625 [00:08<00:06, 31775.77it/s][A
+
45%|████▌ | 217047/478625 [00:06<00:08, 31346.54it/s][A
+
61%|██████ | 290156/478625 [00:08<00:05, 32671.83it/s][A
+
60%|█████▉ | 285698/478625 [00:09<00:06, 31770.27it/s][A
+
63%|██████▎ | 301764/478625 [00:09<00:05, 31469.52it/s][A
+
59%|█████▉ | 281960/478625 [00:08<00:06, 32174.39it/s][A
+
46%|████▌ | 220295/478625 [00:06<00:08, 31678.28it/s][A
+
61%|██████▏ | 293428/478625 [00:09<00:05, 31948.92it/s][A
+
60%|██████ | 289030/478625 [00:09<00:05, 32223.19it/s][A
+
64%|██████▎ | 305016/478625 [00:09<00:05, 31776.22it/s][A
+
60%|█████▉ | 285181/478625 [00:08<00:06, 31733.75it/s][A
+
47%|████▋ | 223563/478625 [00:07<00:07, 31971.89it/s][A
+
62%|██████▏ | 296783/478625 [00:09<00:05, 32415.26it/s][A
+
61%|██████ | 292282/478625 [00:09<00:05, 32280.38it/s][A
+
64%|██████▍ | 308352/478625 [00:09<00:05, 32242.75it/s][A
+
60%|██████ | 288470/478625 [00:09<00:05, 32071.12it/s][A
+
47%|████▋ | 226763/478625 [00:07<00:07, 31544.18it/s][A
+
63%|██████▎ | 300130/478625 [00:09<00:05, 32723.80it/s][A
+
62%|██████▏ | 295513/478625 [00:09<00:05, 31847.45it/s][A
+
65%|██████▌ | 311579/478625 [00:09<00:05, 31424.68it/s][A
+
61%|██████ | 291700/478625 [00:09<00:05, 32136.98it/s][A
+
48%|████▊ | 230004/478625 [00:07<00:07, 31798.49it/s][A
+
63%|██████▎ | 303407/478625 [00:09<00:05, 32241.26it/s][A
+
62%|██████▏ | 298854/478625 [00:09<00:05, 32307.52it/s][A
+
66%|██████▌ | 314871/478625 [00:09<00:05, 31860.74it/s][A
+
62%|██████▏ | 294916/478625 [00:09<00:05, 31672.72it/s][A
+
49%|████▊ | 233207/478625 [00:07<00:07, 31864.21it/s][A
+
64%|██████▍ | 306755/478625 [00:09<00:05, 32604.80it/s][A
+
63%|██████▎ | 302088/478625 [00:09<00:05, 31813.38it/s][A
+
66%|██████▋ | 318104/478625 [00:10<00:05, 31996.69it/s][A
+
62%|██████▏ | 298220/478625 [00:09<00:05, 32074.61it/s][A
+
49%|████▉ | 236396/478625 [00:07<00:07, 31348.83it/s][A
+
65%|██████▍ | 310019/478625 [00:09<00:05, 32111.30it/s][A
+
64%|██████▍ | 305416/478625 [00:09<00:05, 32242.39it/s][A
+
67%|██████▋ | 321308/478625 [00:10<00:05, 30979.39it/s][A
+
63%|██████▎ | 301431/478625 [00:09<00:05, 31607.55it/s][A
+
50%|█████ | 239662/478625 [00:07<00:07, 31733.98it/s][A
+
65%|██████▌ | 313390/478625 [00:09<00:05, 32579.64it/s][A
+
65%|██████▍ | 308737/478625 [00:09<00:05, 32527.46it/s][A
+
68%|██████▊ | 324659/478625 [00:10<00:04, 31712.31it/s][A
+
64%|██████▎ | 304737/478625 [00:09<00:05, 32032.33it/s][A
+
51%|█████ | 242838/478625 [00:07<00:07, 31255.35it/s][A
+
66%|██████▌ | 316735/478625 [00:09<00:04, 32835.54it/s][A
+
65%|██████▌ | 311993/478625 [00:09<00:05, 32039.14it/s][A
+
68%|██████▊ | 327839/478625 [00:10<00:04, 30984.65it/s][A
+
64%|██████▍ | 308041/478625 [00:09<00:05, 32328.06it/s][A
+
51%|█████▏ | 246112/478625 [00:07<00:07, 31691.11it/s][A
+
67%|██████▋ | 320022/478625 [00:09<00:04, 32258.69it/s][A
+
66%|██████▌ | 315336/478625 [00:09<00:05, 32447.71it/s][A
+
69%|██████▉ | 331167/478625 [00:10<00:04, 31649.64it/s][A
+
65%|██████▌ | 311277/478625 [00:09<00:05, 31829.15it/s][A
+
52%|█████▏ | 249382/478625 [00:07<00:07, 31987.59it/s][A
+
68%|██████▊ | 323344/478625 [00:09<00:04, 32539.16it/s][A
+
67%|██████▋ | 318584/478625 [00:10<00:05, 31928.44it/s][A
+
70%|██████▉ | 334340/478625 [00:10<00:04, 31541.28it/s][A
+
66%|██████▌ | 314577/478625 [00:09<00:05, 32171.85it/s][A
+
53%|█████▎ | 252584/478625 [00:07<00:07, 31196.54it/s][A
+
68%|██████▊ | 326602/478625 [00:10<00:04, 32010.57it/s][A
+
67%|██████▋ | 321894/478625 [00:10<00:04, 32270.63it/s][A
+
71%|███████ | 337500/478625 [00:10<00:04, 30839.38it/s][A
+
66%|██████▋ | 317888/478625 [00:09<00:04, 32449.16it/s][A
+
53%|█████▎ | 255834/478625 [00:08<00:07, 31576.51it/s][A
+
69%|██████▉ | 329873/478625 [00:10<00:04, 32212.76it/s][A
+
68%|██████▊ | 325223/478625 [00:10<00:04, 32571.40it/s][A
+
71%|███████ | 340813/478625 [00:10<00:04, 31504.61it/s][A
+
67%|██████▋ | 321136/478625 [00:10<00:04, 31855.27it/s][A
+
54%|█████▍ | 258997/478625 [00:08<00:06, 31552.08it/s][A
+
70%|██████▉ | 333225/478625 [00:10<00:04, 32595.80it/s][A
+
69%|██████▊ | 328483/478625 [00:10<00:04, 31775.70it/s][A
+
72%|███████▏ | 343970/478625 [00:10<00:04, 31202.25it/s][A
+
68%|██████▊ | 324450/478625 [00:10<00:04, 32232.22it/s][A
+
55%|█████▍ | 262156/478625 [00:08<00:06, 31060.64it/s][A
+
70%|███████ | 336488/478625 [00:10<00:04, 32102.48it/s][A
+
69%|██████▉ | 331805/478625 [00:10<00:04, 32197.13it/s][A
+
73%|███████▎ | 347309/478625 [00:10<00:04, 31842.50it/s][A
+
68%|██████▊ | 327677/478625 [00:10<00:04, 31608.40it/s][A
+
55%|█████▌ | 265424/478625 [00:08<00:06, 31535.68it/s][A
+
71%|███████ | 339837/478625 [00:10<00:04, 32510.31it/s][A
+
70%|███████ | 335156/478625 [00:10<00:04, 32581.34it/s][A
+
73%|███████▎ | 350498/478625 [00:11<00:04, 31783.88it/s][A
+
69%|██████▉ | 330895/478625 [00:10<00:04, 31773.70it/s][A
+
56%|█████▌ | 268581/478625 [00:08<00:06, 31092.66it/s][A
+
72%|███████▏ | 343091/478625 [00:10<00:04, 32065.08it/s][A
+
71%|███████ | 338419/478625 [00:10<00:04, 31930.10it/s][A
+
74%|███████▍ | 353680/478625 [00:11<00:03, 31434.87it/s][A
+
70%|██████▉ | 334205/478625 [00:10<00:04, 32162.64it/s][A
+
57%|█████▋ | 271810/478625 [00:08<00:06, 31443.40it/s][A
+
72%|███████▏ | 346432/478625 [00:10<00:04, 32460.23it/s][A
+
71%|███████▏ | 341761/478625 [00:10<00:04, 32364.06it/s][A
+
75%|███████▍ | 357005/478625 [00:11<00:03, 31969.85it/s][A
+
70%|███████ | 337425/478625 [00:10<00:04, 31590.15it/s][A
+
57%|█████▋ | 275100/478625 [00:08<00:06, 31873.57it/s][A
+
73%|███████▎ | 349803/478625 [00:10<00:03, 32829.38it/s][A
+
72%|███████▏ | 345002/478625 [00:10<00:04, 31837.36it/s][A
+
75%|███████▌ | 360301/478625 [00:11<00:03, 32260.86it/s][A
+
71%|███████ | 340751/478625 [00:10<00:04, 32077.10it/s][A
+
58%|█████▊ | 278291/478625 [00:08<00:06, 31280.68it/s][A
+
74%|███████▍ | 353089/478625 [00:10<00:03, 32241.52it/s][A
+
73%|███████▎ | 348348/478625 [00:10<00:04, 32311.14it/s][A
+
76%|███████▌ | 363530/478625 [00:11<00:03, 31794.94it/s][A
+
72%|███████▏ | 343963/478625 [00:10<00:04, 31575.31it/s][A
+
59%|█████▉ | 281581/478625 [00:08<00:06, 31756.01it/s][A
+
74%|███████▍ | 356453/478625 [00:10<00:03, 32652.58it/s][A
+
73%|███████▎ | 351682/478625 [00:11<00:03, 32613.46it/s][A
+
77%|███████▋ | 366860/478625 [00:11<00:03, 32237.90it/s][A
+
73%|███████▎ | 347261/478625 [00:10<00:04, 31984.40it/s][A
+
59%|█████▉ | 284761/478625 [00:08<00:06, 31304.79it/s][A
+
75%|███████▌ | 359795/478625 [00:11<00:03, 32136.57it/s][A
+
74%|███████▍ | 354947/478625 [00:11<00:03, 32028.38it/s][A
+
77%|███████▋ | 370087/478625 [00:11<00:03, 31653.87it/s][A
+
73%|███████▎ | 350579/478625 [00:10<00:03, 32336.12it/s][A
+
60%|██████ | 288064/478625 [00:09<00:05, 31811.45it/s][A
+
76%|███████▌ | 363151/478625 [00:11<00:03, 32552.24it/s][A
+
75%|███████▍ | 358312/478625 [00:11<00:03, 32504.00it/s][A
+
78%|███████▊ | 373474/478625 [00:11<00:03, 32302.80it/s][A
+
74%|███████▍ | 353816/478625 [00:11<00:03, 31770.77it/s][A
+
61%|██████ | 291340/478625 [00:09<00:05, 32091.10it/s][A
+
77%|███████▋ | 366489/478625 [00:11<00:03, 32795.41it/s][A
+
76%|███████▌ | 361567/478625 [00:11<00:03, 32000.78it/s][A
+
79%|███████▊ | 376840/478625 [00:11<00:03, 32700.63it/s][A
+
75%|███████▍ | 357121/478625 [00:11<00:03, 32143.25it/s][A
+
62%|██████▏ | 294552/478625 [00:09<00:05, 31287.39it/s][A
+
77%|███████▋ | 369772/478625 [00:11<00:03, 32005.83it/s][A
+
76%|███████▌ | 364907/478625 [00:11<00:03, 32408.15it/s][A
+
79%|███████▉ | 380114/478625 [00:11<00:03, 32187.00it/s][A
+
75%|███████▌ | 360438/478625 [00:11<00:03, 32444.10it/s][A
+
62%|██████▏ | 297817/478625 [00:09<00:05, 31684.37it/s][A
+
78%|███████▊ | 373160/478625 [00:11<00:03, 32517.89it/s][A
+
77%|███████▋ | 368226/478625 [00:11<00:03, 32636.51it/s][A
+
80%|████████ | 383452/478625 [00:12<00:02, 32537.26it/s][A
+
76%|███████▌ | 363686/478625 [00:11<00:03, 31916.54it/s][A
+
63%|██████▎ | 301081/478625 [00:09<00:05, 31963.77it/s][A
+
79%|███████▊ | 376528/478625 [00:11<00:03, 32859.12it/s][A
+
78%|███████▊ | 371493/478625 [00:11<00:03, 31789.74it/s][A
+
81%|████████ | 386709/478625 [00:12<00:02, 31721.81it/s][A
+
77%|███████▋ | 366973/478625 [00:11<00:03, 32196.60it/s][A
+
64%|██████▎ | 304282/478625 [00:09<00:05, 31473.31it/s][A
+
79%|███████▉ | 379819/478625 [00:11<00:03, 32312.80it/s][A
+
78%|███████▊ | 374836/478625 [00:11<00:03, 32268.46it/s][A
+
81%|████████▏ | 390057/478625 [00:12<00:02, 32234.30it/s][A
+
77%|███████▋ | 370196/478625 [00:11<00:03, 31522.76it/s][A
+
64%|██████▍ | 307552/478625 [00:09<00:05, 31832.77it/s][A
+
80%|████████ | 383203/478625 [00:11<00:02, 32760.73it/s][A
+
79%|███████▉ | 378069/478625 [00:11<00:03, 31815.18it/s][A
+
82%|████████▏ | 393444/478625 [00:12<00:02, 32713.30it/s][A
+
78%|███████▊ | 373501/478625 [00:11<00:03, 31969.34it/s][A
+
65%|██████▍ | 310739/478625 [00:09<00:05, 31306.10it/s][A
+
81%|████████ | 386484/478625 [00:11<00:02, 32263.81it/s][A
+
80%|███████▉ | 381433/478625 [00:11<00:03, 32347.64it/s][A
+
83%|████████▎ | 396721/478625 [00:12<00:02, 32007.92it/s][A
+
79%|███████▊ | 376819/478625 [00:11<00:03, 32324.23it/s][A
+
66%|██████▌ | 314017/478625 [00:09<00:05, 31735.84it/s][A
+
81%|████████▏ | 389847/478625 [00:11<00:02, 32662.21it/s][A
+
80%|████████ | 384771/478625 [00:12<00:02, 32651.13it/s][A
+
84%|████████▎ | 400063/478625 [00:12<00:02, 32419.05it/s][A
+
79%|███████▉ | 380055/478625 [00:11<00:03, 31836.25it/s][A
+
66%|██████▋ | 317278/478625 [00:10<00:05, 31992.98it/s][A
+
82%|████████▏ | 393217/478625 [00:12<00:02, 32967.19it/s][A
+
81%|████████ | 388040/478625 [00:12<00:02, 32043.29it/s][A
+
84%|████████▍ | 403311/478625 [00:12<00:02, 32014.36it/s][A
+
80%|████████ | 383371/478625 [00:11<00:02, 32222.74it/s][A
+
67%|██████▋ | 320481/478625 [00:10<00:05, 31449.04it/s][A
+
83%|████████▎ | 396517/478625 [00:12<00:02, 32348.65it/s][A
+
82%|████████▏ | 391376/478625 [00:12<00:02, 32429.10it/s][A
+
85%|████████▍ | 406647/478625 [00:12<00:02, 32408.23it/s][A
+
81%|████████ | 386597/478625 [00:12<00:02, 31748.76it/s][A
+
68%|██████▊ | 323761/478625 [00:10<00:04, 31843.82it/s][A
+
84%|████████▎ | 399879/478625 [00:12<00:02, 32720.01it/s][A
+
82%|████████▏ | 394623/478625 [00:12<00:02, 31917.25it/s][A
+
86%|████████▌ | 410010/478625 [00:12<00:02, 32767.41it/s][A
+
81%|████████▏ | 389920/478625 [00:12<00:02, 32183.56it/s][A
+
68%|██████▊ | 326949/478625 [00:10<00:04, 31310.76it/s][A
+
84%|████████▍ | 403155/478625 [00:12<00:02, 32261.80it/s][A
+
83%|████████▎ | 397946/478625 [00:12<00:02, 32299.44it/s][A
+
86%|████████▋ | 413290/478625 [00:13<00:02, 32037.74it/s][A
+
82%|████████▏ | 393248/478625 [00:12<00:02, 32506.61it/s][A
+
69%|██████▉ | 330085/478625 [00:10<00:04, 31323.71it/s][A
+
85%|████████▍ | 406511/478625 [00:12<00:02, 32642.08it/s][A
+
84%|████████▍ | 401268/478625 [00:12<00:02, 32570.37it/s][A
+
87%|████████▋ | 416499/478625 [00:13<00:01, 31991.11it/s][A
+
83%|████████▎ | 396502/478625 [00:12<00:02, 31828.57it/s][A
+
70%|██████▉ | 333306/478625 [00:10<00:04, 31583.74it/s][A
+
86%|████████▌ | 409867/478625 [00:12<00:02, 32904.73it/s][A
+
85%|████████▍ | 404528/478625 [00:12<00:02, 32038.89it/s][A
+
88%|████████▊ | 419702/478625 [00:13<00:01, 31706.86it/s][A
+
84%|████████▎ | 399833/478625 [00:12<00:02, 32262.51it/s][A
+
70%|███████ | 336467/478625 [00:10<00:04, 31132.34it/s][A
+
86%|████████▋ | 413161/478625 [00:12<00:02, 32253.44it/s][A
+
85%|████████▌ | 407845/478625 [00:12<00:02, 32369.35it/s][A
+
88%|████████▊ | 423022/478625 [00:13<00:01, 32144.44it/s][A
+
84%|████████▍ | 403064/478625 [00:12<00:02, 31748.62it/s][A
+
71%|███████ | 339735/478625 [00:10<00:04, 31586.97it/s][A
+
87%|████████▋ | 416414/478625 [00:12<00:01, 32333.40it/s][A
+
86%|████████▌ | 411109/478625 [00:12<00:02, 31865.73it/s][A
+
89%|████████▉ | 426384/478625 [00:13<00:01, 32580.75it/s][A
+
85%|████████▍ | 406371/478625 [00:12<00:02, 32132.84it/s][A
+
72%|███████▏ | 343004/478625 [00:10<00:04, 31912.65it/s][A
+
88%|████████▊ | 419651/478625 [00:12<00:01, 31940.92it/s][A
+
87%|████████▋ | 414429/478625 [00:13<00:01, 32254.47it/s][A
+
90%|████████▉ | 429645/478625 [00:13<00:01, 32137.38it/s][A
+
86%|████████▌ | 409704/478625 [00:12<00:02, 32482.61it/s][A
+
72%|███████▏ | 346198/478625 [00:10<00:04, 31317.41it/s][A
+
88%|████████▊ | 423033/478625 [00:13<00:01, 32491.16it/s][A
+
87%|████████▋ | 417658/478625 [00:13<00:01, 32229.71it/s][A
+
90%|█████████ | 433050/478625 [00:13<00:01, 32699.44it/s][A
+
86%|████████▋ | 412956/478625 [00:12<00:02, 31782.57it/s][A
+
73%|███████▎ | 349463/478625 [00:11<00:04, 31707.19it/s][A
+
89%|████████▉ | 426384/478625 [00:13<00:01, 32791.42it/s][A
+
88%|████████▊ | 420884/478625 [00:13<00:01, 31861.94it/s][A
+
91%|█████████ | 436433/478625 [00:13<00:01, 33033.14it/s][A
+
87%|████████▋ | 416163/478625 [00:12<00:01, 31864.25it/s][A
+
74%|███████▎ | 352638/478625 [00:11<00:04, 31283.86it/s][A
+
90%|████████▉ | 429666/478625 [00:13<00:01, 32272.67it/s][A
+
89%|████████▊ | 424185/478625 [00:13<00:01, 32198.80it/s][A
+
92%|█████████▏| 439739/478625 [00:13<00:01, 32466.97it/s][A
+
88%|████████▊ | 419499/478625 [00:13<00:01, 32303.51it/s][A
+
74%|███████▍ | 355877/478625 [00:11<00:03, 31606.77it/s][A
+
90%|█████████ | 433013/478625 [00:13<00:01, 32625.04it/s][A
+
89%|████████▉ | 427511/478625 [00:13<00:01, 32511.89it/s][A
+
93%|█████████▎| 443085/478625 [00:13<00:01, 32757.71it/s][A
+
88%|████████▊ | 422733/478625 [00:13<00:01, 31794.87it/s][A
+
75%|███████▌ | 359145/478625 [00:11<00:03, 31923.37it/s][A
+
91%|█████████ | 436279/478625 [00:13<00:01, 32177.29it/s][A
+
90%|█████████ | 430765/478625 [00:13<00:01, 32062.99it/s][A
+
93%|█████████▎| 446364/478625 [00:14<00:01, 32157.34it/s][A
+
89%|████████▉ | 426045/478625 [00:13<00:01, 32183.87it/s][A
+
76%|███████▌ | 362340/478625 [00:11<00:03, 31461.63it/s][A
+
92%|█████████▏| 439610/478625 [00:13<00:01, 32509.29it/s][A
+
91%|█████████ | 434095/478625 [00:13<00:01, 32426.03it/s][A
+
94%|█████████▍| 449722/478625 [00:14<00:00, 32573.29it/s][A
+
90%|████████▉ | 429267/478625 [00:13<00:01, 31735.80it/s][A
+
76%|███████▋ | 365558/478625 [00:11<00:03, 31670.90it/s][A
+
93%|█████████▎| 442977/478625 [00:13<00:01, 32852.37it/s][A
+
91%|█████████▏| 437340/478625 [00:13<00:01, 31906.13it/s][A
+
95%|█████████▍| 452992/478625 [00:14<00:00, 32608.66it/s][A
+
90%|█████████ | 432541/478625 [00:13<00:01, 32029.06it/s][A
+
77%|███████▋ | 368728/478625 [00:11<00:03, 31653.21it/s][A
+
93%|█████████▎| 446265/478625 [00:13<00:01, 32355.73it/s][A
+
92%|█████████▏| 440663/478625 [00:13<00:01, 32275.33it/s][A
+
95%|█████████▌| 456256/478625 [00:14<00:00, 32077.47it/s][A
+
91%|█████████ | 435870/478625 [00:13<00:01, 32400.61it/s][A
+
78%|███████▊ | 371895/478625 [00:11<00:03, 31141.81it/s][A
+
94%|█████████▍| 449642/478625 [00:13<00:00, 32770.64it/s][A
+
93%|█████████▎| 444019/478625 [00:13<00:01, 32652.73it/s][A
+
96%|█████████▌| 459641/478625 [00:14<00:00, 32596.54it/s][A
+
92%|█████████▏| 439113/478625 [00:13<00:01, 31888.34it/s][A
+
78%|███████▊ | 375151/478625 [00:11<00:03, 31558.74it/s][A
+
95%|█████████▍| 452922/478625 [00:13<00:00, 32191.71it/s][A
+
93%|█████████▎| 447287/478625 [00:14<00:00, 32112.93it/s][A
+
97%|█████████▋| 462904/478625 [00:14<00:00, 32164.47it/s][A
+
92%|█████████▏| 442426/478625 [00:13<00:01, 32251.29it/s][A
+
79%|███████▉ | 378310/478625 [00:11<00:03, 30974.98it/s][A
+
95%|█████████▌| 456295/478625 [00:14<00:00, 32641.81it/s][A
+
94%|█████████▍| 450606/478625 [00:14<00:00, 32428.69it/s][A
+
97%|█████████▋| 466174/478625 [00:14<00:00, 32321.60it/s][A
+
93%|█████████▎| 445654/478625 [00:13<00:01, 31754.84it/s][A
+
80%|███████▉ | 381607/478625 [00:12<00:03, 31552.36it/s][A
+
96%|█████████▌| 459676/478625 [00:14<00:00, 32954.33it/s][A
+
95%|█████████▍| 453852/478625 [00:14<00:00, 31961.64it/s][A
+
98%|█████████▊| 469409/478625 [00:14<00:00, 32239.04it/s][A
+
94%|█████████▍| 449010/478625 [00:14<00:00, 32283.94it/s][A
+
80%|████████ | 384873/478625 [00:12<00:02, 31877.21it/s][A
+
97%|█████████▋| 462975/478625 [00:14<00:00, 32446.14it/s][A
+
96%|█████████▌| 457161/478625 [00:14<00:00, 32290.24it/s][A
+
99%|█████████▊| 472635/478625 [00:14<00:00, 31873.32it/s][A
+
94%|█████████▍| 452300/478625 [00:14<00:00, 32465.16it/s][A
+
81%|████████ | 388064/478625 [00:12<00:02, 31428.33it/s][A
+
97%|█████████▋| 466314/478625 [00:14<00:00, 32722.70it/s][A
+
96%|█████████▌| 460496/478625 [00:14<00:00, 32602.52it/s][A
+
99%|█████████▉| 476016/478625 [00:14<00:00, 32443.90it/s][A
+
95%|█████████▌| 455550/478625 [00:14<00:00, 31928.21it/s][A
+
82%|████████▏ | 391346/478625 [00:12<00:02, 31836.32it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31831.46it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.66s/it]
100%|██████████| 1/1 [00:21<00:00, 21.67s/it]
+
+
98%|█████████▊| 469590/478625 [00:14<00:00, 31876.39it/s][A
+
97%|█████████▋| 463759/478625 [00:14<00:00, 31955.34it/s][A
+
96%|█████████▌| 458879/478625 [00:14<00:00, 32328.25it/s][A
+
82%|████████▏ | 394533/478625 [00:12<00:02, 31360.17it/s][A
+
99%|█████████▉| 472954/478625 [00:14<00:00, 32388.58it/s][A
+
98%|█████████▊| 466989/478625 [00:14<00:00, 32054.67it/s][A
+
97%|█████████▋| 462115/478625 [00:14<00:00, 31820.55it/s][A
+
83%|████████▎ | 397776/478625 [00:12<00:02, 31643.22it/s][A
+
100%|█████████▉| 476310/478625 [00:14<00:00, 32706.75it/s][A
+
98%|█████████▊| 470211/478625 [00:14<00:00, 31678.16it/s][A
+
97%|█████████▋| 465374/478625 [00:14<00:00, 32044.35it/s][A
+
84%|████████▍ | 401074/478625 [00:12<00:02, 32036.62it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32527.73it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.35s/it]
100%|██████████| 1/1 [00:21<00:00, 21.35s/it]
+
+
99%|█████████▉| 473519/478625 [00:14<00:00, 32089.15it/s][A
+
98%|█████████▊| 468582/478625 [00:14<00:00, 32002.20it/s][A
+
84%|████████▍ | 404281/478625 [00:12<00:02, 31491.29it/s][A
+
100%|█████████▉| 476862/478625 [00:14<00:00, 32482.88it/s][A
+
99%|█████████▊| 471785/478625 [00:14<00:00, 31572.72it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 31911.82it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.65s/it]
100%|██████████| 1/1 [00:21<00:00, 21.65s/it]
+
+
85%|████████▌ | 407540/478625 [00:12<00:02, 31813.55it/s][A
+
99%|█████████▉| 475117/478625 [00:14<00:00, 32086.15it/s][A
+
86%|████████▌ | 410794/478625 [00:12<00:02, 32025.52it/s][A
+
100%|█████████▉| 478430/478625 [00:14<00:00, 32394.34it/s][A
100%|██████████| 478625/478625 [00:14<00:00, 32041.22it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.49s/it]
100%|██████████| 1/1 [00:21<00:00, 21.49s/it]
+time 22.34464454650879
+
+
86%|████████▋ | 413999/478625 [00:13<00:02, 31139.77it/s][An_elements: 474899
+data length: 474899
+
+
87%|████████▋ | 417185/478625 [00:13<00:01, 31348.62it/s][A
+
88%|████████▊ | 420325/478625 [00:13<00:01, 30695.11it/s][Atime 21.991446256637573
+n_elements: 474899
+data length: 474899
+
+
89%|████████▊ | 423594/478625 [00:13<00:01, 31275.66it/s][A
+
89%|████████▉ | 426811/478625 [00:13<00:01, 31537.75it/s][Atime 22.311201095581055
+n_elements: 474899
+data length: 474899
+
+
90%|████████▉ | 429970/478625 [00:13<00:01, 31115.72it/s][A
+
91%|█████████ | 433213/478625 [00:13<00:01, 31499.94it/s][Atime 22.154942750930786
+n_elements: 474899
+data length: 474899
+
+
91%|█████████ | 436390/478625 [00:13<00:01, 31577.17it/s][A
+
92%|█████████▏| 439551/478625 [00:13<00:01, 31131.97it/s][A
+
93%|█████████▎| 442798/478625 [00:14<00:01, 31526.02it/s][A
+
93%|█████████▎| 445954/478625 [00:14<00:01, 30843.54it/s][A
+
94%|█████████▍| 449222/478625 [00:14<00:00, 31379.24it/s][A
+
95%|█████████▍| 452500/478625 [00:14<00:00, 31788.65it/s][A
+
95%|█████████▌| 455683/478625 [00:14<00:00, 31321.61it/s][A
+
96%|█████████▌| 458980/478625 [00:14<00:00, 31803.59it/s][A
+
97%|█████████▋| 462164/478625 [00:14<00:00, 31254.71it/s][A
+
97%|█████████▋| 465403/478625 [00:14<00:00, 31584.83it/s][A
+
98%|█████████▊| 468565/478625 [00:14<00:00, 31249.25it/s][A
+
99%|█████████▊| 471693/478625 [00:14<00:00, 30515.57it/s][A
+
99%|█████████▉| 474857/478625 [00:15<00:00, 30841.04it/s][A
+
100%|█████████▉| 477983/478625 [00:15<00:00, 30961.69it/s][A
100%|██████████| 478625/478625 [00:15<00:00, 31575.03it/s]
+
100%|██████████| 1/1 [00:21<00:00, 21.79s/it]
100%|██████████| 1/1 [00:21<00:00, 21.79s/it]
+time 22.4544620513916
+n_elements: 474899
+data length: 474899
+[2024-09-05 19:42:54,936] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2024-09-05 19:42:54,944] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2024-09-05 19:42:54,944] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+[2024-09-05 19:42:55,093] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
+[2024-09-05 19:42:55,093] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=.NewCls'>
+[2024-09-05 19:42:55,093] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
+[2024-09-05 19:42:55,093] [INFO] [stage_1_and_2.py:173:__init__] Reduce bucket size 536870912
+[2024-09-05 19:42:55,093] [INFO] [stage_1_and_2.py:174:__init__] Allgather bucket size 536870912
+[2024-09-05 19:42:55,093] [INFO] [stage_1_and_2.py:175:__init__] CPU Offload: False
+[2024-09-05 19:42:55,093] [INFO] [stage_1_and_2.py:176:__init__] Round robin gradient partitioning: False
+zp rank is 0, zp_size=8
+zp rank is 5, zp_size=8
+zp rank is 6, zp_size=8
+zp rank is 1, zp_size=8
+zp rank is 4, zp_size=8
+zp rank is 7, zp_size=8
+zp rank is 2, zp_size=8
+zp rank is 3, zp_size=8
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xffff1ce4d550>: attribute lookup on opensora.models.causalvideovae failed
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xfffefb366a60>: attribute lookup on opensora.models.causalvideovae failed
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xfffef4c2b820>: attribute lookup on opensora.models.causalvideovae failed
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xffff00b67ca0>: attribute lookup on opensora.models.causalvideovae failed
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ main(args)reduction.dump(process_obj, fp)
+
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xffff060eb820>: attribute lookup on opensora.models.causalvideovae failed
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xfffef0e4e5e0>: attribute lookup on opensora.models.causalvideovae failed
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xffff240ae790>: attribute lookup on opensora.models.causalvideovae failed
+[2024-09-05 19:43:00,342] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states
+[2024-09-05 19:43:00,343] [INFO] [utils.py:792:see_memory_usage] MA 17.67 GB Max_MA 18.33 GB CA 18.7 GB Max_CA 19 GB
+[2024-09-05 19:43:00,344] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 195.08 GB, percent = 12.9%
+[2024-09-05 19:43:02,243] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states
+[2024-09-05 19:43:02,245] [INFO] [utils.py:792:see_memory_usage] MA 20.3 GB Max_MA 24.24 GB CA 25.27 GB Max_CA 25 GB
+[2024-09-05 19:43:02,245] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 148.04 GB, percent = 9.8%
+[2024-09-05 19:43:02,245] [INFO] [stage_1_and_2.py:552:__init__] optimizer state initialized
+[2024-09-05 19:43:04,101] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer
+[2024-09-05 19:43:04,102] [INFO] [utils.py:792:see_memory_usage] MA 20.3 GB Max_MA 20.3 GB CA 25.27 GB Max_CA 25 GB
+[2024-09-05 19:43:04,102] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 101.25 GB, percent = 6.7%
+[2024-09-05 19:43:04,110] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
+[2024-09-05 19:43:04,111] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2024-09-05 19:43:04,111] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2024-09-05 19:43:04,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[(0.9, 0.999)]
+[2024-09-05 19:43:04,114] [INFO] [config.py:984:print] DeepSpeedEngine configuration:
+[2024-09-05 19:43:04,114] [INFO] [config.py:988:print] activation_checkpointing_config {
+ "partition_activations": false,
+ "contiguous_memory_optimization": false,
+ "cpu_checkpointing": false,
+ "number_checkpoints": null,
+ "synchronize_checkpoint_boundary": false,
+ "profile": false
+}
+[2024-09-05 19:43:04,114] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2024-09-05 19:43:04,114] [INFO] [config.py:988:print] amp_enabled .................. False
+[2024-09-05 19:43:04,114] [INFO] [config.py:988:print] amp_params ................... False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] autotuning_config ............ {
+ "enabled": false,
+ "start_step": null,
+ "end_step": null,
+ "metric_path": null,
+ "arg_mappings": null,
+ "metric": "throughput",
+ "model_info": null,
+ "results_dir": "autotuning_results",
+ "exps_dir": "autotuning_exps",
+ "overwrite": true,
+ "fast": true,
+ "start_profile_step": 3,
+ "end_profile_step": 5,
+ "tuner_type": "gridsearch",
+ "tuner_early_stopping": 5,
+ "tuner_num_trials": 50,
+ "model_info_path": null,
+ "mp_size": 1,
+ "max_train_batch_size": null,
+ "min_train_batch_size": 1,
+ "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+ "min_train_micro_batch_size_per_gpu": 1,
+ "num_tuning_micro_batch_sizes": 3
+}
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] bfloat16_enabled ............. True
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] comms_config .................
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] communication_data_type ...... torch.float32
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] curriculum_params_legacy ..... False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] data_efficiency_enabled ...... False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] dataloader_drop_last ......... False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] disable_allgather ............ False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] dump_state ................... False
+[2024-09-05 19:43:04,115] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_enabled ........... False
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] eigenvalue_verbose ........... False
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] elasticity_enabled ........... False
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] flops_profiler_config ........ {
+ "enabled": false,
+ "recompute_fwd_factor": 0.0,
+ "profile_step": 1,
+ "module_depth": -1,
+ "top_modules": 1,
+ "detailed": true,
+ "output_file": null
+}
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] fp16_auto_cast ............... None
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] fp16_enabled ................. False
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] global_rank .................. 0
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] grad_accum_dtype ............. None
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] gradient_accumulation_steps .. 1
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] gradient_clipping ............ 1.0
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] graph_harvesting ............. False
+[2024-09-05 19:43:04,116] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] load_universal_checkpoint .... False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] loss_scale ................... 1.0
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] memory_breakdown ............. False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] mics_hierarchial_params_gather False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] mics_shard_size .............. -1
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] nebula_config ................ {
+ "enabled": false,
+ "persistent_storage_path": null,
+ "persistent_time_interval": 100,
+ "num_of_version_in_retention": 2,
+ "enable_nebula_load": true,
+ "load_path": null
+}
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] optimizer_name ............... None
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] optimizer_params ............. None
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] pld_enabled .................. False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] pld_params ................... False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] prescale_gradients ........... False
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] scheduler_name ............... None
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] scheduler_params ............. None
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32
+[2024-09-05 19:43:04,117] [INFO] [config.py:988:print] sparse_attention ............. None
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] steps_per_print .............. inf
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] train_batch_size ............. 8
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 1
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] use_node_local_storage ....... False
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] wall_clock_breakdown ......... False
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] weight_quantization_config ... None
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] world_size ................... 8
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] zero_allow_untested_optimizer True
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=536870912 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=536870912 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] zero_enabled ................. True
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True
+[2024-09-05 19:43:04,118] [INFO] [config.py:988:print] zero_optimization_stage ...... 2
+[2024-09-05 19:43:04,118] [INFO] [config.py:974:print_user_config] json = {
+ "fp16": {
+ "enabled": false,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 16,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "bf16": {
+ "enabled": true
+ },
+ "communication_data_type": "fp32",
+ "gradient_clipping": 1.0,
+ "train_micro_batch_size_per_gpu": 1,
+ "train_batch_size": 8,
+ "gradient_accumulation_steps": 1,
+ "zero_optimization": {
+ "stage": 2,
+ "overlap_comm": true,
+ "allgather_bucket_size": 5.368709e+08,
+ "contiguous_gradients": true,
+ "reduce_bucket_size": 5.368709e+08
+ },
+ "steps_per_print": inf,
+ "zero_allow_untested_optimizer": true
+}
+09/05/2024 19:43:04 - INFO - __main__ - after accelerator.prepare
+09/05/2024 19:43:04 - INFO - __main__ - init trackers...
+wandb: Currently logged in as: pkuhxy (pkuhxy-Peking University). Use `wandb login --relogin` to force relogin
+wandb: - Waiting for wandb.init()...
wandb: \ Waiting for wandb.init()...
wandb: wandb version 0.17.8 is available! To upgrade, please run:
+wandb: $ pip install wandb --upgrade
+wandb: Tracking run with wandb version 0.16.3
+wandb: Run data is saved locally in /home/image_data/hxy/Open-Sora-Plan/wandb/run-20240905_194309-hhdazhf4
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run exalted-morning-5
+wandb: ⭐️ View project at https://wandb.ai/pkuhxy-Peking%20University/inpaint_93x320x320_stage1_swap
+wandb: 🚀 View run at https://wandb.ai/pkuhxy-Peking%20University/inpaint_93x320x320_stage1_swap/runs/hhdazhf4
+09/05/2024 19:43:11 - INFO - __main__ - ***** Running training *****
+09/05/2024 19:43:11 - INFO - __main__ - Model = DeepSpeedEngine(
+ (module): OpenSoraInpaint(
+ (pos_embed): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (transformer_blocks): ModuleList(
+ (0-31): 32 x BasicTransformerBlock(
+ (norm1): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn1): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (norm2): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (attn2): Attention(
+ (to_q): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_k): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_v): Linear(in_features=2304, out_features=2304, bias=True)
+ (to_out): ModuleList(
+ (0): Linear(in_features=2304, out_features=2304, bias=True)
+ (1): Dropout(p=0.0, inplace=False)
+ )
+ )
+ (ff): FeedForward(
+ (net): ModuleList(
+ (0): GELU(
+ (proj): Linear(in_features=2304, out_features=9216, bias=True)
+ )
+ (1): Dropout(p=0.0, inplace=False)
+ (2): Linear(in_features=9216, out_features=2304, bias=True)
+ )
+ )
+ )
+ )
+ (norm_out): LayerNorm((2304,), eps=1e-06, elementwise_affine=False)
+ (proj_out): Linear(in_features=2304, out_features=32, bias=True)
+ (adaln_single): AdaLayerNormSingle(
+ (emb): PixArtAlphaCombinedTimestepSizeEmbeddings(
+ (time_proj): Timesteps()
+ (timestep_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (caption_projection): PixArtAlphaTextProjection(
+ (linear_1): Linear(in_features=4096, out_features=2304, bias=True)
+ (act_1): GELU(approximate='tanh')
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ (motion_projection): MotionAdaLayerNormSingle(
+ (emb): MotionEmbeddings(
+ (motion_proj): Timesteps()
+ (motion_embedder): TimestepEmbedding(
+ (linear_1): Linear(in_features=256, out_features=2304, bias=True)
+ (act): SiLU()
+ (linear_2): Linear(in_features=2304, out_features=2304, bias=True)
+ )
+ )
+ (silu): SiLU()
+ (linear): Linear(in_features=2304, out_features=13824, bias=True)
+ )
+ (pos_embed_mask): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(4, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ (pos_embed_masked_hidden_states): ModuleList(
+ (0): PatchEmbed2D(
+ (proj): Conv2d(8, 2304, kernel_size=(2, 2), stride=(2, 2))
+ )
+ (1): Linear(in_features=2304, out_features=2304, bias=False)
+ )
+ )
+)
+09/05/2024 19:43:11 - INFO - __main__ - Num examples = 474899
+09/05/2024 19:43:11 - INFO - __main__ - Num Epochs = 17
+09/05/2024 19:43:11 - INFO - __main__ - Instantaneous batch size per device = 1
+09/05/2024 19:43:11 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8
+09/05/2024 19:43:11 - INFO - __main__ - Gradient Accumulation steps = 1
+09/05/2024 19:43:11 - INFO - __main__ - Total optimization steps = 1000000
+09/05/2024 19:43:11 - INFO - __main__ - Total optimization steps (num_update_steps_per_epoch) = 59362
+09/05/2024 19:43:11 - INFO - __main__ - Total trainable parameters = 2.8204808 B
+
Steps: 0%| | 0/1000000 [00:00, ?it/s]Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xffff303c7e50>: attribute lookup on opensora.models.causalvideovae failed
+Traceback (most recent call last):
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 935, in
+ main(args)
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 766, in main
+ train_one_epoch()
+ File "/home/image_data/hxy/Open-Sora-Plan/opensora/train/train_inpaint.py", line 739, in train_one_epoch
+ for step, data_item in enumerate(train_dataloader):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/data_loader.py", line 449, in __iter__
+ dataloader_iter = super().__iter__()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 438, in __iter__
+ return self._get_iterator()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 386, in _get_iterator
+ return _MultiProcessingDataLoaderIter(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch_npu/utils/module.py", line 351, in mpdl_iter_init
+ origin_mpdl_iter_init(self, *args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1039, in __init__
+ w.start()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 121, in start
+ self._popen = self._Popen(self)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 224, in _Popen
+ return _default_context.get_context().Process._Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
+ return Popen(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
+ super().__init__(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
+ self._launch(process_obj)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
+ reduction.dump(process_obj, fp)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
+ ForkingPickler(file, protocol).dump(obj)
+_pickle.PicklingError: Can't pickle at 0xffff303c7e50>: attribute lookup on opensora.models.causalvideovae failed
+wandb: - 0.004 MB of 0.004 MB uploaded
wandb: \ 0.004 MB of 0.004 MB uploaded
wandb: | 0.004 MB of 0.004 MB uploaded
wandb: / 0.004 MB of 0.004 MB uploaded
wandb: - 0.004 MB of 0.004 MB uploaded
wandb: \ 0.004 MB of 0.004 MB uploaded
[2024-09-05 19:43:18,905] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 398 closing signal SIGTERM
+[2024-09-05 19:43:18,905] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 400 closing signal SIGTERM
+[2024-09-05 19:43:18,905] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 402 closing signal SIGTERM
+wandb: | 0.004 MB of 0.004 MB uploaded
[2024-09-05 19:43:19,870] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 1 (pid: 399) of binary: /home/ma-user/anaconda3/envs/PyTorch-2.1.0/bin/python3.9
+Traceback (most recent call last):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/bin/accelerate", line 8, in
+ sys.exit(main())
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 46, in main
+ args.func(args)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/commands/launch.py", line 1042, in launch_command
+ deepspeed_launcher(args)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/accelerate/commands/launch.py", line 754, in deepspeed_launcher
+ distrib_run.run(args)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/distributed/run.py", line 797, in run
+ elastic_launch(
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+ return launch_agent(self._config, self._entrypoint, list(args))
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+ raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+opensora/train/train_inpaint.py FAILED
+------------------------------------------------------------
+Failures:
+[1]:
+ time : 2024-09-05_19:43:18
+ host : bms-sora-910b-0003
+ rank : 3 (local_rank: 3)
+ exitcode : 1 (pid: 401)
+ error_file:
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[2]:
+ time : 2024-09-05_19:43:18
+ host : bms-sora-910b-0003
+ rank : 5 (local_rank: 5)
+ exitcode : 1 (pid: 403)
+ error_file:
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[3]:
+ time : 2024-09-05_19:43:18
+ host : bms-sora-910b-0003
+ rank : 6 (local_rank: 6)
+ exitcode : 1 (pid: 404)
+ error_file:
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[4]:
+ time : 2024-09-05_19:43:18
+ host : bms-sora-910b-0003
+ rank : 7 (local_rank: 7)
+ exitcode : 1 (pid: 405)
+ error_file:
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+ time : 2024-09-05_19:43:18
+ host : bms-sora-910b-0003
+ rank : 1 (local_rank: 1)
+ exitcode : 1 (pid: 399)
+ error_file:
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+Process ForkServerProcess-7:
+Process ForkServerProcess-8:
+Process ForkServerProcess-4:
+Process ForkServerProcess-9:
+Process ForkServerProcess-3:
+Process ForkServerProcess-2:
+Process ForkServerProcess-6:
+Process ForkServerProcess-5:
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+Traceback (most recent call last):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+Traceback (most recent call last):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "", line 2, in get
+ File "", line 2, in get
+ File "", line 2, in get
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "", line 2, in get
+EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+EOFError
+EOFError
+EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+EOFError
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+EOFError
+Traceback (most recent call last):
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
+ self.run()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/process.py", line 108, in run
+ self._target(*self._args, **self._kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
+ raise exp
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
+ func(*args, **kwargs)
+ File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
+ key, func_name, detail = resource_proxy[TASK_QUEUE].get()
+ File "", line 2, in get
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
+ kind, result = conn.recv()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 255, in recv
+ buf = self._recv_bytes()
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
+ buf = self._recv(4)
+ File "/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/connection.py", line 388, in _recv
+ raise EOFError
+EOFError
+wandb: / 0.020 MB of 0.039 MB uploaded (0.011 MB deduped)
/home/ma-user/anaconda3/envs/PyTorch-2.1.0/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 42 leaked semaphore objects to clean up at shutdown
+ warnings.warn('resource_tracker: There appear to be %d '
+wandb: - 0.029 MB of 0.039 MB uploaded (0.011 MB deduped)
wandb: \ 0.039 MB of 0.039 MB uploaded (0.011 MB deduped)
wandb: 🚀 View run exalted-morning-5 at: https://wandb.ai/pkuhxy-Peking%20University/inpaint_93x320x320_stage1_swap/runs/hhdazhf4
+wandb: ️⚡ View job at https://wandb.ai/pkuhxy-Peking%20University/inpaint_93x320x320_stage1_swap/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjQyMDM4Nzg0MA==/version_details/v1
+wandb: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)
+wandb: Find logs at: ./wandb/run-20240905_194309-hhdazhf4/logs
diff --git a/new_validation/prompt.txt b/new_validation/prompt.txt
new file mode 100644
index 000000000..f576ff203
--- /dev/null
+++ b/new_validation/prompt.txt
@@ -0,0 +1,16 @@
+A data scientist sits in front of a computer, with the screen showing the training process of a machine learning model. A whiteboard next to them is covered with mathematical formulas and flowcharts. Several books on artificial intelligence are scattered on the desk. The setting is a bright, modern office.
+In a high-tech production workshop, automated assembly lines operate in an orderly manner. Multiple robotic arms are neatly arranged along the production line, performing various precise operations. On one side of the workshop, high-tech display screens show real-time production data and monitoring charts, glowing with blue and orange lights. The entire workshop is brightly lit, with a clean, reflective floor that mirrors the lights and shadows of the machines. Yellow pipes are arranged along the ceiling, enhancing the industrial and technological atmosphere.
+A futuristic control room with large screens displaying real-time monitoring data and analytical charts of the power grid. The control consoles are filled with various high-tech devices, and technicians are busily operating and monitoring the system. The room is bathed in blue light, emphasizing a high-tech environment.
+Under the night sky, a rocket ignites on the launch pad, with intense flames and thick smoke erupting from its base, illuminating the surrounding launch structure and sky. The launch towers on either side are clearly visible, and the night sky is brightened by the rocket's fiery glow and billowing smoke. Ground vegetation stands out in the light of the flames. The entire scene is filled with a sense of awe-inspiring power and technological grandeur.
+An astronaut stands at the observation window of a spaceship, gazing out at Earth. The astronaut, dressed in a full spacesuit, is facing away from the camera, looking out at the stunning view of Earth. The blue oceans, white clouds, and continents are vividly visible, illuminated by the sunlight. The dark and mysterious space outside is dotted with countless stars, while the distant sun shines brightly, casting a holy glow through the window into the spaceship. The interior of the spaceship is filled with high-tech equipment and control panels, creating a scene rich in exploration and futuristic atmosphere.
+A powerful fleet sails across the vast ocean, led by a massive aircraft carrier in the foreground, cutting through the waves with dynamic force. The setting sun casts a golden glow on the carrier's hull, reflecting a brilliant light. The carrier's deck is lined with several fighter jets, and the water on either side churns into white foam, emphasizing the motion. In the background, multiple escort ships, including frigates and destroyers, follow in a precise formation. The sky is filled with vibrant clouds, with the sun slowly dipping below the horizon, adding to the sense of grandeur and movement.
+A high-speed train speeds through a cityscape, its sleek, silver body gliding smoothly along the tracks. The train's streamlined design highlights its incredible speed. In the background, the modern city skyline features towering skyscrapers, with the iconic Oriental Pearl Tower standing out prominently under the glow of the setting sun. The sun casts a golden hue over the buildings and the train, adding a warm tone to the scene. The tracks blur due to the train's velocity, emphasizing the sense of motion and dynamic energy as it races forward.
+In a high-tech laboratory, an Asian woman in a blue lab coat, hairnet, and gloves is intently focused on a high-tech screen in front of her. The screen displays complex charts and data, glowing faintly blue. The woman gently blinks, showing her meticulous attention to detail. In the background, another lab technician is also busy operating and monitoring equipment. The cool-toned lighting and advanced equipment in the lab create a futuristic and technological ambiance.
+In a high-tech production workshop, several robotic arms are methodically operating along the production line. The robotic arms are precisely assembling electronic components, moving with fluid and coordinated motions. The workstations are filled with various intricate devices. In the background, more robotic arms can be seen busily performing their tasks. The workshop is illuminated with cool blue lighting, enhancing the futuristic and efficient atmosphere. The robotic arms move seamlessly, demonstrating advanced automation and technology in action.
+A rocket lifts off from the launch pad in a spectacular display. Intense flames and thick smoke erupt from the rocket's base, illuminating the surrounding launch structure. As the rocket ascends, dense clouds of smoke quickly spread out. The backdrop features the soft hues of sunset, with the sky transitioning from orange to purple. The launch towers and support structures are clearly visible, adding depth and a sense of advanced technology to the scene. The powerful liftoff captures the dynamic energy and awe-inspiring moment of space exploration.
+A close-up shot of an astronaut in space, wearing a full spacesuit. The transparent visor of the helmet is clearly visible, reflecting the vastness of space and the Earth below. The astronaut's expression is focused and filled with awe, eyes glimmering with curiosity and a spirit of exploration. The background features the stunning view of Earth, with its blue surface dotted with orange city lights. Distant stars twinkle in the dark expanse of space, adding a sense of mystery and grandeur to the scene.
+The video depicts a modern train station where two high-speed trains are parked side by side at the platform. The station's design is contemporary, featuring a transparent arched roof that allows natural light to flood the spacious area. The trains have sleek, streamlined exteriors with shiny silver bodies, showcasing a sense of speed and futurism. The platform is clean and orderly, with the trains reflecting each other, creating a symmetrical aesthetic. In the background, more trains and passengers can be seen, enhancing the technological and modern transportation atmosphere of the scene.
+The video showcases a vast outdoor scene, with a prominent large satellite dish as the main element. The satellite dish stands tall on a green meadow, against a backdrop of a clear blue sky filled with scattered white clouds. The intricate structure of the satellite dish reflects sunlight, making it stand out. In the distance, rows of blue solar panels add to the technological feel. Further on the horizon, a skyline of modern skyscrapers can be seen, creating a blend of nature and technology. The entire scene exudes a sense of harmony between natural beauty and advanced scientific progress.
+The video depicts a satellite orbiting the Earth. The satellite's intricate details are clearly visible, including large solar panels and various antennas. The background showcases a breathtaking view of Earth, with its blue oceans, white clouds, and the twinkling lights of cities at night. The distant starry sky adds depth and a sense of mystery to the scene. The entire image exudes a futuristic and technological atmosphere, highlighting humanity's achievements in space exploration.
+The video depicts a futuristic and high-tech power transmission scene. Multiple geometrically arranged power towers emit a blue glow and are connected by orange power lines, forming an extensive grid. The ground features a grid pattern of glowing lines, complementing the light from the power towers and enhancing the technological and futuristic feel of the scene. The background is a deep blue night sky, making the illuminated power towers and lines stand out prominently.
+The video depicts a spectacular scene of three fighter jets flying in formation over the ocean. The jets are arranged in a V-formation, with the central jet leading and the two side jets slightly behind. The fighter jets have sleek, silver-gray bodies, highlighting their powerful performance and sense of speed. The ocean below glistens with sunlight, and the jets leave white contrails in their wake, enhancing the dynamic feel of the scene. Sunlight reflects off the jets and the water, creating a dazzling effect. The overall image exudes a sense of power and intensity.
\ No newline at end of file
diff --git a/opensora/acceleration/__init__.py b/opensora/acceleration/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/acceleration/communications.py b/opensora/acceleration/communications.py
new file mode 100644
index 000000000..6f69a73a0
--- /dev/null
+++ b/opensora/acceleration/communications.py
@@ -0,0 +1,133 @@
+import torch
+import torch.distributed as dist
+from einops import rearrange
+from opensora.acceleration.parallel_states import hccl_info, lccl_info, enable_LCCL
+try:
+ from lcalib.functional import lcal_all2allvc
+except:
+ lcal_all2allvc = None
+
+def broadcast(input_: torch.Tensor):
+ sp_size = hccl_info.world_size
+ src = hccl_info.rank // sp_size * sp_size
+ dist.broadcast(input_, src=src, group=hccl_info.group)
+
+_COUNT = 0
+def _all_to_all(
+ input_: torch.Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+):
+ group = hccl_info.group
+ sp_size = hccl_info.world_size
+ input_list = [t.contiguous() for t in torch.tensor_split(input_, sp_size, scatter_dim)]
+ output_list = [torch.empty_like(input_list[0]) for _ in range(sp_size)]
+ dist.all_to_all(output_list, input_list, group=group)
+ return torch.cat(output_list, dim=gather_dim).contiguous()
+
+def _single_all_to_all(
+ input_: torch.Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+ enable_HCCL=False,
+):
+ if enable_LCCL:
+ sp_size = lccl_info.world_size
+ else:
+ sp_size = hccl_info.world_size
+ inp_shape = list(input_.shape)
+ inp_shape[scatter_dim] = inp_shape[scatter_dim] // sp_size
+ if scatter_dim < 1:
+ input_t = input_.reshape(
+ [sp_size, inp_shape[scatter_dim]] + \
+ inp_shape[scatter_dim + 1:]
+ )
+ else:
+ # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+ input_t = input_.reshape(
+ [-1, sp_size, inp_shape[scatter_dim]] + \
+ inp_shape[scatter_dim + 1:]
+ ).transpose(0, 1).contiguous()
+
+ output = torch.empty_like(input_t)
+ if enable_LCCL and not enable_HCCL:
+ matrix_count = torch.ones([sp_size, sp_size], dtype=torch.int64, device=input_t.device) * (
+ input_t.numel() // sp_size)
+ lcal_all2allvc(input_t, output, matrix_count, lccl_info.group)
+ else:
+ dist.all_to_all_single(output, input_t, group=hccl_info.group)
+ # if scattering the seq-dim, transpose the heads back to the original dimension
+ if scatter_dim < 1:
+ output = output.transpose(0, 1).contiguous()
+
+ return output.reshape(
+ inp_shape[: gather_dim] + [inp_shape[gather_dim] * sp_size, ] + inp_shape[gather_dim + 1:])
+
+
+class _AllToAll(torch.autograd.Function):
+ """All-to-all communication.
+
+ Args:
+ input_: input matrix
+ process_group: communication group
+ scatter_dim: scatter dimension
+ gather_dim: gather dimension
+ """
+
+ @staticmethod
+ def forward(ctx, input_, scatter_dim, gather_dim, all_to_all_func):
+ ctx.scatter_dim = scatter_dim
+ ctx.gather_dim = gather_dim
+ ctx.all_to_all = all_to_all_func
+ output = ctx.all_to_all(input_, scatter_dim, gather_dim)
+ return output
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ grad_output = ctx.all_to_all(
+ grad_output,
+ ctx.gather_dim,
+ ctx.scatter_dim,
+ )
+ return (
+ grad_output,
+ None,
+ None,
+ None,
+ )
+
+def all_to_all_SBH(
+ input_: torch.Tensor,
+ scatter_dim: int = 1,
+ gather_dim: int = 0,
+):
+ return _AllToAll.apply(input_, scatter_dim, gather_dim, _single_all_to_all)
+
+def all_to_all_BSND(
+ input_: torch.Tensor,
+ scatter_dim: int = 2,
+ gather_dim: int = 1,
+):
+ return _AllToAll.apply(input_, scatter_dim, gather_dim, _all_to_all)
+
+
+def prepare_parallel_data(hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask, use_image_num):
+ def all_to_all(hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask):
+ hidden_states = _single_all_to_all(hidden_states, scatter_dim=2, gather_dim=0, enable_HCCL=True)
+ encoder_hidden_states = _single_all_to_all(encoder_hidden_states, scatter_dim=1, gather_dim=0, enable_HCCL=True)
+ attention_mask = _single_all_to_all(attention_mask, scatter_dim=1, gather_dim=0, enable_HCCL=True)
+ encoder_attention_mask = _single_all_to_all(encoder_attention_mask, scatter_dim=1, gather_dim=0, enable_HCCL=True)
+ return hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask
+
+ sp_size = hccl_info.world_size
+ frame = hidden_states.shape[2]
+ assert frame % sp_size == 0, "frame should be a multiple of sp_size"
+
+ encoder_hidden_states = rearrange(encoder_hidden_states, 'b 1 (n x) h -> b n x h',
+ n=sp_size, x=encoder_hidden_states.shape[2]//sp_size).contiguous()
+ hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask = all_to_all(hidden_states,
+ encoder_hidden_states,
+ attention_mask.repeat(1, sp_size, 1, 1),
+ encoder_attention_mask.repeat(1, sp_size, 1))
+
+ return hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask, use_image_num
\ No newline at end of file
diff --git a/opensora/acceleration/parallel_states.py b/opensora/acceleration/parallel_states.py
new file mode 100644
index 000000000..e3f96e121
--- /dev/null
+++ b/opensora/acceleration/parallel_states.py
@@ -0,0 +1,57 @@
+import torch
+import torch_npu
+import torch.distributed as dist
+import os
+try:
+ from lcalib.functional import lcal_initialize
+ enable_LCCL = True
+except:
+ lcal_initialize = None
+ enable_LCCL = False
+class COMM_INFO:
+ def __init__(self):
+ self.group = None
+ self.world_size = 0
+ self.rank = -1
+
+lccl_info = COMM_INFO()
+hccl_info = COMM_INFO()
+_SEQUENCE_PARALLEL_STATE = False
+def initialize_sequence_parallel_state(sequence_parallel_size):
+ global _SEQUENCE_PARALLEL_STATE
+ if sequence_parallel_size > 1:
+ _SEQUENCE_PARALLEL_STATE = True
+ initialize_sequence_parallel_group(sequence_parallel_size)
+
+def set_sequence_parallel_state(state):
+ global _SEQUENCE_PARALLEL_STATE
+ _SEQUENCE_PARALLEL_STATE = state
+
+def get_sequence_parallel_state():
+ return _SEQUENCE_PARALLEL_STATE
+
+def initialize_sequence_parallel_group(sequence_parallel_size):
+ """Initialize the sequence parallel group."""
+ rank = int(os.getenv('RANK', '0'))
+ world_size = int(os.getenv("WORLD_SIZE", '1'))
+ assert world_size % sequence_parallel_size == 0, "world_size must be divisible by sequence_parallel_size"
+ # hccl
+ hccl_info.world_size = sequence_parallel_size
+ hccl_info.rank = rank
+ num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+ for i in range(num_sequence_parallel_groups):
+ ranks = range(i * sequence_parallel_size, (i + 1) * sequence_parallel_size)
+ group = dist.new_group(ranks)
+ if rank in ranks:
+ hccl_info.group = group
+
+ if enable_LCCL:
+ assert sequence_parallel_size == 8, "sequence_parallel_size should be 8 when enable_LCCL is True"
+ rank %= sequence_parallel_size
+ lccl_info.world_size = sequence_parallel_size
+ lccl_info.group = lcal_initialize(rank, sequence_parallel_size)
+ lccl_info.rank = rank
+
+def destroy_sequence_parallel_group():
+ """Destroy the sequence parallel group."""
+ dist.destroy_process_group()
diff --git a/opensora/adaptor/__init__.py b/opensora/adaptor/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/adaptor/bf16_optimizer.py b/opensora/adaptor/bf16_optimizer.py
new file mode 100644
index 000000000..e0401c0b3
--- /dev/null
+++ b/opensora/adaptor/bf16_optimizer.py
@@ -0,0 +1,458 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from collections import OrderedDict
+import torch
+import sys
+import os
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed import comm as dist
+from deepspeed.runtime.constants import PIPE_REPLICATED
+from deepspeed.runtime import ZeROOptimizer
+from packaging import version as pkg_version
+
+from deepspeed.git_version_info import version
+from deepspeed.runtime.utils import (get_global_norm_of_tensors, clip_tensors_by_global_norm, DummyOptim,
+ align_dense_tensors, all_gather_dp_groups, bwc_tensor_model_parallel_rank,
+ is_model_parallel_parameter, see_memory_usage, graph_process)
+
+from deepspeed.utils import link_hp_params, fragment_address
+from deepspeed.checkpoint import enable_universal_checkpoint
+from deepspeed.checkpoint.constants import (DS_VERSION, PARTITION_COUNT, BASE_OPTIMIZER_STATE,
+ SINGLE_PARTITION_OF_FP32_GROUPS, CLIP_GRAD, GROUP_PADDINGS,
+ PARAM_SLICE_MAPPINGS)
+
+setattr(sys.modules[__name__], 'fragment_address', fragment_address)
+
+
+def contigous_flatten(tensors):
+ return _flatten_dense_tensors([tensor.contiguous() for tensor in tensors])
+
+
+class BF16_Optimizer(ZeROOptimizer):
+
+ def __init__(self,
+ init_optimizer,
+ param_names,
+ mpu=None,
+ clip_grad=0.0,
+ norm_type=2,
+ allgather_bucket_size=5000000000,
+ dp_process_group=None,
+ timers=None,
+ grad_acc_dtype=None,
+ graph_harvesting=False):
+ # super().__init__()
+ # base_class = ZeROOptimizer.__bases__[0]
+ # # 直接调用基类的 __init__ 方法
+ # base_class.__init__()
+ see_memory_usage('begin bf16_optimizer', force=True)
+ self.timers = timers
+ self.optimizer = init_optimizer
+ self.param_names = param_names
+ self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim)
+
+ assert grad_acc_dtype in [torch.float32, torch.bfloat16
+ ], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}"
+ self.grad_acc_dtype = grad_acc_dtype
+
+ self.clip_grad = clip_grad
+ self.norm_type = norm_type
+ self.mpu = mpu
+ self.allgather_bucket_size = int(allgather_bucket_size)
+ self.dp_process_group = dp_process_group
+ self.dp_rank = dist.get_rank(group=self.dp_process_group)
+ self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
+
+ # Use torch (un)flatten ops
+ self.flatten = contigous_flatten
+ self.unflatten = _unflatten_dense_tensors
+
+ #align nccl all-gather send buffers to 4-bye boundary
+ self.nccl_start_alignment_factor = 16
+
+ # Build BF16/FP32 groups
+ self.bf16_groups = []
+ self.bf16_groups_flat = []
+ self.bf16_partitioned_groups = []
+
+ self.fp32_groups_flat_partition = []
+
+ # Maintain different fp32 gradients views for convenience
+ self.fp32_groups_gradients = []
+ self.fp32_groups_gradient_dict = {}
+ self.fp32_groups_gradients_flat = []
+ self.fp32_groups_actual_gradients_flat = []
+ self.fp32_groups_gradient_flat_partition = []
+ self.fp32_groups_has_gradients = []
+
+ self.group_paddings = []
+ self.graph_harvesting = graph_harvesting
+ if self.using_real_optimizer:
+ self._setup_for_real_optimizer()
+
+ see_memory_usage('end bf16_optimizer', force=True)
+
+ def _setup_for_real_optimizer(self):
+ dp_world_size = dist.get_world_size(group=self.dp_process_group)
+ self.partition_count = [dp_world_size for i in range(len(self.optimizer.param_groups))]
+
+ for i, param_group in enumerate(self.optimizer.param_groups):
+ see_memory_usage(f'before initializing group {i}', force=True)
+
+ partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+
+ # grab the original list
+ trainable_parameters = [param for param in param_group['params'] if param.requires_grad]
+ self.bf16_groups.append(trainable_parameters)
+
+ # create flat bf16 params
+ self.bf16_groups_flat.append(
+ self._flatten_dense_tensors_aligned(self.bf16_groups[i],
+ self.nccl_start_alignment_factor * dp_world_size))
+
+ # Make bf16 params point to flat tensor storage
+ self._update_storage_to_flattened_tensor(tensor_list=self.bf16_groups[i],
+ flat_tensor=self.bf16_groups_flat[i])
+
+ # divide flat weights into equal sized partitions
+ partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
+ bf16_dp_partitions = [
+ self.bf16_groups_flat[i].narrow(0, dp_index * partition_size, partition_size)
+ for dp_index in range(dp_world_size)
+ ]
+ self.bf16_partitioned_groups.append(bf16_dp_partitions)
+
+ # create fp32 params partition
+ self.fp32_groups_flat_partition.append(bf16_dp_partitions[partition_id].clone().float().detach())
+ self.fp32_groups_flat_partition[i].requires_grad = True
+
+ num_elem_list = [t.numel() for t in self.bf16_groups[i]]
+
+ # create fp32 gradients
+ self.fp32_groups_gradients_flat.append(
+ torch.zeros_like(self.bf16_groups_flat[i], dtype=self.grad_acc_dtype))
+
+ # track individual fp32 gradients for entire model
+ fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i],
+ num_elem_list=num_elem_list)
+ self.fp32_groups_gradients.append(fp32_gradients)
+ self.fp32_groups_gradient_dict[i] = fp32_gradients
+
+ # flat tensor corresponding to actual fp32 gradients (i.e., minus alignment padding)
+ length_without_padding = sum(num_elem_list)
+ self.fp32_groups_actual_gradients_flat.append(
+ torch.narrow(self.fp32_groups_gradients_flat[i], 0, 0, length_without_padding))
+
+ # flat tensor corresponding to gradient partition
+ self.fp32_groups_gradient_flat_partition.append(
+ torch.narrow(self.fp32_groups_gradients_flat[i], 0, partition_id * partition_size, partition_size))
+
+ # track fp32 gradient updates
+ self.fp32_groups_has_gradients.append([False] * len(self.bf16_groups[i]))
+
+ # Record padding required for alignment
+ if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
+ padding = self.bf16_groups_flat[i].numel() - length_without_padding
+ else:
+ padding = 0
+
+ self.group_paddings.append(padding)
+
+ # update optimizer param groups to reference fp32 params partition
+ param_group['params'] = [self.fp32_groups_flat_partition[i]]
+
+ see_memory_usage(f'after initializing group {i}', force=True)
+
+ see_memory_usage('before initialize_optimizer', force=True)
+ self.initialize_optimizer_states()
+ see_memory_usage('end initialize_optimizer', force=True)
+
+ # Need optimizer states initialized before linking lp to optimizer state
+ self._link_all_hp_params()
+ self._enable_universal_checkpoint()
+ self._param_slice_mappings = self._create_param_mapping()
+
+ def _enable_universal_checkpoint(self):
+ for lp_param_group in self.bf16_groups:
+ enable_universal_checkpoint(param_list=lp_param_group)
+
+ def _create_param_mapping(self):
+ param_mapping = []
+ for i, _ in enumerate(self.optimizer.param_groups):
+ param_mapping_per_group = OrderedDict()
+ for lp in self.bf16_groups[i]:
+ if lp._hp_mapping is not None:
+ lp_name = self.param_names[lp]
+ param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address()
+ param_mapping.append(param_mapping_per_group)
+
+ return param_mapping
+
+ def _link_all_hp_params(self):
+ dp_world_size = dist.get_world_size(group=self.dp_process_group)
+ for i, _ in enumerate(self.optimizer.param_groups):
+ # Link bf16 and fp32 params in partition
+ partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+ partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
+ flat_hp_partition = self.fp32_groups_flat_partition[i]
+ link_hp_params(lp_param_list=self.bf16_groups[i],
+ flat_hp_partition=flat_hp_partition,
+ gradient_dict=self.fp32_groups_gradient_dict,
+ offload_gradient_dict=None,
+ use_offload=False,
+ param_group_index=i,
+ partition_start=partition_id * partition_size,
+ partition_size=partition_size,
+ partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+ dp_group=self.real_dp_process_group[i])
+
+ def initialize_optimizer_states(self):
+ """Take an optimizer step with zero-valued gradients to allocate internal
+ optimizer state.
+
+ This helps prevent memory fragmentation by allocating optimizer state at the
+ beginning of training instead of after activations have been allocated.
+ """
+ for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
+ self.fp32_groups_gradient_flat_partition):
+ # In case of grad acc dtype different than FP32, need to cast to high precision.
+ param_partition.grad = grad_partition.to(
+ param_partition.dtype) if grad_partition.dtype != param_partition.dtype else grad_partition
+
+ self.optimizer.step()
+
+ if self.grad_acc_dtype is not torch.float32:
+ for param_partition in self.fp32_groups_flat_partition:
+ param_partition.grad = None
+
+ self.clear_hp_grads()
+
+ def _split_flat_tensor(self, flat_tensor, num_elem_list):
+ assert sum(num_elem_list) <= flat_tensor.numel()
+ tensor_list = []
+ offset = 0
+ for num_elem in num_elem_list:
+ dense_tensor = torch.narrow(flat_tensor, 0, offset, num_elem)
+ tensor_list.append(dense_tensor)
+ offset += num_elem
+
+ return tensor_list
+
+ def _update_storage_to_flattened_tensor(self, tensor_list, flat_tensor):
+ updated_params = self.unflatten(flat_tensor, tensor_list)
+ for p, q in zip(tensor_list, updated_params):
+ p.data = q.data
+
+ def _flatten_dense_tensors_aligned(self, tensor_list, alignment):
+ return self.flatten(align_dense_tensors(tensor_list, alignment))
+
+ @torch.no_grad()
+ def step(self, closure=None):
+ if closure is not None:
+ raise NotImplementedError(f'{self.__class__} does not support closure.')
+
+ all_groups_norm = get_global_norm_of_tensors(input_tensors=self.get_grads_for_norm(),
+ mpu=self.mpu,
+ norm_type=self.norm_type,
+ use_graph=self.graph_harvesting)
+ self._global_grad_norm = all_groups_norm
+
+ assert all_groups_norm > 0.
+ if self.clip_grad > 0.:
+ clip_tensors_by_global_norm(input_tensors=self.get_grads_for_norm(for_clipping=True),
+ max_norm=self.clip_grad,
+ global_norm=all_groups_norm,
+ mpu=self.mpu,
+ use_graph=self.graph_harvesting)
+
+ self.optimizer.step()
+
+ self.update_lp_params()
+
+ self.clear_hp_grads()
+
+ def backward(self, loss, update_hp_grads=True, clear_lp_grads=False, **bwd_kwargs):
+ """Perform a backward pass and copy the low-precision gradients to the
+ high-precision copy.
+
+ We copy/accumulate to the high-precision grads now to prevent accumulating in the
+ bf16 grads after successive backward() calls (i.e., grad accumulation steps > 1)
+
+ The low-precision grads are deallocated during this procedure.
+ """
+ self.clear_lp_grads()
+ loss.backward(**bwd_kwargs)
+
+ if update_hp_grads:
+ self.update_hp_grads(clear_lp_grads=clear_lp_grads)
+
+ @torch.no_grad()
+ def update_hp_grads(self, clear_lp_grads=False):
+
+ def _update_hp_grads_func(clear_lp_grads=False):
+ for i, group in enumerate(self.bf16_groups):
+ for j, lp in enumerate(group):
+ if lp.grad is None:
+ continue
+ hp_grad = self.fp32_groups_gradients[i][j]
+ assert hp_grad is not None, \
+ f'high precision param has no gradient, lp param_id = {id(lp)} group_info = [{i}][{j}]'
+ hp_grad.data.add_(lp.grad.data.to(hp_grad.dtype).view(hp_grad.shape))
+ lp._hp_grad = hp_grad
+ self.fp32_groups_has_gradients[i][j] = True
+ # clear gradients
+ if clear_lp_grads:
+ lp.grad._zero()
+
+ if self.graph_harvesting:
+ graph_process(False, _update_hp_grads_func, clear_lp_grads)
+ else:
+ _update_hp_grads_func(clear_lp_grads)
+ #cpu op
+ for i, group in enumerate(self.bf16_groups):
+ for j, lp in enumerate(group):
+ if lp.grad is None:
+ continue
+ self.fp32_groups_has_gradients[i][j] = True
+
+ @torch.no_grad()
+ def get_grads_for_reduction(self):
+ return self.fp32_groups_gradients_flat
+
+ @torch.no_grad()
+ def get_grads_for_norm(self, for_clipping=False):
+ grads = []
+ tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+ for i, group in enumerate(self.bf16_groups):
+ for j, lp in enumerate(group):
+ if not for_clipping:
+ if hasattr(lp, PIPE_REPLICATED) and lp.ds_pipe_replicated:
+ continue
+
+ if not (tensor_mp_rank == 0 or is_model_parallel_parameter(lp)):
+ continue
+
+ if not self.fp32_groups_has_gradients[i][j]:
+ continue
+
+ grads.append(self.fp32_groups_gradients[i][j])
+
+ return grads
+
+ @torch.no_grad()
+ def update_lp_params(self):
+ for i, (bf16_partitions,
+ fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
+ partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+ bf16_partitions[partition_id].data.copy_(fp32_partition.data)
+ # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
+ # if i == 0:
+ # print_rank_0(f'{fp32_partition[:10]=}', force=True)
+
+ all_gather_dp_groups(groups_flat=self.bf16_groups_flat,
+ partitioned_param_groups=self.bf16_partitioned_groups,
+ dp_process_group=self.real_dp_process_group,
+ start_alignment_factor=self.nccl_start_alignment_factor,
+ allgather_bucket_size=self.allgather_bucket_size)
+
+ def clear_hp_grads(self):
+ for flat_gradients in self.fp32_groups_gradients_flat:
+ flat_gradients.zero_()
+
+ for i, group in enumerate(self.fp32_groups_gradients):
+ self.fp32_groups_has_gradients[i] = [False] * len(group)
+
+ def clear_lp_grads(self):
+ for group in self.bf16_groups:
+ for param in group:
+ if param.grad is not None:
+ # Using zero_() fixed memory address for graph replay
+ param.grad.zero_()
+
+ def state_dict(self):
+ state_dict = {}
+ state_dict[CLIP_GRAD] = self.clip_grad
+ state_dict[BASE_OPTIMIZER_STATE] = self.optimizer.state_dict()
+ state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = self.fp32_groups_flat_partition
+ state_dict[GROUP_PADDINGS] = self.group_paddings
+ state_dict[PARTITION_COUNT] = self.partition_count
+ state_dict[DS_VERSION] = version
+ state_dict[PARAM_SLICE_MAPPINGS] = self._param_slice_mappings
+
+ return state_dict
+
+ # Restore base optimizer fp32 weights bfloat16 weights
+ def _restore_from_bit16_weights(self):
+ for i, group in enumerate(self.bf16_groups):
+ partition_id = dist.get_rank(group=self.real_dp_process_group[i])
+ for bf16_partitions, fp32_partition in zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition):
+ fp32_partition.data.copy_(bf16_partitions[partition_id].data)
+
+ def refresh_fp32_params(self):
+ self._restore_from_bit16_weights()
+
+ def load_state_dict(self,
+ state_dict_list,
+ checkpoint_folder,
+ load_optimizer_states=True,
+ load_from_fp32_weights=False,
+ load_serial=None):
+ if checkpoint_folder:
+ self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
+ else:
+ self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
+
+ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False):
+
+ dp_rank = dist.get_rank(group=self.dp_process_group)
+ current_rank_sd = state_dict_list[dp_rank]
+
+ ckpt_version = current_rank_sd.get(DS_VERSION, False)
+ assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed"
+ ckpt_version = pkg_version.parse(ckpt_version)
+
+ self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad)
+
+ if load_optimizer_states:
+ self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
+
+ if load_from_fp32_weights:
+ for current, saved in zip(self.fp32_groups_flat_partition,
+ current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
+ src_tensor = _get_padded_tensor(saved, current.numel())
+ current.data.copy_(src_tensor.data)
+
+ if load_optimizer_states:
+ self._link_all_hp_params()
+
+ def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
+ self._load_hp_checkpoint_state(checkpoint_folder)
+
+ @property
+ def param_groups(self):
+ """Forward the wrapped optimizer's parameters."""
+ return self.optimizer.param_groups
+
+ def _load_hp_checkpoint_state(self, checkpoint_dir):
+ checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+ tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+ tp_world_size = self.mpu.get_slice_parallel_world_size()
+
+ for i, _ in enumerate(self.optimizer.param_groups):
+ for lp in self.bf16_groups[i]:
+ if lp._hp_mapping is not None:
+ #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
+ lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+ tp_world_size)
+
+
+def _get_padded_tensor(src_tensor, size):
+ if src_tensor.numel() >= size:
+ return src_tensor
+ padded_tensor = torch.zeros(size, dtype=src_tensor.dtype, device=src_tensor.device)
+ slice_tensor = torch.narrow(padded_tensor, 0, 0, src_tensor.numel())
+ slice_tensor.data.copy_(src_tensor.data)
+ return padded_tensor
diff --git a/opensora/adaptor/engine.py b/opensora/adaptor/engine.py
new file mode 100644
index 000000000..5208a9e41
--- /dev/null
+++ b/opensora/adaptor/engine.py
@@ -0,0 +1,3568 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import re
+import stat
+import torch
+import hashlib
+from collections import defaultdict, OrderedDict, deque
+from shutil import copyfile
+import gc
+
+from torch.nn.modules import Module
+from torch.nn.parameter import Parameter
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from typing import Callable, Dict, Union, Iterable
+
+import deepspeed
+
+from deepspeed import comm as dist
+from deepspeed.runtime.utils import see_memory_usage, DummyOptim
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.runtime.zero.utils import is_zero_supported_optimizer, ZeRORuntimeException
+from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
+from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION
+
+from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
+from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
+from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
+
+from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
+ ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
+ TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, \
+ MUSGD_OPTIMIZER, LION_OPTIMIZER
+
+from deepspeed.runtime.dataloader import DeepSpeedDataLoader
+from deepspeed.runtime.constants import \
+ ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
+ PLD_THETA, PLD_GAMMA, BFLOAT16, FP16, AMP, GRADIENT_ACCUMULATION_STEPS, \
+ DATA_PARALLEL_GROUP, GLOBAL_RANK
+from deepspeed.runtime.zero.config import ZeroStageEnum
+from deepspeed.compression import compression_scheduler
+from deepspeed.compression.constants import \
+ WEIGHT_QUANTIZE_IN_FORWARD_ENABLED, \
+ WEIGHT_QUANTIZATION, SHARED_PARAMETERS, \
+ WEIGHT_QUANTIZE_ENABLED, \
+ WEIGHT_QUANTIZE_GROUPS, \
+ WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE, \
+ WEIGHT_QUANTIZE_CHANGE_RATIO, \
+ WEIGHT_QUANTIZE_TYPE, \
+ WEIGHT_QUANTIZE_ROUNDING, \
+ WEIGHT_QUANTIZE_VERBOSE, \
+ WEIGHT_QUANTIZE_KERNEL
+from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FROZEN_PARAM_FRAGMENTS
+from deepspeed.runtime.sparse_tensor import SparseTensor
+
+from deepspeed.runtime import lr_schedules
+from deepspeed.utils import groups
+from deepspeed.utils import logger, log_dist, instrument_w_nvtx
+from deepspeed.utils.timer import NoopTimer, ThroughputTimer, SynchronizedWallClockTimer, \
+ FORWARD_MICRO_TIMER, BACKWARD_MICRO_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_REDUCE_MICRO_TIMER, \
+ STEP_MICRO_TIMER, \
+ FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER, \
+ STEP_GLOBAL_TIMER
+from deepspeed.utils.debug import debug_extract_module_and_param_names
+from deepspeed.monitor.monitor import MonitorMaster
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
+from deepspeed.runtime.utils import clip_grad_norm_
+from deepspeed.runtime.eigenvalue import Eigenvalue
+from deepspeed.runtime.data_pipeline.constants import DATA_SAMPLING, \
+ DATA_ROUTING, DATA_SAMPLING_ENABLED, CURRICULUM_LEARNING, \
+ CURRICULUM_LEARNING_ENABLED, DATA_SAMPLING_NUM_WORKERS, RANDOM_LTD, \
+ RANDOM_LTD_ENABLED, RANDOM_LTD_LAYER_ID, RANDOM_LTD_LAYER_NUM, \
+ RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE, RANDOM_LTD_LAYER_TOKEN_LR_ENABLED, \
+ RANDOM_LTD_GLOBAL_BATCH_SIZE, RANDOM_LTD_MICRO_BATCH_SIZE, DATA_EFFICIENCY
+from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
+from deepspeed.runtime.data_pipeline.data_routing.scheduler import RandomLTDScheduler
+from deepspeed.runtime.data_pipeline.data_routing.helper import remove_random_ltd_state_dict
+from deepspeed.runtime.data_pipeline.data_routing.basic_layer import RandomLayerTokenDrop
+
+from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+
+from deepspeed.runtime.pipe.module import PipelineModule
+from deepspeed.runtime.utils import get_ma_status
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.moe.sharded_moe import TopKGate, MOELayer
+from deepspeed.moe.layer import MoE
+from deepspeed.moe.utils import is_moe_param
+from deepspeed.git_version_info import version
+
+from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
+from deepspeed.utils.logging import print_json_dist, print_configuration
+
+from deepspeed.accelerator import get_accelerator
+
+from deepspeed.runtime.config import DtypeEnum
+
+from opensora.adaptor.zp_manager import zp_manager
+
+MEMORY_OPT_ALLREDUCE_SIZE = 500000000
+
+DeepSpeedOptimizerCallable = \
+ Callable[[Union[Iterable[Parameter], Dict[str, Iterable]]], Optimizer]
+DeepSpeedSchedulerCallable = Callable[[Optimizer], _LRScheduler]
+
+try:
+ import apex
+ from apex import amp
+ APEX_INSTALLED = True
+except ImportError:
+ # Fail silently so we don't spam logs unnecessarily if user isn't using amp
+ APEX_INSTALLED = False
+
+
+def split_half_float_double_sparse(tensors):
+ device_type = get_accelerator().device_name()
+ supported_types = [
+ "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
+ "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type),
+ SparseTensor.type()
+ ]
+
+ for t in tensors:
+ assert t.type() in supported_types, f"attempting to reduce an unsupported grad type: {t.type()}"
+
+ buckets = []
+ for i, dtype in enumerate(supported_types):
+ bucket = [t for t in tensors if t.type() == dtype]
+ if bucket:
+ buckets.append((dtype, bucket))
+ return buckets
+
+
+class EngineTimers(object):
+ r"""Wallclock timers for DeepSpeedEngine"""
+
+ def __init__(self, enable_micro_timers, enable_global_timers):
+ self.forward_timers = []
+ self.backward_timers = []
+ self.backward_inner_timers = []
+ self.backward_reduce_timers = []
+ self.step_timers = []
+ self.global_timers = []
+ self.micro_timers = []
+
+ if enable_micro_timers:
+ self.forward_timers += [FORWARD_MICRO_TIMER]
+ self.backward_timers += [BACKWARD_MICRO_TIMER]
+ self.backward_inner_timers += [BACKWARD_INNER_MICRO_TIMER]
+ self.backward_reduce_timers += [BACKWARD_REDUCE_MICRO_TIMER]
+ self.step_timers += [STEP_MICRO_TIMER]
+ self.micro_timers += [
+ FORWARD_MICRO_TIMER, BACKWARD_MICRO_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_REDUCE_MICRO_TIMER,
+ STEP_MICRO_TIMER
+ ]
+
+ if enable_global_timers:
+ self.forward_timers += [FORWARD_GLOBAL_TIMER]
+ self.backward_timers += [BACKWARD_GLOBAL_TIMER]
+ self.backward_inner_timers += [BACKWARD_INNER_GLOBAL_TIMER]
+ self.backward_reduce_timers += [BACKWARD_REDUCE_GLOBAL_TIMER]
+ self.step_timers += [STEP_GLOBAL_TIMER]
+ self.global_timers += [
+ FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER,
+ STEP_GLOBAL_TIMER
+ ]
+
+
+class DeepSpeedEngine(Module):
+ r"""DeepSpeed engine for training."""
+
+ def __init__(
+ self,
+ args,
+ model,
+ optimizer=None,
+ model_parameters=None,
+ training_data=None,
+ lr_scheduler=None,
+ mpu=None,
+ dist_init_required=None,
+ collate_fn=None,
+ config=None,
+ config_class=None,
+ dont_change_device=False,
+ ):
+ super(DeepSpeedEngine, self).__init__()
+ self.dont_change_device = dont_change_device
+ self.client_optimizer = optimizer
+ self.client_lr_scheduler = lr_scheduler
+ self.training_data = training_data
+ self.collate_fn = collate_fn
+ self.mpu = mpu
+ self.all_to_all_group = None
+ self.data_parallel_group = None
+ self.global_steps = 0
+ self.global_samples = 0
+ self.micro_steps = 0
+ self.skipped_steps = 0
+ self.gradient_average = True
+ self.warn_unscaled_loss = True
+ self.config = config
+ self._config = config_class
+ self.loaded_checkpoint_mp_world_size = None
+ self.loaded_checkpoint_dp_world_size = None
+ self.loaded_checkpoint_zp_world_size = None
+ self.enable_backward_allreduce = True
+ self.progressive_layer_drop = None
+ self.eigenvalue = None
+ self.block_eigenvalue = None
+ self.gas_boundary_ctr = 0
+ self.dist_backend = get_accelerator().communication_backend_name()
+ self.has_moe_layers = False
+ self.num_experts = []
+ self.gate_modules = []
+ self.moe_layers = []
+ self._step_applied = False
+ self._global_grad_norm = None
+ self.use_ds_comm = False # False --> Use torch.dist, True --> Use ds.comm backend.
+
+ self.checkpoint_engine = None
+
+ self._is_gradient_accumulation_boundary = None
+ self.scale_wrt_gas = None
+ self.losses = 0.0
+
+ # for debug purposes - can then debug print: debug_get_module_name(module)
+ debug_extract_module_and_param_names(model)
+
+ self._do_args_sanity_check(args)
+ self._configure_with_arguments(args, mpu)
+ self._do_sanity_check()
+ see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
+ if mpu is not None:
+ if self.elasticity_enabled():
+ if not self.is_elastic_model_parallel_supported():
+ assert not self.elasticity_enabled(), ("Elasticity is not currently supported"
+ " with model parallelism.")
+
+ self._set_distributed_vars(args)
+
+ dist.configure(self._config)
+
+ self.monitor = MonitorMaster(self._config.monitor_config)
+
+ see_memory_usage(
+ f"DeepSpeed Engine: Before configure distributed model",
+ force=self.memory_breakdown(),
+ )
+
+ self.pipeline_parallelism = isinstance(model, PipelineModule)
+
+ # Configure distributed model
+ self._configure_distributed_model(model)
+
+ # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
+ self.param_names = {param: name for name, param in model.named_parameters()}
+
+ self._get_model_parameters()
+
+ see_memory_usage(f"DeepSpeed Engine: After configure distributed model")
+
+ # Configure wall clock timers
+ self.timers = SynchronizedWallClockTimer()
+ # Throughput timer
+ self.tput_timer = ThroughputTimer(
+ batch_size=self.train_batch_size(),
+ steps_per_output=self.steps_per_print(),
+ monitor_memory=False,
+ )
+
+ log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", ranks=[0])
+
+ if self.flops_profiler_enabled():
+ self.flops_profiler = FlopsProfiler(self.module, self, self.flops_profiler_recompute_fwd_factor())
+
+ if training_data:
+ self.training_dataloader = self.deepspeed_io(training_data)
+ else:
+ self.training_dataloader = None
+
+ # Configure optimizer and scheduler
+ self.optimizer = None
+ self.basic_optimizer = None
+ self.lr_scheduler = None
+ has_optimizer = False
+
+ if optimizer or self.optimizer_name():
+ has_optimizer = True
+ # If no parameters given by init default to module parameters
+ if model_parameters is None:
+ model_parameters = self.module.parameters()
+
+ # Convert model parameters from generator to list
+ if not isinstance(model_parameters, list):
+ model_parameters = list(model_parameters)
+
+ if has_optimizer:
+ self._configure_optimizer(optimizer, model_parameters)
+ self._configure_lr_scheduler(lr_scheduler)
+ self._report_progress(0)
+ elif self.zero_optimization():
+ # no optim selected but zero is enabled
+ self.optimizer = self._configure_zero_optimizer(optimizer=None)
+ elif self.bfloat16_enabled():
+ self.optimizer = self._configure_bf16_optimizer(optimizer=None)
+
+ # Hook optimizer for snip_momentum pruning
+ if hasattr(model, 'pruners'):
+ from deepspeed.compression.helper import rewrite_optimizer_step
+ self.optimizer.pruners = model.pruners
+ rewrite_optimizer_step(self.optimizer)
+
+ # Bookkeeping for sparse support
+ self.sparse_tensor_module_names = set()
+ # if self.sparse_gradients_enabled():
+ for name, module in self.module.named_modules():
+ if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled():
+ self.sparse_tensor_module_names.add(name + ".weight")
+ logger.info("Will convert {} to sparse tensor during training".format(name))
+
+ self.save_non_zero_checkpoint = False
+ self.save_zero_checkpoint = False
+ if not isinstance(self.optimizer, DeepSpeedZeRoOffload):
+ self._configure_checkpointing(dist_init_required)
+
+ if self.eigenvalue_enabled():
+ self.eigenvalue = self._configure_eigenvalue()
+
+ if self.pld_enabled():
+ self.progressive_layer_drop = self._configure_progressive_layer_drop()
+
+ if self.curriculum_enabled_legacy():
+ self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy()
+
+ if self.random_ltd_enabled():
+ random_ltd_config = self.random_ltd_config()
+ random_ltd_config[RANDOM_LTD_GLOBAL_BATCH_SIZE] = self.train_batch_size()
+ random_ltd_config[RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu()
+ self.random_ltd_scheduler = self._configure_random_ltd_scheduler(random_ltd_config)
+
+ # Engine timers
+
+ self.engine_timers = EngineTimers(enable_micro_timers=self.wall_clock_breakdown(),
+ enable_global_timers=self.wall_clock_breakdown()
+ or self.flops_profiler_enabled())
+
+ if self.global_rank == 0:
+ self._config.print("DeepSpeedEngine configuration")
+ if self.dump_state():
+ print_configuration(self, "DeepSpeedEngine")
+
+ # Use torch (un)flatten ops
+ self.flatten = _flatten_dense_tensors
+ self.unflatten = _unflatten_dense_tensors
+
+ def destroy(self):
+ if self.optimizer is not None and hasattr(self.optimizer, 'destroy'):
+ self.optimizer.destroy()
+
+ def _get_model_parameters(self):
+ if self.autotuning_profile_model_info():
+ self.autotuning_model_info = {}
+ num_params = 0
+ trainable_num_params = 0
+
+ for p in self.module.parameters():
+ # since user code might call deepspeed.zero.Init() before deepspeed.initialize(), need to check the attribute to check if the parameter is partitioned in zero 3 already or not
+ n = 0
+ if hasattr(p, "ds_tensor"): # if the parameter is partitioned in zero 3
+ n += p.ds_numel
+ else: # if the parameter is not partitioned in zero 3 yet
+ n += p.numel()
+ num_params += n
+ if p.requires_grad:
+ trainable_num_params += n
+ if self.global_rank == 0:
+ self.autotuning_model_info["num_params"] = num_params * self.mp_world_size
+ self.autotuning_model_info["trainable_num_params"] = trainable_num_params * self.mp_world_size
+
+ logger.info(f"model parameter = {num_params}")
+
+ def get_batch_info(self):
+ """Get all training batch related settings.
+ Returns:
+ train_batch_size (int): The effective training batch size. This is the amount of data
+ samples that leads to one step of model update.
+ train_micro_batch_size_per_gpu (int): Batch size to be processed by one GPU in one
+ step (without gradient accumulation).
+ gradient_accumulation_steps (int): Number of training steps to accumulate gradients
+ before averaging and applying them.
+ """
+ return (
+ self.train_batch_size,
+ self.train_micro_batch_size_per_gpu,
+ self.gradient_accumulation_steps,
+ )
+
+ def set_train_batch_size(self, train_batch_size):
+ """Adjust the global batch size by increasing or decreasing the number of
+ micro-batches (i.e., gradient accumulation steps). The size of each micro-batch
+ (i.e., ``train_micro_batch_size_per_gpu``) is not changed.
+ Args:
+ train_batch_size (int): The new global batch size for training.
+ Raises:
+ ValueError: if ``train_batch_size`` is not divisible by the
+ configured micro-batch size and data parallelism.
+ """
+ if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0:
+ #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
+ raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism')
+ new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size)
+ # overwrite config
+ self._config.train_batch_size = train_batch_size
+ self._config.gradient_accumulation_steps = new_gas
+
+ def set_train_micro_batch_size(self, micro_batch_size):
+ """Adjust the micro batch size(i.e., the micro batch size in every data parallel group),
+ while keep the gradient accumulation steps the same.
+ Args:
+ micro_batch_size (int): The new micro batch size for training.
+ """
+ # overwrite config
+ new_global_batch_size = micro_batch_size * self._config.gradient_accumulation_steps * self.dp_world_size
+ self._config.train_batch_size = new_global_batch_size
+ self._config.train_micro_batch_size_per_gpu = micro_batch_size
+
+ def set_data_post_process_func(self, post_process_func):
+ if self.training_dataloader is not None:
+ self.training_dataloader.post_process_func = post_process_func
+
+ def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
+ if self.training_dataloader is not None and self.curriculum_learning_enabled():
+ self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(schedule_func_dict)
+
+ def get_global_grad_norm(self) -> float:
+ """Return the 2-norm of all gradients. If there is model parallelism,
+ the norm will be global.
+ The computed norm will be cached and reused until the next step() pass.
+ .. note::
+ In the presence of model parallelism, this is a collective call
+ and acts as a barrier among ``mpu.get_model_parallel_group()``.
+ Returns:
+ float: norm
+ """
+ return self._global_grad_norm
+
+ def __getattr__(self, name):
+ """
+ Pass through attributes defined in the model if they are not overridden by ds-engine.
+ """
+
+ _module = {}
+ if "module" in self.__dict__:
+ _module = self.__dict__['module']
+ if name in dir(self):
+ return getattr(self, name)
+ elif name in dir(_module):
+ return getattr(_module, name)
+ else:
+ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+ def checkpoint_tag_validation_enabled(self):
+ return self._config.checkpoint_tag_validation_enabled
+
+ def checkpoint_tag_validation_fail(self):
+ return self._config.checkpoint_tag_validation_fail
+
+ def elasticity_enabled(self):
+ return self._config.elasticity_enabled
+
+ def is_elastic_model_parallel_supported(self):
+ if self.elasticity_enabled():
+ # Add code for finding number of GPUs per node automatically
+ if self._config.num_gpus_per_node % self._config.elastic_model_parallel_size == 0:
+ return True
+ else:
+ return False
+
+ def pld_enabled(self):
+ return self._config.pld_enabled
+
+ def pld_params(self):
+ return self._config.pld_params
+
+ def pld_theta(self):
+ return self.pld_params()[PLD_THETA]
+
+ def pld_gamma(self):
+ return self.pld_params()[PLD_GAMMA]
+
+ def eigenvalue_enabled(self):
+ return self._config.eigenvalue_enabled
+
+ def eigenvalue_verbose(self):
+ return self._config.eigenvalue_verbose
+
+ def eigenvalue_max_iter(self):
+ return self._config.eigenvalue_max_iter
+
+ def eigenvalue_tol(self):
+ return self._config.eigenvalue_tol
+
+ def eigenvalue_stability(self):
+ return self._config.eigenvalue_stability
+
+ def eigenvalue_gas_boundary_resolution(self):
+ return self._config.eigenvalue_gas_boundary_resolution
+
+ def eigenvalue_layer_name(self):
+ return self._config.eigenvalue_layer_name
+
+ def eigenvalue_layer_num(self):
+ return self._config.eigenvalue_layer_num
+
+ def curriculum_enabled_legacy(self):
+ return self._config.curriculum_enabled_legacy
+
+ def curriculum_params_legacy(self):
+ return self._config.curriculum_params_legacy
+
+ def data_efficiency_enabled(self):
+ return self._config.data_efficiency_enabled
+
+ def data_efficiency_config(self):
+ return self._config.data_efficiency_config
+
+ def data_sampling_enabled(self):
+ return self._config.data_efficiency_config[DATA_SAMPLING][DATA_SAMPLING_ENABLED]
+
+ def data_sampling_config(self):
+ return self._config.data_efficiency_config[DATA_SAMPLING]
+
+ def curriculum_learning_enabled(self):
+ return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]
+
+ def curriculum_learning_config(self):
+ return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
+
+ def random_ltd_enabled(self):
+ return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][RANDOM_LTD_ENABLED]
+
+ def random_ltd_config(self):
+ return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD]
+
+ def random_ltd_initialize(self):
+ assert self.random_ltd_enabled()
+ random_ltd_config = self.random_ltd_config()
+ random_ltd_queue = deque([x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])])
+ count = 0
+ for name, layer in self.module.named_modules():
+ if isinstance(layer, RandomLayerTokenDrop):
+ if len(random_ltd_queue) != 0 and str(random_ltd_queue[0]) in name: ###[1,2,3]
+ layer.init_config(random_ltd_config, self.random_ltd_scheduler, count)
+ random_ltd_queue.popleft()
+ count += 1
+
+ if random_ltd_config[RANDOM_LTD_LAYER_NUM] != count:
+ raise ValueError(f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \
+ equivalent to the len of random_ltd_layer_id {count}')
+
+ if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
+ assert self.client_lr_scheduler is None
+ raise ValueError(f'not yet support')
+ #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
+
+ def wall_clock_breakdown(self):
+ return self._config.wall_clock_breakdown
+
+ def flops_profiler_enabled(self):
+ return self._config.flops_profiler_config.enabled or self.autotuning_enabled()
+
+ def flops_profiler_recompute_fwd_factor(self):
+ return self._config.flops_profiler_config.recompute_fwd_factor
+
+ def flops_profiler_profile_step(self):
+ step = self._config.flops_profiler_config.profile_step
+ if self._config.autotuning_config.enabled:
+ step = self.autotuning_start_profile_step()
+ return step
+
+ def flops_profiler_module_depth(self):
+ return self._config.flops_profiler_config.module_depth
+
+ def flops_profiler_top_modules(self):
+ return self._config.flops_profiler_config.top_modules
+
+ def flops_profiler_detailed(self):
+ if self._config.autotuning_config.enabled:
+ return False
+ return self._config.flops_profiler_config.detailed
+
+ def flops_profiler_output_file(self):
+ return self._config.flops_profiler_config.output_file
+
+ def memory_breakdown(self):
+ return self._config.memory_breakdown
+
+ def autotuning_enabled(self):
+ return self._config.autotuning_config.enabled
+
+ def autotuning_start_profile_step(self):
+ return self._config.autotuning_config.start_profile_step
+
+ def autotuning_end_profile_step(self):
+ return self._config.autotuning_config.end_profile_step
+
+ def autotuning_metric_path(self):
+ path = self._config.autotuning_config.metric_path
+ if not path:
+ path = os.path.join(os.getcwd(), "autotuning_metric.json")
+ return path
+
+ def autotuning_model_info_path(self):
+ path = self._config.autotuning_config.model_info_path
+ if not path:
+ path = os.path.join(os.getcwd(), "autotuning_model_info.json")
+ return path
+
+ def autotuning_metric(self):
+ return self._config.autotuning_config.metric
+
+ def autotuning_profile_model_info(self):
+ return self.autotuning_enabled(
+ ) and self._config.autotuning_config.model_info and self._config.autotuning_config.model_info.get(
+ "profile", False)
+
+ def sparse_gradients_enabled(self):
+ return self._config.sparse_gradients_enabled
+
+ def train_batch_size(self):
+ return self._config.train_batch_size
+
+ def train_micro_batch_size_per_gpu(self):
+ return self._config.train_micro_batch_size_per_gpu
+
+ def optimizer_name(self):
+ return (self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name)
+
+ def optimizer_params(self):
+ return self._config.optimizer_params
+
+ def optimizer_legacy_fusion(self):
+ return self._config.optimizer_legacy_fusion
+
+ def scheduler_name(self):
+ return self._config.scheduler_name
+
+ def scheduler_params(self):
+ return self._config.scheduler_params
+
+ def quantize_training(self):
+ return (
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+ [WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_GROUPS],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
+ [WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_CHANGE_RATIO],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_TYPE],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ROUNDING],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_VERBOSE],
+ self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_KERNEL],
+ )
+
+ def zero_optimization(self):
+ return self._config.zero_enabled
+
+ def zero_allow_untested_optimizer(self):
+ return self._config.zero_allow_untested_optimizer
+
+ def zero_force_ds_cpu_optimizer(self):
+ return self._config.zero_force_ds_cpu_optimizer
+
+ def zero_reduce_scatter(self):
+ return self._config.zero_config.reduce_scatter
+
+ def zero_overlap_comm(self):
+ return self._config.zero_config.overlap_comm
+
+ def zero_offload_optimizer(self):
+ return self._config.zero_config.offload_optimizer
+
+ def zero_offload_param(self):
+ return self._config.zero_config.offload_param
+
+ def zero_use_cpu_optimizer(self):
+ if self._config.zero_config.offload_optimizer is not None:
+ return self._config.zero_config.offload_optimizer.device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme]
+ return False
+
+ def zero_cpu_offload(self):
+ if self._config.zero_config.offload_optimizer is not None:
+ return self._config.zero_config.offload_optimizer.device == OffloadDeviceEnum.cpu
+ return False
+
+ def zero_partial_offload(self):
+ return getattr(self._config.zero_config.offload_optimizer, "ratio", 1.0)
+
+ def zero_sub_group_size(self):
+ return self._config.zero_config.sub_group_size
+
+ def zero_optimization_stage(self):
+ return self._config.zero_optimization_stage
+
+ def mics_shard_size(self):
+ return self._config.mics_shard_size
+
+ def zero_reduce_bucket_size(self):
+ return self._config.zero_config.reduce_bucket_size
+
+ def zero_multi_rank_bucket_allreduce(self):
+ return self._config.zero_config.use_multi_rank_bucket_allreduce
+
+ def zero_allgather_bucket_size(self):
+ return self._config.zero_config.allgather_bucket_size
+
+ def zero_optimization_partition_gradients(self):
+ return self.zero_optimization_stage() >= ZeroStageEnum.gradients
+
+ def zero_optimization_partition_weights(self):
+ return self.zero_optimization_stage() >= ZeroStageEnum.weights
+
+ def is_first_weights_partition_group(self):
+ ret = True if self.mics_shard_size() < 0 \
+ and self.zero_optimization_partition_weights() else False
+ if self.mics_shard_size() > 0 and self.global_rank < self.mics_shard_size():
+ ret = True
+ return ret
+
+ def zero_contiguous_gradients(self):
+ return self._config.zero_config.contiguous_gradients
+
+ def zero_load_from_fp32_weights(self):
+ return self._config.zero_config.load_from_fp32_weights
+
+ def zero_elastic_checkpoint(self):
+ return self._config.zero_config.elastic_checkpoint
+
+ def zero_max_live_parameters(self):
+ return self._config.zero_config.max_live_parameters
+
+ def zero_max_reuse_distance(self):
+ return self._config.zero_config.max_reuse_distance
+
+ def zero_prefetch_bucket_size(self):
+ return self._config.zero_config.prefetch_bucket_size
+
+ def zero_param_persistence_threshold(self):
+ return self._config.zero_config.param_persistence_threshold
+
+ def zero_model_persistence_threshold(self):
+ return self._config.zero_config.model_persistence_threshold
+
+ def zero_gather_16bit_weights_on_model_save(self):
+ return self._config.zero_config.gather_16bit_weights_on_model_save
+
+ def zero_grad_hooks(self):
+ return self._config.zero_config.grad_hooks
+
+ def zero_legacy_stage1(self):
+ return self._config.zero_config.legacy_stage1
+
+ def zero_ignore_unused_parameters(self):
+ return self._config.zero_config.ignore_unused_parameters
+
+ def graph_harvesting(self):
+ return self._config.graph_harvesting
+
+ def fp16_enabled(self):
+ return self._config.fp16_enabled
+
+ def bfloat16_enabled(self):
+ return self._config.bfloat16_enabled
+
+ def fp16_master_weights_and_gradients(self):
+ return self._config.fp16_master_weights_and_gradients
+
+ def amp_enabled(self):
+ return self._config.amp_enabled
+
+ def amp_params(self):
+ return self._config.amp_params
+
+ def fp16_auto_cast(self):
+ return self._config.fp16_auto_cast
+
+ def loss_scale(self):
+ return self._config.loss_scale
+
+ def gradient_accumulation_steps(self):
+ return self._config.gradient_accumulation_steps
+
+ def use_node_local_storage(self):
+ return self._config.use_node_local_storage
+
+ def load_universal_checkpoint(self):
+ return self._config.load_universal_checkpoint
+
+ @property
+ def communication_data_type(self):
+ res = self._config.communication_data_type
+ if res is not None:
+ return res
+
+ if self.fp16_enabled():
+ return torch.float16
+
+ if self.bfloat16_enabled():
+ return torch.bfloat16
+
+ return torch.float32
+
+ @communication_data_type.setter
+ def communication_data_type(self, value):
+ self._config.communication_data_type = value
+
+ def postscale_gradients(self):
+ return not self._config.prescale_gradients
+
+ def gradient_predivide_factor(self):
+ return self._config.gradient_predivide_factor
+
+ def steps_per_print(self):
+ return self._config.steps_per_print
+
+ def zero_allgather_partitions(self):
+ return self._config.zero_config.allgather_partitions
+
+ def zero_round_robin_gradients(self):
+ return self._config.zero_config.round_robin_gradients
+
+ def zero_hpz_partition_size(self):
+ return self._config.zero_config.zero_hpz_partition_size
+
+ def zero_quantized_weights(self):
+ return self._config.zero_config.zero_quantized_weights
+
+ def zero_quantized_nontrainable_weights(self):
+ return self._config.zero_config.zero_quantized_nontrainable_weights
+
+ def zero_quantized_gradients(self):
+ return self._config.zero_config.zero_quantized_gradients
+
+ def dump_state(self):
+ return self._config.dump_state
+
+ def gradient_clipping(self):
+ return self._config.gradient_clipping
+
+ def dynamic_loss_scale(self):
+ return self._config.loss_scale == 0
+
+ def initial_dynamic_scale(self):
+ return self._config.initial_dynamic_scale
+
+ def dynamic_loss_scale_args(self):
+ return self._config.dynamic_loss_scale_args
+
+ def swap_tensor_config(self):
+ return self._config.swap_tensor_config
+
+ def aio_config(self):
+ return self._config.aio_config
+
+ def get_data_types(self):
+ model_dtype = torch.float32
+ if self.fp16_enabled():
+ model_dtype = torch.float16
+ elif self.bfloat16_enabled():
+ model_dtype = torch.bfloat16
+
+ if self._config.grad_accum_dtype is None:
+ if model_dtype == torch.bfloat16 and not self.zero_optimization():
+ grad_accum_dtype = torch.float32
+ else:
+ grad_accum_dtype = model_dtype
+ else:
+ grad_accum_dtype = DtypeEnum(self._config.grad_accum_dtype).value
+
+ return (model_dtype, grad_accum_dtype)
+
+ def _optimizer_has_ckpt_event_prologue(self):
+ return self.optimizer is not None and hasattr(self.optimizer, 'checkpoint_event_prologue')
+
+ def _optimizer_has_ckpt_event_epilogue(self):
+ return self.optimizer is not None and hasattr(self.optimizer, 'checkpoint_event_epilogue')
+
+ def _configure_lr_scheduler(self, client_lr_scheduler):
+ # First check for scheduler in json configuration
+ lr_scheduler = self._scheduler_from_config(self.optimizer)
+ if lr_scheduler:
+ log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
+ self.lr_scheduler = lr_scheduler
+ else:
+ if isinstance(client_lr_scheduler, Callable):
+ log_dist('DeepSpeed using client callable to create LR scheduler', ranks=[0])
+ self.lr_scheduler = client_lr_scheduler(self.basic_optimizer)
+ else:
+ log_dist('DeepSpeed using client LR scheduler', ranks=[0])
+ self.lr_scheduler = client_lr_scheduler
+
+ log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0])
+
+ def _configure_checkpointing(self, dist_init_required):
+ self.checkpoint_engine = TorchCheckpointEngine()
+
+ if self._config is not None and self._config.nebula_config.enabled:
+ try:
+ from deepspeed.runtime.checkpoint_engine.nebula_checkpoint_engine import \
+ NebulaCheckpointEngine
+ self.checkpoint_engine = NebulaCheckpointEngine(config_params=self._config.nebula_config)
+ except ImportError as err:
+ logger.error(f"No torch_nebula was found! Will fall back to torch.save. Details: {err}")
+ self.checkpoint_engine = TorchCheckpointEngine()
+
+ dp_rank = groups._get_sequence_data_parallel_rank()
+
+ rank = self.local_rank if self.use_node_local_storage() else dp_rank
+
+ # only the first data parallel process needs to store the model checkpoint
+ # if you want to use node local storage this must be done by rank 0 on each
+ # node
+ self.save_non_zero_checkpoint = (rank == 0) or (self.zero_optimization_partition_weights()
+ and self.is_first_weights_partition_group())
+
+ if self.zero_optimization() or self.bfloat16_enabled():
+ param_rank = dist.get_rank(group=self.optimizer.zp_process_group)
+
+ # Only the first parameter parallel process needs to store the
+ # optimizer state checkpoints for zero
+ self.save_zero_checkpoint = param_rank == dp_rank
+
+ def _scheduler_from_config(self, optimizer):
+ scheduler_name = self.scheduler_name()
+ if scheduler_name is not None:
+ if hasattr(lr_schedules, scheduler_name):
+ scheduler = getattr(lr_schedules, scheduler_name)
+ else:
+ assert hasattr(torch.optim.lr_scheduler,
+ scheduler_name), f"DeepSpeed does not recognize LR scheduler {scheduler_name}"
+
+ scheduler = getattr(torch.optim.lr_scheduler, scheduler_name)
+
+ scheduler_params = self.scheduler_params()
+ instantiated_scheduler = scheduler(optimizer, **scheduler_params)
+ return instantiated_scheduler
+ else:
+ return None
+
+ def _set_distributed_vars(self, args):
+ device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank
+ if device_rank >= 0:
+ get_accelerator().set_device(device_rank)
+ self.device = torch.device(get_accelerator().device_name(), device_rank)
+ self.world_size = dist.get_world_size()
+ self.global_rank = dist.get_rank()
+ else:
+ self.world_size = 1
+ self.global_rank = 0
+ self.device = torch.device(get_accelerator().device_name())
+
+ # Configure based on command line arguments
+ def _configure_with_arguments(self, args, mpu):
+ # After the distributed backend is initialized we are guaranteed the LOCAL_RANK
+ # environment variable is set. We must align args.local_rank to this value for
+ # backwards compatibility with scripts relying on [args|self].local_rank containing
+ # the correct local rank info. _do_args_sanity_check will ensure this is the case.
+
+ if "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ:
+ ompi_local_rank = os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+ local_rank = os.environ.get('LOCAL_RANK', ompi_local_rank)
+ assert ompi_local_rank == local_rank, f"LOCAL_RANK ({local_rank}) != OMPI_COMM_WORLD_LOCAL_RANK ({ompi_local_rank}), " \
+ "not sure how to proceed as we're seeing conflicting local rank info."
+ os.environ['LOCAL_RANK'] = local_rank
+
+ self.local_rank = int(os.environ['LOCAL_RANK'])
+ if hasattr(args, 'local_rank'):
+ args.local_rank = self.local_rank
+
+ # Validate command line arguments
+ def _do_args_sanity_check(self, args):
+ assert "LOCAL_RANK" in os.environ or "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment " \
+ "variable, it is set by the deepspeed launcher, deepspeed.init_distributed, or the torch's launcher. If using a " \
+ "different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed."
+
+ if hasattr(args, 'local_rank') and args.local_rank is not None:
+ assert isinstance(args.local_rank,
+ int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}"
+ if args.local_rank >= 0:
+ env_local_rank = int(os.environ.get("LOCAL_RANK"))
+ assert (
+ env_local_rank == args.local_rank
+ ), f"Mismatch in local rank setting, args.local_rank={args.local_rank} but env['LOCAL_RANK']={env_local_rank}."
+
+ def _is_supported_optimizer(self, optimizer_name):
+ return (optimizer_name in DEEPSPEED_OPTIMIZERS or getattr(torch.optim, optimizer_name, None) is not None)
+
+ def _supported_optims(self):
+ FairseqOptimizer = None
+ try:
+ from fairseq.optim.fairseq_optimizer import FairseqOptimizer
+ except ImportError:
+ pass
+
+ expected_optim_types = [Optimizer]
+ if FairseqOptimizer:
+ # fairseq optims are not torch.optim objects
+ expected_optim_types.append(FairseqOptimizer)
+ return expected_optim_types
+
+ # Validate configuration based on command line arguments
+ def _do_sanity_check(self):
+ expected_optim_types = self._supported_optims()
+ expected_optim_types += [type(None), Callable]
+ assert isinstance(self.client_optimizer, tuple(expected_optim_types)), \
+ f'Client Optimizer is of unexpected type {type(self.client_optimizer)}'
+
+ if not self.client_optimizer:
+ if self.optimizer_name() is not None:
+ assert self._is_supported_optimizer(
+ self.optimizer_name()), "{} is not a supported DeepSpeed Optimizer".format(self.optimizer_name())
+
+ if (self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER):
+ assert (self.dynamic_loss_scale()), "DeepSpeed {} optimizer requires dynamic loss scaling".format(
+ self.optimizer_name())
+
+ # Detect invalid combinations of client optimizer and client scheduler
+ if isinstance(self.client_lr_scheduler, _LRScheduler):
+ assert isinstance(self.client_optimizer, Optimizer), \
+ f'Client Optimizer (type = {type(self.client_optimizer)} is not instantiated but Client LR Scheduler is instantiated'
+
+ def _broadcast_model(self):
+
+ def is_replicated(p):
+ if hasattr(p, "ds_status") and p.ds_status is not ZeroParamStatus.AVAILABLE:
+ return False
+ return True
+
+ for p in self.module.parameters():
+ # Broadcast the model for different parameters
+ if is_moe_param(p):
+ if torch.is_tensor(p) and is_replicated(p):
+ dist.broadcast(p,
+ groups._get_expert_broadcast_src_rank(p.group_name),
+ group=self.expert_data_parallel_group[p.group_name])
+ else:
+ if torch.is_tensor(p) and is_replicated(p):
+ dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
+
+ @staticmethod
+ def __check_params(model: Module, dtype: torch.dtype) -> None:
+ return
+ if not all(param.dtype == dtype for param in model.parameters()) and dist.get_rank() == 0:
+ raise ValueError(f"{dtype} is enabled but the following parameters have dtype that is "
+ f"not {dtype}: "
+ f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}")
+
+ def _set_client_model(self, model):
+ # register client model in _modules so that nn.module methods work correctly
+ modules = self.__dict__.get('_modules')
+ modules['module'] = model
+ # register module attribute in engine but avoid getattr
+ self.__dict__['module'] = model
+
+ def _configure_distributed_model(self, model):
+ self._set_client_model(model)
+ is_zero_init_model = self.zero_optimization_partition_weights() and any(
+ [hasattr(param, "ds_id") for param in self.module.parameters()])
+
+ if self.fp16_enabled():
+ if is_zero_init_model:
+ self.__check_params(self.module, torch.half)
+ self.module.half()
+ elif self.bfloat16_enabled():
+ if is_zero_init_model:
+ self.__check_params(self.module, torch.bfloat16)
+ self.module.bfloat16()
+ else:
+ self.__check_params(self.module, torch.float)
+
+ # zero.Init() handles device placement of model
+ if not (self.dont_change_device or is_zero_init_model):
+ self.module.to(self.device)
+
+ # MoE related initialization
+ for _, module in self.module.named_modules():
+ if isinstance(module, MoE):
+ self.has_moe_layers = True
+ self.num_experts.append(module.num_experts)
+
+ if self.has_moe_layers:
+ for _, module in self.module.named_modules():
+ if isinstance(module, TopKGate):
+ self.gate_modules.append(module)
+ if self.wall_clock_breakdown():
+ module.wall_clock_breakdown = True
+ if isinstance(module, MOELayer):
+ self.moe_layers.append(module)
+ if self.wall_clock_breakdown():
+ module.wall_clock_breakdown = True
+
+ # Pass the mpu from here to groups. For subsequent use, just query groups
+ if self.mpu is not None:
+ groups.mpu = self.mpu
+
+ # Set deepspeed parallelism spec. for the model including expert parallelism
+ for _, module in self.module.named_modules():
+ if hasattr(module, 'set_deepspeed_parallelism'):
+ module.set_deepspeed_parallelism(self._config.use_data_before_expert_parallel_)
+
+ # Query the groups module to get information about various parallel groups
+ self.local_all_to_all_group = None
+ if self.zero_quantized_gradients():
+ log_dist("Using quantized gradients", ranks=[0])
+ self.local_all_to_all_group = groups._get_local_all_to_all_group()
+ self.data_parallel_group = groups._get_data_parallel_group()
+ self.dp_world_size = groups._get_data_parallel_world_size()
+ self.zp_world_size = zp_manager.zp_size
+ self.seq_data_parallel_group = groups._get_sequence_data_parallel_group()
+ self.seq_dp_world_size = groups._get_sequence_data_parallel_world_size()
+ self.mp_world_size = groups._get_model_parallel_world_size()
+ self.expert_parallel_group = groups._get_expert_parallel_group_dict()
+ self.expert_data_parallel_group = groups._get_expert_data_parallel_group_dict()
+ self.sequence_parallel_size = groups._get_sequence_parallel_world_size()
+ if self.sequence_parallel_size > 1:
+ self.communication_data_type = self._config.seq_parallel_communication_data_type
+
+ if not (self.amp_enabled() or is_zero_init_model):
+ self._broadcast_model()
+
+ # check if parameters are duplicated in optimizer param_groups
+ def _check_for_duplicates(self, optimizer):
+ for name, param in self.module.named_parameters():
+ param_id = id(param)
+
+ def ids_list(group):
+ return [id(param) for param in group]
+
+ occurrence = sum([
+ ids_list(group['params']).count(param_id) if param_id in ids_list(group['params']) else 0
+ for group in optimizer.param_groups
+ ])
+ assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behavior."
+
+ def _do_optimizer_sanity_check(self, basic_optimizer):
+ model_dtype, grad_accum_dtype = self.get_data_types()
+ zero_enabled = self.zero_optimization()
+ amp_enabled = self.amp_enabled()
+ # config based assertions
+ assert (
+ not (amp_enabled and zero_enabled)
+ ), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
+ if zero_enabled:
+ if not is_zero_supported_optimizer(basic_optimizer):
+ assert (
+ self.zero_allow_untested_optimizer()
+ ), 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
+
+ if self.global_rank == 0:
+ logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****")
+ if model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32 and self.zero_optimization_stage(
+ ) == 1 and not self.zero_cpu_offload():
+ return BFLOAT16
+ return ZERO_OPTIMIZATION
+ elif amp_enabled:
+ if model_dtype != grad_accum_dtype:
+ raise NotImplementedError(
+ "Model data type and gradient accumulation data type must be equal to use Amp")
+ if model_dtype == torch.bfloat16 or model_dtype == torch.float16:
+ raise NotImplementedError("Cannot enable both amp with (legacy) fp16 or bfloat16 mode")
+ try:
+ logger.info("Initializing Apex amp from: {}".format(amp.__path__))
+ except NameError:
+ # If apex/amp is available it will be imported above
+ raise RuntimeError("Unable to import apex/amp, please make sure it is installed")
+ return AMP
+ # data type checks
+ elif model_dtype == grad_accum_dtype:
+ if model_dtype == torch.bfloat16:
+ if self.pipeline_parallelism:
+ logger.warning(
+ "**** BF16 gradient accumulation is not safe numerically with large number of accumulation steps, proceed with caution *****"
+ )
+ return BFLOAT16
+ else:
+ raise NotImplementedError(
+ "Bfloat16 wrapper must use a gradient accumulation type of fp32, enable ZeRO to use Bfloat16 gradient accumulation"
+ )
+ if model_dtype == torch.float16:
+ return FP16
+ # else optimizer_wrapper = None
+ elif model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32:
+ return BFLOAT16
+ else:
+ raise NotImplementedError("unsupported mix of model dtype and gradient accumulation type")
+
+ return None
+
+ # Configure optimizer
+ def _configure_optimizer(self, client_optimizer, model_parameters):
+ if client_optimizer is None:
+ basic_optimizer = self._configure_basic_optimizer(model_parameters)
+ log_dist(f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", ranks=[0])
+ else:
+ if isinstance(client_optimizer, tuple(self._supported_optims())):
+ basic_optimizer = client_optimizer
+ log_dist('Using client Optimizer as basic optimizer', ranks=[0])
+ else:
+ basic_optimizer = client_optimizer(model_parameters)
+ log_dist('Using client callable to create basic optimizer', ranks=[0])
+
+ if self.zero_use_cpu_optimizer() and not isinstance(basic_optimizer, deepspeed.ops.adam.DeepSpeedCPUAdam):
+ if self.zero_force_ds_cpu_optimizer():
+ msg = f'You are using ZeRO-Offload with a client provided optimizer ({type(basic_optimizer)}) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.'
+ raise ZeRORuntimeException(msg)
+
+ basic_optimizer.param_groups[:] = [pg for pg in basic_optimizer.param_groups if len(pg["params"]) != 0]
+ log_dist("Removing param_group that has no 'params' in the basic Optimizer", ranks=[0])
+
+ self._check_for_duplicates(basic_optimizer)
+
+ self.basic_optimizer = basic_optimizer
+ log_dist("DeepSpeed Basic Optimizer = {}".format(basic_optimizer.__class__.__name__), ranks=[0])
+
+ optimizer_wrapper = self._do_optimizer_sanity_check(basic_optimizer)
+
+ if optimizer_wrapper == ZERO_OPTIMIZATION:
+ self.optimizer = self._configure_zero_optimizer(basic_optimizer)
+ elif optimizer_wrapper == AMP:
+ amp_params = self.amp_params()
+ log_dist(f"Initializing AMP with these params: {amp_params}", ranks=[0])
+ model, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params)
+ self._set_client_model(model)
+ self._broadcast_model()
+ # TODO: maybe need to broadcast experts differently?
+ elif optimizer_wrapper == FP16:
+ self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
+ elif optimizer_wrapper == BFLOAT16:
+ self.optimizer = self._configure_bf16_optimizer(basic_optimizer)
+ else:
+ self.optimizer = basic_optimizer
+
+ log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0])
+
+ self.compression_scheduler = self._configure_compression_scheduler()
+ self.quantizer = self._configure_quantization()
+
+ def _configure_basic_optimizer(self, model_parameters):
+ optimizer_parameters = self.optimizer_params()
+ if optimizer_parameters is None:
+ optimizer_parameters = {}
+ # print(optimizer_parameters.keys())
+ if "max_grad_norm" in optimizer_parameters.keys():
+ raise ValueError(
+ "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
+ )
+
+ if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
+ torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
+ adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT)
+
+ # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set
+ effective_adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER or adam_w_mode
+
+ if torch_adam:
+ if not effective_adam_w_mode:
+ optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
+ else:
+ optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters)
+ else:
+ if self.zero_use_cpu_optimizer():
+ from deepspeed.ops.adam import DeepSpeedCPUAdam
+ optimizer = DeepSpeedCPUAdam(model_parameters,
+ **optimizer_parameters,
+ adamw_mode=effective_adam_w_mode)
+ else:
+ from deepspeed.ops.adam import FusedAdam
+
+ optimizer = FusedAdam(
+ model_parameters,
+ **optimizer_parameters,
+ adam_w_mode=effective_adam_w_mode,
+ )
+
+ elif self.optimizer_name() == ADAGRAD_OPTIMIZER:
+ if self.zero_use_cpu_optimizer():
+ from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+ optimizer = DeepSpeedCPUAdagrad(model_parameters, **optimizer_parameters)
+ else:
+ optimizer = torch.optim.Adagrad(model_parameters, **optimizer_parameters)
+ elif self.optimizer_name() == LAMB_OPTIMIZER:
+ from deepspeed.ops.lamb import FusedLamb
+
+ optimizer = FusedLamb(model_parameters, **optimizer_parameters)
+ elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
+ assert not self.zero_optimization(), "1bit-Adam is not compatible with ZeRO"
+ from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
+
+ optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
+ if not self.fp16_enabled():
+ logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16")
+ elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER:
+ assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO"
+ from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
+
+ optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters)
+ if not self.fp16_enabled():
+ logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16')
+ elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
+ assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO"
+ from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
+
+ optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
+ if not self.fp16_enabled():
+ logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16")
+ elif self.optimizer_name() == LION_OPTIMIZER:
+ if self.zero_use_cpu_optimizer():
+ from deepspeed.ops.lion import DeepSpeedCPULion
+ optimizer = DeepSpeedCPULion(model_parameters, **optimizer_parameters)
+ else:
+ from deepspeed.ops.lion import FusedLion
+ optimizer = FusedLion(model_parameters, **optimizer_parameters)
+ elif self.optimizer_name() == MUADAM_OPTIMIZER:
+ try:
+ from mup import MuAdam
+ except ImportError:
+ logger.error(f"Install mup to use MuAdam optimizer")
+ optimizer = MuAdam(model_parameters, **optimizer_parameters)
+ elif self.optimizer_name() == MUADAMW_OPTIMIZER:
+ try:
+ from mup import MuAdamW
+ except ImportError:
+ logger.error(f"Install mup to use MuAdamW optimizer")
+ optimizer = MuAdamW(model_parameters, **optimizer_parameters)
+ elif self.optimizer_name() == MUSGD_OPTIMIZER:
+ try:
+ from mup import MuSGD
+ except ImportError:
+ logger.error(f"Install mup to use MuSGD optimizer")
+ optimizer = MuSGD(model_parameters, **optimizer_parameters)
+ else:
+ torch_optimizer = getattr(torch.optim, self.optimizer_name())
+ optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
+ return optimizer
+
+ def _configure_compression_scheduler(self):
+ return compression_scheduler(self.module, self._config.compression_config)
+
+ def _configure_random_ltd_scheduler(self, configs):
+ return RandomLTDScheduler(configs)
+
+ def _configure_quantization(self):
+ (
+ quantize_weight_in_forward,
+ quantize_enabled,
+ q_groups,
+ q_mixed_fp16,
+ q_change_ratio,
+ q_type,
+ q_rounding,
+ q_verbose,
+ use_quantizer_kernel,
+ ) = self.quantize_training()
+ if quantize_enabled and not quantize_weight_in_forward:
+ assert self.fp16_enabled(
+ ), "MoQ (quantize in optimization step) weight quantization is only supported for FP16"
+ quantizer = None
+ if quantize_enabled and not quantize_weight_in_forward:
+ from deepspeed.runtime.quantize import Quantizer
+
+ quantizer = Quantizer(
+ q_groups,
+ q_mixed_fp16,
+ q_change_ratio,
+ q_type,
+ q_rounding,
+ q_verbose,
+ self.eigenvalue_enabled(),
+ use_quantizer_kernel,
+ self.eigenvalue_layer_num() if self.eigenvalue_enabled() else 0,
+ )
+ return quantizer
+
+ def _configure_fp16_optimizer(self, optimizer):
+ initial_dynamic_scale = self.initial_dynamic_scale()
+ dynamic_loss_args = self.dynamic_loss_scale_args()
+ clip_grad = self.gradient_clipping()
+ if APEX_INSTALLED:
+ fused_opts = (apex.optimizers.FusedAdam, FusedAdam)
+ else:
+ fused_opts = FusedAdam
+ if isinstance(optimizer, fused_opts) \
+ or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]:
+ if self.dynamic_loss_scale():
+ log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0])
+ timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
+ optimizer = FP16_Optimizer(
+ optimizer,
+ deepspeed=self,
+ dynamic_loss_scale=True,
+ initial_dynamic_scale=initial_dynamic_scale,
+ dynamic_loss_args=dynamic_loss_args,
+ mpu=self.mpu,
+ clip_grad=clip_grad,
+ fused_adam_legacy=self.optimizer_legacy_fusion(),
+ timers=timers,
+ has_moe_layers=self.has_moe_layers,
+ )
+ else:
+ log_dist(f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}', ranks=[0])
+ optimizer = FP16_Optimizer(
+ optimizer,
+ deepspeed=self,
+ static_loss_scale=self.loss_scale(),
+ mpu=self.mpu,
+ clip_grad=clip_grad,
+ fused_adam_legacy=self.optimizer_legacy_fusion(),
+ has_moe_layers=self.has_moe_layers,
+ )
+ else:
+ log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0])
+ optimizer = FP16_UnfusedOptimizer(
+ optimizer,
+ deepspeed=self,
+ static_loss_scale=self.loss_scale(),
+ dynamic_loss_scale=self.dynamic_loss_scale(),
+ dynamic_loss_args=dynamic_loss_args,
+ mpu=self.mpu,
+ clip_grad=clip_grad,
+ fused_lamb_legacy=self.optimizer_name() == LAMB_OPTIMIZER,
+ )
+
+ return optimizer
+
+ def _configure_bf16_optimizer(self, optimizer):
+ clip_grad = self.gradient_clipping()
+
+ if optimizer is None:
+ optimizer = DummyOptim(list(self.module.parameters()))
+
+ log_dist('Creating BF16 optimizer', ranks=[0])
+
+ timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
+ optimizer = BF16_Optimizer(optimizer,
+ self.param_names,
+ mpu=self.mpu,
+ clip_grad=clip_grad,
+ allgather_bucket_size=self.zero_allgather_bucket_size(),
+ dp_process_group=self.seq_data_parallel_group,
+ timers=timers,
+ grad_acc_dtype=self.get_data_types()[1],
+ graph_harvesting=self.graph_harvesting())
+
+ return optimizer
+
+ def _configure_zero_optimizer(self, optimizer):
+ zero_stage = self.zero_optimization_stage()
+
+ mics_shard_size = self.mics_shard_size()
+ model_dtype, gradient_accumulation_dtype = self.get_data_types()
+
+ timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
+
+ if optimizer is None:
+ optimizer = DummyOptim(list(self.module.parameters()))
+
+ if self.zero_legacy_stage1():
+ raise Exception(
+ "The deprecated version of ZeRO Stage 1 is not supported in deepspeed >= 0.5.9. Please downgrade to a version less than 0.5.9 if you need to use this deprecated version of ZeRO."
+ )
+
+ if zero_stage <= ZeroStageEnum.gradients:
+ overlap_comm = self.zero_overlap_comm()
+ contiguous_gradients = self.zero_contiguous_gradients()
+ round_robin_gradients = self.zero_round_robin_gradients()
+ assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage)
+
+ log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
+ # Overlap and contiguous grads are meaningless in stage 1 and are ignored
+ if zero_stage == ZeroStageEnum.optimizer_states:
+ overlap_comm = False
+ round_robin_gradients = False
+ # Non-MoE requires contiguous grads to be disabled w. stage 1
+ if not self.has_moe_layers:
+ contiguous_gradients = False
+
+ if isinstance(self.module, PipelineModule):
+ if overlap_comm:
+ logger.warning("Pipeline parallelism does not support overlapped communication, will be disabled.")
+ overlap_comm = False
+ optimizer = DeepSpeedZeroOptimizer(
+ optimizer,
+ self.param_names,
+ timers=timers,
+ static_loss_scale=self.loss_scale(),
+ dynamic_loss_scale=self.dynamic_loss_scale(),
+ dynamic_loss_args=self.dynamic_loss_scale_args(),
+ clip_grad=self.gradient_clipping(),
+ contiguous_gradients=contiguous_gradients,
+ reduce_bucket_size=self.zero_reduce_bucket_size(),
+ use_multi_rank_bucket_allreduce=self.zero_multi_rank_bucket_allreduce(),
+ allgather_bucket_size=self.zero_allgather_bucket_size(),
+ dp_process_group=self.seq_data_parallel_group,
+ expert_parallel_group=self.expert_parallel_group if self.has_moe_layers else None,
+ expert_data_parallel_group=self.expert_data_parallel_group if self.has_moe_layers else None,
+ reduce_scatter=self.zero_reduce_scatter(),
+ overlap_comm=overlap_comm,
+ offload_optimizer_config=self.zero_offload_optimizer(),
+ mpu=self.mpu,
+ postscale_gradients=self.postscale_gradients(),
+ gradient_predivide_factor=self.gradient_predivide_factor(),
+ gradient_accumulation_steps=self.gradient_accumulation_steps(),
+ ignore_unused_parameters=self.zero_ignore_unused_parameters(),
+ partition_grads=zero_stage == ZeroStageEnum.gradients,
+ round_robin_gradients=round_robin_gradients,
+ has_moe_layers=self.has_moe_layers,
+ fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(),
+ gradient_accumulation_dtype=gradient_accumulation_dtype,
+ communication_data_type=self.communication_data_type,
+ elastic_checkpoint=self.zero_elastic_checkpoint())
+
+ elif zero_stage == ZeroStageEnum.weights:
+ assert not self.has_moe_layers, "MoE not supported with Stage 3"
+ if isinstance(optimizer, DummyOptim):
+ log_dist("Creating ZeRO Offload", ranks=[0])
+ zero_param_parallel_group = groups._get_zero_param_intra_parallel_group()
+ if self.zero_hpz_partition_size() > 1 and zero_param_parallel_group is None:
+ self._set_zero_group_parallelism()
+ zero_param_parallel_group = groups._get_zero_param_intra_parallel_group()
+ optimizer = DeepSpeedZeRoOffload(
+ self.module,
+ timers=timers,
+ ds_config=self.config,
+ overlap_comm=self.zero_overlap_comm(),
+ prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+ max_reuse_distance=self.zero_max_reuse_distance(),
+ max_live_parameters=self.zero_max_live_parameters(),
+ param_persistence_threshold=self.zero_param_persistence_threshold(),
+ model_persistence_threshold=self.zero_model_persistence_threshold(),
+ offload_param_config=self.zero_offload_param(),
+ mpu=self.mpu,
+ zero_param_parallel_group=zero_param_parallel_group,
+ zero_quantized_weights=self.zero_quantized_weights(),
+ zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+ )
+ else:
+ log_dist(
+ f'Creating fp16 ZeRO stage {zero_stage} optimizer,'
+ f' MiCS is enabled {mics_shard_size>0},'
+ f' Hierarchical params gather {self._config.mics_hierarchial_params_gather}',
+ ranks=[0])
+ if mics_shard_size > 0:
+ return self._return_mics_optimizer(optimizer, timers)
+
+ log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
+ from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+ optimizer = DeepSpeedZeroOptimizer_Stage3(
+ self.module,
+ optimizer,
+ timers=timers,
+ ds_config=self.config,
+ static_loss_scale=self.loss_scale(),
+ dynamic_loss_scale=self.dynamic_loss_scale(),
+ dynamic_loss_args=self.dynamic_loss_scale_args(),
+ clip_grad=self.gradient_clipping(),
+ contiguous_gradients=self.zero_contiguous_gradients(),
+ reduce_bucket_size=self.zero_reduce_bucket_size(),
+ prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+ max_reuse_distance=self.zero_max_reuse_distance(),
+ max_live_parameters=self.zero_max_live_parameters(),
+ param_persistence_threshold=self.zero_param_persistence_threshold(),
+ model_persistence_threshold=self.zero_model_persistence_threshold(),
+ dp_process_group=self.seq_data_parallel_group,
+ all2all_process_group=self.local_all_to_all_group,
+ reduce_scatter=self.zero_reduce_scatter(),
+ overlap_comm=self.zero_overlap_comm(),
+ offload_optimizer_config=self.zero_offload_optimizer(),
+ offload_param_config=self.zero_offload_param(),
+ sub_group_size=self.zero_sub_group_size(),
+ offload_ratio=self.zero_partial_offload(),
+ mpu=self.mpu,
+ postscale_gradients=self.postscale_gradients(),
+ gradient_predivide_factor=self.gradient_predivide_factor(),
+ gradient_accumulation_steps=self.gradient_accumulation_steps(),
+ aio_config=self.aio_config(),
+ gradient_accumulation_dtype=gradient_accumulation_dtype,
+ communication_data_type=self.communication_data_type,
+ zero_hpz_partition_size=self.zero_hpz_partition_size(),
+ zero_quantized_weights=self.zero_quantized_weights(),
+ zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+ )
+
+ else:
+ raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
+
+ return optimizer
+
+ def _return_mics_optimizer(self, basic_optimizer, timers):
+ from deepspeed.runtime.zero.mics import MiCS_Optimizer
+ model_dtype, gradient_accumulation_dtype = self.get_data_types()
+ optimizer = MiCS_Optimizer(self.module,
+ basic_optimizer,
+ timers=timers,
+ ds_config=self.config,
+ static_loss_scale=self.loss_scale(),
+ dynamic_loss_scale=self.dynamic_loss_scale(),
+ dynamic_loss_args=self.dynamic_loss_scale_args(),
+ clip_grad=self.gradient_clipping(),
+ contiguous_gradients=self.zero_contiguous_gradients(),
+ reduce_bucket_size=self.zero_reduce_bucket_size(),
+ prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+ max_reuse_distance=self.zero_max_reuse_distance(),
+ max_live_parameters=self.zero_max_live_parameters(),
+ param_persistence_threshold=self.zero_param_persistence_threshold(),
+ model_persistence_threshold=self.zero_model_persistence_threshold(),
+ dp_process_group=self.seq_data_parallel_group,
+ reduce_scatter=self.zero_reduce_scatter(),
+ overlap_comm=self.zero_overlap_comm(),
+ offload_optimizer_config=self.zero_offload_optimizer(),
+ offload_param_config=self.zero_offload_param(),
+ sub_group_size=self.zero_sub_group_size(),
+ mpu=self.mpu,
+ postscale_gradients=self.postscale_gradients(),
+ gradient_predivide_factor=self.gradient_predivide_factor(),
+ gradient_accumulation_steps=self.gradient_accumulation_steps(),
+ aio_config=self.aio_config(),
+ gradient_accumulation_dtype=gradient_accumulation_dtype,
+ communication_data_type=self.communication_data_type)
+ return optimizer
+
+ def _configure_eigenvalue(self):
+ eigenvalue = Eigenvalue(
+ verbose=self.eigenvalue_verbose(),
+ max_iter=self.eigenvalue_max_iter(),
+ tol=self.eigenvalue_tol(),
+ stability=self.eigenvalue_stability(),
+ gas_boundary_resolution=self.eigenvalue_gas_boundary_resolution(),
+ layer_name=self.eigenvalue_layer_name(),
+ layer_num=self.eigenvalue_layer_num(),
+ )
+
+ return eigenvalue
+
+ def _configure_progressive_layer_drop(self):
+ pld = ProgressiveLayerDrop(theta=self.pld_theta(), gamma=self.pld_gamma())
+
+ return pld
+
+ def _configure_curriculum_scheduler_legacy(self):
+ scheduler = CurriculumScheduler(self.curriculum_params_legacy())
+ return scheduler
+
+ @staticmethod
+ def is_map_style_dataset(obj):
+ return hasattr(obj, "__getitem__") and hasattr(obj, "__len__")
+
+ @staticmethod
+ def is_iterable_style_dataset(obj):
+ return isinstance(obj, torch.utils.data.IterableDataset) # hasattr(obj, "__iter__") should work as well
+
+ def dataloader_drop_last(self):
+ return self._config.dataloader_drop_last
+
+ def was_step_applied(self) -> bool:
+ """Returns True if the latest ``step()`` produced in parameter updates.
+ Note that a ``False`` return is not an error condition. Steps are frequently
+ no-ops, such as between gradient accumulation boundaries or when overflows
+ occur.
+ Returns:
+ bool: Whether the latest ``step()`` modified model parameters.
+ """
+ return self._step_applied
+
+ def deepspeed_io(self,
+ dataset,
+ batch_size=None,
+ route=ROUTE_TRAIN,
+ pin_memory=True,
+ data_sampler=None,
+ collate_fn=None,
+ num_local_io_workers=None):
+ if not (self.is_map_style_dataset(dataset) or self.is_iterable_style_dataset(dataset)):
+ raise ValueError("Training data must be a torch Dataset")
+
+ if batch_size is None:
+ batch_size = self.train_micro_batch_size_per_gpu()
+
+ if collate_fn is None:
+ collate_fn = self.collate_fn
+
+ # Currently we only use timer in train route
+ deepspeed_io_timer = None
+ if route == ROUTE_TRAIN:
+ deepspeed_io_timer = self.tput_timer
+
+ # If mpu is provided, forward world size and parallel rank to sampler.
+ data_parallel_world_size = self.dp_world_size
+ data_parallel_rank = self.global_rank
+ if self.mpu is not None:
+ data_parallel_world_size = self.mpu.get_data_parallel_world_size()
+ data_parallel_rank = self.mpu.get_data_parallel_rank()
+
+ if data_sampler is None and (route == ROUTE_PREDICT or route == ROUTE_EVAL):
+ data_sampler = torch.utils.data.DistributedSampler(
+ dataset,
+ num_replicas=data_parallel_world_size,
+ rank=data_parallel_rank,
+ shuffle=False,
+ )
+
+ deepspeed_dataloader_config = {}
+ if self.curriculum_learning_enabled():
+ deepspeed_dataloader_config = {
+ CURRICULUM_LEARNING: self.curriculum_learning_enabled(),
+ DATA_EFFICIENCY: self.data_efficiency_config(),
+ DATA_PARALLEL_GROUP: self.data_parallel_group,
+ GRADIENT_ACCUMULATION_STEPS: self.gradient_accumulation_steps(),
+ GLOBAL_RANK: self.global_rank,
+ DATA_SAMPLING_NUM_WORKERS: self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS]
+ }
+
+ return DeepSpeedDataLoader(dataset=dataset,
+ batch_size=batch_size,
+ pin_memory=pin_memory,
+ collate_fn=collate_fn,
+ local_rank=self.local_rank,
+ tput_timer=deepspeed_io_timer,
+ num_local_io_workers=num_local_io_workers,
+ data_sampler=data_sampler,
+ data_parallel_world_size=data_parallel_world_size,
+ data_parallel_rank=data_parallel_rank,
+ dataloader_drop_last=self.dataloader_drop_last(),
+ deepspeed_dataloader_config=deepspeed_dataloader_config)
+
+ def train(self, mode=True):
+ r""""""
+
+ self.warn_unscaled_loss = True
+ self.module.train(mode)
+
+ def eval(self):
+ r""""""
+
+ self.warn_unscaled_loss = True
+ self.module.train(False)
+
+ def _scale_loss_by_gas(self, prescaled_loss):
+ if isinstance(prescaled_loss, torch.Tensor):
+ scaled_loss = prescaled_loss / self.gradient_accumulation_steps()
+ elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list):
+ scaled_loss = []
+ for l in prescaled_loss:
+ if isinstance(l, torch.Tensor):
+ scaled_loss.append(l / self.gradient_accumulation_steps())
+ else:
+ scaled_loss.append(l)
+ else:
+ scaled_loss = prescaled_loss
+ if self.warn_unscaled_loss:
+ logger.warning(f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}")
+ self.warn_unscaled_loss = False
+
+ return scaled_loss
+
+ @instrument_w_nvtx
+ def forward(self, *inputs, **kwargs):
+ r"""Execute forward propagation
+ Arguments:
+ *inputs: Variable length input list
+ **kwargs: variable length keyword arguments
+ """
+
+ if self.autotuning_profile_model_info():
+ ma = get_ma_status()
+ else:
+ see_memory_usage("Engine before forward", force=self.memory_breakdown())
+
+ flops_profiler_active = (self.flops_profiler_enabled()
+ and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0)
+
+ # used to check quantization happens at step 0!
+ if self.global_steps == 0 and hasattr(self, "compression_scheduler"):
+ self.compression_scheduler.step(step_zero_check=True)
+ if self.quantizer:
+ tensor_to_quantize = self.optimizer.bit16_groups if self.zero_optimization_stage(
+ ) == 2 else self.optimizer.fp16_groups
+ if self.compression_scheduler.weight_quantization_enabled:
+ self.quantizer.quantize(
+ tensor_to_quantize,
+ (self.optimizer.overflow if self.fp16_enabled() else False),
+ self.eigenvalue_enabled(),
+ None,
+ )
+
+ if flops_profiler_active:
+ self.flops_profiler.start_profile(ignore_list=None)
+
+ if self.module.training:
+ if self.progressive_layer_drop:
+ kwargs.update(self.progressive_layer_drop.get_state())
+
+ if self.__class__.__name__ != "PipelineEngine":
+ # TODO: The above if condition is a HACK since for PipelineEngine
+ # it's difficult to inject argument in forward pass.
+ if self.module.training and self.curriculum_enabled_legacy():
+ self.curriculum_scheduler_legacy.update_difficulty(self.global_steps + 1)
+ if self.curriculum_params_legacy()["curriculum_type"] == "seqlen":
+ kwargs.update({"curriculum_seqlen": self.curriculum_scheduler_legacy.get_current_difficulty()})
+
+ if self.module.training and self.random_ltd_enabled():
+ self.random_ltd_scheduler.update_seq(self.global_steps)
+
+ if self.zero_optimization_partition_weights():
+ # Enable automated discovery of external parameters by indicating that
+ # we are in a forward pass.
+ for module in self.module.modules():
+ module._parameters._in_forward = True
+
+ self._start_timers(self.engine_timers.forward_timers)
+
+ if self.training_dataloader is None:
+ self.tput_timer.start()
+
+ if self.fp16_auto_cast():
+ inputs = self._cast_inputs_half(inputs)
+ # print(f"RANK[{self.global_rank}] self.fp16_auto_cast() is {self.fp16_auto_cast()}")
+
+ loss = self.module(*inputs, **kwargs)
+
+ # print(f"RANK[{self.global_rank}]'s loss is {loss}")
+
+ if self.zero_optimization_partition_weights():
+ # Disable automated discovery of external parameters
+ for module in self.module.modules():
+ module._parameters._in_forward = False
+
+ self._stop_timers(self.engine_timers.forward_timers)
+
+ if flops_profiler_active:
+ self.flops_profiler.stop_profile()
+
+ if self.autotuning_profile_model_info():
+ activation_mem = get_ma_status() - ma
+ self.autotuning_model_info["activation_mem_per_gpu"] = activation_mem
+ print_json_dist(self.autotuning_model_info, [0], path=self.autotuning_model_info_path())
+ exit()
+ else:
+ see_memory_usage("Engine after forward", force=self.memory_breakdown())
+ return loss
+
+ def _cast_inputs_half(self, inputs):
+ if isinstance(inputs, (list, tuple)):
+ new_inputs = []
+ for v in inputs:
+ new_inputs.append(self._cast_inputs_half(v))
+ return inputs.__class__(new_inputs)
+ elif isinstance(inputs, dict):
+ new_inputs = {}
+ for k, v in inputs.items():
+ new_inputs[k] = self._cast_inputs_half(v)
+ return new_inputs
+ elif hasattr(inputs, 'half'):
+ return inputs.half()
+ else:
+ return inputs
+
+ def print_forward_breakdown(self, fwd_time):
+ gate_time = 0.0
+ moe_time = 0.0
+ falltoall = 0.0
+ salltoall = 0.0
+
+ for gate in self.gate_modules:
+ #logger.info(f"Individual TopK gate time: {gate.gate_time:.2f} ms")
+ gate_time += gate.gate_time
+
+ for l in self.moe_layers:
+ #logger.info(f"MoE layer; total: {l.time_moe:.2f} ms, first alltoall: {l.time_falltoall:.2f}, second alltoall: {l.time_salltoall:.2f}")
+ moe_time += l.time_moe
+ falltoall += l.time_falltoall
+ salltoall += l.time_salltoall
+
+ # TODO: Allreduce/average them across ranks for more accurate timing.
+
+ # if deepspeed.comm.get_rank() == 0:
+ log_dist(
+ f"time (ms) | fwd: {fwd_time:.2f} (fwd_moe: {moe_time:.2f}, 1st_a2a: {falltoall:.2f}, 2nd_a2a: {salltoall:.2f}, top_k: {gate_time:.2f})",
+ ranks=[0])
+
+ @instrument_w_nvtx
+ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
+ assert not (self.bfloat16_enabled() and self.pipeline_parallelism), \
+ f'allreduce_gradients() is not valid when bfloat+pipeline_parallelism is enabled'
+
+ # Pass (PP) gas boundary flag to optimizer (required for zero)
+ self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
+ # ZeRO stage >= 2 communicates during non gradient accumulation boundaries as well
+ if self.zero_optimization_partition_gradients():
+ self.optimizer.overlapping_partition_gradients_reduce_epilogue()
+
+ # Communicate only at gradient accumulation boundaries
+ elif self.is_gradient_accumulation_boundary():
+ if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states and hasattr(
+ self.optimizer, 'reduce_gradients'):
+ self.optimizer.reduce_gradients(pipeline_parallel=self.pipeline_parallelism)
+ else:
+ self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
+
+ @instrument_w_nvtx
+ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True):
+ r"""Execute backward pass on the loss
+ Arguments:
+ loss: Torch tensor on which to execute backward propagation
+ allreduce_gradients: is deprecated, ignored, and will soon be removed'
+ retain_graph: bool, default: false
+ forward on user defined choice of retain_graph
+ """
+
+ see_memory_usage("Engine before backward", force=self.memory_breakdown())
+
+ if self.scale_wrt_gas is not None:
+ scale_wrt_gas = self.scale_wrt_gas
+
+ if not allreduce_gradients:
+ logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed")
+
+ # scale loss w.r.t. gradient accumulation if needed
+ if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
+ loss = self._scale_loss_by_gas(loss.float())
+
+ # Log training loss
+ self.losses += loss.mean().item()
+ if self.monitor.enabled:
+ if self.is_gradient_accumulation_boundary():
+ if self.global_rank == 0:
+ self.summary_events = [(
+ f"Train/Samples/train_loss",
+ self.losses,
+ self.global_samples,
+ )]
+ self.monitor.write_events(self.summary_events)
+
+ self._start_timers(self.engine_timers.backward_timers)
+
+ assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
+ "must provide optimizer during init in order to use backward"
+
+ self._start_timers(self.engine_timers.backward_inner_timers)
+
+ if self.zero_optimization():
+ self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
+ self.optimizer.backward(loss, retain_graph=retain_graph)
+ elif self.amp_enabled():
+ # AMP requires delaying unscale when inside gradient accumulation boundaries
+ # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
+ delay_unscale = not self.is_gradient_accumulation_boundary()
+ with amp.scale_loss(loss, self.optimizer, delay_unscale=delay_unscale) as scaled_loss:
+ scaled_loss.backward(retain_graph=retain_graph)
+ elif self.fp16_enabled():
+ if self.eigenvalue_enabled():
+ self.optimizer.backward(loss, create_graph=True, retain_graph=True)
+ else:
+ self.optimizer.backward(loss, retain_graph=retain_graph)
+ elif self.bfloat16_enabled():
+ self.optimizer.backward(loss)
+ else:
+ if self.eigenvalue_enabled():
+ loss.backward(create_graph=True, retain_graph=True)
+ else:
+ loss.backward(retain_graph=retain_graph)
+
+ self._stop_timers(self.engine_timers.backward_inner_timers)
+
+ self._start_timers(self.engine_timers.backward_reduce_timers)
+
+ if allreduce_gradients and self.enable_backward_allreduce:
+ # Traditional code path that allreduces the module parameter grads
+ self.allreduce_gradients()
+
+ self._stop_timers(self.engine_timers.backward_reduce_timers)
+
+ self._stop_timers(self.engine_timers.backward_timers)
+
+ if release_loss:
+ # loss.data = None
+ pass
+
+ see_memory_usage("Engine after backward", force=self.memory_breakdown())
+
+ return loss
+
+ def is_gradient_accumulation_boundary(self):
+ """
+ Query whether the current micro-batch is at the boundary of
+ gradient accumulation, and thus will trigger gradient reductions and
+ an optimizer step.
+
+ Returns:
+ bool: if the current step is a gradient accumulation boundary.
+
+ """
+ if self._is_gradient_accumulation_boundary is None:
+ return (self.micro_steps + 1) % \
+ self.gradient_accumulation_steps() == 0
+ else:
+ return self._is_gradient_accumulation_boundary
+
+ def set_gradient_accumulation_boundary(self, is_boundary):
+ """
+ Manually overrides the DeepSpeed engine's gradient accumulation boundary state, this is an optional
+ feature and should be used with care. The state should be set before to the intended
+ value before each forward/backward. The final forward/backward should have the
+ boundary state set to True. This style allows client code to only call engine.step() once after all
+ the gradient accumulation passes are complete. See example below:
+ .. code-block:: python
+ engine.set_gradient_accumulation_boundary(False)
+ for _ in range(gradient_accumulation_steps - 1):
+ micro_batch = next(data_loader)
+ loss = engine(micro_batch)
+ engine.backward(loss)
+ engine.set_gradient_accumulation_boundary(True)
+ micro_batch = next(data_loader)
+ loss = engine(micro_batch)
+ engine.backward(loss)
+ engine.step()
+ Arguments:
+ is_boundary (bool): are we at a gradient accumulation boundary or not?
+ """
+ self._is_gradient_accumulation_boundary = is_boundary
+ self.optimizer.is_gradient_accumulation_boundary = is_boundary
+
+ def zero_grad(self):
+ """
+ Zero parameter grads.
+ """
+ for param_name, param in self.module.named_parameters():
+ param.grad = None
+
+ def clip_fp32_gradients(self):
+ clip_grad_norm_(parameters=self.module.parameters(), max_norm=self.gradient_clipping(), mpu=self.mpu)
+
+ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
+ if self.gradient_clipping() > 0.0:
+ if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled() or self.zero_optimization()):
+ self.clip_fp32_gradients()
+ elif self.amp_enabled():
+ # AMP's recommended way of doing clipping
+ # https://nvidia.github.io/apex/advanced.html#gradient-clipping
+ master_params = amp.master_params(self.optimizer)
+ clip_grad_norm_(parameters=master_params, max_norm=self.gradient_clipping(), mpu=self.mpu)
+ self.optimizer.step()
+
+ if hasattr(self.optimizer, '_global_grad_norm'):
+ self._global_grad_norm = self.optimizer._global_grad_norm
+
+ # Quantize the updated parameter if there is no overflow
+ if self.quantizer:
+ tensor_to_quantize = self.optimizer.bit16_groups if self.zero_optimization_stage(
+ ) == 2 else self.optimizer.fp16_groups
+ if self.compression_scheduler.weight_quantization_enabled:
+ self.quantizer.quantize(
+ tensor_to_quantize,
+ (self.optimizer.overflow if self.fp16_enabled() else False),
+ self.eigenvalue_enabled(),
+ block_eigenvalue,
+ )
+ # zero grad in basic optimizer could be unreliable and may not exhibit
+ # the behavior that we want
+ if self.bfloat16_enabled():
+ # TODO: Temporary until bf16_optimizer and zero_optimizer are integrated
+ if self.zero_optimization() and hasattr(self.optimizer, "zero_grad"):
+ self.optimizer.zero_grad()
+ else:
+ pass
+ elif self.zero_optimization() or self.fp16_enabled() or self.amp_enabled():
+ self.optimizer.zero_grad()
+ else:
+ self.zero_grad()
+
+ report_progress = self.global_rank == 0 if self.global_rank else True
+
+ # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
+ overflow = False
+ if hasattr(self.optimizer, "overflow"):
+ overflow = self.optimizer.overflow
+ self._step_applied = not overflow
+
+ if overflow:
+ self.skipped_steps += 1
+ else:
+ self.compression_scheduler.step()
+ if self.lr_scheduler is not None:
+ try:
+ self.lr_scheduler.step(**(lr_kwargs or {}))
+ except TypeError:
+ # XXX Hack to work with Megatron 2.0 and DeepSpeed pipelines.
+ # We don't currently have a way to specify lr_kwargs from
+ # pipe_engine.train_batch()
+ self.lr_scheduler.step(self.train_batch_size())
+
+ if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+ self._report_progress(self.global_steps + 1)
+
+ self.losses = 0.0
+ self.global_steps += 1
+ self.global_samples += self.train_batch_size()
+
+ def step(self, lr_kwargs=None):
+ r"""Execute the weight update step after forward and backward propagation
+ on effective_train_batch.
+ """
+ see_memory_usage("Engine before step", force=self.memory_breakdown())
+
+ # Check early because self.global_steps is incremented at some point here.
+ # TODO: Delay self.global_steps increment until very end of this function.
+ flops_profiler_active = self.flops_profiler_enabled(
+ ) and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0
+
+ self._start_timers(self.engine_timers.step_timers)
+
+ assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
+ "must provide optimizer during init in order to use step"
+
+ report_progress = False
+
+ self._step_applied = False # assume False, will flip to True
+
+ # Update the model when we reach gradient accumulation boundaries
+ if self.is_gradient_accumulation_boundary():
+ self.gas_boundary_ctr += 1
+
+ if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
+ and self.quantizer.any_precision_switch()):
+ log_dist(f"computing eigenvalue...", ranks=[0])
+ self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device,
+ self.optimizer.cur_scale)
+
+ if self.progressive_layer_drop:
+ self.progressive_layer_drop.update_state(self.global_steps)
+
+ if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()
+ and self.quantizer.any_precision_switch()):
+ self._take_model_step(lr_kwargs, self.block_eigenvalue)
+ else:
+ self._take_model_step(lr_kwargs)
+
+ report_progress = self.global_rank == 0 if self.global_rank else True
+
+ self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress)
+
+ self._stop_timers(self.engine_timers.step_timers)
+
+ # Log learning rate
+ if self.monitor.enabled:
+ if self.is_gradient_accumulation_boundary():
+ if self.global_rank == 0:
+ self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)]
+
+ if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"):
+ self.summary_events.append((
+ f"Train/Samples/loss_scale",
+ self.optimizer.cur_scale,
+ self.global_samples,
+ ))
+
+ if (self.eigenvalue_enabled()
+ and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()):
+ ev_values = self.block_eigenvalue.values()
+ for i in range(len(ev_values)):
+ self.summary_events.append((
+ f"Train/Eigenvalues/ModelBlockParam_{i}",
+ self.ev_values[i][0],
+ self.global_samples,
+ ))
+ self.monitor.write_events(self.summary_events)
+
+ # Check flops profiling
+ if flops_profiler_active:
+ if self.autotuning_enabled():
+ self.flops = self.flops_profiler.get_total_flops() * 3
+ self.fwd_duration = self.flops_profiler.get_total_duration()
+ else:
+ self.flops_profiler.print_model_profile(
+ profile_step=self.global_steps,
+ module_depth=self.flops_profiler_module_depth(),
+ top_modules=self.flops_profiler_top_modules(),
+ detailed=self.flops_profiler_detailed(),
+ output_file=self.flops_profiler_output_file(),
+ )
+ self.flops_profiler.end_profile()
+
+ if self.autotuning_enabled() and self.global_steps == (self.autotuning_end_profile_step() + 1):
+ self._autotuning_exit()
+
+ if self.wall_clock_breakdown():
+ # Log micro timing and reset
+ self.timers.log(names=self.engine_timers.micro_timers, memory_breakdown=self.memory_breakdown())
+
+ if self.wall_clock_breakdown() or self.flops_profiler_enabled():
+ # Log global timing and reset
+ if self.is_gradient_accumulation_boundary():
+ if self.monitor.enabled:
+ self._write_monitor()
+
+ if self.has_moe_layers:
+ fwd_time = self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False)
+ self.print_forward_breakdown(fwd_time=fwd_time)
+
+ self.timers.log(self.engine_timers.global_timers)
+
+ self.micro_steps += 1
+ see_memory_usage("Engine after step", force=self.memory_breakdown())
+
+ def _start_timers(self, timer_names):
+ for name in timer_names:
+ self.timers(name).start()
+
+ def _stop_timers(self, timer_names):
+ record = self.is_gradient_accumulation_boundary() and \
+ self.flops_profiler_enabled() and \
+ (self.global_steps >= self.flops_profiler_profile_step())
+ for name in timer_names:
+ self.timers(name).stop(record=record)
+
+ def _autotuning_exit(self):
+ if self.global_rank == 0:
+ msg = self.timers.get_mean([
+ FORWARD_GLOBAL_TIMER,
+ BACKWARD_GLOBAL_TIMER,
+ STEP_GLOBAL_TIMER,
+ ], reset=False)
+ titer = 0.0
+ titer += msg[FORWARD_GLOBAL_TIMER] if FORWARD_GLOBAL_TIMER in msg else 0
+ titer += msg[BACKWARD_GLOBAL_TIMER] if BACKWARD_GLOBAL_TIMER in msg else 0
+ titer += msg[STEP_GLOBAL_TIMER] if STEP_GLOBAL_TIMER in msg else 0
+ titer *= self.gradient_accumulation_steps()
+ msg["latency"] = titer
+ msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps() / titer
+ msg["throughput"] = self.train_batch_size() * 1_000_000 / \
+ msg["latency"]
+ print_json_dist(msg, [0], path=self.autotuning_metric_path())
+ log_dist(
+ f"Wrote metrics to {self.autotuning_metric_path()}, {os.path.abspath(self.autotuning_metric_path())}",
+ ranks=[0])
+ import atexit
+ atexit.register(print, "Autotuning: done with running current ds config.")
+ exit()
+
+ def _write_monitor(self):
+ if self.global_rank == 0:
+ self.summary_events = [
+ (
+ f"Train/Samples/elapsed_time_ms_forward",
+ self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False),
+ self.global_samples,
+ ),
+ (
+ f"Train/Samples/elapsed_time_ms_backward",
+ self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False),
+ self.global_samples,
+ ),
+ (
+ f"Train/Samples/elapsed_time_ms_backward_inner",
+ self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False),
+ self.global_samples,
+ ),
+ (
+ f"Train/Samples/elapsed_time_ms_backward_allreduce",
+ self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False),
+ self.global_samples,
+ ),
+ (
+ f"Train/Samples/elapsed_time_ms_step",
+ self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False),
+ self.global_samples,
+ ),
+ ]
+ self.monitor.write_events(self.summary_events)
+
+ def _get_optimizer_param(self, param_name):
+ result = []
+ if not self.optimizer:
+ return result
+ for group in self.optimizer.param_groups:
+ if param_name in group:
+ result.append(group[param_name])
+ else:
+ result.append(0.0)
+ return result
+
+ def get_lr(self):
+ return self._get_optimizer_param("lr")
+
+ def get_type(self):
+ return self._get_optimizer_param("type")
+
+ def get_mom(self):
+ if self.optimizer_name() in ["SGD", "RMSprop"]:
+ return self._get_optimizer_param("momentum")
+ else:
+ return self._get_optimizer_param("betas")
+
+ def get_pld_theta(self):
+ if self.progressive_layer_drop:
+ return self.progressive_layer_drop.get_theta()
+ else:
+ return None
+
+ def _report_progress(self, step):
+ lr = self.get_lr()
+ mom = self.get_mom()
+ log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", ranks=[0])
+
+ def allreduce_bucket(self, bucket, dp_group):
+ tensor = self.flatten(bucket)
+
+ tensor_to_allreduce = tensor
+
+ if self.communication_data_type != tensor.dtype:
+ tensor_to_allreduce = tensor.to(self.communication_data_type)
+
+ if self.postscale_gradients():
+ if self.gradient_predivide_factor() != 1.0:
+ tensor_to_allreduce.mul_(1.0 / self.gradient_predivide_factor())
+
+ dist.all_reduce(tensor_to_allreduce, group=dp_group)
+ if self.gradient_average:
+ if self.gradient_predivide_factor() != dist.get_world_size(group=dp_group):
+ tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group))
+ else:
+ tensor_to_allreduce.mul_(1. / dist.get_world_size(group=dp_group))
+ dist.all_reduce(tensor_to_allreduce, group=dp_group)
+
+ if self.communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
+ tensor.copy_(tensor_to_allreduce)
+
+ return tensor
+
+ def allreduce_and_copy(self, small_bucket, dp_group):
+ allreduced = self.allreduce_bucket(small_bucket, dp_group)
+ for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
+ buf.copy_(synced)
+
+ def allreduce_no_retain(self, bucket, dp_group, numel_per_bucket=500000000):
+ small_bucket = []
+ numel = 0
+ for tensor in bucket:
+ small_bucket.append(tensor)
+ numel = numel + tensor.numel()
+ if numel > numel_per_bucket:
+ self.allreduce_and_copy(small_bucket, dp_group)
+ small_bucket = []
+ numel = 0
+ if len(small_bucket) > 0:
+ self.allreduce_and_copy(small_bucket, dp_group)
+
+ def _get_gradients_for_reduction(self):
+ non_expert_grads = []
+ expert_grads = {}
+ if self.has_moe_layers:
+ for key in self.expert_data_parallel_group.keys():
+ expert_grads[key] = []
+
+ for param_name, param in self.module.named_parameters():
+ if not param.requires_grad:
+ continue
+
+ if param.grad is None:
+ # In cases where there is an imbalance of empty grads across
+ # ranks we must create empty grads, this will ensure that every
+ # rank is reducing the same size. In some cases it may make
+ # sense in the future to support the ability to average not
+ # w.r.t. world size but with a different value.
+ param.grad = torch.zeros(param.size(), dtype=param.dtype, device=param.device)
+
+ grad_data = param.grad.data
+ if param_name in self.sparse_tensor_module_names or grad_data.is_sparse:
+ # Call param.grad without data to avoid problem with setting of updated grads
+ grad_data = SparseTensor(param.grad)
+
+ if is_moe_param(param):
+ expert_grads[param.group_name].append(grad_data)
+ else:
+ non_expert_grads.append(grad_data)
+
+ return non_expert_grads, expert_grads
+
+ def _reduce_non_expert_gradients(self, grads, elements_per_buffer):
+ split_buckets = split_half_float_double_sparse(grads)
+ for _, bucket_tuple in enumerate(split_buckets):
+ bucket_type, bucket = bucket_tuple
+
+ if self.pipeline_parallelism:
+ dp_group = self.mpu.get_data_parallel_group()
+ else:
+ dp_group = groups._get_sequence_data_parallel_group()
+
+ if bucket_type == SparseTensor.type():
+ self.sparse_allreduce_no_retain(bucket, dp_group=dp_group)
+ else:
+ self.allreduce_no_retain(bucket, dp_group=dp_group, numel_per_bucket=elements_per_buffer)
+
+ def _reduce_expert_gradients(self, expert_grads, elements_per_buffer):
+ for ep_name, expert_grads_group in expert_grads.items():
+ expert_split_buckets = split_half_float_double_sparse(expert_grads_group)
+ for i, bucket_tuple in enumerate(expert_split_buckets):
+ bucket_type, bucket = bucket_tuple
+ if bucket_type == SparseTensor.type():
+ self.sparse_allreduce_no_retain(bucket, groups._get_expert_data_parallel_group(ep_name))
+ else:
+ # Separate between diff groups
+ self.allreduce_no_retain(bucket,
+ dp_group=groups._get_expert_data_parallel_group(ep_name),
+ numel_per_bucket=elements_per_buffer)
+
+ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000):
+ if grads is None:
+ non_expert_grads, expert_grads = self._get_gradients_for_reduction()
+ else:
+ assert not self.has_moe_layers, "attempting to reduce grads in unsupported way w.r.t. MoE"
+ non_expert_grads = grads
+
+ self._reduce_non_expert_gradients(non_expert_grads, elements_per_buffer)
+
+ if self.has_moe_layers:
+ self._reduce_expert_gradients(expert_grads, elements_per_buffer)
+
+ def sparse_allreduce_no_retain(self, bucket, dp_group):
+ allreduced_sparses = self.sparse_allreduce_bucket(bucket, dp_group)
+ # Densify sparse tensor and copy back to original location
+ for tensor in allreduced_sparses:
+ if tensor.is_sparse:
+ tensor.orig_dense_tensor.data = tensor.to_coo_tensor()
+ else:
+ tensor.orig_dense_tensor.copy_(tensor.to_dense())
+
+ def sparse_allreduce_bucket(self, bucket, dp_group):
+ sparse_list = []
+ for sparse in bucket:
+ sparse_list.append(self.sparse_allreduce(sparse, dp_group))
+ return sparse_list
+
+ def sparse_allreduce(self, sparse, dp_group):
+ original_data_type = sparse.values.dtype
+ if self.communication_data_type != sparse.values.dtype:
+ if self.communication_data_type in (torch.float16, torch.bfloat16):
+ indices = sparse.indices.to(torch.int32)
+ else:
+ indices = sparse.indices
+ values = sparse.values.to(self.communication_data_type)
+ else:
+ indices = sparse.indices
+ values = sparse.values
+
+ if self.postscale_gradients():
+ if self.gradient_average:
+ values.mul_(self.gradient_predivide_factor() /
+ (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size)))
+ else:
+ values.mul_(1. / (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size)))
+
+ indices_device_list = self.sparse_all_gather(indices, dp_group)
+ values_device_list = self.sparse_all_gather(values, dp_group)
+
+ sparse.indices = torch.cat(indices_device_list).to(torch.long)
+ sparse.values = torch.cat(values_device_list).to(original_data_type)
+ return sparse
+
+ def sparse_all_gather(self, value, dp_group):
+ my_size = torch.LongTensor([value.size()[0]]).to(self.device)
+ all_sizes = self.all_gather_scalar(my_size, dp_group)
+ max_size = torch.cat(all_sizes).max()
+ fill_size = max_size - my_size
+
+ assert value.dim() in [1, 2]
+ if value.dim() == 1:
+ if fill_size > 0:
+ value = torch.cat([value, value.new_empty(fill_size)])
+ tensor_list = [value.new_empty(max_size) for _ in range(dist.get_world_size(group=dp_group))]
+ else:
+ if fill_size > 0:
+ value = torch.cat([value, value.new_empty(fill_size, value.size()[1])])
+ tensor_list = [
+ value.new_empty(max_size,
+ value.size()[1]) for _ in range(dist.get_world_size(group=dp_group))
+ ]
+
+ dist.all_gather(tensor_list, value, group=dp_group)
+ tensors = []
+ for dev_idx, t in enumerate(tensor_list):
+ size = all_sizes[dev_idx][0]
+ tensors.append(t.index_select(0, torch.arange(size, dtype=torch.long, device=self.device)))
+
+ return tensors
+
+ def all_gather_scalar(self, value, dp_group):
+ tensor_list = [value.new_zeros(value.size()) for _ in range(dist.get_world_size(group=dp_group))]
+ dist.all_gather(tensor_list, value, group=dp_group)
+ return tensor_list
+
+ def module_state_dict(self, destination=None, prefix="", keep_vars=False, exclude_frozen_parameters=False):
+ sd = self.module.state_dict(destination, prefix, keep_vars)
+
+ # Remove frozen parameter weights from state_dict if specified
+ if exclude_frozen_parameters:
+ for n, p in self.module.named_parameters():
+ if not p.requires_grad and n in sd:
+ del sd[n]
+
+ if self.random_ltd_enabled():
+ sd = remove_random_ltd_state_dict(sd)
+ return sd
+
+ @staticmethod
+ def load_moe_state_dict(checkpoint_path,
+ tag,
+ state_dict,
+ old_moe_load,
+ model=None,
+ mpu=None,
+ num_experts=1,
+ checkpoint_engine=TorchCheckpointEngine()):
+ if old_moe_load:
+ expp_rank = groups._get_expert_data_parallel_rank(groups._get_max_expert_size_name())
+
+ num_local_experts = max(num_experts) // groups._get_expert_parallel_world_size(
+ groups._get_max_expert_size_name())
+ for local_expert_id in range(num_local_experts):
+ global_expert_id = expp_rank * num_local_experts + local_expert_id
+ expert_state_dict = checkpoint_engine.load(
+ DeepSpeedEngine._get_expert_ckpt_name(
+ checkpoint_path,
+ -1, # -1 means ignore layer_id
+ global_expert_id,
+ tag,
+ mpu),
+ map_location=torch.device('cpu'))
+
+ # Updating global -> local expert ids
+ moe_str_prefix = '.deepspeed_moe.experts.deepspeed_experts.'
+ for key in list(expert_state_dict.keys()):
+ local_key = key.replace(f'{moe_str_prefix}{global_expert_id}',
+ f'{moe_str_prefix}{local_expert_id}')
+ expert_state_dict[local_key] = expert_state_dict.pop(key)
+ state_dict.update(expert_state_dict)
+
+ else:
+ moe_layer_id = 0
+ for n_module, module in model.named_modules():
+ if isinstance(module, MoE): # and deepspeed.comm.get_rank() == 0:
+ group_name = module.expert_group_name
+ num_local_experts = module.num_local_experts
+ expp_rank = groups._get_expert_parallel_rank(group_name)
+ # loop all local_experts
+ for local_expert_id in range(num_local_experts):
+ global_expert_id = expp_rank * num_local_experts + local_expert_id
+ expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name(
+ checkpoint_path, moe_layer_id, global_expert_id, tag, mpu),
+ map_location=torch.device('cpu'))
+ # print(expert_state_dict.keys())
+ # Updating global -> local expert ids
+ moe_str_prefix = '.deepspeed_moe.experts.deepspeed_experts.'
+ for key in list(expert_state_dict.keys()):
+ local_key = key.replace(f'{moe_str_prefix}{global_expert_id}',
+ f'{moe_str_prefix}{local_expert_id}')
+ expert_state_dict[local_key] = expert_state_dict.pop(key)
+ state_dict.update(expert_state_dict)
+ moe_layer_id += 1
+
+ def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None, fetch_z3_params=False):
+ if fetch_z3_params:
+ params_to_fetch = [
+ p for p in self.module.parameters()
+ if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE
+ ]
+ else:
+ params_to_fetch = []
+
+ with deepspeed.zero.GatheredParameters(params_to_fetch, modifier_rank=0):
+ module_state_dict = checkpoint['module']
+ if custom_load_fn:
+ custom_load_fn(src=module_state_dict, dst=self.module)
+ else:
+ self.module.load_state_dict(
+ module_state_dict, # TODO
+ strict=strict)
+
+ if checkpoint.get(FROZEN_PARAM_FRAGMENTS, None) is not None:
+ saved_frozen_params = checkpoint[FROZEN_PARAM_FRAGMENTS]
+ for param in self.module.parameters():
+ if param.requires_grad:
+ continue
+ if param not in self.param_names:
+ raise ValueError(f"failed to find frozen {param} in named params")
+ name = self.param_names[param]
+ if hasattr(param, 'ds_id'):
+ param.ds_tensor.data.copy_(saved_frozen_params[name].data)
+ else:
+ param.data.copy_(saved_frozen_params[name].data)
+
+ def _get_zero_ckpt_prefix(self, dp_rank, bf16_mode):
+ return f'{"bf16_" if bf16_mode else ""}zero_pp_rank_{dp_rank}'
+
+ def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank, bf16_mode):
+ file_prefix = self._get_zero_ckpt_prefix(dp_rank, bf16_mode=bf16_mode)
+ zero_ckpt_name = os.path.join(
+ checkpoints_path,
+ str(tag),
+ f"{file_prefix}_mp_rank_{mp_rank:02d}_optim_states.pt",
+ )
+ return zero_ckpt_name
+
+ def _get_zero_ckpt_name(self, checkpoints_path, tag):
+ mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
+ pp_rank = dist.get_rank(group=self.optimizer.zp_process_group)
+ bf16_mode = self.bfloat16_enabled()
+ return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank, bf16_mode)
+
+ def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
+ if mp_placeholder is not None:
+ mp_rank_str = mp_placeholder
+ else:
+ mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
+ mp_rank_str = f"{mp_rank:02d}"
+
+ if self.zero_optimization_partition_weights():
+ filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.zp_process_group))
+ ckpt_name = os.path.join(
+ checkpoints_path,
+ str(tag),
+ f"{filename}_mp_rank_{mp_rank_str}_model_states.pt",
+ )
+ else:
+ ckpt_name = os.path.join(
+ checkpoints_path,
+ str(tag),
+ "mp_rank_" + mp_rank_str + "_model_states.pt",
+ )
+ return ckpt_name
+
+ def _get_optimizer_ckpt_name(self, checkpoints_path, tag, expp_rank):
+ mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
+ ckpt_name = os.path.join(checkpoints_path, str(tag),
+ f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt')
+ return ckpt_name
+
+ @staticmethod
+ def _get_expert_ckpt_name(checkpoints_path, layer_id, expert_id, tag, mpu=None):
+ mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
+ if layer_id <= -1:
+ # Used to support old checkpoint loading
+ ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag),
+ f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
+ else:
+ # Used to support new checkpoint loading
+ ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag),
+ f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
+ return ckpt_name
+
+ def _get_all_ckpt_names(self, checkpoints_path, tag):
+ # It is required that (checkpoints_path, tag) are consistent among all ranks.
+ ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*")
+ import glob
+
+ ckpt_files = glob.glob(ckpt_file_pattern)
+ ckpt_files.sort()
+ return ckpt_files
+
+ def load_checkpoint(self,
+ load_dir,
+ tag=None,
+ load_module_strict=True,
+ load_optimizer_states=True,
+ load_lr_scheduler_states=True,
+ load_module_only=False,
+ custom_load_fn=None):
+ """
+ Load training checkpoint
+
+ Arguments:
+ load_dir: Required. Directory to load the checkpoint from
+ tag: Checkpoint tag used as a unique identifier for checkpoint, if not provided will attempt to load tag in 'latest' file
+ load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match.
+ load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance
+ load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
+ load_module_only: Optional. Boolean to load only the model weights from the checkpoint. Ex. warmstarting.
+ custom_load_fn: Optional. Custom model load function.
+
+ Returns:
+ A tuple of ``load_path`` and ``client_state``.
+ *``load_path``: Path of the loaded checkpoint. ``None`` if loading the checkpoint failed.
+ *``client_state``: State dictionary used for loading required training states in the client code.
+
+ Important: under ZeRO3, one cannot load checkpoint with ``engine.load_checkpoint()`` right
+ after ``engine.save_checkpoint()``. It is because ``engine.module`` is partitioned, and
+ ``load_checkpoint()`` wants a pristine model. If insisting to do so, please reinitialize engine
+ before ``load_checkpoint()``.
+
+ """
+
+ if tag is None:
+ latest_tag = "latest_universal" if self.load_universal_checkpoint() else "latest"
+ latest_path = os.path.join(load_dir, latest_tag)
+ if os.path.isfile(latest_path):
+ with open(latest_path, "r") as fd:
+ tag = fd.read().strip()
+ else:
+ if self.load_universal_checkpoint():
+ raise ValueError(f'Invalid for universal checkpoint: {latest_path} does not exist')
+ else:
+ logger.warning(
+ f"Unable to find latest file at {latest_path}, if trying to load latest "
+ "checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint."
+ )
+ return None, None
+
+ if self._optimizer_has_ckpt_event_prologue():
+ # Prepare for checkpoint load by ensuring all parameters are partitioned
+ self.optimizer.checkpoint_event_prologue()
+
+ load_path, client_states = self._load_checkpoint(load_dir,
+ tag,
+ load_module_strict=load_module_strict,
+ load_optimizer_states=load_optimizer_states,
+ load_lr_scheduler_states=load_lr_scheduler_states,
+ load_module_only=load_module_only,
+ custom_load_fn=custom_load_fn)
+
+ load_zero_checkpoint = load_optimizer_states and load_path is not None and (self.zero_optimization()
+ or self.bfloat16_enabled())
+ if load_zero_checkpoint:
+ success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states)
+ if not success:
+ self.optimizer._restore_from_bit16_weights()
+
+ if self._optimizer_has_ckpt_event_epilogue():
+ self.optimizer.checkpoint_event_epilogue()
+
+ if self.load_universal_checkpoint():
+ self.optimizer.update_lp_params()
+ if load_zero_checkpoint:
+ self.update_optimizer_step(step=client_states['iteration'] + 1)
+
+ return load_path, client_states
+
+ def _load_checkpoint(self,
+ load_dir,
+ tag,
+ load_module_strict=True,
+ load_optimizer_states=True,
+ load_lr_scheduler_states=True,
+ load_module_only=False,
+ custom_load_fn=None):
+
+ from deepspeed.runtime.state_dict_factory import SDLoaderFactory
+
+ ckpt_list = self._get_all_ckpt_names(load_dir, tag)
+ sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine=self.checkpoint_engine)
+
+ is_pipe_parallel = isinstance(self.module, PipelineModule)
+
+ mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
+ load_path, checkpoint, _ = sd_loader.load(self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel)
+
+ if checkpoint is None:
+ return None, None
+
+ fetch_z3_params = False
+ if self.zero_optimization_partition_weights() and not load_optimizer_states:
+ checkpoint['module'] = get_fp32_state_dict_from_zero_checkpoint(load_dir)
+ fetch_z3_params = True
+
+ if is_pipe_parallel:
+ # Pipeline parallelism uses this to load its own checkpoint files.
+ self._curr_ckpt_path = os.path.join(load_dir, tag)
+
+ if self.has_moe_layers:
+ # print(checkpoint.keys())
+ old_moe_load = False
+ if not isinstance(checkpoint['num_experts'], list):
+ old_moe_load = True
+ DeepSpeedEngine.load_moe_state_dict(load_dir,
+ tag,
+ state_dict=checkpoint['module'],
+ old_moe_load=old_moe_load,
+ model=self.module,
+ mpu=self.mpu,
+ num_experts=self.num_experts,
+ checkpoint_engine=self.checkpoint_engine)
+ if not self.load_universal_checkpoint():
+ self.load_module_state_dict(checkpoint=checkpoint,
+ strict=load_module_strict,
+ custom_load_fn=custom_load_fn,
+ fetch_z3_params=fetch_z3_params)
+
+ self.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
+ if 'zp_world_size' not in checkpoint:
+ checkpoint['zp_world_size'] = self.zp_world_size
+ self.loaded_checkpoint_zp_world_size = checkpoint['zp_world_size']
+
+ optim_checkpoint = None
+ if load_module_only:
+ deepspeed_states = ['module']
+ if self.optimizer is not None and self.fp16_enabled():
+ self.optimizer.refresh_fp32_params()
+ else:
+ has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
+ if load_optimizer_states and self.optimizer is not None and not has_zero_optimizer_state:
+ if self.has_moe_layers:
+ largest_group_name = groups._get_max_expert_size_name()
+ expp_rank = groups._get_expert_parallel_rank(largest_group_name)
+ optim_load_path = self._get_optimizer_ckpt_name(load_dir, tag, expp_rank)
+ optim_checkpoint = self.checkpoint_engine.load(optim_load_path, map_location=torch.device('cpu'))
+ else:
+ optim_checkpoint = checkpoint
+
+ if self.fp16_enabled() or self.bfloat16_enabled():
+ self.optimizer.load_state_dict(optim_checkpoint['optimizer'],
+ load_optimizer_states=load_optimizer_states)
+ else:
+ optim_checkpoint = checkpoint
+
+ self.optimizer.load_state_dict(optim_checkpoint['optimizer'])
+
+ if load_lr_scheduler_states and self.lr_scheduler is not None:
+ self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+
+ if self.random_ltd_enabled() and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint:
+ self.random_ltd_scheduler.load_state_dict(checkpoint['random_ltd'])
+
+ if self.training_dataloader is not None and self.curriculum_learning_enabled(
+ ) and 'data_sampler' in checkpoint:
+ self.training_dataloader.data_sampler.load_state_dict(checkpoint['data_sampler'])
+
+ def get_sparse_tensor_module_names(original_set, loaded_set, original_parameters, loaded_parameters):
+ result = set()
+
+ for name in original_set:
+ if name in loaded_parameters and name not in loaded_set:
+ continue # parameter existed in previous model and was not sparse
+ result.add(name)
+
+ for name in loaded_set:
+ if name in original_parameters:
+ result.add(name) # parameter exists in both configs and it was sparse
+
+ return result
+
+ if 'sparse_tensor_module_names' in checkpoint:
+ sparse_tensor_module_names = checkpoint['sparse_tensor_module_names']
+ elif 'csr_tensor_module_names' in checkpoint:
+ sparse_tensor_module_names = checkpoint['csr_tensor_module_names']
+ else:
+ sparse_tensor_module_names = None
+ if sparse_tensor_module_names is not None:
+ if load_module_strict:
+ self.sparse_tensor_module_names = sparse_tensor_module_names
+ else:
+ self.sparse_tensor_module_names = get_sparse_tensor_module_names(
+ self.sparse_tensor_module_names, sparse_tensor_module_names,
+ dict(self.module.named_parameters()), checkpoint["module"])
+
+ self.global_steps = checkpoint['global_steps']
+ self.global_samples = checkpoint.get('global_samples', self.global_steps * self.train_batch_size())
+ self.skipped_steps = checkpoint['skipped_steps']
+ self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
+ deepspeed_states = [
+ 'module', 'sparse_tensor_module_names', 'skipped_steps', 'global_steps', 'zp_world_size',
+ 'mp_world_size', 'data_sampler', 'random_ltd', 'dp_world_size',
+ ]
+ client_state = {}
+
+ if load_lr_scheduler_states:
+ deepspeed_states.append('lr_scheduler')
+ if load_optimizer_states:
+ deepspeed_states.append('optimizer')
+
+ client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states}
+
+ if optim_checkpoint is not None:
+ client_state['optimizer'] = optim_checkpoint['optimizer']
+
+ return load_path, client_state
+
+ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
+
+ load_serial = None
+ # When use loading checkpoint serial, checkpoint loading start from local rank 0,
+ # all other local rank would be paused, waiting for its rank-1 peer ready and its notification.
+ if self._config.zero_config.pipeline_loading_checkpoint:
+ assert self.zero_optimization_stage(
+ ) == ZeroStageEnum.weights, "Only stage3 support for pipeline checkpoint loading"
+ load_serial = torch.zeros(1).to(self.device)
+ if dist.get_local_rank() != 0:
+ dist.recv(tensor=load_serial, src=dist.get_rank() - 1)
+ if self.load_universal_checkpoint():
+ zero_sd_list = None
+ checkpoint_folder = f'{os.path.join(load_dir, tag)}'
+ else:
+ if load_optimizer_states and self.zp_world_size != self.loaded_checkpoint_zp_world_size:
+ raise ZeRORuntimeException("The checkpoint being loaded used a DP " \
+ f"world size of {self.loaded_checkpoint_zp_world_size} but the " \
+ f"current world size is {self.zp_world_size}. Automatic adjustment " \
+ "of ZeRO's optimizer state partitioning with a new world size is not " \
+ "currently supported.")
+ checkpoint_folder = None
+ zero_sd_list = self._get_all_zero_checkpoints(load_dir, tag)
+ if zero_sd_list is None:
+ return False
+
+ self.optimizer.load_state_dict(state_dict_list=zero_sd_list,
+ load_optimizer_states=load_optimizer_states,
+ load_from_fp32_weights=self.zero_load_from_fp32_weights(),
+ checkpoint_folder=checkpoint_folder,
+ load_serial=load_serial)
+
+ if self.load_universal_checkpoint():
+ logger.info(f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}')
+ else:
+ logger.info(f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}")
+ return True
+
+ def update_optimizer_step(self, step):
+
+ def set_step(d):
+ if isinstance(d['step'], torch.Tensor):
+ d['step'] = torch.tensor(step, dtype=d['step'].dtype, device=d['step'].device)
+ else:
+ d['step'] = step
+
+ optimizer = self.optimizer
+ base_optimizer = optimizer.optimizer
+ state = base_optimizer.state
+ for group in optimizer.param_groups:
+ if 'step' in group:
+ set_step(group)
+ for p in group['params']:
+ if p in state and len(state[p]) > 0 and 'step' in state[p]:
+ set_step(state[p])
+
+ def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size, bf16_mode):
+ zero_ckpt_names = []
+ for dp_rank in range(dp_world_size):
+ ckpt_name = self._get_rank_zero_ckpt_name(checkpoints_path=load_dir,
+ tag=tag,
+ mp_rank=mp_rank,
+ dp_rank=dp_rank,
+ bf16_mode=bf16_mode)
+ zero_ckpt_names.append(ckpt_name)
+
+ return zero_ckpt_names
+
+ def _get_all_zero_checkpoint_names(self, load_dir, tag, bf16_mode):
+ mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
+ zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names(load_dir=load_dir,
+ tag=tag,
+ mp_rank=mp_rank,
+ dp_world_size=self.loaded_checkpoint_dp_world_size,
+ bf16_mode=bf16_mode)
+ for i, ckpt_name in enumerate(zero_ckpt_names):
+ if not os.path.exists(ckpt_name):
+ # transparently handle the old file pattern for optim_states
+ if "optim_states.pt" in ckpt_name:
+ ckpt_name_try = ckpt_name.replace("_optim_states.pt", "optim_states.pt")
+ if os.path.exists(ckpt_name_try):
+ zero_ckpt_names[i] = ckpt_name_try
+ continue
+
+ return zero_ckpt_names
+
+ def _get_all_zero_checkpoint_state_dicts(self, zero_ckpt_names):
+ zero_sd_list = []
+ for i, ckpt_name in enumerate(zero_ckpt_names):
+ _state = None
+ if ckpt_name is None:
+ _state = {OPTIMIZER_STATE_DICT: None}
+ # Fully load state for current rank
+ elif self.zero_elastic_checkpoint() or dist.get_rank(group=self.optimizer.zp_process_group) == i:
+ _state = self.checkpoint_engine.load(
+ ckpt_name,
+ map_location='cpu',
+ )
+ else:
+ _state = {OPTIMIZER_STATE_DICT: None}
+ zero_sd_list.append(_state)
+
+ zero_optimizer_sd = [sd[OPTIMIZER_STATE_DICT] for sd in zero_sd_list]
+ logger.info(f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}")
+ return zero_optimizer_sd
+
+ def _get_all_zero_checkpoints(self, load_dir, tag):
+ for bf16_mode in [self.bfloat16_enabled(), not self.bfloat16_enabled()]:
+ zero_ckpt_names = self._get_all_zero_checkpoint_names(load_dir, tag, bf16_mode)
+ if zero_ckpt_names is not None:
+ # Warn if loading checkpoint of different bit16 type
+ if bf16_mode is not self.bfloat16_enabled():
+ checkpoint_bit16 = BFLOAT16 if bf16_mode else FP16
+ engine_bit16 = BFLOAT16 if self.bfloat16_enabled() else FP16
+ logger.warn(f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine')
+ return self._get_all_zero_checkpoint_state_dicts(zero_ckpt_names)
+
+ return None
+
+ def _checkpoint_tag_validation(self, tag):
+ if self.checkpoint_tag_validation_enabled():
+ s_hash = hashlib.sha1(tag.encode())
+ bhash = torch.ByteTensor([s_hash.digest()]).flatten().to(self.device)
+ max_bhash = bhash.clone()
+ min_bhash = bhash.clone()
+ dist.all_reduce(max_bhash, op=dist.ReduceOp.MAX)
+ dist.all_reduce(min_bhash, op=dist.ReduceOp.MIN)
+ valid = all(min_bhash == bhash) and all(max_bhash == bhash)
+ msg = (f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across "
+ "all ranks. Including rank unique information in checkpoint tag could cause issues when "
+ "restoring with different world sizes.")
+ if self.checkpoint_tag_validation_fail():
+ assert valid, msg
+ elif not valid:
+ logger.warning(msg)
+
+ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True, exclude_frozen_parameters=False):
+ """Save training checkpoint
+
+ Arguments:
+ save_dir: Required. Directory for saving the checkpoint
+ tag: Optional. Checkpoint tag used as a unique identifier for the checkpoint, global step is
+ used if not provided. Tag name must be the same across all ranks.
+ client_state: Optional. State dictionary used for saving required training states in the client code.
+ save_latest: Optional. Save a file 'latest' pointing to the latest saved checkpoint.
+ exclude_frozen_parameters: Optional. Exclude frozen parameters from checkpointed state.
+ Important: all processes must call this method and not just the process with rank 0. It is
+ because each process needs to save its master weights and scheduler+optimizer states. This
+ method will hang waiting to synchronize with other processes if it's called just for the
+ process with rank 0.
+
+ """
+ if self._optimizer_has_ckpt_event_prologue():
+ # Custom preparation for checkpoint save, if applicable
+ self.optimizer.checkpoint_event_prologue()
+
+ rank = self.local_rank if self.use_node_local_storage() else self.global_rank
+
+ # This is to make sure the checkpoint names are created without collision
+ # There seems to be issue creating them in parallel
+
+ # Ensure save_dir directory exists
+ if rank == 0:
+ self.checkpoint_engine.makedirs(save_dir, exist_ok=True)
+ dist.barrier()
+
+ if tag is None:
+ tag = f"global_step{self.global_steps}"
+
+ # Ensure tag is a string
+ tag = str(tag)
+ self.checkpoint_engine.create(tag)
+
+ # Ensure checkpoint tag is consistent across ranks
+ self._checkpoint_tag_validation(tag)
+
+ if self.has_moe_layers:
+ self.save_non_zero_checkpoint = False
+ self._create_checkpoint_file(save_dir, tag, False)
+ self._save_moe_checkpoint(save_dir,
+ tag,
+ client_state=client_state,
+ exclude_frozen_parameters=exclude_frozen_parameters)
+
+ # We distribute the task of saving layer checkpoint files among
+ # data parallel instances, so all procs should call _save_checkpoint.
+ # All procs then call module_state_dict(), but only procs of data
+ # parallel rank 0 save the general model params.
+ if not self.has_moe_layers:
+ self._create_checkpoint_file(save_dir, tag, False)
+ self._save_checkpoint(save_dir,
+ tag,
+ client_state=client_state,
+ exclude_frozen_parameters=exclude_frozen_parameters)
+
+ if self.save_zero_checkpoint:
+ self._create_zero_checkpoint_files(save_dir, tag)
+ self._save_zero_checkpoint(save_dir, tag)
+
+ if self._optimizer_has_ckpt_event_epilogue():
+ self.optimizer.checkpoint_event_epilogue()
+
+ # Save latest checkpoint tag
+ self.checkpoint_engine.commit(tag)
+ if save_latest and rank == 0:
+ with open(os.path.join(save_dir, 'latest'), 'w') as fd:
+ fd.write(tag)
+
+ dist.barrier()
+
+ return True
+
+ def _get_non_moe_state_dict(self, full_state_dict):
+ """
+ Get the state dict of the non-moe layers
+ """
+ for key in list(full_state_dict.keys()):
+ if 'expert' in key and 'moe.gate.wg.weight' not in key:
+ full_state_dict.pop(key)
+
+ return full_state_dict
+
+ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_parameters=False):
+ save_path = self._get_ckpt_name(save_dir, tag)
+ # A hack to save the checkpointing directory. Pipeline parallelism overrides
+ # module_state_dict() and uses this path to save the model. module_state_dict()
+ # then instead just returns None.
+
+ # Using layer_#_export_# to save the model's expert state_dict
+ moe_layer_id = 0
+ for n_module, module in self.module.named_modules():
+ if isinstance(module, MoE): # and deepspeed.comm.get_rank() == 0:
+ group_name = module.expert_group_name
+ num_local_experts = module.num_local_experts
+ expp_rank = groups._get_expert_parallel_rank(group_name)
+ exp_dp_rank = groups._get_expert_data_parallel_rank(group_name)
+ # print(expp_rank, exp_dp_rank)
+ if exp_dp_rank != 0:
+ moe_layer_id += 1
+ continue
+
+ # get all moe parameters
+ moe_state_dict = {}
+ for n, p in module.state_dict().items():
+ if 'expert' in n and 'moe.gate.wg.weight' not in n:
+ moe_state_dict[n_module + '.' + n] = p
+ moe_str_prefix = '.deepspeed_moe.experts.deepspeed_experts.'
+ # print(moe_state_dict.keys()) # until now, everything is fine. So the bug happens at next few lines
+ # Reorder the moe name rank, so that each checkpoint only has one expert
+ experts_state_dict = defaultdict(dict)
+ for key in list(moe_state_dict.keys()):
+ m = re.match(f".*{moe_str_prefix}([0-9]+).*", key)
+
+ local_expert_id = None
+ if not m:
+ logger.warn(f'No expert found in key {key}.')
+ else:
+ local_expert_id = m.group(1)
+
+ global_expert_id = expp_rank * \
+ num_local_experts + int(local_expert_id)
+ expert_key = key.replace(f'{moe_str_prefix}{local_expert_id}',
+ f'{moe_str_prefix}{global_expert_id}')
+ # truncating extra tensor (shared) storage
+ truncated = moe_state_dict.pop(key).clone().detach()
+ experts_state_dict[str(global_expert_id)][expert_key] = truncated
+
+ # let save the moe parameters
+ for global_expert_id, expert_state_dict in experts_state_dict.items():
+ # save the moe parameters
+ moe_save_path = self._get_expert_ckpt_name(save_dir, moe_layer_id, global_expert_id, tag, self.mpu)
+ if self.random_ltd_enabled():
+ expert_state_dict = remove_random_ltd_state_dict(expert_state_dict)
+ self.checkpoint_engine.save(expert_state_dict, moe_save_path)
+ moe_layer_id += 1
+
+ self._curr_ckpt_path = os.path.join(save_dir, tag)
+
+ largest_group_name = groups._get_max_expert_size_name()
+ expp_rank = groups._get_expert_parallel_rank(largest_group_name)
+ exp_dp_rank = groups._get_expert_data_parallel_rank(largest_group_name)
+
+ # In the case of E + D parallelism, only the
+ # first expert parallel group should save the expert weights
+ # since each expert parallel group is a copy of the model's experts
+ if exp_dp_rank != 0:
+ return
+
+ # Save optimizer states. They are different across each exp parallel rank.
+ optimizer_state = {
+ 'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None
+ }
+ # TODO: why use BufferedWriter not the path
+ file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
+ self.checkpoint_engine.save(optimizer_state, file_path)
+
+ # get non-moe parameters
+ model_state_dict = self._get_non_moe_state_dict(
+ self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters))
+
+ if expp_rank == 0:
+ # TODO: update num experts info,.. in checkpoint
+ state = {
+ 'module':
+ model_state_dict,
+ 'lr_scheduler':
+ self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
+ 'data_sampler':
+ self.training_dataloader.data_sampler.state_dict() if
+ (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None,
+ 'random_ltd':
+ self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None,
+ 'sparse_tensor_module_names':
+ self.sparse_tensor_module_names,
+ 'skipped_steps':
+ self.skipped_steps,
+ 'global_steps':
+ self.global_steps,
+ 'global_samples':
+ self.global_samples,
+ 'zp_world_size':
+ self.zp_world_size,
+ 'dp_world_size':
+ self.dp_world_size,
+ 'mp_world_size':
+ self.mp_world_size,
+ 'num_experts':
+ self.num_experts
+ }
+ state.update(client_state)
+ logger.info(f'Saving model checkpoint: {save_path}')
+ self.checkpoint_engine.save(state, save_path)
+
+ def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
+ name_function = (self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name)
+ try:
+ checkpoint_name = name_function(save_dir, tag)
+ path = os.path.dirname(checkpoint_name)
+ self.checkpoint_engine.makedirs(path, exist_ok=True)
+ except:
+ logger.error(f"Failed saving model checkpoint to {save_dir} with tag {tag}")
+ return False
+
+ return True
+
+ def _create_zero_checkpoint_files(self, save_dir, tag):
+ success = True
+ # zero checkpoint files are created sequentially
+ for rank in range(dist.get_world_size(self.optimizer.zp_process_group)):
+ if rank == self.global_rank:
+ success = self._create_checkpoint_file(save_dir, tag, True)
+
+ dist.barrier(group=self.optimizer.zp_process_group)
+
+ return success
+
+ def _save_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_parameters=False):
+
+ save_path = self._get_ckpt_name(save_dir, tag)
+
+ zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
+
+ save_frozen_param = self.zero_optimization_partition_gradients() and not exclude_frozen_parameters
+
+ # A hack to save the checkpointing directory. Pipeline parallelism overrides
+ # module_state_dict() and uses this path to save the model. module_state_dict()
+ # then instead just returns None. The module_state_dict() implementation in
+ # PipelineEngine expects the save path to be set in self._curr_ckpt_path.
+ self._curr_ckpt_path = os.path.join(save_dir, tag)
+ module = self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters)
+ self._curr_ckpt_path = None
+
+ state = dict(module=module,
+ buffer_names=self._get_buffer_names(),
+ optimizer=self.optimizer.state_dict() if self.optimizer and not zero_optimizer_state else None,
+ param_shapes=self._get_zero_param_shapes() if self.optimizer and zero_optimizer_state else None,
+ frozen_param_shapes=self._get_zero_frozen_param_attributes(self._get_param_shape_func)
+ if save_frozen_param else None,
+ shared_params=self._get_shared_params() if self.optimizer and zero_optimizer_state else None,
+ frozen_param_fragments=self._get_zero_frozen_param_attributes(self._get_param_fragment_func)
+ if save_frozen_param else None,
+ lr_scheduler=self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
+ data_sampler=self.training_dataloader.data_sampler.state_dict() if
+ (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None,
+ random_ltd=self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None,
+ sparse_tensor_module_names=self.sparse_tensor_module_names,
+ skipped_steps=self.skipped_steps,
+ global_steps=self.global_steps,
+ global_samples=self.global_samples,
+ dp_world_size=self.seq_dp_world_size,
+ mp_world_size=self.mp_world_size,
+ ds_config=self.config,
+ ds_version=version)
+ state.update(client_state)
+
+ if self.save_non_zero_checkpoint:
+ log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0, 1])
+ self.checkpoint_engine.save(state, save_path)
+
+ def _get_buffer_names(self):
+ buffer_names = []
+
+ # we save buffer names so that we could extract later the real buffers from the saved
+ # state_dict["module"] in the non-zero checkpoint - the buffers are already there but they
+ # are intermixed with param placeholders
+
+ # have to traverse the tree to be able to skip non-persistent buffers
+ def get_layer_named_buffers(module, prefix=""):
+ for name, buf in module.named_buffers(recurse=False):
+ if buf is not None and name not in module._non_persistent_buffers_set:
+ buffer_names.append(prefix + name)
+
+ for name, child in module.named_children():
+ if child is not None:
+ get_layer_named_buffers(child, prefix + name + ".")
+
+ get_layer_named_buffers(self.module, prefix="")
+
+ return buffer_names
+
+ def _get_param_shape_func(self, param):
+ return param.ds_shape if hasattr(param, 'ds_id') else param.shape
+
+ def _get_param_fragment_func(self, param):
+ return param.ds_tensor.detach().cpu() if hasattr(param, 'ds_id') else param.detach().cpu()
+
+ def _get_zero_frozen_param_attributes(self, attr_func):
+ frozen_param_fragments = OrderedDict()
+
+ for param in self.module.parameters():
+ if param.requires_grad:
+ continue
+ if param not in self.param_names:
+ raise ValueError(f"failed to find frozen {param} in named params")
+ name = self.param_names[param]
+ frozen_param_fragments[name] = attr_func(param)
+
+ return frozen_param_fragments
+
+ def _get_zero_param_shapes(self):
+ """Returns a dict of name to shape mapping, only for the flattened fp32 weights saved by the
+ optimizer. the names are exactly as in state_dict. The order is absolutely important, since
+ the saved data is just flattened data with no identifiers and requires reconstruction in the
+ same order it was saved.
+ We can't rely on self.module.named_parameters() to get the saved tensors, as some params
+ will be missing and others unsaved and then it'd be impossible to reconstruct state_dict
+ from the flattened weights.
+ optimizer.bit16_groups seems to be the easiest to use as it's in all zeroX versions.
+ """
+ param_group_shapes = []
+ cnt = 0
+ numel = 0
+
+ # zero2 started using a round_robin_bit16_groups which is a shuffled version of bit16_groups -
+ # if we don't use it, we get parameters ordered incorrectly
+ if hasattr(self.optimizer, "round_robin_bit16_groups"):
+ bit16_groups = self.optimizer.round_robin_bit16_groups
+ elif self.bfloat16_enabled() and hasattr(self.optimizer, "bf16_groups"):
+ bit16_groups = self.optimizer.bf16_groups
+ else:
+ bit16_groups = self.optimizer.bit16_groups if self.zero_optimization_stage(
+ ) == 2 else self.optimizer.fp16_groups
+
+ for bit16_group in bit16_groups:
+ param_shapes = OrderedDict()
+ for param in bit16_group:
+ cnt += 1
+ numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel()
+ shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape
+ if param not in self.param_names:
+ raise ValueError(f"failed to find optimizer param in named params")
+ name = self.param_names[param]
+ param_shapes[name] = shape
+
+ # uncomment to debug zero_to_fp32.py problems
+ # if self.global_rank == 0: print(f"saving param {name} {shape} (numel={shape.numel()})")
+ param_group_shapes.append(param_shapes)
+ # if self.global_rank == 0: print(f"Total saved {numel} numels in {cnt} params")
+
+ return param_group_shapes
+
+ def _get_shared_params(self):
+ """
+ Returns a dict of shared params, which can later be used to reconstruct the original state dict,
+ e.g. in `zero_to_fp32`. Each dict entry is a pair of param names, where the key is the name
+ of the variable that isn't stored and the value is the actual param holding data.
+ """
+ shared_index = {}
+ shared_params_by_full_name = {}
+
+ is_zero3_model = (self.zero_optimization_partition_weights()
+ and any(hasattr(param, "ds_id") for param in self.module.parameters()))
+
+ def get_layer_state_dict(module, prefix=""):
+ # handle params
+ for name, param in module.named_parameters(recurse=False):
+ if param is None or (is_zero3_model and not hasattr(param, "ds_id")):
+ continue
+ key = prefix + name
+
+ # When weights are manged by stage 3, we can't rely on param.data_ptr() as it will be reused
+ # as weights get gathered and reduced, but param.ds_id is unique across all zero weights
+ # (and shared params will have the same param.ds_id)
+ param_id = param.ds_id if is_zero3_model else param.data_ptr()
+
+ if param_id in shared_index:
+ # shared weights
+ #print(f"`{key}` is shared with `{shared_index[param_id]}`")
+ shared_params_by_full_name[key] = shared_index[param_id]
+ else:
+ shared_index[param_id] = key
+
+ for name, child in module.named_children():
+ if child is not None:
+ get_layer_state_dict(child, prefix + name + ".")
+
+ if dist.get_rank() == 0:
+ get_layer_state_dict(self.module, prefix="")
+
+ return shared_params_by_full_name
+
+ def _copy_recovery_script(self, save_path):
+ base_dir = os.path.dirname(os.path.dirname(__file__))
+ script = "zero_to_fp32.py"
+ src = os.path.join(base_dir, "utils", script)
+ dst = os.path.join(save_path, script)
+ #logger.info(f"creating recovery script {dst}")
+ copyfile(src, dst)
+ self._change_recovery_script_permissions(dst)
+
+ def _change_recovery_script_permissions(self, dst):
+ # make executable (safeguard for file shares - Azure as example)
+ try:
+ os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
+ except (FileNotFoundError, PermissionError) as e:
+ #this message is used in unit test TestZeRONonDistributed
+ logger.info(
+ f'Warning: Could not change permissions for {dst} due to error: {e}. Continuing without changing permissions.'
+ )
+
+ def _save_zero_checkpoint(self, save_path, tag):
+ zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
+ zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), ds_config=self.config, ds_version=version)
+ self.checkpoint_engine.save(zero_sd, zero_checkpoint_name)
+
+ if self.global_rank == 0:
+ self._copy_recovery_script(save_path)
+ ckpt_type = 'zero' if self.zero_optimization() else 'bf16_zero'
+ logger.info(f'{ckpt_type} checkpoint saved {zero_checkpoint_name}')
+
+ def _zero3_consolidated_16bit_state_dict(self):
+ """
+ Get a full non-partitioned state_dict with fp16 weights on cpu.
+ Important: this function must be called on all ranks and not just rank 0.
+ This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
+ 1. consolidates the weights from different partitions on gpu0
+ 2. works on one layer at a time to require as little gpu0 memory as possible, by
+ moving the already consolidated weights to cpu
+ 3. takes care to keep the shared params shared when gradually copying the params to cpu
+ Returns:
+ a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
+ """
+ if not self.zero_optimization_partition_weights():
+ raise ValueError("this function requires ZeRO-3 mode")
+
+ state_dict = OrderedDict() if dist.get_rank() == 0 else None
+ shared_params = {}
+
+ def get_layer_state_dict(module, prefix=""):
+ # gather one layer at a time to be memory-efficient
+ # must use modifier_rank=0 to release GPU memory after each layer gathered
+ #see_memory_usage("before GatheredParameters", force=True)
+ with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+ if dist.get_rank() == 0:
+ # handle params
+ for name, param in module.named_parameters(recurse=False):
+ if param is None:
+ continue
+ key = prefix + name
+ # can't rely on param.data_ptr() as it will be reused as weights gets
+ # gathered and reduced, but param.ds_id is unique across all zero weights
+ # (and shared params will have the same param.ds_id)
+ if param.ds_id in shared_params:
+ # shared weights
+ #print(f"`{key}` is shared with `{shared_params[param.ds_id]}`")
+ state_dict[key] = state_dict[shared_params[param.ds_id]]
+ else:
+ state_dict[key] = param.detach().cpu()
+ shared_params[param.ds_id] = key
+ #print(f"param {param.ds_id} {param.shape} {key} ")
+
+ # now buffers - not sure if need to take care of potentially shared weights here
+ for name, buf in module.named_buffers(recurse=False):
+ if (buf is not None and name not in module._non_persistent_buffers_set):
+ state_dict[prefix + name] = buf.detach().cpu()
+ #see_memory_usage("after GatheredParameters", force=True)
+
+ for name, child in module.named_children():
+ if child is not None:
+ get_layer_state_dict(child, prefix + name + ".")
+
+ # Prepare for checkpoint save by ensuring all parameters are partitioned
+ if self._optimizer_has_ckpt_event_prologue():
+ self.optimizer.checkpoint_event_prologue()
+
+ see_memory_usage("before get_layer_state_dict", force=False)
+ get_layer_state_dict(self.module, prefix="")
+ see_memory_usage("after get_layer_state_dict", force=False)
+
+ if self._optimizer_has_ckpt_event_epilogue():
+ self.optimizer.checkpoint_event_epilogue()
+
+ return state_dict
+
+ def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
+ """has been renamed to save_16bit_model, keeping this around for backwards
+ compatibility"""
+ return self.save_16bit_model(save_dir, save_filename)
+
+ def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
+ """
+ Save 16bit model weights
+
+ This method saves the 16bit model weights at the desired destination.
+
+ Arguments:
+ save_dir: Required. Directory for saving the model
+ save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
+
+ Returns:
+ ``True`` when a model has been saved, ``False`` otherwise. It will not be saved if
+ stage3_gather_16bit_weights_on_model_save is ``False``.
+
+ Important: all processes must call this method and not just the process with rank 0. It is
+ because the processes need to work in sync to gather the weights. This method will hang
+ waiting to synchronize with other processes if it's called just for the process with rank 0.
+
+ """
+
+ path = os.path.join(save_dir, save_filename)
+
+ if self.zero_optimization_partition_weights():
+ if self.zero_gather_16bit_weights_on_model_save():
+ # consolidation is expensive in time and memory and therefore isn't a default
+ state_dict = self._zero3_consolidated_16bit_state_dict()
+ else:
+ # the model will be bogus if not consolidated so don't confuse the user by saving it
+ logger.info(
+ f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False")
+ return False
+ else:
+ state_dict = self.module.state_dict()
+
+ tag = f"global_step{self.global_steps}"
+ tag = str(tag)
+ self.checkpoint_engine.create(tag)
+
+ if dist.get_rank() == 0:
+ self.checkpoint_engine.makedirs(save_dir, exist_ok=True)
+ logger.info(f"Saving model weights to {path}, tag: {tag}")
+ self.checkpoint_engine.save(state_dict, path)
+
+ self.checkpoint_engine.commit(tag)
+
+ return True
+
+ def empty_partition_cache(self):
+ """
+ Release GPU memory consumed by offloaded model parameters.
+ """
+ if hasattr(self.optimizer, 'empty_partition_cache'):
+ self.optimizer.empty_partition_cache()
+ gc.collect()
+ get_accelerator().empty_cache()
diff --git a/opensora/adaptor/modules.py b/opensora/adaptor/modules.py
new file mode 100644
index 000000000..e53599d55
--- /dev/null
+++ b/opensora/adaptor/modules.py
@@ -0,0 +1,23 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def fp32_layer_norm_forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ origin_dtype = inputs.dtype
+ return F.layer_norm(inputs.float(), self.normalized_shape, self.weight.float() if self.weight is not None else None,
+ self.bias.float() if self.bias is not None else None, self.eps).to(origin_dtype)
+
+
+def fp32_silu_forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ return torch.nn.functional.silu(inputs.float(), inplace=self.inplace).to(inputs.dtype)
+
+
+def fp32_gelu_forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ return torch.nn.functional.gelu(inputs.float(), approximate=self.approximate).to(inputs.dtype)
+
+
+def replace_with_fp32_forwards():
+ nn.GELU.forward = fp32_gelu_forward
+ nn.SiLU.forward = fp32_silu_forward
+ nn.LayerNorm.forward = fp32_layer_norm_forward
diff --git a/opensora/adaptor/stage_1_and_2.py b/opensora/adaptor/stage_1_and_2.py
new file mode 100644
index 000000000..35f9a6d55
--- /dev/null
+++ b/opensora/adaptor/stage_1_and_2.py
@@ -0,0 +1,2522 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import os
+import pdb
+from deepspeed import comm as dist
+from packaging import version as pkg_version
+from collections import OrderedDict
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from deepspeed.runtime import ZeROOptimizer
+from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
+from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage,
+ inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups)
+
+from deepspeed.runtime.zero.config import ZeroStageEnum
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.utils import logger
+from deepspeed.moe.utils import is_moe_param
+from deepspeed.git_version_info import version
+
+from deepspeed.runtime.constants import PIPE_REPLICATED
+from deepspeed.accelerator import get_accelerator
+
+from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, LOSS_SCALER,
+ SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE,
+ BASE_OPTIMIZER_STATE_STEP, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS)
+from deepspeed.utils import link_hp_params
+from deepspeed.checkpoint import enable_universal_checkpoint
+
+from deepspeed.utils import groups
+
+from opensora.adaptor.zp_manager import zp_manager
+
+# Toggle this to true to enable correctness test
+# with gradient partitioning and without
+pg_correctness_test = False
+
+OPTIMIZER_ALLGATHER_TIMER = 'optimizer_allgather'
+OPTIMIZER_GRADIENTS_TIMER = 'optimizer_gradients'
+OPTIMIZER_STEP_TIMER = 'optimizer_step'
+OPTIMIZER_TIMERS = [OPTIMIZER_ALLGATHER_TIMER, OPTIMIZER_GRADIENTS_TIMER, OPTIMIZER_STEP_TIMER]
+
+
+def input(msg):
+ return
+
+
+def split_half_float_double(tensors):
+ device_type = get_accelerator().device_name()
+ dtypes = [
+ "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
+ "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type)
+ ]
+ buckets = []
+ for i, dtype in enumerate(dtypes):
+ bucket = [t for t in tensors if t.type() == dtype]
+ if bucket:
+ buckets.append(bucket)
+ return buckets
+
+
+def isclose(a, b, rtol=1e-09, atol=0.0):
+ return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
+
+
+def lcm(x, y):
+ from fractions import gcd # or can import gcd from `math` in Python 3
+ return x * y // gcd(x, y)
+
+
+def get_alignment_padding(tensor_list, alignment):
+ num_elements = sum([tensor.numel() for tensor in tensor_list])
+ remainder = num_elements % alignment
+ return (alignment - remainder) if remainder else remainder
+
+
+def move_to_cpu(tensor_list):
+ for tensor in tensor_list:
+ tensor.data = tensor.data.cpu()
+
+
+def print_rank_msg(msg):
+ print(f"rank {dist.get_rank()} - {msg}")
+
+
+def _get_padded_tensor(src_tensor, size):
+ if src_tensor.numel() >= size:
+ return src_tensor
+ padded_tensor = torch.zeros(size, dtype=src_tensor.dtype, device=src_tensor.device)
+ slice_tensor = torch.narrow(padded_tensor, 0, 0, src_tensor.numel())
+ slice_tensor.data.copy_(src_tensor.data)
+ return padded_tensor
+
+
+def contigous_flatten(tensors):
+ return _flatten_dense_tensors([tensor.contiguous() for tensor in tensors])
+
+
+def all_gather_into_tensor_dp_groups(groups_flat, partitioned_param_groups, zp_process_group):
+ for group_id, (group_flat, partitioned_params) in enumerate(zip(groups_flat, partitioned_param_groups)):
+ partition_id = dist.get_rank(group=zp_process_group[group_id])
+ dp_world_size = dist.get_world_size(group=zp_process_group[group_id])
+ if dp_world_size == 1:
+ # no groups share optimizer states
+ # pipeline parallel with bf16 will default call this even if dp size = 1.
+ continue
+ input_tensor = partitioned_params[partition_id].contiguous()
+ # print(f"call all_gather_into_tensor_dp_groups, input size is {input_tensor.size()}, "
+ # f"output size is {group_flat.size()}")
+ #
+ # print(f"groups_flat.size = {groups_flat.numel()}")
+ # print(f"partitioned_param_groups = {sum([v.numel() for v in partitioned_param_groups])}")
+ dist.all_gather_into_tensor(group_flat, input_tensor, zp_process_group[group_id])
+
+
+class DeepSpeedZeroOptimizer(ZeROOptimizer):
+ """
+ DeepSpeedZeroOptimizer designed to reduce the memory footprint
+ required for training large deep learning models.
+
+ For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
+ https://arxiv.org/abs/1910.02054
+
+ For usage examples, refer to TODO: DeepSpeed Tutorial
+
+ """
+
+ def __init__(self,
+ init_optimizer,
+ param_names,
+ timers,
+ static_loss_scale=1.0,
+ dynamic_loss_scale=False,
+ dynamic_loss_args=None,
+ verbose=True,
+ contiguous_gradients=True,
+ reduce_bucket_size=500000000,
+ use_multi_rank_bucket_allreduce=True,
+ allgather_bucket_size=5000000000,
+ dp_process_group=None,
+ expert_parallel_group=None,
+ expert_data_parallel_group=None,
+ reduce_scatter=True,
+ overlap_comm=False,
+ offload_optimizer_config=None,
+ mpu=None,
+ clip_grad=0.0,
+ gradient_accumulation_dtype=torch.float32,
+ communication_data_type=torch.float16,
+ postscale_gradients=True,
+ gradient_predivide_factor=1.0,
+ gradient_accumulation_steps=1,
+ ignore_unused_parameters=True,
+ partition_grads=True,
+ round_robin_gradients=False,
+ has_moe_layers=False,
+ fp16_master_weights_and_gradients=False,
+ elastic_checkpoint=False):
+
+ if offload_optimizer_config is not None and offload_optimizer_config.device != OffloadDeviceEnum.none:
+ self.cpu_offload = True
+ self.cpu_offload_pin_memory = offload_optimizer_config.pin_memory
+ else:
+ self.cpu_offload = False
+ self.cpu_offload_pin_memory = False
+
+ if dist.get_rank() == 0:
+ logger.info(f"Reduce bucket size {reduce_bucket_size}")
+ logger.info(f"Allgather bucket size {allgather_bucket_size}")
+ logger.info(f"CPU Offload: {self.cpu_offload}")
+ logger.info(f'Round robin gradient partitioning: {round_robin_gradients}')
+ # The fused optimizer does all the work. We need this layer for two reason:
+ # 1. maintain same user API from apex.fp16_utils
+ # 2. keep common stuff here in case we need to add ne552w fused optimizer later
+
+ self.elastic_checkpoint = elastic_checkpoint
+ self.param_names = param_names
+ self.mpu = mpu
+ # differences from apex.fp16_utils:
+ # - assume all model params in fp16
+ # - assume all params requires grad
+ # - flat by groups, not keeping state. TODO: remove state explicitly?
+ # - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
+ if not get_accelerator().is_available():
+ raise SystemError("Accelerator is not detected, cannot perform low precision training (e.g., fp16, bf16).")
+ self.optimizer = init_optimizer
+
+ # Use torch (un)flatten ops
+ self.flatten = contigous_flatten
+ self.unflatten = _unflatten_dense_tensors
+
+ # ZeRO stage 1 (False) or 2 (True)
+ self.partition_gradients = partition_grads
+ self.zero_stage_string = "ZeRO-2" if partition_grads else "ZeRO-1"
+
+ self.timers = timers
+
+ self.reduce_scatter = reduce_scatter
+
+ self.overlap_comm = overlap_comm
+
+ self.deepspeed_adam_offload = self.cpu_offload
+
+ self.device = get_accelerator().current_device_name() if not self.cpu_offload else 'cpu'
+
+ zp_manager.init_group()
+ self.zp_process_group = zp_manager.zp_group
+ zp_rank = dist.get_rank(group=self.zp_process_group)
+ zp_size = dist.get_world_size(group=self.zp_process_group)
+ print(f"zp rank is {zp_rank}, zp_size={zp_size}")
+
+ self.dp_process_group = dp_process_group
+
+ self.sequence_parallel_size = groups._get_sequence_parallel_world_size()
+ # expert parallel group
+ self.ep_process_group = expert_parallel_group
+
+ # data parallel group for experts
+ self.expert_dp_process_group = expert_data_parallel_group
+
+ # data parallel size for non-experts
+ dp_size = dist.get_world_size(group=self.dp_process_group)
+
+ # For MoE models this maybe different for different param group
+ # It will be modified during MoE setup later in the init
+ self.real_zp_process_group = [self.zp_process_group for i in range(len(self.optimizer.param_groups))]
+ self.real_dp_process_group = [self.dp_process_group for i in range(len(self.optimizer.param_groups))]
+ self.partition_count = [zp_manager.zp_size for i in range(len(self.optimizer.param_groups))]
+
+ self.is_gradient_accumulation_boundary = True
+
+ # CPU-Offload requires contiguous gradients
+ self.contiguous_gradients = contiguous_gradients or self.cpu_offload
+
+ self.has_moe_layers = has_moe_layers
+ if self.has_moe_layers:
+ self._configure_moe_settings()
+ self._global_grad_norm = 0.
+
+ if mpu is None:
+ self.model_parallel_group = None
+ self.model_parallel_world_size = 1
+ self.model_parallel_rank = 0
+ else:
+ self.model_parallel_group = mpu.get_model_parallel_group()
+ self.model_parallel_world_size = mpu.get_model_parallel_world_size()
+ self.model_parallel_rank = bwc_tensor_model_parallel_rank(mpu)
+
+ self.overflow = False
+ self.clip_grad = clip_grad
+ self.communication_data_type = communication_data_type
+ self.gradient_predivide_factor = gradient_predivide_factor
+ self.postscale_gradients = postscale_gradients
+ self.gradient_accumulation_steps = gradient_accumulation_steps
+ self.micro_step_id = 0
+ self.ignore_unused_parameters = ignore_unused_parameters
+ self.round_robin_gradients = round_robin_gradients
+
+ self.extra_large_param_to_reduce = None
+ self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients
+
+ if self.fp16_master_weights_and_gradients:
+ assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], \
+ f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32." \
+ f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
+ f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
+
+ if self.reduce_scatter:
+ valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+ assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
+ assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
+ assert self.postscale_gradients, "pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
+
+ # param flattened by groups
+ self.bit16_groups = []
+ self.bit16_groups_flat = []
+
+ # param partitioned by data parallel degree
+ # this will contain a list of equal sized tensors
+ # each of which will be updated by a different process
+ self.parallel_partitioned_bit16_groups = []
+
+ # a single 32-bit partition of the parallel partitioned parameters
+ # that this process will update
+ self.single_partition_of_fp32_groups = []
+
+ # param partition info
+
+ # These are the parameters in each group that will not be updated by this process directly
+ self.params_not_in_partition = []
+
+ # These are the parameters that will be updated by this process directly
+ self.params_in_partition = []
+
+ # Offset from the first parameter in the self.params_in_partition
+ # the parameter boundaries may not align with partition boundaries
+ # so we need to keep track of the offset
+ self.first_offset = []
+
+ # number of elements per partition in each group
+ self.partition_size = []
+
+ # align nccl all-gather send buffers to 4-byte boundary
+ self.nccl_start_alignment_factor = 16 # 4-byte alignment/sizeof(fp16) = 2
+
+ assert (
+ allgather_bucket_size % self.nccl_start_alignment_factor == 0
+ ), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
+
+ self.all_reduce_print = False
+ self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
+ self.gradient_accumulation_dtype = gradient_accumulation_dtype
+
+ if self.dtype != self.gradient_accumulation_dtype:
+ self.use_separate_grad_accum = True
+ else:
+ self.use_separate_grad_accum = False
+ if self.use_separate_grad_accum and not self.partition_gradients:
+ self.use_grad_accum_attribute = True
+ else:
+ self.use_grad_accum_attribute = False
+
+ self.round_robin_bit16_groups = []
+ self.round_robin_bit16_indices = []
+
+ # Use different parallel to do all_to_all_reduce related things
+ # padding on each partition for alignment purposes
+ self.groups_padding = []
+ # loop to deal with groups
+ for i, param_group in enumerate(self.optimizer.param_groups):
+ partition_id = dist.get_rank(group=self.real_zp_process_group[i])
+
+ # push this group to list before modify
+ # TODO: Explore simplification that avoids the extra book-keeping by pushing the reordered group
+ trainable_parameters = []
+ for param in param_group['params']:
+ if param.requires_grad:
+ param.grad_accum = None
+ trainable_parameters.append(param)
+ self.bit16_groups.append(trainable_parameters)
+
+ # not sure why apex was cloning the weights before flattening
+ # removing cloning here
+
+ see_memory_usage(f"Before moving param group {i} to CPU")
+ # move all the parameters to cpu to free up GPU space for creating flat buffer
+ move_to_cpu(self.bit16_groups[i])
+ empty_cache()
+ see_memory_usage(f"After moving param group {i} to CPU", force=False)
+
+ # Reorder group parameters for load balancing of gradient partitioning during backward among ranks.
+ # This ensures that gradients are reduced in a fashion such that ownership round robins among the ranks.
+ # For example, rather than 3 gradients (g_n+2, g_n+1, g_n) that are reduced consecutively belonging
+ # to the same rank, instead they will belong to 3 ranks (r_m+2, r_m+1, r_m).
+ if self.round_robin_gradients:
+ round_robin_tensors, round_robin_indices = self._round_robin_reorder(
+ self.bit16_groups[i], dist.get_world_size(group=self.real_zp_process_group[i]))
+ else:
+ round_robin_tensors = self.bit16_groups[i]
+ round_robin_indices = list(range(len(self.bit16_groups[i])))
+
+ self.round_robin_bit16_groups.append(round_robin_tensors)
+ self.round_robin_bit16_indices.append(round_robin_indices)
+
+ # create flat buffer in CPU and move to GPU
+ self.bit16_groups_flat.append(
+ self.flatten_dense_tensors_aligned(
+ self.round_robin_bit16_groups[i],
+ self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_zp_process_group[i])).to(
+ get_accelerator().current_device_name()))
+ see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False)
+
+ # Record padding required for alignment
+ if partition_id == dist.get_world_size(group=self.real_zp_process_group[i]) - 1:
+ padding = self.bit16_groups_flat[i].numel() - sum(
+ [t.numel() for t in self.round_robin_bit16_groups[i]])
+ else:
+ padding = 0
+ self.groups_padding.append(padding)
+
+ if dist.get_rank(group=self.real_zp_process_group[i]) == 0:
+ see_memory_usage(f"After Flattening and after emptying param group {i} cache", force=False)
+
+ # set model bit16 weight to slices of flattened buffer
+ self._update_model_bit16_weights(i)
+
+ # divide the flat weights into near equal partition equal to the data parallel degree
+ # each process will compute on a different part of the partition
+ data_parallel_partitions = self.get_data_parallel_partitions(self.bit16_groups_flat[i], i)
+ self.parallel_partitioned_bit16_groups.append(data_parallel_partitions)
+
+ # print(f"self.bit16_groups_flat[i].size = {self.bit16_groups_flat[i].numel()}")
+ # print(f"data_parallel_partitions = {sum([v.numel() for v in data_parallel_partitions])}")
+
+ # verify that data partition start locations are 4-byte aligned
+ for partitioned_data in data_parallel_partitions:
+ assert (partitioned_data.data_ptr() % (2 * self.nccl_start_alignment_factor) == 0)
+
+ # A partition of the fp32 master weights that will be updated by this process.
+ # Note that the params in single_partition_of_fp32_groups is cloned and detached
+ # from the origin params of the model.
+ if not fp16_master_weights_and_gradients:
+ self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
+ self.device).clone().float().detach())
+ else:
+ self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
+ self.device).clone().half().detach())
+
+ # Set local optimizer to have flat params of its own partition.
+ # After this, the local optimizer will only contain its own partition of params.
+ # In that case, the local optimizer only saves the states(momentum, variance, etc.) related to its partition's params(zero stage1).
+ self.single_partition_of_fp32_groups[
+ i].requires_grad = True # keep this in case internal optimizer uses it
+ param_group['params'] = [self.single_partition_of_fp32_groups[i]]
+
+ partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size(group=self.real_zp_process_group[i])
+ params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(
+ self.round_robin_bit16_groups[i], partition_size, partition_id)
+
+ self.partition_size.append(partition_size)
+ self.params_in_partition.append(params_in_partition)
+ self.params_not_in_partition.append(params_not_in_partition)
+ self.first_offset.append(first_offset)
+
+ self.reduce_bucket_size = int(reduce_bucket_size)
+ self.use_multi_rank_bucket_allreduce = use_multi_rank_bucket_allreduce
+ self.allgather_bucket_size = int(allgather_bucket_size)
+
+ self.reduction_stream = None if get_accelerator().is_synchronized_device() else get_accelerator().Stream()
+ # self.copy_grad_stream = get_accelerator().Stream()
+ self.callback_queued = False
+
+ self.param_dict = {}
+
+ # map between param_id and bool to specify if a param is in this partition
+ self.is_param_in_current_partition = {}
+
+ self.grads_in_ipg_bucket = []
+ self.params_in_ipg_bucket = []
+ self.elements_in_ipg_bucket = 0
+ self.params_already_reduced = []
+ self._release_ipg_buffers()
+ self.previous_reduced_grads = None
+ self.ipg_bucket_has_moe_params = False
+
+ # simplified param id
+ self.param_id = {}
+
+ # interesting code: unique ids being assigned to individual parameters
+ largest_param_numel = 0
+ count = 0
+ for i, params_group in enumerate(self.bit16_groups):
+ for param in params_group:
+ unique_id = id(param)
+ self.param_id[unique_id] = count
+ self.param_dict[count] = param
+ self.params_already_reduced.append(False)
+ if param.numel() > largest_param_numel:
+ largest_param_numel = param.numel()
+ count = count + 1
+
+ for param_group in self.params_in_partition:
+ for param in param_group:
+ self.is_param_in_current_partition[self.get_param_id(param)] = True
+
+ for param_group in self.params_not_in_partition:
+ for param in param_group:
+ self.is_param_in_current_partition[self.get_param_id(param)] = False
+
+ if self.cpu_offload:
+ self.accumulated_grads_in_cpu = {}
+ self.norm_for_param_grads = {}
+ self.local_overflow = False
+ self.grad_position = {}
+ self.temp_grad_buffer_for_cpu_offload = torch.zeros(largest_param_numel,
+ device=self.device,
+ dtype=self.dtype)
+ if self.cpu_offload_pin_memory:
+ self.temp_grad_buffer_for_cpu_offload = get_accelerator().pin_memory(
+ self.temp_grad_buffer_for_cpu_offload)
+ self.temp_grad_buffer_for_gpu_offload = torch.zeros(largest_param_numel,
+ device=get_accelerator().current_device_name(),
+ dtype=self.dtype)
+ for i, params_group in enumerate(self.bit16_groups):
+ self.get_grad_position(i, self.params_in_partition[i], self.first_offset[i], self.partition_size[i])
+
+ # mapping from parameter to partition that it belongs to
+ self.param_to_partition_ids = {}
+
+ # stores if a partition has been reduced in this step
+ self.is_partition_reduced = {}
+
+ # number of grads in partition that still need to be computed
+ self.remaining_grads_in_partition = {}
+
+ # total number of grads in partition
+ self.total_grads_in_partition = {}
+
+ # stores if a grad in a partition has been computed or not
+ self.is_grad_computed = {}
+
+ # stores the offset at which a parameter gradient needs to be inserted in a partition
+ self.grad_partition_insertion_offset = {}
+
+ # the offset in the gradient at which it must be inserted at the beginning of the partition
+ self.grad_start_offset = {}
+
+ # will store the averaged gradients required by this partition
+ self.averaged_gradients = {}
+
+ # For cpu_offload, will store the averaged gradients required by this partition
+ self.offload_gradient_dict = {}
+
+ # store index of first parameter in each partition
+ self.first_param_index_in_partition = {}
+
+ # initializes all data structures for implementing gradient partitioning
+ self.initialize_gradient_partitioning_data_structures()
+
+ # resets the data structure value for the next backward propagation
+ self.reset_partition_gradient_structures()
+
+ # creates backward hooks for gradient partitioning
+ if self.partition_gradients or self.overlap_comm:
+ self.create_reduce_and_remove_grad_hooks()
+
+ self.custom_loss_scaler = False
+ self.external_loss_scale = None
+
+ # we may have a way of fusing dynamic scale. Do not support for now
+ self.loss_scaler = CreateLossScaler(dtype=self.dtype,
+ static_loss_scale=static_loss_scale,
+ dynamic_scaling=dynamic_loss_scale,
+ dynamic_loss_args=dynamic_loss_args)
+ self.dynamic_loss_scale = self.loss_scaler.dynamic
+
+ if self.dtype != torch.float16:
+ # Only fp16 should use dynamic loss scaling
+ assert self.loss_scaler.cur_scale == 1.0
+ assert not self.dynamic_loss_scale
+
+ see_memory_usage("Before initializing optimizer states", force=True)
+ self.initialize_optimizer_states()
+ see_memory_usage("After initializing optimizer states", force=True)
+
+ if dist.get_rank() == 0:
+ logger.info(f"optimizer state initialized")
+
+ if dist.get_rank(group=self.zp_process_group) == 0:
+ see_memory_usage(f"After initializing ZeRO optimizer", force=True)
+
+ self._link_all_hp_params()
+ self._enable_universal_checkpoint()
+ self._param_slice_mappings = self._create_param_mapping()
+
+ def _enable_universal_checkpoint(self):
+ for lp_param_group in self.bit16_groups:
+ enable_universal_checkpoint(param_list=lp_param_group)
+
+ def _create_param_mapping(self):
+ param_mapping = []
+ for i, _ in enumerate(self.optimizer.param_groups):
+ param_mapping_per_group = OrderedDict()
+ for lp in self.bit16_groups[i]:
+ if lp._hp_mapping is not None:
+ lp_name = self.param_names[lp]
+ param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address()
+ param_mapping.append(param_mapping_per_group)
+
+ return param_mapping
+
+ def _link_all_hp_params(self):
+ dp_world_size = dist.get_world_size(group=self.zp_process_group)
+ if self.cpu_offload:
+ self._get_offload_gradient_dict()
+
+ for i, _ in enumerate(self.optimizer.param_groups):
+ # Link bit16 and fp32 params in partition
+ partition_id = dist.get_rank(group=self.real_zp_process_group[i])
+ partition_size = self.bit16_groups_flat[i].numel() // dp_world_size
+ flat_hp_partition = self.single_partition_of_fp32_groups[i]
+ link_hp_params(lp_param_list=self.bit16_groups[i],
+ flat_hp_partition=flat_hp_partition,
+ gradient_dict=self.averaged_gradients,
+ offload_gradient_dict=self.offload_gradient_dict,
+ use_offload=self.cpu_offload,
+ param_group_index=i,
+ partition_start=partition_id * partition_size,
+ partition_size=partition_size,
+ partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+ dp_group=self.real_zp_process_group[i])
+
+ def is_moe_group(self, group):
+ return 'moe' in group and group['moe']
+
+ def _configure_moe_settings(self):
+ # if we're using ZeRO stage 2, ensure contiguous gradients are used
+ if self.partition_gradients:
+ assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
+ # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion
+ if not self.partition_gradients and not self.contiguous_gradients:
+ logger.warn(
+ "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental.")
+ assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
+
+ assert any(
+ [self.is_moe_group(group) for group in self.optimizer.param_groups]
+ ), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
+ self.is_moe_param_group = []
+ for i, group in enumerate(self.optimizer.param_groups):
+ if self.is_moe_group(group):
+ assert all([is_moe_param(param)
+ for param in group['params']]), "All params in MoE group must be MoE params"
+ self.real_zp_process_group[i] = self.expert_dp_process_group[group['name']]
+ self.partition_count[i] = dist.get_world_size(group=self.expert_dp_process_group[group['name']])
+ self.is_moe_param_group.append(True)
+ else:
+ self.is_moe_param_group.append(False)
+
+ assert self.expert_dp_process_group is not None, "Expert data parallel group should be configured with MoE"
+ assert self.ep_process_group is not None, "Expert parallel group should be configured with MoE"
+
+ def _update_model_bit16_weights(self, group_index):
+ updated_params = self.unflatten(self.bit16_groups_flat[group_index],
+ self.round_robin_bit16_groups[group_index])
+ for p, q in zip(self.round_robin_bit16_groups[group_index], updated_params):
+ p.data = q.data
+
+ # set model fp16 weight to slices of reordered flattened buffer
+ for param_index, param in enumerate(self.bit16_groups[group_index]):
+ new_index = self.round_robin_bit16_indices[group_index][param_index]
+ param.data = self.round_robin_bit16_groups[group_index][new_index].data
+
+ def _round_robin_reorder(self, tensor_list, num_partitions):
+
+ # disable round robin if need to debug something
+ # return tensor_list, list(range(len(tensor_list)))
+
+ partition_tensors = {}
+
+ for i, tensor in enumerate(tensor_list):
+ j = i % num_partitions
+ if not j in partition_tensors:
+ partition_tensors[j] = []
+ partition_tensors[j].append((i, tensor))
+
+ reordered_tensors = []
+ reordered_indices = {}
+
+ for partition_index in partition_tensors.keys():
+ for i, (original_index, tensor) in enumerate(partition_tensors[partition_index]):
+ reordered_indices[original_index] = len(reordered_tensors)
+ reordered_tensors.append(tensor)
+
+ return reordered_tensors, reordered_indices
+
+ def _release_ipg_buffers(self):
+ if self.contiguous_gradients:
+ self.ipg_buffer = None
+ self.grads_in_partition = None
+ self.grads_in_partition_offset = 0
+
+ def initialize_optimizer_states(self):
+
+ for i, group in enumerate(self.bit16_groups):
+ single_grad_partition = torch.zeros(int(self.partition_size[i]),
+ dtype=self.single_partition_of_fp32_groups[i].dtype,
+ device=self.device)
+ self.single_partition_of_fp32_groups[i].grad = get_accelerator().pin_memory(
+ single_grad_partition) if self.cpu_offload_pin_memory else single_grad_partition
+
+ # Initialize the optimizer states with the flattened fp32 partition.
+ # State initialization for the Adagrad optimizer occurs at construction as opposed to other optimizers
+ # which do lazy initialization of the state at the first call to step.
+ if isinstance(self.optimizer, torch.optim.Adagrad):
+ self.optimizer = torch.optim.Adagrad(self.single_partition_of_fp32_groups, **self.optimizer.defaults)
+ else:
+ self.optimizer.step()
+
+ if not self.cpu_offload:
+ for group in self.single_partition_of_fp32_groups:
+ group.grad = None # class init
+
+ return
+
+ #########################################################################
+ #################### ZeRO Stage 1 - reduce gradients ####################
+ #########################################################################
+ def reduce_gradients(self, pipeline_parallel=False):
+ world_size = dist.get_world_size(self.zp_process_group)
+ my_rank = dist.get_rank(self.zp_process_group)
+
+ # with PP we must create ipg buffer, since backward is handled outside zero
+ if pipeline_parallel and self.contiguous_gradients:
+ self.ipg_buffer = []
+ buf_0 = torch.empty(int(self.reduce_bucket_size),
+ dtype=self.dtype,
+ device=get_accelerator().current_device_name())
+ self.ipg_buffer.append(buf_0)
+ self.ipg_index = 0
+
+ if not self.overlap_comm:
+ for i, group in enumerate(self.bit16_groups):
+ for param in group:
+ grad_reduc = self.get_gradient_for_reduction(param)
+ if grad_reduc is not None:
+ self.reduce_ready_partitions_and_remove_grads(param, i)
+ # reduce any pending grads in either hook/non-hook case
+ self.overlapping_partition_gradients_reduce_epilogue()
+
+ #########################################################################
+ #########################ZeRO Partition Gradients########################
+ #########################################################################
+
+ def get_first_param_index(self, group_id, param_group, partition_id):
+ for index, param in enumerate(param_group):
+ param_id = self.get_param_id(param)
+ if partition_id in self.param_to_partition_ids[group_id][param_id]:
+ return index
+ return None
+
+ def initialize_gradient_partitioning_data_structures(self):
+
+ for i, param_group in enumerate(self.round_robin_bit16_groups):
+ total_partitions = dist.get_world_size(group=self.real_zp_process_group[i])
+
+ self.param_to_partition_ids[i] = {}
+ self.is_partition_reduced[i] = {}
+ self.total_grads_in_partition[i] = {}
+ self.remaining_grads_in_partition[i] = {}
+ self.is_grad_computed[i] = {}
+ self.grad_partition_insertion_offset[i] = {}
+ self.grad_start_offset[i] = {}
+ self.first_param_index_in_partition[i] = {}
+
+ for partition_id in range(total_partitions):
+ self.is_grad_computed[i][partition_id] = {}
+ self.grad_partition_insertion_offset[i][partition_id] = {}
+ self.grad_start_offset[i][partition_id] = {}
+ self.total_grads_in_partition[i][partition_id] = 0
+ self.initialize_gradient_partition(i, param_group, partition_id)
+ self.is_partition_reduced[i][partition_id] = False
+ self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index(
+ i, param_group, partition_id)
+
+ def independent_gradient_partition_epilogue(self):
+ self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
+ self.reduce_ipg_grads()
+ self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
+
+ # if dist.get_rank() == 0:
+ # logger.info("Params already reduced %s", self.params_already_reduced)
+ for i in range(len(self.params_already_reduced)):
+ self.params_already_reduced[i] = False
+
+ if self.overlap_comm:
+ get_accelerator().synchronize()
+ # It is safe to clear previously reduced grads of other partitions
+ self._clear_previous_reduced_grads()
+
+ if self.cpu_offload is False:
+ for i, _ in enumerate(self.bit16_groups):
+
+ if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
+ self.averaged_gradients[i] = self.get_flat_partition(
+ self.params_in_partition[i],
+ self.first_offset[i],
+ self.partition_size[i],
+ dtype=self.gradient_accumulation_dtype,
+ device=get_accelerator().current_device_name(),
+ return_tensor_list=True)
+ else:
+ avg_new = self.get_flat_partition(self.params_in_partition[i],
+ self.first_offset[i],
+ self.partition_size[i],
+ dtype=self.gradient_accumulation_dtype,
+ device=get_accelerator().current_device_name(),
+ return_tensor_list=True)
+
+ for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i], avg_new):
+ accumulated_grad.add_(new_avg_grad)
+
+ self._release_ipg_buffers()
+
+ # No need to keep the gradients anymore.
+ # All gradients required by the step
+ # are in self.averaged_gradients
+ self.zero_grad(set_to_none=True)
+ see_memory_usage(f"End ipg_epilogue")
+
+ # resets all partition to no reduced
+ # sets remaining grads to the total number of grads in each partition
+ # set is grad computed to false for all grads in partition
+ def reset_partition_gradient_structures(self):
+ for i, _ in enumerate(self.bit16_groups):
+ total_partitions = dist.get_world_size(group=self.real_zp_process_group[i])
+ for partition_id in range(total_partitions):
+ self.is_partition_reduced[i][partition_id] = False
+ self.remaining_grads_in_partition[i][partition_id] = self.total_grads_in_partition[i][partition_id]
+
+ for param_id in self.is_grad_computed[i][partition_id]:
+ self.is_grad_computed[i][partition_id][param_id] = False
+
+ def initialize_gradient_partition(self, i, param_group, partition_id):
+
+ def set_key_value_list(dictionary, key, value):
+ if key in dictionary:
+ dictionary[key].append(value)
+ else:
+ dictionary[key] = [value]
+
+ def increment_value(dictionary, key):
+ if key in dictionary:
+ dictionary[key] += 1
+ else:
+ dictionary[key] = 1
+
+ partition_size = self.partition_size[i]
+
+ start_index = partition_size * partition_id
+ end_index = partition_size * (partition_id + 1)
+
+ current_index = 0
+ first_offset = 0
+
+ for param in param_group:
+
+ param_size = param.numel()
+ param_id = self.get_param_id(param)
+
+ if start_index <= current_index < end_index:
+ set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id)
+ increment_value(self.total_grads_in_partition[i], partition_id)
+
+ self.is_grad_computed[i][partition_id][param_id] = False
+
+ self.grad_partition_insertion_offset[i][partition_id][param_id] = current_index - start_index
+ self.grad_start_offset[i][partition_id][param_id] = 0
+
+ elif current_index < start_index < (current_index + param_size):
+ assert (first_offset == 0
+ ), "This can happen either zero or only once as this must be the first tensor in the partition"
+ first_offset = start_index - current_index
+
+ set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id)
+ increment_value(self.total_grads_in_partition[i], partition_id)
+
+ self.is_grad_computed[i][partition_id][param_id] = False
+
+ self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
+ self.grad_start_offset[i][partition_id][param_id] = first_offset
+
+ current_index = current_index + param_size
+
+ def overlapping_partition_gradients_reduce_epilogue(self):
+ self.independent_gradient_partition_epilogue()
+
+ def fill_grad_accum_attribute(self):
+ for group in self.bit16_groups:
+ for param in group:
+ if param.grad is not None:
+ if param.grad_accum is None:
+ param.grad_accum = param.grad.to(self.gradient_accumulation_dtype)
+ else:
+ param.grad_accum.add_(
+ param.grad.to(self.gradient_accumulation_dtype).view(param.grad_accum.shape))
+ param.grad = None
+
+ def get_gradient_for_reduction(self, param):
+ if self.use_grad_accum_attribute:
+ return param.grad_accum.to(self.dtype) if param.grad_accum is not None else None
+ else:
+ return param.grad
+
+ def get_param_gradient_attribute(self, param):
+ return param.grad_accum if self.use_grad_accum_attribute else param.grad
+
+ # Clear the tensor the reduction gradient attribute is pointing to
+ def clear_grad_attribute(self, param):
+ if self.use_grad_accum_attribute:
+ param.grad_accum = None
+ else:
+ param.grad = None
+
+ def create_reduce_and_remove_grad_hooks(self):
+ self.grad_accs = []
+ for i, param_group in enumerate(self.bit16_groups):
+ for param in param_group:
+ if param.requires_grad:
+ def wrapper(param, i):
+ param_tmp = param.expand_as(param)
+ grad_acc = param_tmp.grad_fn.next_functions[0][0]
+
+ def reduce_partition_and_remove_grads(*notneeded):
+ self.reduce_ready_partitions_and_remove_grads(param, i)
+
+ grad_acc.register_hook(reduce_partition_and_remove_grads)
+ self.grad_accs.append(grad_acc)
+
+ wrapper(param, i)
+
+ def get_param_id(self, param):
+ unique_id = id(param)
+ return self.param_id[unique_id]
+
+ def report_ipg_memory_usage(self, tag, param_elems):
+ elem_count = self.elements_in_ipg_bucket + param_elems
+ percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
+ see_memory_usage(
+ f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
+ )
+
+ # create a flat tensor aligned at the alignment boundary
+ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+ return self.flatten(align_dense_tensors(tensor_list, alignment))
+
+ ############### Independent Partition Gradient ########################
+ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
+
+ grad_reduc = self.get_gradient_for_reduction(param)
+ if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
+ self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.numel())
+ self.reduce_ipg_grads()
+ if self.contiguous_gradients and self.overlap_comm:
+ # Swap ipg_index between 0 and 1
+ self.ipg_index = 1 - self.ipg_index
+ self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads", param.numel())
+
+ param_id = self.get_param_id(param)
+ assert self.params_already_reduced[param_id] == False, \
+ f"The parameter {param_id} has already been reduced. \
+ Gradient computed twice for this partition. \
+ Multiple gradient reduction is currently not supported"
+
+ if self.contiguous_gradients:
+ if param.numel() > self.reduce_bucket_size:
+ self.extra_large_param_to_reduce = param
+ else:
+ # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
+ new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(0, self.elements_in_ipg_bucket, param.numel())
+ new_grad_tensor.copy_(grad_reduc.view(-1))
+ grad_reduc.data = new_grad_tensor.data.view_as(grad_reduc)
+
+ self.elements_in_ipg_bucket += param.numel()
+
+ assert grad_reduc is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
+
+ self.grads_in_ipg_bucket.append(grad_reduc)
+ self.params_in_ipg_bucket.append((i, param, param_id))
+
+ # make sure the average tensor function knows how to average the gradients
+ if is_moe_param(param):
+ self.ipg_bucket_has_moe_params = True
+
+ self.report_ipg_memory_usage("End ipg_remove_grads", 0)
+
+ def print_rank_0(self, message):
+ if dist.get_rank() == 0:
+ logger.info(message)
+
+ def gradient_reduction_w_predivide(self, tensor):
+
+ dp_world_size = dist.get_world_size(group=self.dp_process_group)
+
+ tensor_to_allreduce = tensor
+
+ if self.communication_data_type != tensor.dtype:
+ tensor_to_allreduce = tensor.to(self.communication_data_type)
+
+ if self.postscale_gradients:
+ if self.gradient_predivide_factor != 1.0:
+ tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
+
+ dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+
+ if self.gradient_predivide_factor != dp_world_size:
+ tensor_to_allreduce.mul_(self.gradient_predivide_factor /
+ (dp_world_size / float(self.sequence_parallel_size)))
+ else:
+ tensor_to_allreduce.div_(dp_world_size / float(self.sequence_parallel_size))
+ dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+
+ if self.communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
+ tensor.copy_(tensor_to_allreduce)
+
+ return tensor
+
+ def allreduce_and_copy_with_multiple_ranks(self,
+ small_bucket,
+ log=None,
+ divide=True,
+ process_group=None,
+ bucket_ranks=None):
+ process_group = self.zp_process_group if process_group is None else process_group
+ allreduced = self.allreduce_bucket(small_bucket, log=log, divide=divide, process_group=process_group)
+ for buf, synced, bucket_rank in zip(small_bucket, self.unflatten(allreduced, small_bucket), bucket_ranks):
+ if dist.get_rank(group=process_group) == bucket_rank:
+ buf.copy_(synced)
+
+ def allreduce_and_scatter(self, bucket, numel_per_bucket=500000000, log=None, divide=True, process_group=None):
+ small_bucket = []
+ small_bucket_ranks = []
+ numel = 0
+ allreduce_sizes = []
+
+ for i, bucket_elem in enumerate(bucket):
+ rank, tensor = bucket_elem
+ small_bucket.append(tensor)
+ small_bucket_ranks.append(rank)
+ numel = numel + tensor.numel()
+ if numel > numel_per_bucket:
+ self.allreduce_and_copy_with_multiple_ranks(small_bucket,
+ log=None,
+ divide=divide,
+ process_group=process_group,
+ bucket_ranks=small_bucket_ranks)
+ small_bucket = []
+ small_bucket_ranks = []
+ numel = 0
+
+ if len(small_bucket) > 0:
+ self.allreduce_and_copy_with_multiple_ranks(small_bucket,
+ log=None,
+ divide=divide,
+ process_group=process_group,
+ bucket_ranks=small_bucket_ranks)
+
+ def average_tensor(self, tensor):
+ if self.overlap_comm:
+ stream = self.reduction_stream
+ if not get_accelerator().is_synchronized_device():
+ stream.wait_stream(get_accelerator().current_stream())
+ else:
+ stream = get_accelerator().current_stream()
+
+ with get_accelerator().stream(stream):
+ if not self.reduce_scatter:
+ self.gradient_reduction_w_predivide(tensor)
+ return
+
+ # Accumulate destination ranks and bucket offsets for each gradient slice.
+ # Note: potential future optimization, record access pattern of parameters
+ # in backward pass and partition gradients w.r.t. access pattern so that our
+ # bucket is guaranteed to be contiguous w.r.t. ranks
+ rank_and_offsets = []
+ real_dp_process_group = []
+ curr_size = 0
+ prev_id, prev_process_group = -1, None
+
+ process_group = self.zp_process_group
+ # count = 0
+ for i, param, param_id in self.params_in_ipg_bucket:
+
+ process_group = self.zp_process_group
+ grad_reduc = self.get_gradient_for_reduction(param)
+ # Averages gradients at parameter level if ipg has a moe param
+ # Otherwise averaging is done at the entire buffer level at the end of the loop
+ # MoE param have different groups
+ if self.ipg_bucket_has_moe_params:
+ process_group = self.expert_dp_process_group[param.group_name] if is_moe_param(
+ param) else self.zp_process_group
+ grad_reduc.data.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size))
+
+ partition_ids = self.param_to_partition_ids[i][param_id]
+ assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids
+ ]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}"
+ partition_size = self.partition_size[i]
+ # Get all partition ids + their offsets
+ partition_ids_w_offsets = []
+ for partition_id in partition_ids:
+ offset = self.grad_start_offset[i][partition_id][param_id]
+ partition_ids_w_offsets.append((partition_id, offset))
+ partition_ids_w_offsets.sort(key=lambda t: t[1])
+
+ # Calculate rank and offsets for grad slices
+ for idx in range(len(partition_ids_w_offsets)):
+ partition_id, offset = partition_ids_w_offsets[idx]
+
+ # if dist.get_rank() == 0 and count < 100:
+ # print(f"Rank {dist.get_rank()} rank offset id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}")
+ # count += 1
+
+ # Calculate numel for grad slice depending on partition location
+ if idx == len(partition_ids_w_offsets) - 1:
+ # Last partition_id uses its own offset
+ numel = param.numel() - offset
+ else:
+ # Set numel to next partition's offset
+ numel = partition_ids_w_offsets[idx + 1][1] - offset
+
+ # Merge bucket ranges if they belong to the same rank
+ if partition_id == prev_id and process_group == prev_process_group:
+ prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
+ rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
+ else:
+ rank_and_offsets.append((partition_id, curr_size, numel))
+ real_dp_process_group.append(process_group)
+ curr_size += numel
+ prev_id, prev_process_group = partition_id, process_group
+
+ if not self.ipg_bucket_has_moe_params:
+ tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
+
+ buckets = {}
+ for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
+ grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
+ bucket_key = real_dp_process_group[i] if self.use_multi_rank_bucket_allreduce else (
+ dst, real_dp_process_group[i])
+ if bucket_key not in buckets:
+ buckets[bucket_key] = []
+ if self.use_multi_rank_bucket_allreduce:
+ buckets[bucket_key].append((dst, grad_slice))
+ else:
+ buckets[bucket_key].append(grad_slice)
+
+ for bucket_key in buckets:
+ if self.use_multi_rank_bucket_allreduce:
+ self.allreduce_and_scatter(buckets[bucket_key],
+ numel_per_bucket=self.reduce_bucket_size,
+ divide=self.ipg_bucket_has_moe_params,
+ process_group=bucket_key)
+ else:
+ dst, process_group = bucket_key
+ self.allreduce_no_retain(buckets[bucket_key],
+ numel_per_bucket=self.reduce_bucket_size,
+ rank=dst,
+ divide=self.ipg_bucket_has_moe_params,
+ process_group=process_group)
+
+ ##############################################################################
+ ############################# CPU Offload Methods#############################
+ ##############################################################################
+ def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
+ current_offset = 0
+
+ for i, tensor in enumerate(tensor_list):
+ param_id = self.get_param_id(tensor)
+ param_start_offset = 0
+
+ num_elements = tensor.numel()
+
+ # we need to offset to get to the right element
+ if i == 0 and first_offset > 0:
+ tensor_offset = first_offset
+ num_elements = num_elements - tensor_offset
+ param_start_offset = first_offset
+
+ # we dont need all elements of the tensor
+ if num_elements > (partition_size - current_offset):
+ num_elements = partition_size - current_offset
+
+ self.grad_position[param_id] = [
+ int(group_id), int(param_start_offset),
+ int(current_offset), int(num_elements)
+ ]
+ current_offset += num_elements
+
+ def update_overflow_tracker_for_param_grad(self, param):
+ grad_accum = self.get_param_gradient_attribute(param)
+ if grad_accum is not None and self._has_inf_or_nan(grad_accum.data):
+ self.local_overflow = True
+
+ def _get_offload_gradient_dict(self):
+ for param_group_index, _ in enumerate(self.optimizer.param_groups):
+ self.offload_gradient_dict[param_group_index] = []
+ for lp_param in self.params_in_partition[param_group_index]:
+ param_id = self.get_param_id(lp_param)
+ [_, _, dest_offset, num_elements] = self.grad_position[param_id]
+ dest_tensor = self.single_partition_of_fp32_groups[param_group_index].grad.view(-1).narrow(
+ 0, dest_offset, num_elements)
+ self.offload_gradient_dict[param_group_index].append(dest_tensor)
+
+ def async_accumulate_grad_in_cpu_via_gpu(self, param):
+ param_id = self.get_param_id(param)
+
+ [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+ # copy to a preexisiting buffer to avoid memory allocation penalty
+ dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(0, 0, param.numel())
+
+ # buffer for storing gradients for this parameter in CPU
+ def buffer_to_accumulate_to_in_cpu():
+ if not self.fp16_master_weights_and_gradients:
+ buffer = torch.zeros(param.numel(), dtype=param.dtype, device=self.device)
+ return get_accelerator().pin_memory(buffer) if self.cpu_offload_pin_memory else buffer
+ else:
+ return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
+
+ # accumulate gradients into param.grad_accum or parts of it that belongs to this partition
+ def accumulate_gradients():
+ grad_accum = self.get_param_gradient_attribute(param)
+ if not self.fp16_master_weights_and_gradients:
+ dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1), non_blocking=True)
+ grad_accum.data.view(-1).add_(dest_buffer)
+ else:
+ dest_buffer.narrow(0, source_offset,
+ num_elements).copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
+ non_blocking=True)
+ grad_accum.data.view(-1).narrow(0, source_offset,
+ num_elements).add_(dest_buffer.narrow(0, source_offset, num_elements))
+
+ # move accumulated gradients back to CPU
+ def copy_gradients_to_cpu():
+ grad_accum = self.get_param_gradient_attribute(param)
+ if not self.fp16_master_weights_and_gradients:
+ self.accumulated_grads_in_cpu[param_id].data.copy_(grad_accum.data.view(-1), non_blocking=True)
+ else:
+ self.accumulated_grads_in_cpu[param_id].data.copy_(grad_accum.data.view(-1).narrow(
+ 0, source_offset, num_elements),
+ non_blocking=True)
+
+ if param_id not in self.accumulated_grads_in_cpu:
+ self.accumulated_grads_in_cpu[param_id] = buffer_to_accumulate_to_in_cpu()
+
+ if self.micro_step_id > 0:
+ accumulate_gradients()
+
+ # at the boundary we will send 32bit directly
+ if not self.is_gradient_accumulation_boundary:
+ copy_gradients_to_cpu()
+
+ def set_norm_for_param_grad(self, param):
+ param_id = self.get_param_id(param)
+ grad_accum = self.get_param_gradient_attribute(param)
+ accumulated_grad = self.accumulated_grads_in_cpu[
+ param_id] if self.gradient_accumulation_steps > 1 else grad_accum
+
+ [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+ start = source_offset
+ accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+ self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+ def set_norm_for_param_grad_in_gpu(self, param):
+ param_id = self.get_param_id(param)
+ grad_accum = self.get_param_gradient_attribute(param)
+ if grad_accum is None:
+ accumulated_grad = param.grad
+ else:
+ accumulated_grad = grad_accum
+
+ [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+ start = source_offset
+ accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+ self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+ def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
+ param_id = self.get_param_id(param)
+
+ [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+ dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
+
+ grad_accum = self.get_param_gradient_attribute(param)
+ if grad_accum is None:
+ src_tensor = grad_accum.view(-1).narrow(0, source_offset, num_elements)
+ else:
+ src_tensor = grad_accum.view(-1).narrow(0, source_offset, num_elements)
+ if not self.fp16_master_weights_and_gradients:
+ src_tensor = src_tensor.float()
+
+ dest_tensor.copy_(src_tensor, non_blocking=True)
+ param.grad = None # offload only
+
+ def complete_grad_norm_calculation_for_cpu_offload(self, params):
+ total_norm = 0.0
+ norm_type = 2.0
+ for p in params:
+ # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+ if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+ continue
+
+ if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+ param_id = self.get_param_id(p)
+ # as some model have trainable parameters but skipped in training,
+ # their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
+ # so they have no norm_for_param_grads
+ if param_id in self.norm_for_param_grads:
+ param_norm = self.norm_for_param_grads[param_id]
+ total_norm += param_norm.item() ** 2
+ else:
+ # As unused parameters in modules may not be expected sometimes,
+ # add an explicit error msg when it occurred and an option to
+ # avoid the error
+ assert self.ignore_unused_parameters, """
+ This assert indicates that your module has parameters that
+ were not used in producing loss.
+ You can avoid this assert by
+ (1) enable ignore_unused_parameters option in zero_optimization config;
+ (2) making sure all trainable parameters and `forward` function
+ outputs participate in calculating loss.
+ """
+
+ # Sum across all model parallel GPUs.
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
+
+ self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
+
+ total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
+
+ if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
+ total_norm = -1
+
+ return total_norm
+
+ ############################################################################################
+ def copy_grads_in_partition(self, param):
+ if self.cpu_offload:
+
+ if self.gradient_accumulation_steps > 1:
+ self.async_accumulate_grad_in_cpu_via_gpu(param)
+
+ if self.is_gradient_accumulation_boundary:
+ self.set_norm_for_param_grad_in_gpu(param)
+
+ self.update_overflow_tracker_for_param_grad(param)
+
+ self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
+
+ return
+ # print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
+ if self.grads_in_partition is None:
+ self.grads_in_partition_offset = 0
+ total_size = 0
+ for group in self.params_in_partition:
+ for param_in_partition in group:
+ total_size += param_in_partition.numel()
+
+ see_memory_usage(f"before copying {total_size} gradients into partition")
+ self.grads_in_partition = torch.empty(int(total_size),
+ dtype=self.dtype,
+ device=get_accelerator().current_device_name())
+ see_memory_usage(f"after copying {total_size} gradients into partition")
+
+ grad_reduc = self.get_gradient_for_reduction(param)
+ # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
+ new_grad_tensor = self.grads_in_partition.view(-1).narrow(0, self.grads_in_partition_offset, param.numel())
+ new_grad_tensor.copy_(grad_reduc.view(-1))
+ grad_reduc.data = new_grad_tensor.data.view_as(grad_reduc)
+ # print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
+ self.grads_in_partition_offset += param.numel()
+
+ def reduce_ipg_grads(self):
+ if self.contiguous_gradients:
+ if self.extra_large_param_to_reduce is not None:
+ assert len(self.params_in_ipg_bucket) == 1, "more than 1 param in ipg bucket, this shouldn't happen"
+ _, _, param_id = self.params_in_ipg_bucket[0]
+ assert self.get_param_id(self.extra_large_param_to_reduce
+ ) == param_id, "param in ipg bucket does not match extra-large param"
+ extra_large_grad_reduc = self.get_gradient_for_reduction(self.extra_large_param_to_reduce)
+ self.average_tensor(extra_large_grad_reduc.view(-1))
+ self.extra_large_param_to_reduce = None
+ else:
+ self.average_tensor(self.ipg_buffer[self.ipg_index])
+ else:
+ self.buffered_reduce_fallback(None,
+ self.grads_in_ipg_bucket,
+ elements_per_buffer=self.elements_in_ipg_bucket)
+
+ if self.overlap_comm:
+ stream = self.reduction_stream
+ elif self.cpu_offload:
+ # TODO: copy_grad_stream is disabled because of race with reduce. This hurts perf and should be fixed.
+ # get_accelerator().synchronize()
+ # stream = self.copy_grad_stream
+ stream = get_accelerator().current_stream()
+ else:
+ stream = get_accelerator().current_stream()
+
+ with get_accelerator().stream(stream):
+ for _, param, param_id in self.params_in_ipg_bucket:
+
+ assert self.params_already_reduced[param_id] == False, \
+ f"The parameter {param_id} has already been reduced. \
+ Gradient computed twice for this partition. \
+ Multiple gradient reduction is currently not supported"
+
+ self.params_already_reduced[param_id] = True
+ if self.partition_gradients:
+ if not self.is_param_in_current_partition[param_id]:
+ if self.overlap_comm and self.contiguous_gradients is False:
+ # Clear grads of other partitions during the next reduction
+ # to avoid clearing them before the reduction is complete.
+ if self.previous_reduced_grads is None:
+ self.previous_reduced_grads = []
+ self.previous_reduced_grads.append(param)
+ else:
+ self.clear_grad_attribute(param)
+ elif self.contiguous_gradients:
+ self.copy_grads_in_partition(param)
+ else: # zero stage 1 - partition only optimizer state
+ if self.contiguous_gradients and self.is_param_in_current_partition[param_id]:
+ self.copy_grads_in_partition(param)
+
+ self.grads_in_ipg_bucket = []
+ self.params_in_ipg_bucket = []
+ self.ipg_bucket_has_moe_params = False
+ self.elements_in_ipg_bucket = 0
+ #####################################################################
+
+ def reduce_ready_partitions_and_remove_grads(self, param, i):
+ if self.partition_gradients or self.is_gradient_accumulation_boundary:
+ self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
+
+ def zero_reduced_gradients(self, partition_id, i):
+
+ def are_all_related_partitions_reduced(params_id):
+ for partition_id in self.param_to_partition_ids[i][params_id]:
+ if not self.is_partition_reduced[i][partition_id]:
+ return False
+ return True
+
+ for params_id in self.is_grad_computed[i][partition_id]:
+ if are_all_related_partitions_reduced(params_id):
+ self.param_dict[params_id].grad = None # dead code
+
+ def flatten_and_print(self, message, tensors, start=0, n=5):
+ flatten_tensor = self.flatten(tensors)
+
+ def print_func():
+ logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
+
+ self.sequential_execution(print_func, message)
+
+ def get_grads_to_reduce(self, i, partition_id):
+
+ def get_reducible_portion(key):
+ grad = self.param_dict[key].grad
+ total_elements = grad.numel()
+ start = self.grad_start_offset[i][partition_id][key]
+ num_elements = min(total_elements - start,
+ self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key])
+ if not pg_correctness_test:
+ if num_elements == total_elements:
+ return grad
+ else:
+ return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements))
+ else:
+ if num_elements == total_elements:
+ return grad.clone()
+ else:
+ return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements))
+
+ grads_to_reduce = []
+ for key in self.is_grad_computed[i][partition_id]:
+ grad = get_reducible_portion(key)
+ grads_to_reduce.append(grad)
+ return grads_to_reduce
+
+ def sequential_execution(self, function, message, group=None):
+ if group is None:
+ group = self.zp_process_group
+ if dist.get_rank(group=group) == 0:
+ logger.info(message)
+ for id in range(dist.get_world_size(group=group)):
+ if id == dist.get_rank(group=group):
+ function()
+ dist.barrier(group=group)
+
+ def set_none_gradients_to_zero(self, i, partition_id):
+ for param_id in self.is_grad_computed[i][partition_id]:
+ param = self.param_dict[param_id]
+ if param.grad is None:
+ param.grad = torch.zero_like(param)
+
+ ######################Reduction Related Methods##############################
+ def allreduce_bucket(self, bucket, rank=None, log=None, divide=True, process_group=None):
+ rank = None
+ tensor = self.flatten(bucket)
+
+ process_group = self.zp_process_group if process_group is None else process_group
+
+ tensor_to_allreduce = tensor
+
+ if pg_correctness_test or self.sequence_parallel_size > 1:
+ communication_data_type = torch.float32
+ else:
+ communication_data_type = self.communication_data_type
+
+ if communication_data_type != tensor.dtype:
+ tensor_to_allreduce = tensor.to(communication_data_type)
+
+ if divide:
+ tensor_to_allreduce.div_(
+ dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
+
+ tensor_to_allreduce = tensor_to_allreduce.contiguous()
+ if rank is None:
+ # "All Reducing"
+ dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+ else:
+ global_rank = dist.get_global_rank(self.dp_process_group, rank)
+ dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
+
+ if communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
+ if rank is None or rank == dist.get_rank(group=process_group):
+ tensor.copy_(tensor_to_allreduce)
+
+ return tensor
+
+ def _clear_previous_reduced_grads(self):
+ if self.previous_reduced_grads is not None:
+ for param in self.previous_reduced_grads:
+ self.clear_grad_attribute(param)
+ self.previous_reduced_grads = None
+
+ # if rank is specified do a reduction instead of an allreduce
+ def allreduce_and_copy(self, small_bucket, rank=None, log=None, divide=True, process_group=None):
+ process_group = self.zp_process_group if process_group is None else process_group
+ if self.overlap_comm:
+ get_accelerator().synchronize()
+ # It is safe to clear the previously reduced grads of other partitions
+ self._clear_previous_reduced_grads()
+ stream = self.reduction_stream
+ else:
+ stream = get_accelerator().current_stream()
+
+ with get_accelerator().stream(stream):
+ allreduced = self.allreduce_bucket(
+ small_bucket,
+ rank=rank,
+ log=log,
+ divide=divide,
+ process_group=process_group,
+ )
+ if rank is None or rank == dist.get_rank(group=self.zp_process_group):
+ for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
+ buf.copy_(synced)
+
+ def allreduce_no_retain(
+ self,
+ bucket,
+ numel_per_bucket=500000000,
+ rank=None,
+ log=None,
+ divide=True,
+ process_group=None,
+ ):
+ small_bucket = []
+ numel = 0
+ for tensor in bucket:
+ small_bucket.append(tensor)
+ numel = numel + tensor.numel()
+ if numel > numel_per_bucket:
+ self.allreduce_and_copy(small_bucket, rank=rank, log=None, divide=divide, process_group=process_group)
+ small_bucket = []
+ numel = 0
+
+ if len(small_bucket) > 0:
+ self.allreduce_and_copy(small_bucket, rank=rank, log=log, divide=divide, process_group=process_group)
+
+ # allows using reduction of gradients instead of using all_reduce
+
+ def buffered_reduce_fallback(self, rank, grads, elements_per_buffer=500000000, log=None):
+ split_buckets = split_half_float_double(grads)
+
+ for i, bucket in enumerate(split_buckets):
+ self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer, rank=rank, log=log)
+
+ #############################################################################
+ #############################################################################
+ #############################################################################
+
+ # views the tensor as multiple partitions and returns
+ # those partitions
+ def get_data_parallel_partitions(self, tensor, group_id):
+ partitions = []
+
+ dp = dist.get_world_size(group=self.real_zp_process_group[group_id])
+ # dp_id = dist.get_rank(group=self.real_dp_process_group[group_id])
+
+ total_num_elements = tensor.numel()
+
+ base_size = total_num_elements // dp
+ remaining = total_num_elements % dp
+
+ start = 0
+ for id in range(dp):
+ partition_size = base_size
+ if id < remaining:
+ partition_size = partition_size + 1
+ partitions.append(tensor.narrow(0, start, partition_size))
+ start = start + partition_size
+ return partitions
+
+ def get_partition_info(self, tensor_list, partition_size, partition_id):
+ params_in_partition = []
+ params_not_in_partition = []
+
+ start_index = partition_size * partition_id
+ end_index = partition_size * (partition_id + 1)
+
+ current_index = 0
+ first_offset = 0
+
+ for tensor in tensor_list:
+
+ tensor_size = tensor.numel()
+
+ if start_index <= current_index < end_index:
+ params_in_partition.append(tensor)
+
+ elif current_index < start_index < (current_index + tensor_size):
+ params_in_partition.append(tensor)
+
+ assert (first_offset == 0
+ ), "This can happen either zero or only once as this must be the first tensor in the partition"
+ first_offset = start_index - current_index
+
+ else:
+ params_not_in_partition.append(tensor)
+
+ current_index = current_index + tensor_size
+
+ return params_in_partition, params_not_in_partition, first_offset
+
+ def zero_grad(self, set_to_none=True):
+ """
+ Zero FP16 parameter grads.
+ """
+ # FP32 grad should never exist.
+ # For speed, set model fp16 grad to None by default
+ # zero all pointers to grad tensors
+ for group in self.bit16_groups:
+ for p in group:
+ if set_to_none:
+ p.grad = None # epilogue and in step
+ p.grad_accum = None
+ else:
+ if p.grad is not None:
+ p.grad.detach_()
+ p.grad.zero_()
+
+ def _model_parallel_all_reduce(self, tensor, op):
+ """ Perform all reduce within model parallel group, if any.
+ """
+ if self.model_parallel_group is None or self.model_parallel_world_size == 1:
+ pass
+ else:
+ dist.all_reduce(tensor=tensor, op=op, group=self.model_parallel_group)
+
+ def get_grad_norm_direct(self, gradients, params, norm_type=2):
+ """Clips gradient norm of an iterable of parameters.
+
+ This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+ added functionality to handle model parallel parameters. Note that
+ the gradients are modified in place.
+
+ Arguments:
+ parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+ single Tensor that will have gradients normalized
+ max_norm (float or int): max norm of the gradients
+ norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+ infinity norm.
+
+ Returns:
+ Total norm of the parameters (viewed as a single vector).
+ """
+ norm_type = float(norm_type)
+ if norm_type == inf:
+ total_norm = max(g.data.abs().max() for g in gradients)
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
+
+ # Take max across all GPUs.
+ self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
+ total_norm = total_norm_cuda[0].item()
+ else:
+ total_norm = 0.0
+ # if dist.get_rank() == 0:
+ # logger.info(f"Total Norm beginning {total_norm}")
+ for g, p in zip(gradients, params):
+ # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+ if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+ continue
+ if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+ param_norm = g.data.double().norm(2)
+ total_norm += param_norm.item() ** 2
+ # Sum across all model parallel GPUs.
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
+
+ self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
+
+ total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
+
+ if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
+ total_norm = -1
+
+ return total_norm
+
+ # creates a flat fused tensor from the tensor list starting at the first_offset
+ # in the first tensor of the list. If there are not enough elements in the tensor
+ # list then the flat tensor will be padded with zeros
+ def get_flat_partition(self, tensor_list, first_offset, partition_size, dtype, device, return_tensor_list=False):
+ flat_tensor_list = []
+ current_size = 0
+
+ for i, tensor in enumerate(tensor_list):
+ grad_accum = self.get_param_gradient_attribute(tensor)
+ if grad_accum is None:
+ grad_accum = torch.zeros_like(tensor, dtype=dtype)
+
+ tensor = grad_accum
+ num_elements = tensor.numel()
+ tensor_offset = 0
+
+ # we need to offset to get to the right element
+ if i == 0 and first_offset > 0:
+ tensor_offset = first_offset
+ num_elements = num_elements - tensor_offset
+
+ # we dont need all elements of the tensor
+ if num_elements > (partition_size - current_size):
+ num_elements = partition_size - current_size
+
+ # we need a narrow view of the tensor based on the tensor offset and number of elements that
+ # we need from this tensor
+ if tensor_offset > 0 or num_elements < tensor.numel():
+ flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements)))
+ else:
+ flat_tensor_list.append(tensor)
+
+ current_size = current_size + num_elements
+
+ # this means its the last partition and does not align with the dp boundary. We need to pad before flattening
+ if current_size < partition_size:
+ flat_tensor_list.append(torch.zeros(int(partition_size - current_size), dtype=dtype, device=device))
+
+ if return_tensor_list:
+ return flat_tensor_list
+
+ return self.flatten(flat_tensor_list)
+
+ def free_grad_in_param_list(self, param_list):
+ for p in param_list:
+ p.grad = None # in step
+ p.grad_accum = None
+
+ def reset_cpu_buffers(self):
+ self.norm_for_param_grads = {}
+ self.local_overflow = False
+
+ def set_lr(self, lr):
+ """Set the learning rate."""
+ for param_group in self.optimizer.param_groups:
+ param_group["lr"] = lr
+
+ def get_lr(self):
+ """Return the current learning rate."""
+ return self.optimizer.param_groups[0]["lr"]
+
+ def override_loss_scale(self, loss_scale):
+ if loss_scale != self.external_loss_scale:
+ logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
+ self.custom_loss_scaler = True
+ self.external_loss_scale = loss_scale
+
+ def scaled_global_norm(self, norm_type=2):
+ assert norm_type == 2, "only L2 norm supported"
+ norm_groups = []
+ for i, group in enumerate(self.bit16_groups):
+ partition_id = dist.get_rank(group=self.real_zp_process_group[i])
+ if self.cpu_offload:
+ norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]))
+ single_grad_partition = self.single_partition_of_fp32_groups[i].grad
+ else:
+ norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i]))
+
+ if self.has_moe_layers:
+ self._average_expert_grad_norms(norm_groups)
+
+ # note that the get_global_norm function only supports l2 norm
+ return get_global_norm(norm_list=norm_groups)
+
+ def get_bit16_param_group(self, group_no):
+ bit16_partitions = self.parallel_partitioned_bit16_groups[group_no]
+ partition_id = dist.get_rank(group=self.real_zp_process_group[group_no])
+ return [bit16_partitions[dist.get_rank(group=self.real_zp_process_group[group_no])]]
+
+ def _optimizer_step(self, group_no):
+ original_param_groups = self.optimizer.param_groups
+ self.optimizer.param_groups = [original_param_groups[group_no]]
+ # Disabling this as the C++ side copy & synchronize is not working correctly
+ # from deepspeed.ops.adam import DeepSpeedCPUAdam
+ # if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
+ # self.optimizer.step(fp16_param_groups=[self.get_bit16_param_group(group_no)])
+ # else:
+ # self.optimizer.step()
+ self.optimizer.step()
+ self.optimizer.param_groups = original_param_groups
+
+ def step(self, closure=None):
+ """
+ Not supporting closure.
+ """
+ self.micro_step_id = -1
+
+ see_memory_usage(f"In step before checking overflow")
+
+ # First compute norm for all group so we know if there is overflow
+ if self.dtype == torch.float16:
+ self.check_overflow()
+
+ prev_scale = self.loss_scale
+ self._update_scale(self.overflow)
+ if self.overflow:
+ see_memory_usage('After overflow before clearing gradients')
+ self.zero_grad(set_to_none=True)
+ if self.cpu_offload:
+ self.reset_cpu_buffers()
+ else:
+ self.averaged_gradients = {}
+
+ see_memory_usage('After overflow after clearing gradients')
+
+ for timer in OPTIMIZER_TIMERS:
+ self.timers(timer).start()
+ self.timers(timer).stop()
+ return
+
+ # Step 1:- Calculate gradient norm using bit-16 grads
+ see_memory_usage('Before norm calculation')
+ scaled_global_grad_norm = self.scaled_global_norm()
+ self._global_grad_norm = scaled_global_grad_norm / prev_scale
+ see_memory_usage('After norm before optimizer')
+
+ # Step 2:- run optimizer and upscaling simultaneously
+ for i, group in enumerate(self.bit16_groups):
+ self.timers(OPTIMIZER_GRADIENTS_TIMER).start()
+ partition_id = dist.get_rank(group=self.real_zp_process_group[i])
+ if self.cpu_offload:
+ single_grad_partition = self.single_partition_of_fp32_groups[i].grad
+ self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm)
+
+ self.timers(OPTIMIZER_GRADIENTS_TIMER).stop()
+ self.timers(OPTIMIZER_STEP_TIMER).start()
+ self._optimizer_step(i)
+
+ # Disabled, this is not currently working
+ # from deepspeed.ops.adam import DeepSpeedCPUAdam
+ # if not (type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half):
+ # bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+ # fp32_partition = self.single_partition_of_fp32_groups[i]
+ # bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+ bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+ fp32_partition = self.single_partition_of_fp32_groups[i]
+ bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+
+ self.timers(OPTIMIZER_STEP_TIMER).stop()
+ else:
+ # free gradients for all the parameters that are not updated by this process(ZeRO stage2)
+ self.free_grad_in_param_list(self.params_not_in_partition[i])
+
+ # create a flat gradients for parameters updated by this process
+ # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
+ if partition_id == dist.get_world_size(group=self.real_zp_process_group[i]) - 1:
+ single_grad_partition = self.flatten_dense_tensors_aligned(
+ self.averaged_gradients[i],
+ int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype)
+ else:
+ single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
+ self.single_partition_of_fp32_groups[i].dtype)
+ assert single_grad_partition.numel() == self.partition_size[i], \
+ "averaged gradients have different number of elements that partition size {} {} {} {}".format(
+ single_grad_partition.numel(), self.partition_size[i], i, partition_id)
+
+ self.single_partition_of_fp32_groups[i].grad = single_grad_partition
+ # release all the gradient since we have already created a necessary copy in dp_grad_partition(ZeRO stage2)
+ self.free_grad_in_param_list(self.params_in_partition[i])
+
+ self.averaged_gradients[i] = None
+
+ self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm)
+
+ self.timers(OPTIMIZER_GRADIENTS_TIMER).stop()
+
+ # Step 3:- run the optimizer if no offloading
+ self.timers(OPTIMIZER_STEP_TIMER).start()
+ self._optimizer_step(i)
+ # Step 4:- get rid of the fp32 gradients. Not needed anymore
+ self.single_partition_of_fp32_groups[i].grad = None
+ del single_grad_partition
+ bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+ fp32_partition = self.single_partition_of_fp32_groups[i]
+ bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+ self.timers(OPTIMIZER_STEP_TIMER).stop()
+
+ see_memory_usage('After optimizer before all-gather')
+ if self.cpu_offload:
+ self.reset_cpu_buffers()
+
+ self.timers(OPTIMIZER_ALLGATHER_TIMER).start()
+
+ # if dist.get_rank(group=self.dp_process_group) == 0:
+ # pdb.set_trace() # 或者使用其他调试工具
+
+ # Gather the updated weights from everyone.
+ # Then all partitions of the model parameters are updated and ready for next round forward.
+ all_gather_into_tensor_dp_groups(groups_flat=self.bit16_groups_flat,
+ partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+ zp_process_group=self.real_zp_process_group)
+ self.timers(OPTIMIZER_ALLGATHER_TIMER).stop()
+
+ # TODO: we probably don't need this? just to be safe
+ for i in range(len(self.bit16_groups)):
+ self._update_model_bit16_weights(i)
+
+ self.timers.log(OPTIMIZER_TIMERS)
+ see_memory_usage('After zero_optimizer step')
+
+ return
+
+ @torch.no_grad()
+ def update_lp_params(self):
+ for i, (bit16_partitions, fp32_partition) in enumerate(
+ zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+ partition_id = dist.get_rank(group=self.real_zp_process_group[i])
+ bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+ # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
+ # if i == 0:
+ # print_rank_0(f'{fp32_partition[:10]=}', force=True)
+ all_gather_into_tensor_dp_groups(groups_flat=self.bit16_groups_flat,
+ partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+ zp_process_group=self.real_zp_process_group)
+
+ def _average_expert_grad_norms(self, norm_groups):
+ for i, norm in enumerate(norm_groups):
+ if self.is_moe_param_group[i]:
+ scaled_norm = norm * 1.0 / float(dist.get_world_size(group=self.dp_process_group))
+ scaled_norm_tensor = torch.tensor(scaled_norm,
+ device=get_accelerator().device_name(),
+ dtype=torch.float)
+ dist.all_reduce(scaled_norm_tensor, group=self.dp_process_group)
+ norm_groups[i] = scaled_norm_tensor.item()
+
+ def unscale_and_clip_grads(self, grad_groups_flat, total_norm):
+ # compute combined scale factor for this group
+ combined_scale = self.loss_scale
+ if self.clip_grad > 0.:
+ # norm is in fact norm*scale
+ clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
+ if clip > 1:
+ combined_scale = clip * self.loss_scale
+
+ for grad in grad_groups_flat:
+ if isinstance(grad, list):
+ sub_partitions = grad
+ for g in sub_partitions:
+ g.data.mul_(1. / combined_scale)
+ else:
+ grad.data.mul_(1. / combined_scale)
+
+ def _check_overflow(self, partition_gradients=True):
+ self.overflow = self.has_overflow(partition_gradients)
+
+ # `params` is a list / generator of torch.Variable
+ def has_overflow_serial(self, params, is_grad_list=False):
+ for p in params:
+ if p.grad is not None and self._has_inf_or_nan(p.grad.data):
+ return True
+
+ return False
+
+ def has_overflow_partitioned_grads_serial(self):
+ for i in range(len(self.bit16_groups)):
+ for j, grad in enumerate(self.averaged_gradients[i]):
+ if grad is not None and self._has_inf_or_nan(grad.data, j):
+ return True
+ return False
+
+ def has_overflow(self, partition_gradients=True):
+ if partition_gradients:
+ overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial()
+ overflow_gpu = get_accelerator().ByteTensor([overflow])
+ '''This will capture overflow across all data parallel and expert parallel process
+ Since expert parallel process are a subset of data parallel process'''
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
+
+ else:
+ params = []
+ for group in self.bit16_groups:
+ for param in group:
+ params.append(param)
+
+ overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
+ overflow_gpu = get_accelerator().ByteTensor([overflow])
+
+ # Since each model parallel GPU carries only part of the model,
+ # make sure overflow flag is synced across all the model parallel GPUs
+ self._model_parallel_all_reduce(tensor=overflow_gpu, op=dist.ReduceOp.MAX)
+
+ overflow = overflow_gpu[0].item()
+ return bool(overflow)
+
+ # `x` is a torch.Tensor
+ @staticmethod
+ def _has_inf_or_nan(x, j=None):
+ try:
+ # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+ # Pytorch's .sum() creates a one-element tensor of the same type as x
+ # (which is true for some recent version of pytorch).
+ cpu_sum = float(x.float().sum())
+ # More efficient version that can be used if .sum() returns a Python scalar
+ # cpu_sum = float(x.sum())
+ except RuntimeError as instance:
+ # We want to check if inst is actually an overflow exception.
+ # RuntimeError could come from a different error.
+ # If so, we still want the exception to propagate.
+ if "value cannot be converted" not in instance.args[0]:
+ raise
+ return True
+ else:
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+ return True
+ return False
+
+ def backward(self, loss, retain_graph=False):
+ """
+ :attr:`backward` performs the following steps:
+
+ 1. fp32_loss = loss.float()
+ 2. scaled_loss = fp32_loss*loss_scale
+ 3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
+ """
+ self.micro_step_id += 1
+
+ if self.contiguous_gradients:
+ self.ipg_buffer = []
+ buf_0 = torch.empty(int(self.reduce_bucket_size),
+ dtype=self.dtype,
+ device=get_accelerator().current_device_name())
+ self.ipg_buffer.append(buf_0)
+
+ # Use double buffers to avoid data access conflict when overlap_comm is enabled.
+ if self.overlap_comm:
+ buf_1 = torch.empty(int(self.reduce_bucket_size),
+ dtype=self.dtype,
+ device=get_accelerator().current_device_name())
+ self.ipg_buffer.append(buf_1)
+ self.ipg_index = 0
+
+ if self.custom_loss_scaler:
+ scaled_loss = self.external_loss_scale * loss
+ scaled_loss.backward()
+ else:
+ self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+
+ # Only for Stage 1, Mode 2
+ if self.use_grad_accum_attribute:
+ self.fill_grad_accum_attribute()
+
+ def check_overflow(self, partition_gradients=True):
+ self._check_overflow(partition_gradients)
+
+ def _update_scale(self, has_overflow=False):
+ self.loss_scaler.update_scale(has_overflow)
+
+ # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+ def _get_state(self):
+ return self.optimizer.state
+
+ def _set_state(self, value):
+ self.optimizer.state = value
+
+ state = property(_get_state, _set_state)
+
+ # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+ # (for example, to adjust the learning rate)
+ def _get_param_groups(self):
+ return self.optimizer.param_groups
+
+ def _set_param_groups(self, value):
+ self.optimizer.param_groups = value
+
+ param_groups = property(_get_param_groups, _set_param_groups)
+
+ # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+ def _get_loss_scale(self):
+ if self.custom_loss_scaler:
+ return self.external_loss_scale
+ else:
+ return self.loss_scaler.cur_scale
+
+ def _set_loss_scale(self, value):
+ self.loss_scaler.cur_scale = value
+
+ loss_scale = property(_get_loss_scale, _set_loss_scale)
+ cur_scale = property(_get_loss_scale, _set_loss_scale)
+
+ # Return group tensor after removing paddings that are added for alignment to DP world size.
+ # This method works on the assumption that each group contains a single flattened tensor.
+ def _get_groups_without_padding(self, groups_with_padding):
+ groups_without_padding = []
+ for i, group in enumerate(groups_with_padding):
+ lean_length = group.numel() - self.groups_padding[i]
+ groups_without_padding.append(group[:lean_length])
+
+ return groups_without_padding
+
+ # Return optimizer state after removing paddings that are added for alignment.
+ def _get_state_without_padding(self, state_with_padding, padding):
+ lean_state = {}
+ for key, value in state_with_padding.items():
+ if torch.is_tensor(value):
+ lean_length = value.numel() - padding
+ lean_state[key] = value[:lean_length]
+ else:
+ lean_state[key] = value
+
+ return lean_state
+
+ # Return base optimizer states.
+ # This method assumes that each param group contains a single flattened tensor.
+ def _get_base_optimizer_state(self):
+ optimizer_groups_state = []
+ for i, group in enumerate(self.optimizer.param_groups):
+ p = group['params'][0]
+ lean_optimizer_state = self._get_state_without_padding(self.optimizer.state[p], self.groups_padding[i])
+ optimizer_groups_state.append(lean_optimizer_state)
+
+ return optimizer_groups_state
+
+ def state_dict(self):
+ """
+ Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+ This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+ of the contained Pytorch optimizer.
+ Example::
+ checkpoint = {}
+ checkpoint['model'] = model.state_dict()
+ checkpoint['optimizer'] = optimizer.state_dict()
+ torch.save(checkpoint, "saved.pth")
+ """
+ state_dict = {}
+ state_dict[LOSS_SCALER] = self.loss_scaler
+ state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+ state_dict['overflow'] = self.overflow
+ state_dict[CLIP_GRAD] = self.clip_grad
+
+ if self.elastic_checkpoint:
+ state_dict[BASE_OPTIMIZER_STATE] = self._get_base_optimizer_state()
+
+ if "step" in self.optimizer.param_groups[0]:
+ # Assuming "step" is the only item that changes through training iterations
+ assert all(group["step"] == self.optimizer.param_groups[0]["step"]
+ for group in self.optimizer.param_groups), "All param groups must have the same step value"
+ state_dict[BASE_OPTIMIZER_STATE_STEP] = self.optimizer.param_groups[0]["step"]
+ else:
+ state_dict[BASE_OPTIMIZER_STATE] = self.optimizer.state_dict()
+
+ # Remove paddings for DP alignment to enable loading for other alignment values
+ fp32_groups_without_padding = self._get_groups_without_padding(self.single_partition_of_fp32_groups)
+ state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = fp32_groups_without_padding
+
+ state_dict[
+ ZERO_STAGE] = ZeroStageEnum.gradients if self.partition_gradients else ZeroStageEnum.optimizer_states
+ state_dict[GROUP_PADDINGS] = self.groups_padding
+ state_dict[PARTITION_COUNT] = self.partition_count
+
+ state_dict[DS_VERSION] = version
+ state_dict[PARAM_SLICE_MAPPINGS] = self._param_slice_mappings
+
+ return state_dict
+
+ # Restore base optimizer fp32 weights from elastic checkpoint by:
+ # 1) Merging fp32 weights from checkpoints of all partitions
+ # 2) Extracting fp32 weights for current partition from merged weights
+ # 3) Using extracted weights to update base optimizer weights directly.
+ def _restore_from_elastic_fp32_weights(self, all_state_dict):
+ merged_single_partition_of_fp32_groups = []
+
+ for i in range(len(self.single_partition_of_fp32_groups)):
+ partition_id = dist.get_rank(group=self.real_zp_process_group[i])
+ merged_partitions = [sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict]
+ if self.is_moe_group(self.optimizer.param_groups[i]):
+ ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name'])
+ merged_partitions = [merged_partitions[i] for i in ranks]
+ flat_merged_partitions = self.flatten_dense_tensors_aligned(
+ merged_partitions,
+ self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_zp_process_group[i]))
+ dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, i)
+ merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
+
+ for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
+ current.data.copy_(saved.data)
+
+ # Restore base optimizer fp32 weights from ZeRO fp16 or bfloat16 weights
+ def _restore_from_bit16_weights(self):
+ for group_id, (bit16_partitions, fp32_partition) in enumerate(
+ zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+ partition_id = dist.get_rank(group=self.real_zp_process_group[group_id])
+ fp32_partition.data.copy_(bit16_partitions[partition_id].data)
+
+ # Refresh the fp32 master params from the fp16 or bfloat16 copies.
+ def refresh_fp32_params(self):
+ self._restore_from_bit16_weights()
+
+ # Extract optimizer state for current partition from merged states of all partitions
+ def _partition_base_optimizer_state(self, state_key, all_partition_states, group_id):
+ partition_id = dist.get_rank(group=self.real_zp_process_group[group_id])
+ alignment = dist.get_world_size(group=self.real_zp_process_group[group_id])
+ if torch.is_tensor(all_partition_states[0]):
+ flat_merged_partitions = self.flatten_dense_tensors_aligned(all_partition_states, alignment)
+ dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, group_id)
+ return dp_partitions[partition_id]
+ else:
+ # Assume non-tensor states are not partitioned and equal across ranks, so return first one
+ return all_partition_states[0]
+
+ def _restore_base_optimizer_state(self, base_optimizer_group_states):
+ if type(base_optimizer_group_states) == dict:
+ base_optimizer_group_states = base_optimizer_group_states['state']
+ for i, group in enumerate(self.optimizer.param_groups):
+ p = group['params'][0]
+ for key, saved in base_optimizer_group_states[i].items():
+ if torch.is_tensor(self.optimizer.state[p][key]):
+ dst_tensor = self.optimizer.state[p][key]
+ src_tensor = _get_padded_tensor(saved, dst_tensor.numel())
+ self.optimizer.state[p][key].data.copy_(src_tensor.data)
+ else:
+ self.optimizer.state[p][key] = saved
+
+ def get_ep_ranks(self, rank=0, group_name=None):
+ from deepspeed.utils import groups
+ expert_parallel_size_ = groups._get_expert_parallel_world_size(group_name)
+ world_size = groups._get_data_parallel_world_size()
+ rank = groups._get_expert_parallel_rank(group_name)
+ ranks = range(rank, world_size, expert_parallel_size_)
+ return list(ranks)
+
+ # Restore base optimizer state from elastic checkpoint by
+ # 1) Merging optimizer state from checkpoints of all partitions
+ # 2) Extracting optimizer state for current partition from the merged state
+ # 3) Using the extracted value to directly update the base optimizer.
+ def _restore_elastic_base_optimizer_state(self, all_state_dict):
+ base_optimizer_group_states = []
+ for i in range(len(self.optimizer.param_groups)):
+ partition_states = {}
+ all_partition_group_states = [sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict]
+
+ if self.is_moe_group(self.optimizer.param_groups[i]):
+ ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name'])
+ all_partition_group_states = [all_partition_group_states[i] for i in ranks]
+
+ for key in all_partition_group_states[0].keys():
+ all_partition_states = [all_states[key] for all_states in all_partition_group_states]
+ partition_states[key] = self._partition_base_optimizer_state(key, all_partition_states, i)
+ base_optimizer_group_states.append(partition_states)
+
+ self._restore_base_optimizer_state(base_optimizer_group_states)
+
+ # Restore step
+ if BASE_OPTIMIZER_STATE_STEP in all_state_dict[0]:
+ assert all(sd[BASE_OPTIMIZER_STATE_STEP] == all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+ for sd in all_state_dict), "State dicts of all partitions must have the same step value"
+ loaded_param_groups_step = all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+ for param_group in self.optimizer.param_groups:
+ param_group['step'] = loaded_param_groups_step
+
+ def load_state_dict(self,
+ state_dict_list,
+ load_optimizer_states=True,
+ load_from_fp32_weights=False,
+ checkpoint_folder=None,
+ load_serial=None):
+ if checkpoint_folder:
+ self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
+ else:
+ self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
+
+ def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
+ self._load_hp_checkpoint_state(checkpoint_folder)
+
+ @property
+ def param_groups(self):
+ """Forward the wrapped optimizer's parameters."""
+ return self.optimizer.param_groups
+
+ def _load_hp_checkpoint_state(self, checkpoint_dir):
+ checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+ optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
+ assert os.path.isfile(
+ optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
+ optim_sd = torch.load(optim_state_path)
+ self._load_global_state(optim_sd)
+
+ tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+ tp_world_size = self.mpu.get_slice_parallel_world_size() if hasattr(self.mpu, "get_slice_parallel_world_size") \
+ else self.mpu.get_tensor_model_parallel_world_size()
+
+ for i, _ in enumerate(self.optimizer.param_groups):
+ for lp in self.bit16_groups[i]:
+ if lp._hp_mapping is not None:
+ # print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
+ lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+ tp_world_size)
+
+ def _load_global_state(self, sd):
+ self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)
+ self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale)
+ self.overflow = sd.get('overflow', self.overflow)
+ self.clip_grad = sd.get(CLIP_GRAD, self.clip_grad)
+
+ ckpt_version = sd.get(DS_VERSION, False)
+ assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed"
+ ckpt_version = pkg_version.parse(ckpt_version)
+
+ # zero stage 1 mode
+ if not self.partition_gradients:
+ required_version = pkg_version.parse("0.3.17")
+ error_str = f"ZeRO stage 1 changed in {required_version} and is not backwards compatible " \
+ "with older stage 1 checkpoints. If you'd like to load an old ZeRO-1 checkpoint " \
+ "please use an older version of DeepSpeed (<= 0.5.8) and set 'legacy_stage1': true in your zero config json."
+ assert required_version <= ckpt_version, f"Old version: {ckpt_version} {error_str}"
+
+ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False):
+ r"""Loading ZeRO checkpoint
+
+ Arguments:
+ state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
+ Note that the number of saved partitions may differ from number of loading partitions to support
+ changing GPU count, specifically DP world size, between saving and loading checkpoints.
+ load_optimizer_states: Boolean indicating whether or not to load base optimizer states
+ load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
+ copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
+ """
+ """
+ Loads a state_dict created by an earlier call to state_dict().
+ If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+ whose parameters in turn came from ``model``, it is expected that the user
+ will call ``model.load_state_dict()`` before
+ ``fp16_optimizer_instance.load_state_dict()`` is called.
+ Example::
+ model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
+ optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+ optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+ ...
+ checkpoint = torch.load("saved.pth")
+ model.load_state_dict(checkpoint['model'])
+ optimizer.load_state_dict(checkpoint['optimizer'])
+ """
+
+ # I think it should actually be ok to reload the optimizer before the model.
+ dp_rank = dist.get_rank(group=self.zp_process_group)
+ current_rank_sd = state_dict_list[dp_rank]
+ self._load_global_state(current_rank_sd)
+
+ ckpt_is_rigid = isinstance(current_rank_sd[BASE_OPTIMIZER_STATE], dict)
+
+ # padding is always at the last rank/partition
+ # if DP=1024 and param-group elems=16 -> padding will be 1024-16 across all but one rank
+ # scenario-1 (shrink): saving w. 4 gpus -> loading w. 2 gpus
+ # scenario-2 (expand): saving w. 2 gpus -> loading w. 4 gpus
+ # if load_optimizer_states:
+ # if new_dp_size:
+ # self.strip_padding()
+ # self.add_padding_w_new_dp_size()
+ # self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
+
+ if load_optimizer_states:
+ if ckpt_is_rigid:
+ # loading rigid ckpt into either rigid or elastic exec
+ self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
+ else:
+ if self.elastic_checkpoint:
+ # loading elastic into elastic exec
+ self._restore_elastic_base_optimizer_state(state_dict_list)
+ else:
+ # loading an elastic checkpoint into rigid exec
+ self._restore_base_optimizer_state(current_rank_sd[BASE_OPTIMIZER_STATE])
+
+ # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+ # The optimizer's hyperparameters and internal buffers are also up to date.
+ # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+ # out of date. There are two options.
+ # 1: Refresh the master params from the model's fp16 params.
+ # This requires less storage but incurs precision loss.
+ # 2: Save and restore the fp32 master copies separately.
+ # We choose option 1 if changing DP degree and option 2 otherwise.
+ #
+ # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+ # of their associated parameters, because it's possible those buffers might not exist yet in
+ # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been
+ # constructed in the same way as the one whose state_dict we are loading, the same master params
+ # are guaranteed to exist, so we can just copy_() from the saved master params.
+
+ if load_from_fp32_weights:
+ # option 2 from above
+ if self.elastic_checkpoint and not ckpt_is_rigid:
+ self._restore_from_elastic_fp32_weights(state_dict_list)
+ else:
+ # For non-elastic checkpoint, simply copying from saved weights of current rank is sufficient.
+ for current, saved in zip(self.single_partition_of_fp32_groups,
+ current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
+ src_tensor = _get_padded_tensor(saved, current.numel())
+ current.data.copy_(src_tensor.data)
+ else:
+ # option 1 from above
+ self._restore_from_bit16_weights()
+
+ if load_optimizer_states:
+ self._link_all_hp_params()
+
+
+def _handle_overflow(cpu_sum, x, i):
+ import math
+ rank = dist.get_rank()
+ if rank == 0:
+ t_i = -1
+ for v_i, v in enumerate(x.data.contiguous().view(-1)):
+ if not math.isfinite(float(v)):
+ t_i = v_i
+ break
+ logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
+
+
+def estimate_zero2_model_states_mem_needs(total_params,
+ num_gpus_per_node=1,
+ num_nodes=1,
+ cpu_offload=True,
+ additional_buffer_factor=1.5):
+ total_gpus = num_nodes * num_gpus_per_node
+
+ if cpu_offload:
+ gpu_mem = 2 * total_params
+ cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor
+ else:
+ gpu_mem = 4 * total_params + int(16 * total_params / total_gpus)
+ cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
+
+ return int(cpu_mem), int(gpu_mem)
+
+
+def model_to_params(model):
+ # shared params calculated only once
+ total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+ return total_params
+
+
+def estimate_zero2_model_states_mem_needs_all_live(model,
+ num_gpus_per_node=1,
+ num_nodes=1,
+ additional_buffer_factor=1.5):
+ """
+ Print out estimates on memory usage requirements for ZeRO 2 params, optim states and gradients
+ for a given ``model`` and hardware setup.
+
+ If you have an actual model object, use this function and everything will be derived
+ automatically.
+
+ If it's a hypothetical model, use ``estimate_zero2_model_states_mem_needs_all_cold`` where you have to pass
+ the ``total_params`` explicitly.
+
+ Args:
+ - ``model``: ``nn.Module`` object
+ - ``num_gpus_per_node``: how many gpus per node (defaults to 1)
+ - ``num_nodes``: how many nodes (defaults to 1),
+ - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
+
+ """
+
+ total_params = model_to_params(model)
+
+ estimate_zero2_model_states_mem_needs_all_cold(total_params=total_params,
+ num_gpus_per_node=num_gpus_per_node,
+ num_nodes=num_nodes,
+ additional_buffer_factor=additional_buffer_factor)
+
+
+def estimate_zero2_model_states_mem_needs_all_cold(total_params,
+ num_gpus_per_node=1,
+ num_nodes=1,
+ additional_buffer_factor=1.5):
+ """
+ Print out estimates on memory usage requirements for ZeRO 2 params, optim states and gradients
+ for a given ``model`` and hardware setup.
+
+ If it's a hypothetical model, use this function where you have to pass
+ the ``total_params`` and ``largest_layer_params`` explicitly.
+
+ If you have an actual model object, use ``estimate_zero2_model_states_mem_needs_all_live`` and everything
+ will be derived automatically.
+
+ Args:
+ - ``total_params``: total model params
+ - ``num_gpus_per_node``: how many gpus per node (defaults to 1)
+ - ``num_nodes``: how many nodes (defaults to 1),
+ - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
+
+ """
+
+ def format_options(cpu_offload):
+ enabled = []
+ device = f'{OffloadDeviceEnum.cpu:4}' if cpu_offload else "none"
+ enabled.append(f"offload_optimizer={device}")
+ return ", ".join(enabled)
+
+ nodes_str = "nodes" if num_nodes > 1 else "node"
+ gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU"
+ print("Estimated memory needed for params, optim states and gradients for a:\n"
+ f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
+ f"SW: Model with {int(total_params / 1e6)}M total params.")
+ print(" per CPU | per GPU | Options")
+ for cpu_offload in [True, False]:
+ cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params=total_params,
+ num_gpus_per_node=num_gpus_per_node,
+ num_nodes=num_nodes,
+ cpu_offload=cpu_offload,
+ additional_buffer_factor=additional_buffer_factor)
+
+ options_str = format_options(cpu_offload=cpu_offload)
+ print(f" {cpu_mem / 2 ** 30:7.2f}GB | {gpu_mem / 2 ** 30:6.2f}GB | {options_str}")
diff --git a/opensora/adaptor/utils.py b/opensora/adaptor/utils.py
new file mode 100644
index 000000000..c32ff6efc
--- /dev/null
+++ b/opensora/adaptor/utils.py
@@ -0,0 +1,1042 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Copyright NVIDIA/Megatron
+
+Helper functions and classes from multiple sources.
+"""
+
+from collections.abc import Iterable
+from deepspeed.moe.utils import is_moe_param
+import os
+import psutil
+import gc
+from math import sqrt
+from packaging import version as pkg_version
+
+import torch
+from deepspeed import comm as dist
+
+try:
+ from torch._six import inf
+except ModuleNotFoundError:
+ from torch import inf
+
+from deepspeed.utils import groups, logger
+from deepspeed.runtime.constants import PIPE_REPLICATED
+from numpy import prod
+from deepspeed.accelerator import get_accelerator
+
+from deepspeed.module_inject.policy import transpose
+from torch.nn import functional as F
+
+torch_memory_reserved = get_accelerator().memory_reserved
+torch_max_memory_reserved = get_accelerator().max_memory_reserved
+
+
+class DummyOptim():
+ """
+ Dummy optimizer presents model parameters as a param group, this is
+ primarily used to allow ZeRO-3 without an optimizer
+ """
+
+ def __init__(self, params):
+ self.param_groups = []
+ self.param_groups.append({'params': params})
+
+
+graph_cache = {}
+
+
+def graph_process(replay_first_step, func, *args, **kwargs):
+ # `func` should only contain operations on the GPU
+ # Please ensure that the memory address of the data required by 'func' remains constant
+ if func.__name__ not in graph_cache:
+ cuda_stream = get_accelerator().Stream()
+ cuda_stream.wait_stream(get_accelerator().current_stream())
+ with get_accelerator().stream(cuda_stream):
+ func(*args, **kwargs)
+ get_accelerator().current_stream().wait_stream(cuda_stream)
+ graph_cache[func.__name__] = get_accelerator().create_graph()
+ with get_accelerator().capture_to_graph(graph_cache[func.__name__]):
+ func(*args, **kwargs)
+ if replay_first_step:
+ get_accelerator().replay_graph(graph_cache[func.__name__])
+ else:
+ get_accelerator().replay_graph(graph_cache[func.__name__])
+
+
+def noop_decorator(func):
+ return func
+
+
+class noop_context(object):
+
+ def __init__(self):
+ pass
+
+ def __enter__(self):
+ pass
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ pass
+
+
+def ensure_directory_exists(filename):
+ """Create the directory path to ``filename`` if it does not already exist.
+
+ Args:
+ filename (str): A file path.
+ """
+ dirname = os.path.dirname(filename)
+ os.makedirs(dirname, exist_ok=True)
+
+
+def set_random_seed(seed):
+ """Set the random seed for common PRNGs used during training: random, numpy, and torch.
+
+ Args:
+ seed (int): the seed to use
+ """
+ import numpy
+ import random
+ random.seed(seed)
+ numpy.random.seed(seed)
+ torch.manual_seed(seed)
+
+
+def is_model_parallel_parameter(p) -> bool:
+ if hasattr(p, 'model_parallel') and p.model_parallel:
+ return True
+
+ if hasattr(p, 'tensor_model_parallel') and p.tensor_model_parallel:
+ return True
+
+ return False
+
+
+def bwc_tensor_model_parallel_rank(mpu=None):
+ """Backwards-compatible way of querying the tensor model parallel rank from
+ an ``mpu`` object.
+
+ *Tensor* model parallelism means that tensors are physically split across
+ processes. This contrasts with *pipeline* model parallelism, in which the
+ layers are partitioned but tensors left intact.
+
+ The API for tensor model parallelism has changed across versions and this
+ helper provides a best-effort implementation across versions of ``mpu``
+ objects. The preferred mechanism is
+ ``mpu.get_tensor_model_parallel_rank()``.
+
+ This should "just work" with both Megatron-LM and DeepSpeed's pipeline
+ parallelism.
+
+ Args:
+ mpu (model parallel unit, optional): The tensor model parallel rank.
+ If ``mpu=None``, returns 0. Defaults to ``None``.
+
+ Returns:
+ int: the rank
+ """
+ if mpu is None:
+ # No model parallelism in easy :)
+ return 0
+
+ if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+ # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
+ return mpu.get_tensor_model_parallel_rank()
+ elif hasattr(mpu, 'get_slice_parallel_rank'):
+ # Some DeepSpeed + pipeline parallelism versions
+ return mpu.get_slice_parallel_rank()
+ else:
+ # Deprecated Megatron and DeepSpeed convention
+ return mpu.get_model_parallel_rank()
+
+
+def copy_to_device(item, device, criterion_func):
+ """
+ Return a copy of tensor on specified device.
+ Works on individual tensors, and tensors contained/nested in lists, tuples, and dicts.
+ Parameters:
+ item: tensor to copy or (possibly nested) container of tensors to copy.
+ device: target device
+ criterion_func: Function to restrict copy operation to items meet criterion
+
+ Returns:
+ None
+ """
+ if criterion_func(item):
+ return item.to(device)
+ elif isinstance(item, list):
+ return [copy_to_device(v, device, criterion_func) for v in item]
+ elif isinstance(item, tuple):
+ return tuple([copy_to_device(v, device, criterion_func) for v in item])
+ elif isinstance(item, dict):
+ return {k: copy_to_device(v, device, criterion_func) for k, v in item.items()}
+ else:
+ return item
+
+
+def move_to_device(item, device, criterion_func):
+ """
+ Move tensor on to specified device by changing the storage.
+ Works on individual tensors, and tensors contained/nested in lists, tuples, and dicts.
+ Parameters:
+ item: tensor to move or (possibly nested) container of tensors to move.
+ device: target device
+ criterion_func: Function to restrict move operation to items meet criterion
+
+ Returns:
+ None
+ """
+ if criterion_func(item):
+ device_copy = item.to(device)
+ item.data = device_copy.data
+ return item
+ elif isinstance(item, list):
+ return [move_to_device(v, device, criterion_func) for v in item]
+ elif isinstance(item, tuple):
+ return tuple([move_to_device(v, device, criterion_func) for v in item])
+ elif isinstance(item, dict):
+ return {k: move_to_device(v, device, criterion_func) for k, v in item.items()}
+ else:
+ return item
+
+
+class CheckOverflow(object):
+ '''Checks for overflow in gradient across parallel process'''
+
+ def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False, deepspeed=None):
+ self.mpu = mpu
+ self.params = [] if param_groups else None
+ self.zero_reduce_scatter = zero_reduce_scatter
+ self.deepspeed = deepspeed
+ self.has_moe_params = False
+ if param_groups:
+ for group in param_groups:
+ for param in group:
+ self.params.append(param)
+ if is_moe_param(param):
+ self.has_moe_params = True
+
+ def check_using_norm(self, norm_group, reduce_overflow=True):
+ # TODO: I don't think reduce_overflow is needed if mpu is None
+ overflow = -1 in norm_group
+ overflow_gpu = get_accelerator().FloatTensor([overflow])
+ if self.has_moe_params:
+ # In this case, we need to do an all_reduce across
+ # the expert_parallel_group, so that if there was
+ # an overflow due to expert weights, we detect it
+
+ # Only need to check groups.get_largest_expert_parallel_group()
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group())
+ if self.mpu is not None:
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
+ elif reduce_overflow:
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX)
+ dist.barrier()
+ overflow = overflow_gpu[0].item()
+ return bool(overflow)
+
+ def check(self, param_groups=None):
+ params = []
+ has_moe_params = False
+ if param_groups is None:
+ params = self.params
+ has_moe_params = self.has_moe_params
+ else:
+ assert param_groups is not None, \
+ "self.params and param_groups both cannot be none"
+
+ for group in param_groups:
+ for param in group:
+ params.append(param)
+ if is_moe_param(param):
+ has_moe_params = True
+
+ return self.has_overflow(params, has_moe_params=has_moe_params)
+
+ # `params` is a list / generator of torch.Variable
+ def has_overflow_serial(self, params):
+ for i, p in enumerate(params):
+ if p.grad is not None and self._has_inf_or_nan(p.grad.data, i):
+ return True
+ return False
+
+ def has_overflow(self, params, has_moe_params=None):
+ if has_moe_params is None:
+ has_moe_params = self.has_moe_params
+ overflow = self.has_overflow_serial(params)
+ # Since each model parallel GPU carries only part of the model,
+ # make sure overflow flag is synced across all the model parallel GPUs
+ overflow_gpu = get_accelerator().ByteTensor([overflow])
+ # deepspeed.comm.all_reduce(overflow_gpu,
+ # op=deepspeed.comm.ReduceOp.MAX,
+ # group=mpu.get_model_parallel_group())
+ if has_moe_params:
+ # All reduce this across expert_parallel_group, so that if an expert
+ # overflows, we detect it here
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group())
+ if self.zero_reduce_scatter:
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group())
+ elif self.mpu is not None:
+ if self.deepspeed is not None:
+ using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+ if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or (
+ not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group())
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
+ elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
+ dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group())
+
+ overflow = overflow_gpu[0].item()
+ return bool(overflow)
+
+ # `x` is a torch.Tensor
+ @staticmethod
+ def _has_inf_or_nan(x, i):
+ try:
+ # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+ # Pytorch's .sum() creates a one-element tensor of the same type as x
+ # (which is true for some recent version of pytorch).
+ cpu_sum = float(x.float().sum())
+ # More efficient version that can be used if .sum() returns a Python scalar
+ # cpu_sum = float(x.sum())
+ except RuntimeError as instance:
+ # We want to check if inst is actually an overflow exception.
+ # RuntimeError could come from a different error.
+ # If so, we still want the exception to propagate.
+ if "value cannot be converted" not in instance.args[0]:
+ raise
+ return True
+ else:
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+ return True
+ return False
+
+
+def _handle_overflow(cpu_sum, x, i):
+ import math
+ rank = dist.get_rank()
+ if rank == 0:
+ t_i = -1
+ for v_i, v in enumerate(x.data.contiguous().view(-1)):
+ if not math.isfinite(float(v)):
+ t_i = v_i
+ break
+ logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
+
+
+def get_global_norm(norm_list):
+ """ Compute total from a list of norms
+ """
+ total_norm = 0.0
+ for norm in norm_list:
+ total_norm += norm**2.0
+ # logger.info(f'norm_list = {norm_list} global = {sqrt(total_norm)}')
+ return sqrt(total_norm)
+
+
+def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
+ """Clips gradient norm of an iterable of parameters.
+
+ This has been adapted from Nvidia megatron. We add norm averaging
+ to consider MoE params when calculating norm as they will result
+ in different norms across different ranks.
+
+ This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+ added functionality to handle model parallel parameters. Note that
+ the gradients are modified in place.
+
+ Arguments:
+ parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+ single Tensor that will have gradients normalized
+ max_norm (float or int): max norm of the gradients
+ norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+ infinity norm.
+
+ Returns:
+ Total norm of the parameters (viewed as a single vector).
+ """
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+ max_norm = float(max_norm)
+ norm_type = float(norm_type)
+ if norm_type == inf:
+ total_norm = max(p.grad.data.abs().max() for p in parameters)
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ # Take max across all GPUs.
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()
+ else:
+ total_norm = 0
+ for p in parameters:
+ if mpu is not None:
+ if (mpu.get_model_parallel_rank() == 0) or is_model_parallel_parameter(p):
+ param_norm = p.grad.data.norm(norm_type)
+ total_norm += param_norm.item()**norm_type
+ else:
+ param_norm = p.grad.data.float().norm(norm_type)
+ total_norm += param_norm.item()**norm_type
+
+ # Sum across all model parallel GPUs.
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+ # Need to average total_norm across different GPUs due to the presence of moe params
+ pg = groups._get_data_parallel_group()
+ scaled_norm = total_norm * 1.0 / float(dist.get_world_size(group=pg))
+
+ scaled_norm_tensor = get_accelerator().FloatTensor([float(scaled_norm)])
+ dist.all_reduce(scaled_norm_tensor, group=pg)
+ total_norm = scaled_norm_tensor.item()
+
+ clip_coef = max_norm / (total_norm + 1e-6)
+ if clip_coef < 1:
+ for p in parameters:
+ p.grad.data.mul_(clip_coef)
+ return total_norm
+
+
+def get_grad_norm(parameters, norm_type=2, mpu=None):
+ """Get grad norm of an iterable of parameters.
+
+ This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+ added functionality to handle model parallel parameters. Note that
+ the gradients are modified in place. Taken from Nvidia Megatron.
+
+ Arguments:
+ parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+ single Tensor that will have gradients normalized
+ norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+ infinity norm.
+
+ Returns:
+ Total norm of the parameters (viewed as a single vector).
+ """
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+ norm_type = float(norm_type)
+ if norm_type == inf:
+ total_norm = max(p.grad.data.abs().max() for p in parameters)
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ # Take max across all GPUs.
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()
+ else:
+ total_norm = 0.
+ tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=mpu)
+ for p in parameters:
+ # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+ if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+ continue
+
+ # Filter to avoid over-counting replicated tensors from tensor
+ # model parallelism
+ if (tensor_mp_rank > 0) and not is_model_parallel_parameter(p):
+ continue
+
+ param_norm = p.grad.data.float().norm(norm_type)
+ total_norm += param_norm.item()**norm_type
+
+ # Sum across all model parallel GPUs.
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+ if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
+ total_norm = -1
+
+ return total_norm
+
+
+def get_grad_zeros(parameters, mpu=None):
+ """Compute the number of grads with zero values.
+
+ This is adapted from get_grad_norm
+
+ Arguments:
+ parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+ single Tensor that will have gradients normalized
+
+ Returns:
+ Total number of params with zero values (viewed as a single vector).
+ """
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+ total_zeros = 0.
+ tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=mpu)
+ for p in parameters:
+ # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+ if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+ continue
+
+ # Filter to avoid over-counting replicated tensors from tensor
+ # model parallelism
+ if (tensor_mp_rank > 0) and not is_model_parallel_parameter(p):
+ continue
+
+ count_zeros = p.grad.numel() - torch.count_nonzero(p.grad)
+ total_zeros += count_zeros.item()
+
+ # Sum across all model parallel GPUs.
+ total_zeros_cuda = get_accelerator().FloatTensor([float(total_zeros)])
+ if mpu is not None:
+ dist.all_reduce(total_zeros_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+ total_zeros = total_zeros_cuda[0].item()
+
+ return total_zeros
+
+
+def get_weight_norm(parameters, norm_type=2, mpu=None):
+ """Get norm of an iterable of parameters.
+
+ This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+ added functionality to handle model parallel parameters. Note that
+ the gradients are modified in place. Taken from Nvidia Megatron.
+
+ Arguments:
+ parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+ single Tensor that will have gradients normalized
+ norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+ infinity norm.
+
+ Returns:
+ Total norm of the parameters (viewed as a single vector).
+ -1 if the norm value is NaN or Inf.
+ """
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+
+ norm_type = float(norm_type)
+ if norm_type == inf:
+ total_norm = max(p.data.abs().max() for p in parameters)
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ # Take max across all GPUs.
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()
+ else:
+ total_norm = 0.
+ tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=mpu)
+ for p in parameters:
+ # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+ if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+ continue
+
+ # Filter to avoid over-counting replicated tensors from tensor
+ # model parallelism
+ if (tensor_mp_rank > 0) and not is_model_parallel_parameter(p):
+ continue
+
+ param_norm = p.data.float().norm(norm_type)
+ total_norm += param_norm**norm_type
+
+ # Sum across all model parallel GPUs.
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+ if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
+ total_norm = -1
+
+ return total_norm
+
+
+def prefix_sum_inc(weights):
+ """ Compute an inclusive prefix sum.
+
+ Example:
+ >>> prefix_sum_inc([3,4,5])
+ [3, 7, 12]
+ """
+ weights_ = [w for w in weights]
+ for x in range(1, len(weights_)):
+ weights_[x] += weights_[x - 1]
+ return weights_
+
+
+def partition_uniform(num_items, num_parts):
+ import numpy
+ parts = [0] * (num_parts + 1)
+ # First check for the trivial edge case
+ if num_items <= num_parts:
+ for p in range(num_parts + 1):
+ parts[p] = min(p, num_items)
+ return parts
+
+ chunksize = num_items // num_parts
+ residual = num_items - (chunksize * num_parts)
+
+ parts = numpy.arange(0, (num_parts + 1) * chunksize, chunksize)
+
+ for i in range(residual):
+ parts[i + 1:] += 1
+ parts = parts.tolist()
+
+ return parts
+
+
+def partition_balanced(weights, num_parts):
+ """
+ use dynamic programming solve `The Linear Partition Problem`.
+ see https://www8.cs.umu.se/kurser/TDBAfl/VT06/algorithms/BOOK/BOOK2/NODE45.HTM
+ """
+ import numpy as np
+ n = len(weights)
+ m = num_parts
+
+ if n <= m:
+ return partition_uniform(n, m)
+
+ dp_max = np.full((n + 1, m + 1), np.inf)
+ dp_min = np.full((n + 1, m + 1), np.inf)
+ dp_cost = np.full((n + 1, m + 1), np.inf)
+ position = np.zeros((n + 1, m + 1), dtype=int)
+ prefix_sum = np.zeros((n + 1))
+ prefix_sum[1:] = np.cumsum(weights)
+
+ dp_max[0, 0] = 0
+ dp_cost[0, 0] = 0
+ for i in range(1, n + 1):
+ for j in range(1, min(i, m) + 1):
+ for k in range(i):
+ max_sum = max(dp_max[k, j - 1], prefix_sum[i] - prefix_sum[k])
+ min_sum = min(dp_min[k, j - 1], prefix_sum[i] - prefix_sum[k])
+ cost = max_sum - min_sum
+ if dp_cost[i, j] >= cost:
+ dp_cost[i, j] = cost
+ dp_max[i, j] = max_sum
+ dp_min[i, j] = min_sum
+ position[i, j] = k
+
+ parts = [n]
+ for i in reversed(range(1, m + 1)):
+ parts.append(position[parts[-1], i])
+ parts.reverse()
+
+ return parts
+
+
+class PartitionedTensor:
+
+ def __init__(self, tensor, group, partition_meta=None):
+ super().__init__()
+
+ self.group = group
+ self.num_parts = dist.get_world_size(group=self.group)
+ self.rank = dist.get_rank(group=self.group)
+ self.orig_size = list(tensor.size())
+ self.orig_device = tensor.device
+ self.local_data, self.partition = self._partition_tensor(tensor)
+ self.even_split = tensor.numel() % self.num_parts == 0
+
+ @classmethod
+ def from_meta(cls, meta, local_part, group, device=get_accelerator().device_name()):
+ assert meta.dtype == torch.long
+ dummy = torch.ones(dist.get_world_size(group=group))
+ part_obj = cls(tensor=dummy, group=group)
+
+ meta = meta.tolist()
+
+ # [N, list0, ..., listN-1]
+ part_obj.orig_size = meta[1:(1 + meta[0])]
+ meta = meta[1 + meta[0]:]
+
+ part_obj.orig_device = device
+ part_obj.local_data = local_part.detach()
+
+ part_obj.group = group
+
+ # Partition is encoded like the rowptr of a CSR matrix:
+ # [num_parts, rank, 0, part_1, ..., part_num_parts]
+ # TODO: support shuffle between different partition granularities
+ assert part_obj.num_parts == meta[0]
+ assert part_obj.rank == meta[1]
+ part_obj.partition = meta[2:] # length num_parts+1
+
+ return part_obj
+
+ def _partition_tensor(self, tensor):
+ partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts)
+ start = partition[self.rank]
+ length = partition[self.rank + 1] - start
+ tensor_part = tensor.detach().contiguous().view(-1).narrow(0, start=start, length=length).clone()
+
+ return tensor_part, partition
+
+ def full(self, device=None):
+ if device is None:
+ device = self.orig_device
+
+ # Allocate the full tensor as a flat buffer.
+ full_numel = prod(self.full_size())
+ flat_tensor = torch.zeros([full_numel], dtype=self.local_data.dtype, device=device)
+ if self.even_split:
+ # Collect the full tensor
+ dist.all_gather_into_tensor(flat_tensor, self.local_data, group=self.group)
+ else:
+ for part_id in range(self.num_parts):
+ part_size = self.partition[part_id + 1] - self.partition[part_id]
+ buf = flat_tensor.narrow(0, start=self.partition[part_id], length=part_size)
+ if part_id == self.rank:
+ buf.copy_(self.local_data)
+ dist.broadcast(buf, part_id, self.group)
+ return flat_tensor.view(self.full_size()).clone().detach()
+
+ def to_meta(self):
+ """Returns a torch.LongTensor that encodes partitioning information.
+
+ Can be used along with ``data()`` to serialize a ``PartitionedTensor`` for
+ communication.
+
+ Returns:
+ torch.LongTensor: a tensor encoding the meta-information for the partitioning
+ """
+ meta = []
+ meta.append(len(self.orig_size))
+ meta += list(self.orig_size)
+ meta.append(self.num_parts)
+ meta.append(self.rank)
+ meta += self.partition
+ return torch.LongTensor(data=meta).to(self.orig_device)
+
+ def data(self):
+ return self.local_data
+
+ def local_size(self):
+ return self.local_data.size()
+
+ def full_size(self):
+ return self.orig_size
+
+
+mem_alloced = 0
+mem_cached = 0
+
+
+def memory_status(msg, print_rank=-1, reset_max=False):
+ global mem_alloced, mem_cached
+
+ rank = dist.get_rank()
+ if print_rank != -1 and rank != print_rank:
+ return
+
+ get_accelerator().synchronize()
+
+ if reset_max:
+ get_accelerator().reset_max_memory_cached()
+ get_accelerator().reset_max_memory_allocated()
+
+ new_alloced = get_accelerator().memory_allocated()
+ new_cached = get_accelerator().memory_cached()
+
+ delta_alloced = new_alloced - mem_alloced
+ delta_cached = new_cached - mem_cached
+
+ mem_cached = new_cached
+ mem_alloced = new_alloced
+
+ max_alloced = get_accelerator().max_memory_allocated()
+ max_cached = get_accelerator().max_memory_cached()
+
+ # convert to GB for printing
+ new_alloced /= 1024**3
+ new_cached /= 1024**3
+ delta_alloced /= 1024**3
+ delta_cached /= 1024**3
+ max_alloced /= 1024**3
+ max_cached /= 1024**3
+
+ print(
+ f'RANK={rank} MEMSTATS', msg, f'device={get_accelerator().current_device_name()} '
+ f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
+ f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)')
+
+
+def get_ma_status():
+ if dist.is_initialized() and not dist.get_rank() == 0:
+ return 0
+ return get_accelerator().memory_allocated()
+
+
+def empty_cache():
+ get_accelerator().empty_cache()
+ get_accelerator().reset_peak_memory_stats()
+
+
+def see_memory_usage(message, force=False):
+ if not force:
+ return
+ if dist.is_initialized() and not dist.get_rank() == 0:
+ return
+
+ # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
+ gc.collect()
+
+ # Print message except when distributed but not rank 0
+ logger.info(message)
+ logger.info(f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
+ Max_MA {round(get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
+ CA {round(torch_memory_reserved() / (1024 * 1024 * 1024),2)} GB \
+ Max_CA {round(torch_max_memory_reserved() / (1024 * 1024 * 1024))} GB ")
+
+ vm_stats = psutil.virtual_memory()
+ used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
+ logger.info(f'CPU Virtual Memory: used = {used_GB} GB, percent = {vm_stats.percent}%')
+
+ # get the peak memory to report correct data, so reset the counter for the next call
+ get_accelerator().reset_peak_memory_stats()
+
+
+def call_to_str(base, *args, **kwargs):
+ """Construct a string representation of a call.
+
+ Args:
+ base (str): name of the call
+ args (tuple, optional): args to ``base``
+ kwargs (dict, optional): kwargs supplied to ``base``
+
+ Returns:
+ str: A string representation of base(*args, **kwargs)
+ """
+ name = f'{base}('
+ if args:
+ name += ', '.join(repr(arg) for arg in args)
+ if kwargs:
+ name += ', '
+ if kwargs:
+ name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
+ name += ')'
+ return name
+
+
+def get_only_unique_item(items):
+ item_set = set(items)
+ if len(item_set) != 1:
+ raise RuntimeError(f"expected there to be only one unique element in {items}")
+ unique_item, = item_set
+
+ return unique_item
+
+
+def clip_gradients(parameters, max_norm=1.0, global_grad_norm=None, mpu=None, eps=1e-6):
+ """Clip the gradient of a list of parameters.
+ Args:
+ parameters: List of parameters whose .grad will be clipped.
+ global_grad_norm (float, optional): Precomputed gradient norm. Defaults to None.
+ mpu (optional): model parallelism unit. Defaults to None.
+ eps (float, optional): epsilon value added to grad norm. Defaults to 1e-6
+ Returns:
+ float: the global gradient norm
+ """
+ if global_grad_norm is None:
+ global_grad_norm = get_grad_norm(parameters, mpu=mpu)
+ clip_coef = max_norm / (global_grad_norm + eps)
+ if clip_coef < 1:
+ for p in parameters:
+ p.grad.detach().mul_(clip_coef)
+ return global_grad_norm
+
+
+def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=False):
+ """Get norm of an iterable of tensors.
+
+ This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+ added functionality to handle model parallel parameters. Taken from Nvidia Megatron.
+
+ Arguments:
+ input_tensors (Iterable[Tensor]): an iterable of Tensors will have norm computed
+ norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+ infinity norm.
+
+ Returns:
+ Total norm of the tensors (viewed as a single vector).
+ """
+ assert isinstance(input_tensors, Iterable), f'expected Iterable type not {type(input_tensors)}'
+ assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors'
+
+ norm_type = float(norm_type)
+ if norm_type == inf:
+ total_norm = max(t.data.abs().max() for t in input_tensors)
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()
+ else:
+ if use_graph:
+ if 'norm_tensors_compute_buffer' not in graph_cache:
+ graph_cache['norm_tensors_compute_buffer'] = [t.data.float().norm(norm_type) for t in input_tensors]
+ compute_buffer = graph_cache['norm_tensors_compute_buffer']
+
+ def _norm_tensors(tensor_list, _compute_buffer, _norm_type):
+ for i, t in enumerate(tensor_list):
+ _compute_buffer[i].data.copy_(t.data.float().norm(_norm_type)**_norm_type)
+ if i != 0:
+ _compute_buffer[0].data.add_(_compute_buffer[i].data)
+
+ graph_process(False, _norm_tensors, input_tensors, compute_buffer, norm_type)
+
+ total_norm = compute_buffer[0]
+ else:
+ total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
+
+ total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]).detach()
+ if mpu is not None:
+ dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+ total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+ if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
+ total_norm = -1
+
+ return total_norm
+
+
+def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, mpu=None, eps=1e-6, use_graph=False):
+ """Clip list of tensors by global norm.
+ Args:
+ input_tensors: List of tensors to be clipped
+ global_norm (float, optional): Precomputed norm. Defaults to None.
+ mpu (optional): model parallelism unit. Defaults to None.
+ eps (float, optional): epsilon value added to grad norm. Defaults to 1e-6
+ Returns:
+ float: the global norm
+ """
+ if global_norm is None:
+ global_norm = get_global_norm_of_tensors(input_tensors, mpu=mpu, use_graph=use_graph)
+ clip_coef = max_norm / (global_norm + eps)
+ if clip_coef < 1:
+ if use_graph:
+
+ def clip_tensors(_tensor_list, _clip_coef_tensor):
+ for t in _tensor_list:
+ t.detach().mul_(_clip_coef_tensor)
+
+ if 'clip_coef_tensor' not in graph_cache:
+ # Alloc memory
+ graph_cache['clip_coef_tensor'] = torch.tensor(clip_coef,
+ dtype=torch.float32).to(get_accelerator().device_name())
+ clip_coef_tensor = graph_cache['clip_coef_tensor']
+ clip_coef_tensor.copy_(torch.tensor(clip_coef, dtype=torch.float32))
+ graph_process(False, clip_tensors, input_tensors, clip_coef_tensor)
+
+ else:
+ for t in input_tensors:
+ t.detach().mul_(clip_coef)
+ return global_norm
+
+
+def align_dense_tensors(tensor_list, alignment):
+ num_elements = sum(t.numel() for t in tensor_list)
+ remaining = num_elements % alignment
+
+ if remaining:
+ elements_to_add = alignment - remaining
+ pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype)
+ padded_tensor_list = tensor_list + [pad_tensor]
+ else:
+ padded_tensor_list = tensor_list
+
+ return padded_tensor_list
+
+
+def all_gather_into_tensor_dp_groups(groups_flat, partitioned_param_groups, zp_process_group, dp_process_group=None):
+
+ for group_id, (group_flat, partitioned_params) in enumerate(zip(groups_flat, partitioned_param_groups)):
+ partition_id = dist.get_rank(group=zp_process_group[group_id])
+ dp_world_size = dist.get_world_size(group=dp_process_group)
+ if dp_world_size == 1:
+ # no groups share optimizer states
+ # pipeline parallel with bf16 will default call this even if dp size = 1.
+ continue
+ # print("call contiguous for all_gather_into_tensor_dp_groups")
+ dist.all_gather_into_tensor(group_flat, partitioned_params[partition_id].contiguous(), dp_process_group)
+
+
+def all_gather_dp_groups(groups_flat, partitioned_param_groups, zp_process_group, start_alignment_factor,
+ allgather_bucket_size, dp_process_group=None):
+ # if dist.has_all_gather_into_tensor():
+ return all_gather_into_tensor_dp_groups(groups_flat, partitioned_param_groups, zp_process_group, dp_process_group)
+
+ # for group_id, partitioned_params in enumerate(partitioned_param_groups):
+ # # Sequential AllGather Best of both worlds
+ # partition_id = dist.get_rank(group=dp_process_group[group_id])
+ # dp_world_size = dist.get_world_size(group=dp_process_group[group_id])
+ #
+ # if dp_world_size == 1:
+ # # no groups share optimizer states
+ # # pipeline parallel with bf16 will default call this even if dp size = 1.
+ # continue
+ # num_shards = max(1, partitioned_params[partition_id].numel() * dp_world_size // allgather_bucket_size)
+ #
+ # shard_size = partitioned_params[partition_id].numel() // num_shards
+ #
+ # # Enforce nccl/rccl alignment of start location of each shard
+ # shard_size = shard_size - (shard_size % start_alignment_factor)
+ #
+ # num_elements = shard_size
+ #
+ # assert shard_size * num_shards <= partitioned_params[partition_id].numel()
+ #
+ # for shard_id in range(num_shards):
+ #
+ # if shard_id == (num_shards - 1):
+ # num_elements = partitioned_params[partition_id].numel() - shard_id * shard_size
+ #
+ # shard_list = []
+ # for dp_id in range(dp_world_size):
+ # curr_shard = partitioned_params[dp_id].narrow(0, shard_id * shard_size, num_elements).detach()
+ # shard_list.append(curr_shard)
+ # dist.all_gather(shard_list, shard_list[partition_id].contiguous(), dp_process_group[group_id])
+
+
+class TLinear(torch.nn.Linear):
+
+ def __init__(self, orig_layer, name=""):
+ self.name = name
+ super().__init__(orig_layer.weight.shape[1], orig_layer.weight.shape[0], bias=(orig_layer.bias is not None))
+ self.weight.data = transpose(orig_layer.weight.data)
+ self.bias = orig_layer.bias
+ self._fwd_func = self._fwd_bias_add if self.bias is not None else self._fwd
+
+ def _fwd(self, input):
+ return F.linear(input, self.weight)
+
+ def _fwd_bias_add(self, input):
+ return F.linear(input, self.weight, bias=self.bias)
+
+ def forward(self, input):
+ return self._fwd_func(input)
+
+
+def get_inactive_params(param_list):
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+ return [param for param in param_list if (hasattr(param, 'ds_id') and \
+ param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
+
+
+def required_torch_version(min_version=None, max_version=None):
+ assert min_version or max_version, "Must provide a min_version or max_version argument"
+
+ torch_version = pkg_version.parse(torch.__version__)
+
+ if min_version and pkg_version.parse(str(min_version)) > torch_version:
+ return False
+
+ if max_version and pkg_version.parse(str(max_version)) < torch_version:
+ return False
+
+ return True
diff --git a/opensora/adaptor/zp_manager.py b/opensora/adaptor/zp_manager.py
new file mode 100644
index 000000000..e05bfdbcd
--- /dev/null
+++ b/opensora/adaptor/zp_manager.py
@@ -0,0 +1,31 @@
+import torch
+import os
+import torch.distributed as dist
+
+
+class ZPManager(object):
+ def __init__(self, zp_size=8):
+ self.rank = int(os.getenv('RANK', '0'))
+ self.world_size = int(os.getenv("WORLD_SIZE", '1'))
+ self.zp_size = zp_size
+ self.zp_group = None
+ self.zp_rank = None
+ self.is_initialized = False
+
+ def init_group(self):
+ if self.is_initialized:
+ return
+
+ self.is_initialized = True
+
+ """Initialize the sequence parallel group."""
+ num_zp_groups: int = self.world_size // self.zp_size
+ for i in range(num_zp_groups):
+ ranks = range(i * self.zp_size, (i + 1) * self.zp_size)
+ group = dist.new_group(ranks)
+ if self.rank in ranks:
+ self.zp_group = group
+ self.zp_rank = self.rank % self.zp_size
+
+
+zp_manager = ZPManager()
diff --git a/opensora/dataset/__init__.py b/opensora/dataset/__init__.py
index 5b64611d5..6ce2eea45 100644
--- a/opensora/dataset/__init__.py
+++ b/opensora/dataset/__init__.py
@@ -1,59 +1,129 @@
from torchvision.transforms import Compose
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoImageProcessor
-from .feature_datasets import T2V_Feature_dataset, T2V_T5_Feature_dataset
from torchvision import transforms
from torchvision.transforms import Lambda
-from .t2v_datasets import T2V_dataset
-from .transform import ToTensorVideo, TemporalRandomCrop, RandomHorizontalFlipVideo, CenterCropResizeVideo, LongSideResizeVideo, SpatialStrideCropVideo
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+from opensora.dataset.t2v_datasets import T2V_dataset
+from opensora.models.causalvideovae import ae_norm, ae_denorm
+from opensora.dataset.transform import ToTensorVideo, TemporalRandomCrop, RandomHorizontalFlipVideo, CenterCropResizeVideo, LongSideResizeVideo, SpatialStrideCropVideo, NormalizeVideo, ToTensorAfterResize
-ae_norm = {
- 'CausalVAEModel_4x8x8': Lambda(lambda x: 2. * x - 1.),
- 'CausalVQVAEModel_4x4x4': Lambda(lambda x: x - 0.5),
- 'CausalVQVAEModel_4x8x8': Lambda(lambda x: x - 0.5),
- 'VQVAEModel_4x4x4': Lambda(lambda x: x - 0.5),
- 'VQVAEModel_4x8x8': Lambda(lambda x: x - 0.5),
- "bair_stride4x2x2": Lambda(lambda x: x - 0.5),
- "ucf101_stride4x4x4": Lambda(lambda x: x - 0.5),
- "kinetics_stride4x4x4": Lambda(lambda x: x - 0.5),
- "kinetics_stride2x4x4": Lambda(lambda x: x - 0.5),
- 'stabilityai/sd-vae-ft-mse': transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- 'stabilityai/sd-vae-ft-ema': transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- 'vqgan_imagenet_f16_1024': Lambda(lambda x: 2. * x - 1.),
- 'vqgan_imagenet_f16_16384': Lambda(lambda x: 2. * x - 1.),
- 'vqgan_gumbel_f8': Lambda(lambda x: 2. * x - 1.),
-
-}
-ae_denorm = {
- 'CausalVAEModel_4x8x8': lambda x: (x + 1.) / 2.,
- 'CausalVQVAEModel_4x4x4': lambda x: x + 0.5,
- 'CausalVQVAEModel_4x8x8': lambda x: x + 0.5,
- 'VQVAEModel_4x4x4': lambda x: x + 0.5,
- 'VQVAEModel_4x8x8': lambda x: x + 0.5,
- "bair_stride4x2x2": lambda x: x + 0.5,
- "ucf101_stride4x4x4": lambda x: x + 0.5,
- "kinetics_stride4x4x4": lambda x: x + 0.5,
- "kinetics_stride2x4x4": lambda x: x + 0.5,
- 'stabilityai/sd-vae-ft-mse': lambda x: 0.5 * x + 0.5,
- 'stabilityai/sd-vae-ft-ema': lambda x: 0.5 * x + 0.5,
- 'vqgan_imagenet_f16_1024': lambda x: (x + 1.) / 2.,
- 'vqgan_imagenet_f16_16384': lambda x: (x + 1.) / 2.,
- 'vqgan_gumbel_f8': lambda x: (x + 1.) / 2.,
-}
+from opensora.dataset.inpaint_datasets import Inpaint_dataset
+from ultralytics import YOLO
def getdataset(args):
- temporal_sample = TemporalRandomCrop(args.num_frames * args.sample_rate) # 16 x
+ temporal_sample = TemporalRandomCrop(args.num_frames) # 16 x
norm_fun = ae_norm[args.ae]
+ if args.force_resolution:
+ resize = [CenterCropResizeVideo((args.max_height, args.max_width)), ]
+ else:
+ resize = [
+ LongSideResizeVideo((args.max_height, args.max_width), skip_low_resolution=True),
+ SpatialStrideCropVideo(stride=args.hw_stride),
+ ]
+ transform = transforms.Compose([
+ ToTensorVideo(),
+ *resize,
+ norm_fun
+ ])
+ inpaint_transform = transforms.Compose([
+ *resize
+ ])
+ tokenizer = AutoTokenizer.from_pretrained("/home/image_data/mt5-xxl", cache_dir=args.cache_dir)
+ YOLOmodel = YOLO("/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt")
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir)
+ # tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
if args.dataset == 't2v':
- transform = transforms.Compose([
- ToTensorVideo(),
- LongSideResizeVideo(args.max_image_size, skip_low_resolution=True),
- SpatialStrideCropVideo(args.stride),
- # RandomHorizontalFlipVideo(p=0.5), # in case their caption have position decription
- norm_fun
- ])
- tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
return T2V_dataset(args, transform=transform, temporal_sample=temporal_sample, tokenizer=tokenizer)
+ elif args.dataset == 'inpaint' or args.dataset == 'i2v':
+ mask_processor = transforms.Compose([*resize])
+ return Inpaint_dataset(args, transform=inpaint_transform, temporal_sample=temporal_sample, tokenizer=tokenizer)
raise NotImplementedError(args.dataset)
+
+
+if __name__ == "__main__":
+ from accelerate import Accelerator
+ from opensora.dataset.t2v_datasets import dataset_prog
+ from opensora.utils.dataset_utils import LengthGroupedSampler, Collate
+ from torch.utils.data import DataLoader
+ import random
+ from torch import distributed as dist
+ from tqdm import tqdm
+ args = type('args', (),
+ {
+ 'ae': 'CausalVAEModel_D8_4x8x8',
+ 'dataset': 't2v',
+ 'attention_mode': 'xformers',
+ 'use_rope': True,
+ 'model_max_length': 300,
+ 'max_height': 320,
+ 'max_width': 320,
+ 'hw_stride': 32,
+ 'skip_low_resolution': True,
+ 'num_frames': 93,
+ 'use_image_num': 0,
+ 'compress_kv_factor': 1,
+ 'interpolation_scale_t': 1,
+ 'interpolation_scale_h': 1,
+ 'interpolation_scale_w': 1,
+ 'cache_dir': '../cache_dir',
+ 'data': 'scripts/train_data/merge_data_debug.txt',
+ 'train_fps': 16,
+ 'drop_short_ratio': 0.0,
+ 'use_img_from_vid': False,
+ 'speed_factor': 1.0,
+ 'cfg': 0.1,
+ 'text_encoder_name': 'google/mt5-xxl',
+ 'dataloader_num_workers': 10,
+ 'use_motion': False,
+ 'force_resolution': False,
+ 'use_decord': True,
+ 'group_data': True,
+ 'train_batch_size': 1,
+ 'gradient_accumulation_steps': 1,
+ 'ae_stride': 8,
+ 'ae_stride_t': 4,
+ 'patch_size': 2,
+ 'patch_size_t': 1,
+ }
+ )
+ accelerator = Accelerator()
+ dataset = getdataset(args)
+ # data = next(iter(dataset))
+ # import ipdb;ipdb.set_trace()
+ # print()
+ sampler = LengthGroupedSampler(
+ args.train_batch_size,
+ world_size=accelerator.num_processes,
+ gradient_accumulation_size=args.gradient_accumulation_steps,
+ initial_global_step=0,
+ lengths=dataset.lengths,
+ group_data=args.group_data,
+ )
+ train_dataloader = DataLoader(
+ dataset,
+ shuffle=False,
+ # pin_memory=True,
+ collate_fn=Collate(args),
+ batch_size=args.train_batch_size,
+ num_workers=args.dataloader_num_workers,
+ sampler=sampler,
+ drop_last=False,
+ prefetch_factor=4
+ )
+ import imageio
+ import numpy as np
+ from einops import rearrange
+ while True:
+ for idx, i in enumerate(tqdm(train_dataloader)):
+ import ipdb;ipdb.set_trace()
+ pixel_values = i[0][0]
+ pixel_values_ = (pixel_values+1)/2
+ pixel_values_ = rearrange(pixel_values_, 'c t h w -> t h w c') * 255.0
+ pixel_values_ = pixel_values_.numpy().astype(np.uint8)
+ imageio.mimwrite(f'output{idx}.mp4', pixel_values_, fps=args.train_fps)
+ dist.barrier()
+ pass
\ No newline at end of file
diff --git a/opensora/dataset/allinpaint_data.py b/opensora/dataset/allinpaint_data.py
new file mode 100644
index 000000000..66815f899
--- /dev/null
+++ b/opensora/dataset/allinpaint_data.py
@@ -0,0 +1,611 @@
+import random
+from opensora.dataset.inpaint_utils import get_mask_tensor,MaskType
+import torch
+from opensora.dataset.transform import ToTensorVideo
+from opensora.models.causalvideovae import ae_norm
+from torchvision import transforms
+import os
+from PIL import Image
+import numpy as np
+import cv2
+from enum import Enum, auto
+from ultralytics import YOLO
+import argparse
+import json
+
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+
+
+os.environ['YOLO_VERBOSS'] = 'False'
+
+class MaskType(Enum):
+ Semantic_mask = auto()
+ bbox_mask = auto()
+ background_mask = auto()
+ fixed_mask = auto()
+ Semantic_expansion_mask = auto()
+ fixed_bg_mask = auto()
+ t2iv_mask = auto()
+ i2v_mask = auto()
+ transition_mask = auto()
+ v2v_mask = auto()
+ clear_mask = auto()
+ random_mask = auto()
+
+
+
+
+class single_info:
+ def __init__(self, id, label, shape) -> None:
+ self.id = id
+ self.label = label
+ self.shape = shape
+ self.frame_indexes = []
+ self.infos = []
+ def update(self,frame_index,box,conf,mask):
+ self.frame_indexes.append(frame_index)
+ info = dict(
+ box=box,
+ conf=conf,
+ mask=mask,
+ )
+ self.infos.append(info)
+ def return_dict(self,):
+ return dict(
+ id=self.id,
+ label=self.label,
+ frame_size=self.shape,
+ frame_index_list = self.frame_indexes,
+ infos_list = self.infos
+ )
+
+def save_videos_from_pil(pil_images, path, fps=24):
+ """
+ pil_images: list[Image,...]
+ """
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+
+ image = pil_images[0]
+
+ image = ndarray_to_pil(pil_images[0])
+ width, height = image.size
+
+
+ codec = "libx264"
+ container = av.open(path, "w")
+ stream = container.add_stream(codec, rate=fps)
+
+ stream.width = width
+ stream.height = height
+
+ for pil_image in pil_images:
+ # pil_image = Image.fromarray(image_arr).convert("RGB")
+ pil_image = ndarray_to_pil(pil_image)
+ av_frame = av.VideoFrame.from_image(pil_image)
+ container.mux(stream.encode(av_frame))
+ container.mux(stream.encode())
+ container.close()
+
+def read_frames(video_tensor) -> list:
+ """
+ 读取视频,返回一个元素类型为ndarray的列表
+ """
+ # container = av.open(video_path)
+ T = video_tensor.shape[0]
+ frames = []
+ for t in range(T):
+ frame_tensor = video_tensor[t]
+ frame_tensor = frame_tensor.cpu().numpy()
+ frame_tensor = np.transpose(frame_tensor, (1, 2, 0))
+ frames.append(frame_tensor)
+ return frames
+
+
+def get_masked_image(image: np.ndarray, mask: np.ndarray) -> np.ndarray:
+ mask = mask.astype(bool)
+ if len(mask.shape) == 2:
+ mask = np.expand_dims(mask, axis=2)
+ masked_img = image * (1-mask)
+ return masked_img # shape: [H,W,C]; range: [0, 255]
+
+def get_bbox_image(image: np.ndarray,bbox,obj_id):
+ # cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
+ bbox_image = image.copy()
+ bbox_image[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 0
+ # cv2.putText(image, f'ID: {obj_id}', (int(bbox[0]), int(bbox[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 0), 2)
+ return bbox_image
+
+
+
+def select_bg_from_video(bg_masks, video):
+ new_container = []
+ for index, frame in enumerate(video):
+
+ mask = bg_masks[index]
+ masked_frame = get_masked_image(frame, mask)
+ new_container.append(masked_frame)
+ return new_container
+
+def get_random_box(image_tensor):
+
+ H, W,C = image_tensor.shape
+
+ box_min_size = min(H,W)/2
+ box_max_size = max(H,W)/2
+
+ # 随机确定 box 的宽高
+ box_width = random.randint(box_min_size, min(box_max_size, W))
+ box_height = random.randint(box_min_size, min(box_max_size, H))
+
+ # 随机确定 box 的左上角坐标
+ x_start = random.randint(0, W - box_width)
+ y_start = random.randint(0, H - box_height)
+
+ box = (x_start, y_start, x_start + box_width, y_start + box_height)
+
+ return box
+
+def combine_masks_and_get_background(masks):
+ """
+ 合并所有 mask 并取反得到背景 mask
+ """
+ combined_mask = np.any(masks, axis=0)
+ background_mask = np.logical_not(combined_mask)
+ return background_mask
+
+def parser_results_for_ids(results, frame_size=None):
+ id_record = []
+ single_info_ins = {}
+ background_masks = []
+ for frame_index, result in enumerate(results):
+ result = result[0]
+ if frame_index == 0 and frame_size is None:
+ frame_size = result.boxes.orig_shape
+ id = result.boxes.id
+
+ # 如果没有检测到物体
+ if id is None:
+ background_masks.append(np.ones((frame_size)) * 255)
+ continue
+
+ id = id.tolist()
+ cls = result.boxes.cls.tolist() #每个id对应的label
+ conf = result.boxes.conf.tolist() #每个id对应的预测置信度
+ box_n = result.boxes.xyxy.tolist() #每个id对应的box
+ mask = result.masks.data.cpu().detach().numpy() #每个id对应的mask
+ background_masks.append(combine_masks_and_get_background(mask))
+
+ for i, iden in enumerate(id):
+ if iden not in id_record:
+ id_record.append(iden)
+ single_info_ins[iden] = single_info(iden, cls[i], frame_size)
+ single_info_ins[iden].update(frame_index, box_n[i], conf[i],mask[i])
+ return_list = []
+ for _, value in single_info_ins.items():
+ return_list.append(value.return_dict())
+ return return_list, background_masks
+
+
+def get_mask(frames,mask_type,yole_model):
+
+ video = frames
+
+ # video_tensor_batch = video_tensor.unsqueeze(1)
+ # T,C,H,W = video_tensor.shape
+
+ # video = video_tensor
+
+ tracker = yole_model
+
+ results = []
+
+
+ for frame in frames:
+ # frame_tensor = video_tensor[t] # 获取当前帧, (C, H, W)
+ # frame_tensor = frame_tensor.data.cpu().numpy() # 转为numpy
+ # frame_tensor = np.transpose(frame_tensor, (1, 2, 0))
+
+ # 进行推理
+ result = tracker.track(frame,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False,nms=False)
+
+ # 保存结果
+ results.append(result)
+
+
+ parser_res, background_masks = parser_results_for_ids(results)
+
+ select_index = -1
+ object_info = []
+ frame_indexes = []
+ infos = []
+
+
+ #随机选择一个被追踪物体
+ if len(parser_res) is not 0:
+ select_index = random.randint(0, len(parser_res)-1)
+ object_info = parser_res[select_index]
+ frame_indexes = object_info['frame_index_list']
+ infos = object_info['infos_list']
+ # print("infos size",len(infos))
+ # print("frame_indexed",len(frame_indexes))
+ else:
+ mask_type = MaskType.fixed_mask
+
+
+
+ # print("frame_indexed:",frame_indexes)
+ # print("infos:",infos)
+
+ # mask_type = MaskType.Semantic_mask
+
+ if mask_type == MaskType.Semantic_mask or mask_type == MaskType.Semantic_expansion_mask:
+ Semantic_masks = []
+ mask_container = []
+ info_index = 0
+ for index, frame in enumerate(video):
+ if index in frame_indexes:
+
+ mask = infos[info_index]['mask']
+ info_index = info_index + 1
+
+ if mask_type == MaskType.Semantic_expansion_mask:
+ kernel = np.ones((5, 5), np.uint8)
+ # 进行膨胀操作
+ mask = cv2.dilate(mask, kernel, iterations=1)
+
+ # 计算掩码中前景像素的数量
+ foreground_pixels = np.sum(mask)
+
+ # 计算图像的总像素数
+ total_pixels = mask.size # 或者使用 image.shape[0] * image.shape[1]
+
+ # 计算比例
+ ratio = foreground_pixels / total_pixels
+
+ if ratio < 0.2:
+ if random.random() < 0.5:
+ mask_type = MaskType.fixed_mask
+ break
+
+ # masked_frame = get_masked_image(frame, mask)
+ # mask_container.append(masked_frame)
+ Semantic_masks.append(mask)
+ else:
+ mask_container.append(np.zeros_like(frame))
+ Semantic_masks.append(np.zeros_like(frame)[:,:,0])
+ if mask_type == MaskType.Semantic_mask or mask_type == MaskType.Semantic_expansion_mask:
+ return Semantic_masks
+
+ if mask_type == MaskType.bbox_mask:
+ boxes_masks = []
+ box_container = []
+
+ info_index = 0
+
+ for index, frame in enumerate(video):
+ if index in frame_indexes:
+ bbox = infos[info_index]['box']
+ info_index = info_index + 1
+
+ # boxed_frame = get_bbox_image(frame, bbox, object_info['id'])
+ # box_container.append(boxed_frame)
+ boxmask = np.zeros_like(frame)[:,:,0]
+ boxmask[int(bbox[1]): int(bbox[3]), int(bbox[0]): int(bbox[2])] = 1
+ boxes_masks.append(boxmask)
+ else:
+ # box_container.append(frame)
+ boxes_masks.append(np.zeros_like(frame)[:,:,0])
+
+ return boxes_masks
+
+ if mask_type == MaskType.background_mask:
+ # bg_container = select_bg_from_video(background_masks, video)
+ return background_masks
+
+ if mask_type == MaskType.fixed_mask or mask_type == MaskType.fixed_bg_mask:
+ fixed_mask_container = []
+ fixed_masks = []
+
+ box = get_random_box(video[0])
+ for index , frame in enumerate(video):
+ if mask_type == MaskType.fixed_mask:
+ # boxed_frame = frame.copy()
+ # boxed_frame[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 0
+ # fixed_mask_container.append(boxed_frame)
+
+ fixed_mask = np.zeros_like(frame)[:,:,0]
+ fixed_mask[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 1
+ fixed_masks.append(fixed_mask)
+ if mask_type == MaskType.fixed_bg_mask:
+ boxed_frame = frame.copy()
+
+ fixed_mask = np.zeros_like(frame)[:,:,0]
+ fixed_mask[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 1
+ fixed_mask = 1 - fixed_mask
+ fixed_masks.append(fixed_mask)
+
+ # boxed_bg_frame = get_masked_image(boxed_frame, fixed_mask)
+ # fixed_mask_container.append(boxed_bg_frame)
+
+ return fixed_masks
+
+
+
+def video_to_tensor(video_path):
+ # 打开视频文件
+ cap = cv2.VideoCapture(video_path)
+
+ frames = []
+
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ # 将 BGR 转换为 RGB
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+ # 转换为张量并添加到帧列表中
+ frame_tensor = torch.from_numpy(frame).permute(2, 0, 1) # (H, W, C) -> (C, H, W)
+ frames.append(frame_tensor)
+
+ cap.release()
+
+ # 将所有帧组合成一个四维张量
+ video_tensor = torch.stack(frames) # (T, C, H, W)
+
+ return video_tensor
+
+
+def ndarray_to_pil(image: np.ndarray) -> Image:
+ if np.max(image) <= 1.1:
+ image = image * 255
+ image = image.astype(np.uint8)
+ return Image.fromarray(image)
+
+
+
+def get_mask_tensor(video_tensor,mask_type,yolomodel):
+
+ # return video_tensor,video_tensor
+
+ masks_container = get_mask(video_tensor,mask_type,yolomodel)
+
+ # masked_frames = [torch.from_numpy(frame.transpose(2,0,1)) for frame in masked_video_container]
+ # masked_video = torch.stack(masked_frames)
+
+ masks = [torch.from_numpy(mask.reshape(1,mask.shape[0],mask.shape[1])) for mask in masks_container]
+ # import ipdb; ipdb.set_trace()
+ mask = torch.stack(masks)
+
+ return mask
+
+
+def video_to_frames(video_path):
+ # 打开视频文件
+ cap = cv2.VideoCapture(video_path)
+
+ # 存储所有帧的列表
+ frames = []
+
+ # 逐帧读取视频
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+ frames.append(frame)
+
+ # 释放视频文件资源
+ cap.release()
+
+ return frames
+
+
+class MaskProcessor:
+ def __init__(self,args,YOLOmodel):
+ # ratio
+ # transform
+ self.num_frames = args.num_frames
+ if self.num_frames != 1:
+ # inpaint
+ self.t2v_ratio = args.t2v_ratio
+ self.i2v_ratio = args.i2v_ratio
+ self.transition_ratio = args.transition_ratio
+ self.v2v_ratio = args.v2v_ratio
+ self.clear_video_ratio = args.clear_video_ratio
+ self.Semantic_ratio = args.Semantic_ratio
+ self.bbox_ratio = args.bbox_ratio
+ self.background_ratio = args.background_ratio
+ self.fixed_ratio = args.fixed_ratio
+ self.Semantic_expansion_ratio = args.Semantic_expansion_ratio
+ self.fixed_bg_ratio = args.fixed_bg_ratio
+ assert self.t2v_ratio + self.i2v_ratio + self.transition_ratio + self.v2v_ratio + self.clear_video_ratio + self.Semantic_ratio + self.bbox_ratio + self.background_ratio + self.fixed_ratio + self.fixed_bg_ratio + self.Semantic_expansion_ratio < 1, 'The sum of t2v_ratio, i2v_ratio, transition_ratio, v2v_ratio and clear video ratio should be less than 1.'
+
+ self.min_clear_ratio = 0.0 if args.min_clear_ratio is None else args.min_clear_ratio
+ assert self.min_clear_ratio >= 0 and self.min_clear_ratio <= 1, 'min_clear_ratio should be in the range of [0, 1].'
+
+
+ # self.transform = transforms.Compose([
+ # ToTensorVideo(),
+ # ae_norm[args.ae]
+ # ])
+
+ self.init_mask_func()
+ self.init_ratio()
+
+ self.default_text_ratio = args.default_text_ratio
+
+ self.yolomodel = YOLOmodel
+
+ def init_mask_func(self):
+ # mask: ones_like (t 1 h w)
+ def t2iv(mask):
+ mask[:] = 1
+ return mask
+
+ def i2v(mask):
+ mask[0] = 0
+ return mask
+
+ def transition(mask):
+ mask[0] = 0
+ mask[-1] = 0
+ return mask
+
+ def v2v(mask):
+ end_idx = random.randint(int(mask.shape[0] * self.min_clear_ratio), mask.shape[0])
+ mask[:end_idx] = 0
+ return mask
+
+ def clear(mask):
+ mask[:] = 0
+ return mask
+
+ def random_mask(mask):
+ num_to_select = random.randint(int(mask.shape[0] * self.min_clear_ratio), mask.shape[0])
+ selected_indices = random.sample(range(mask.shape[0]), num_to_select)
+ mask[selected_indices] = 0
+ return mask
+
+ def Semantic_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.Semantic_mask,self.yolomodel)
+
+ def bbox_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.bbox_mask,self.yolomodel)
+
+ def background_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.background_mask,self.yolomodel)
+
+ def fixed_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.fixed_mask,self.yolomodel)
+
+ def Semantic_expansion_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.Semantic_expansion_mask,self.yolomodel)
+
+ def fixed_bg_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+
+
+
+ self.mask_functions = {
+ MaskType.t2iv_mask: t2iv,
+ MaskType.i2v_mask: i2v,
+ MaskType.transition_mask: transition,
+ MaskType.v2v_mask: v2v,
+ MaskType.clear_mask: clear,
+ MaskType.random_mask: random_mask,
+ MaskType.Semantic_mask:Semantic_mask,
+ MaskType.bbox_mask:bbox_mask,
+ MaskType.background_mask:background_mask,
+ MaskType.fixed_mask:fixed_mask,
+ MaskType.Semantic_expansion_mask:Semantic_expansion_mask,
+ MaskType.fixed_bg_mask:fixed_bg_mask
+ }
+
+ def init_ratio(self):
+
+ self.mask_func_weights_video = {
+ MaskType.t2iv_mask: self.t2v_ratio,
+ MaskType.i2v_mask: self.i2v_ratio,
+ MaskType.transition_mask: self.transition_ratio,
+ MaskType.v2v_mask: self.v2v_ratio,
+ MaskType.clear_mask: self.clear_video_ratio,
+ MaskType.Semantic_mask:self.Semantic_ratio,
+ MaskType.bbox_mask:self.bbox_ratio,
+ MaskType.background_mask:self.background_ratio,
+ MaskType.fixed_mask:self.fixed_ratio,
+ MaskType.Semantic_expansion_mask:self.Semantic_expansion_ratio,
+ MaskType.fixed_bg_mask:self.fixed_bg_ratio,
+ MaskType.random_mask: 1 - self.t2v_ratio - self.i2v_ratio - self.transition_ratio - self.v2v_ratio - self.clear_video_ratio - self.Semantic_ratio - self.bbox_ratio - self.background_ratio - self.fixed_ratio - self.Semantic_expansion_ratio - self.fixed_bg_ratio
+
+ }
+
+ self.mask_func_weights_image = {
+ 't2iv': 0.9,
+ 'clear': 0.1
+ }
+
+ # t c h w
+ def __call__(self,pixel_values):
+ # pixel_values shape (T, C, H, W)
+ # 1 means masked, 0 means not masked
+ t = len(pixel_values)
+ h, w, c = pixel_values[0].shape
+ mask = torch.ones([t, h, w, 1], device=pixel_values.device, dtype=pixel_values.dtype)
+
+ mask_func_name = random.choices(list(self.mask_func_weights_video.keys()), list(self.mask_func_weights_video.values()))[0]
+ frame_mask_list = [MaskType.t2iv_mask,MaskType.i2v_mask,MaskType.transition_mask,MaskType.v2v_mask,MaskType.clear_mask,MaskType.random_mask]
+ pos_mask_list = [MaskType.Semantic_mask,MaskType.bbox_mask,MaskType.background_mask,MaskType.fixed_mask,MaskType.Semantic_expansion_mask,MaskType.fixed_bg_mask]
+
+
+ if mask_func_name in frame_mask_list:
+ mask = self.mask_functions[mask_func_name](mask)
+ masked_pixel_values = pixel_values * (mask < 0.5)
+
+ if mask_func_name in pos_mask_list:
+ mask = self.mask_functions[mask_func_name](pixel_values)
+ # save_video(masked_pixel_values.permute(0, 2, 3, 1).cpu().numpy(), 'masked_video.mp4')
+
+ # import ipdb; ipdb.set_trace()
+
+ # pixel_values = self.transform(pixel_values)
+ # masked_pixel_values = self.transform(masked_pixel_values.to(torch.uint8))
+
+
+
+ return mask
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--num_frames", type=int, default=65)
+ parser.add_argument("--t2v_ratio", type=float, default=0.0) # for inpainting mode
+ parser.add_argument("--i2v_ratio", type=float, default=0.0) # for inpainting mode
+ parser.add_argument("--transition_ratio", type=float, default=0.05) # for inpainting mode
+ parser.add_argument("--v2v_ratio", type=float, default=0.05) # for inpainting mode
+ parser.add_argument("--clear_video_ratio", type=float, default=0.0) # for inpainting mode
+ parser.add_argument("--Semantic_ratio", type=float, default=0.2) # for inpainting mode
+ parser.add_argument("--bbox_ratio", type=float, default=0.2) # for inpainting mode
+ parser.add_argument("--background_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--fixed_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--Semantic_expansion_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--fixed_bg_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--min_clear_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--default_text_ratio", type=float, default=0.5) # for inpainting mode
+ parser.add_argument("--pretrained_transformer_model_path", type=str, default=None)
+ parser.add_argument("--yolomodel_pathorname",type=str,default="/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt")
+ parser.add_argument("--json_path",type=str,default="/home/image_data/hxy/Open-Sora-Plan/opensora/test/sucai_test.json")
+ parser.add_argument("--output_dir", type=str, default=None)
+
+
+ args = parser.parse_args()
+
+ YOLOModel = YOLO(args.yolomodel_pathorname)
+
+ mask_generator = MaskProcessor(args,YOLOModel)
+ with open(args.json_path, 'r', encoding='utf-8') as f:
+ data_list = json.load(f)
+
+ for item in data_list:
+ frames = video_to_frames(item["path"])
+ mask = mask_generator(frames)
+
+ bool_mask = np.array(mask, dtype=bool)
+ packed_arr = np.packbits(bool_mask)
+
+ filename = item["path"].split("/")[-1].split(".")[0]
+
+ output_path = args.output_dir + "/" + filename + ".npy"
+
+ np.save(output_path, packed_arr)
+
+ item["mask_path"] = output_path
+
+
+ update_json_path = args.json_path.replace(".json","_update.json")
+ with open(update_json_path, 'w', encoding='utf-8') as f:
+ json.dump(data_list, f, ensure_ascii=False, indent=4)
+
+
diff --git a/opensora/dataset/feature_datasets.py b/opensora/dataset/feature_datasets.py
deleted file mode 100644
index 74362528a..000000000
--- a/opensora/dataset/feature_datasets.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import json
-import os
-import torch
-import random
-import torch.utils.data as data
-
-import numpy as np
-from glob import glob
-from PIL import Image
-from torch.utils.data import Dataset
-from tqdm import tqdm
-
-from opensora.dataset.transform import center_crop, RandomCropVideo
-from opensora.utils.dataset_utils import DecordInit
-
-
-class T2V_Feature_dataset(Dataset):
- def __init__(self, args, temporal_sample):
-
- self.video_folder = args.video_folder
- self.num_frames = args.video_length
- self.temporal_sample = temporal_sample
-
- print('Building dataset...')
- if os.path.exists('samples_430k.json'):
- with open('samples_430k.json', 'r') as f:
- self.samples = json.load(f)
- else:
- self.samples = self._make_dataset()
- with open('samples_430k.json', 'w') as f:
- json.dump(self.samples, f, indent=2)
-
- self.use_image_num = args.use_image_num
- self.use_img_from_vid = args.use_img_from_vid
- if self.use_image_num != 0 and not self.use_img_from_vid:
- self.img_cap_list = self.get_img_cap_list()
-
- def _make_dataset(self):
- all_mp4 = list(glob(os.path.join(self.video_folder, '**', '*.mp4'), recursive=True))
- # all_mp4 = all_mp4[:1000]
- samples = []
- for i in tqdm(all_mp4):
- video_id = os.path.basename(i).split('.')[0]
- ae = os.path.split(i)[0].replace('data_split_tt', 'lb_causalvideovae444_feature')
- ae = os.path.join(ae, f'{video_id}_causalvideovae444.npy')
- if not os.path.exists(ae):
- continue
-
- t5 = os.path.split(i)[0].replace('data_split_tt', 'lb_t5_feature')
- cond_list = []
- cond_llava = os.path.join(t5, f'{video_id}_t5_llava_fea.npy')
- mask_llava = os.path.join(t5, f'{video_id}_t5_llava_mask.npy')
- if os.path.exists(cond_llava) and os.path.exists(mask_llava):
- llava = dict(cond=cond_llava, mask=mask_llava)
- cond_list.append(llava)
- cond_sharegpt4v = os.path.join(t5, f'{video_id}_t5_sharegpt4v_fea.npy')
- mask_sharegpt4v = os.path.join(t5, f'{video_id}_t5_sharegpt4v_mask.npy')
- if os.path.exists(cond_sharegpt4v) and os.path.exists(mask_sharegpt4v):
- sharegpt4v = dict(cond=cond_sharegpt4v, mask=mask_sharegpt4v)
- cond_list.append(sharegpt4v)
- if len(cond_list) > 0:
- sample = dict(ae=ae, t5=cond_list)
- samples.append(sample)
- return samples
-
- def __len__(self):
- return len(self.samples)
-
- def __getitem__(self, idx):
- # try:
- sample = self.samples[idx]
- ae, t5 = sample['ae'], sample['t5']
- t5 = random.choice(t5)
- video_origin = np.load(ae)[0] # C T H W
- _, total_frames, _, _ = video_origin.shape
- # Sampling video frames
- start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
- assert end_frame_ind - start_frame_ind >= self.num_frames
- select_video_idx = np.linspace(start_frame_ind, end_frame_ind - 1, num=self.num_frames, dtype=int) # start, stop, num=50
- # print('select_video_idx', total_frames, select_video_idx)
- video = video_origin[:, select_video_idx] # C num_frames H W
- video = torch.from_numpy(video)
-
- cond = torch.from_numpy(np.load(t5['cond']))[0] # L
- cond_mask = torch.from_numpy(np.load(t5['mask']))[0] # L D
-
- if self.use_image_num != 0 and self.use_img_from_vid:
- select_image_idx = np.random.randint(0, total_frames, self.use_image_num)
- # print('select_image_idx', total_frames, self.use_image_num, select_image_idx)
- images = video_origin[:, select_image_idx] # c, num_img, h, w
- images = torch.from_numpy(images)
- video = torch.cat([video, images], dim=1) # c, num_frame+num_img, h, w
- cond = torch.stack([cond] * (1+self.use_image_num)) # 1+self.use_image_num, l
- cond_mask = torch.stack([cond_mask] * (1+self.use_image_num)) # 1+self.use_image_num, l
- elif self.use_image_num != 0 and not self.use_img_from_vid:
- images, captions = self.img_cap_list[idx]
- raise NotImplementedError
- else:
- pass
-
- return video, cond, cond_mask
- # except Exception as e:
- # print(f'Error with {e}, {sample}')
- # return self.__getitem__(random.randint(0, self.__len__() - 1))
-
- def get_img_cap_list(self):
- raise NotImplementedError
-
-
-
-
-class T2V_T5_Feature_dataset(Dataset):
- def __init__(self, args, transform, temporal_sample):
-
- self.video_folder = args.video_folder
- self.num_frames = args.num_frames
- self.transform = transform
- self.temporal_sample = temporal_sample
- self.v_decoder = DecordInit()
-
- print('Building dataset...')
- if os.path.exists('samples_430k.json'):
- with open('samples_430k.json', 'r') as f:
- self.samples = json.load(f)
- self.samples = [dict(ae=i['ae'].replace('lb_causalvideovae444_feature', 'data_split_1024').replace('_causalvideovae444.npy', '.mp4'), t5=i['t5']) for i in self.samples]
- else:
- self.samples = self._make_dataset()
- with open('samples_430k.json', 'w') as f:
- json.dump(self.samples, f, indent=2)
-
- self.use_image_num = args.use_image_num
- self.use_img_from_vid = args.use_img_from_vid
- if self.use_image_num != 0 and not self.use_img_from_vid:
- self.img_cap_list = self.get_img_cap_list()
-
- def _make_dataset(self):
- all_mp4 = list(glob(os.path.join(self.video_folder, '**', '*.mp4'), recursive=True))
- # all_mp4 = all_mp4[:1000]
- samples = []
- for i in tqdm(all_mp4):
- video_id = os.path.basename(i).split('.')[0]
- # ae = os.path.split(i)[0].replace('data_split', 'lb_causalvideovae444_feature')
- # ae = os.path.join(ae, f'{video_id}_causalvideovae444.npy')
- ae = i
- if not os.path.exists(ae):
- continue
-
- t5 = os.path.split(i)[0].replace('data_split_1024', 'lb_t5_feature')
- cond_list = []
- cond_llava = os.path.join(t5, f'{video_id}_t5_llava_fea.npy')
- mask_llava = os.path.join(t5, f'{video_id}_t5_llava_mask.npy')
- if os.path.exists(cond_llava) and os.path.exists(mask_llava):
- llava = dict(cond=cond_llava, mask=mask_llava)
- cond_list.append(llava)
- cond_sharegpt4v = os.path.join(t5, f'{video_id}_t5_sharegpt4v_fea.npy')
- mask_sharegpt4v = os.path.join(t5, f'{video_id}_t5_sharegpt4v_mask.npy')
- if os.path.exists(cond_sharegpt4v) and os.path.exists(mask_sharegpt4v):
- sharegpt4v = dict(cond=cond_sharegpt4v, mask=mask_sharegpt4v)
- cond_list.append(sharegpt4v)
- if len(cond_list) > 0:
- sample = dict(ae=ae, t5=cond_list)
- samples.append(sample)
- return samples
-
- def __len__(self):
- return len(self.samples)
-
- def __getitem__(self, idx):
- try:
- sample = self.samples[idx]
- ae, t5 = sample['ae'], sample['t5']
- t5 = random.choice(t5)
-
- video = self.decord_read(ae)
- video = self.transform(video) # T C H W -> T C H W
- video = video.transpose(0, 1) # T C H W -> C T H W
- total_frames = video.shape[1]
- cond = torch.from_numpy(np.load(t5['cond']))[0] # L
- cond_mask = torch.from_numpy(np.load(t5['mask']))[0] # L D
-
- if self.use_image_num != 0 and self.use_img_from_vid:
- select_image_idx = np.random.randint(0, total_frames, self.use_image_num)
- # print('select_image_idx', total_frames, self.use_image_num, select_image_idx)
- images = video.numpy()[:, select_image_idx] # c, num_img, h, w
- images = torch.from_numpy(images)
- video = torch.cat([video, images], dim=1) # c, num_frame+num_img, h, w
- cond = torch.stack([cond] * (1+self.use_image_num)) # 1+self.use_image_num, l
- cond_mask = torch.stack([cond_mask] * (1+self.use_image_num)) # 1+self.use_image_num, l
- elif self.use_image_num != 0 and not self.use_img_from_vid:
- images, captions = self.img_cap_list[idx]
- raise NotImplementedError
- else:
- pass
-
- return video, cond, cond_mask
- except Exception as e:
- print(f'Error with {e}, {sample}')
- return self.__getitem__(random.randint(0, self.__len__() - 1))
-
- def decord_read(self, path):
- decord_vr = self.v_decoder(path)
- total_frames = len(decord_vr)
- # Sampling video frames
- start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
- # assert end_frame_ind - start_frame_ind >= self.num_frames
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.num_frames, dtype=int)
- video_data = decord_vr.get_batch(frame_indice).asnumpy()
- video_data = torch.from_numpy(video_data)
- video_data = video_data.permute(0, 3, 1, 2) # (T, H, W, C) -> (T C H W)
- return video_data
-
- def get_img_cap_list(self):
- raise NotImplementedError
\ No newline at end of file
diff --git a/opensora/dataset/fusion_result.json b/opensora/dataset/fusion_result.json
new file mode 100644
index 000000000..17882d8ba
--- /dev/null
+++ b/opensora/dataset/fusion_result.json
@@ -0,0 +1,721 @@
+[{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "10_10"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "11_11"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "12_12"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConstToAttrPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvertToConvTransposeDFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "DeconvWeightTransFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "13_13"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "14_14"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "15_15"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "1_1"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "2_2"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "3_3"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "4_4"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "5_5"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "6_6"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "7_7"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "8_8"
+},{
+ "graph_fusion": {
+ "ARefreshCubeC0FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "ConvFormatRefreshFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvToFullyConnectionFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "ConvWeightCompressFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "CubeTransFixpipeFusionPass": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEAPREQUANTFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "FIXPIPEFUSIONPASS": {
+ "effect_times": "0",
+ "match_times": "1"
+ },
+ "RefreshInt64ToInt32FusionPass": {
+ "effect_times": "1",
+ "match_times": "1"
+ },
+ "TransdataCastFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFz2FzgFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ },
+ "TransdataFzg2FzFusionPass": {
+ "effect_times": "0",
+ "match_times": "3"
+ }
+ },
+ "session_and_graph_id": "9_9"
+}]
\ No newline at end of file
diff --git a/opensora/dataset/inpaint_datasets.py b/opensora/dataset/inpaint_datasets.py
new file mode 100644
index 000000000..f0eedf85e
--- /dev/null
+++ b/opensora/dataset/inpaint_datasets.py
@@ -0,0 +1,394 @@
+
+from torch.utils.data import Dataset
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+import glob
+import json
+import os, io, csv, math, random
+import numpy as np
+import torchvision
+from einops import rearrange
+from decord import VideoReader
+from os.path import join as opj
+from collections import Counter
+
+import torch
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from torch.utils.data import DataLoader, Dataset, get_worker_info
+from tqdm import tqdm
+from PIL import Image
+from accelerate.logging import get_logger
+
+from opensora.utils.dataset_utils import DecordInit
+from opensora.utils.utils import text_preprocessing
+from opensora.dataset.transform import add_masking_notice, motion_mapping_fun, add_webvid_watermark_notice, \
+ clean_vidal, add_high_aesthetic_notice_image, add_aesthetic_notice_video, add_high_aesthetic_notice_image_human
+
+from opensora.dataset.t2v_datasets import SingletonMeta, DataSetProg
+from opensora.dataset.t2v_datasets import T2V_dataset
+
+import imageio
+
+from opensora.dataset.inpaint_utils import get_mask_tensor,MaskType
+from ultralytics import YOLO
+
+logger = get_logger(__name__)
+
+dataset_prog = DataSetProg()
+
+def save_video(video, name='video.mp4'):
+ imageio.mimwrite(
+ name, video, fps=24, quality=6) # highest quality is 10, lowest is 0
+
+class Meta_dataset(T2V_dataset):
+ def __init__(self, args, transform, temporal_sample, tokenizer):
+ super().__init__(args, transform, temporal_sample, tokenizer)
+
+ # if self.num_frames != 1:
+ # # inpaint
+ # self.t2v_ratio = args.t2v_ratio
+ # self.i2v_ratio = args.i2v_ratio
+ # self.transition_ratio = args.transition_ratio
+ # self.v2v_ratio = args.v2v_ratio
+ # self.clear_video_ratio = args.clear_video_ratio
+ # self.Semantic_ratio = args.Semantic_ratio
+ # self.bbox_ratio = args.bbox_ratio
+ # self.background_ratio = args.background_ratio
+ # self.fixed_ratio = args.fixed_ratio
+ # self.Semantic_expansion_ratio = args.Semantic_expansion_ratio
+ # self.fixed_bg_ratio = args.fixed_bg_ratio
+ # assert self.t2v_ratio + self.i2v_ratio + self.transition_ratio + self.v2v_ratio + self.clear_video_ratio + self.Semantic_ratio + self.bbox_ratio + self.background_ratio + self.fixed_ratio + self.fixed_bg_ratio + self.Semantic_expansion_ratio < 1, 'The sum of t2v_ratio, i2v_ratio, transition_ratio, v2v_ratio and clear video ratio should be less than 1.'
+
+ # self.min_clear_ratio = 0.0 if args.min_clear_ratio is None else args.min_clear_ratio
+ # assert self.min_clear_ratio >= 0 and self.min_clear_ratio <= 1, 'min_clear_ratio should be in the range of [0, 1].'
+
+ # self.mask_processor = mask_processor
+ # self.init_mask_func()
+
+ self.default_text_ratio = args.default_text_ratio
+
+ # self.yolomodel = modelYOLO
+ # # self.yolomodel = None
+
+ # def init_mask_func(self):
+ # # mask: ones_like (t 1 h w)
+ # def t2iv(mask):
+ # mask[:] = 1
+ # return mask
+
+ # def i2v(mask):
+ # mask[0] = 0
+ # return mask
+
+ # def transition(mask):
+ # mask[0] = 0
+ # mask[-1] = 0
+ # return mask
+
+ # def v2v(mask):
+ # end_idx = random.randint(int(mask.shape[0] * self.min_clear_ratio), mask.shape[0])
+ # mask[:end_idx] = 0
+ # return mask
+
+ # def clear(mask):
+ # mask[:] = 0
+ # return mask
+
+ # def random_mask(mask):
+ # num_to_select = random.randint(int(mask.shape[0] * self.min_clear_ratio), mask.shape[0])
+ # selected_indices = random.sample(range(mask.shape[0]), num_to_select)
+ # mask[selected_indices] = 0
+ # return mask
+
+ # def Semantic_mask(video_tensor):
+ # return get_mask_tensor(video_tensor,MaskType.Semantic_mask,self.yolomodel)
+
+ # def bbox_mask(video_tensor):
+ # return get_mask_tensor(video_tensor,MaskType.bbox_mask,self.yolomodel)
+
+ # def background_mask(video_tensor):
+ # return get_mask_tensor(video_tensor,MaskType.background_mask,self.yolomodel)
+
+ # def fixed_mask(video_tensor):
+ # return get_mask_tensor(video_tensor,MaskType.fixed_mask,self.yolomodel)
+
+ # def Semantic_expansion_mask(video_tensor):
+ # return get_mask_tensor(video_tensor,MaskType.Semantic_expansion_mask,self.yolomodel)
+
+ # def fixed_bg_mask(video_tensor):
+ # return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+
+
+
+ # self.mask_functions = {
+ # 't2iv': t2iv,
+ # 'i2v': i2v,
+ # 'transition': transition,
+ # 'v2v': v2v,
+ # 'clear': clear,
+ # 'random_mask': random_mask,
+ # 'Semantic_mask':Semantic_mask,
+ # 'bbox_mask':bbox_mask,
+ # 'background_mask':background_mask,
+ # 'fixed_mask':fixed_mask,
+ # 'Semantic_expansion_mask':Semantic_expansion_mask,
+ # 'fixed_bg_mask':fixed_bg_mask
+ # }
+
+
+ # def get_mask_masked_pixel_values(self, pixel_values, mask_func_weights):
+ # # pixel_values shape (T, C, H, W)
+ # # 1 means masked, 0 means not masked
+ # t, c, h, w = pixel_values.shape
+ # mask = torch.ones([t, 1, h, w], device=pixel_values.device, dtype=pixel_values.dtype)
+
+ # mask_func_name = random.choices(list(mask_func_weights.keys()), list(mask_func_weights.values()))[0]
+ # frame_mask_list = ['t2iv','i2v','transition','v2v','clear','random_mask']
+ # pos_mask_list = ['Semantic_mask','bbox_mask','background_mask','fixed_mask','Semantic_expansion_mask','fixed_bg_mask']
+
+ # if mask_func_name in frame_mask_list:
+ # mask = self.mask_functions[mask_func_name](mask)
+ # masked_pixel_values = pixel_values * (mask < 0.5)
+
+ # if mask_func_name in pos_mask_list:
+ # masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ # # save_video(masked_pixel_values.permute(0, 2, 3, 1).cpu().numpy(), 'masked_video.mp4')
+ # return dict(mask=mask, masked_pixel_values=masked_pixel_values)
+
+ def drop(self, text, is_video=True):
+ rand_num = random.random()
+ rand_num_text = random.random()
+
+ if rand_num < self.cfg:
+ if rand_num_text < self.default_text_ratio:
+ if not is_video:
+ text = "The image showcases a scene with coherent and clear visuals."
+ else:
+ text = "The video showcases a scene with coherent and clear visuals."
+ else:
+ text = ''
+
+ return dict(text=text)
+
+
+class Inpaint_dataset(Meta_dataset):
+
+ def __init__(self, args, transform, temporal_sample, tokenizer):
+ super().__init__(args, transform, temporal_sample, tokenizer)
+
+ # self.mask_func_weights_video = {
+ # 't2iv': self.t2v_ratio,
+ # 'i2v': self.i2v_ratio,
+ # 'transition': self.transition_ratio,
+ # 'v2v': self.v2v_ratio,
+ # 'clear': self.clear_video_ratio,
+ # 'Semantic_mask':self.Semantic_ratio,
+ # 'bbox_mask':self.bbox_ratio,
+ # 'background_mask':self.background_ratio,
+ # 'fixed_mask':self.fixed_ratio,
+ # 'Semantic_expansion_mask':self.Semantic_expansion_ratio,
+ # 'fixed_bg_mask':self.fixed_bg_ratio,
+ # 'random_mask': 1 - self.t2v_ratio - self.i2v_ratio - self.transition_ratio - self.v2v_ratio - self.clear_video_ratio - self.Semantic_ratio - self.bbox_ratio - self.background_ratio - self.fixed_ratio - self.Semantic_expansion_ratio - self.fixed_bg_ratio
+
+ # }
+
+ # self.mask_func_weights_image = {
+ # 't2iv': 0.9,
+ # 'clear': 0.1
+ # }
+
+ def get_video(self, idx):
+ video_data = dataset_prog.cap_list[idx]
+ video_path = video_data['path']
+ assert os.path.exists(video_path), f"file {video_path} do not exist!"
+ frame_indice = dataset_prog.cap_list[idx]['sample_frame_index']
+ sample_h = video_data['resolution']['sample_height']
+ sample_w = video_data['resolution']['sample_width']
+ if self.video_reader == 'decord':
+ video = self.decord_read(video_path, predefine_frame_indice=frame_indice)
+ elif self.video_reader == 'opencv':
+ video = self.opencv_read(video_path, predefine_frame_indice=frame_indice)
+ else:
+ NotImplementedError(f'Found {self.video_reader}, but support decord or opencv')
+
+ # inpaint_cond_data = self.get_mask_masked_pixel_values(video, self.mask_func_weights_video)
+ # mask, masked_video = inpaint_cond_data['mask'], inpaint_cond_data['masked_pixel_values']
+
+ video = self.transform(video) # T C H W -> T C H W
+ # masked_video = self.transform(masked_video) # T C H W -> T C H W
+ # mask = self.mask_processor(mask) # T 1 H W -> T 1 H W
+ assert video.shape[2] == sample_h and video.shape[3] == sample_w
+
+ # video = torch.cat([video, masked_video, mask], dim=1) # T 2C+1 H W
+ # video = torch.rand(221, 3, 480, 640)
+
+ # video = video.transpose(0, 1) # T C H W -> C T H W
+ text = video_data['cap']
+ if not isinstance(text, list):
+ text = [text]
+ text = [random.choice(text)]
+ if '/VIDAL-10M/' in video_path:
+ text = [clean_vidal(text[0])]
+ if '/Webvid-10M/' in video_path:
+ text = [add_webvid_watermark_notice(text[0])]
+ if not (video_data.get('aesthetic', None) is None):
+ text = [add_aesthetic_notice_video(text[0], video_data['aesthetic'])]
+
+ text = [text[0].replace(' image ', ' video ').replace(' image,', ' video,')]
+ text = text_preprocessing(text, support_Chinese=self.support_Chinese)
+ text = self.drop(text, is_video=True)['text']
+
+ text_tokens_and_mask = self.tokenizer(
+ text,
+ max_length=self.model_max_length,
+ padding='max_length',
+ truncation=True,
+ return_attention_mask=True,
+ add_special_tokens=True,
+ return_tensors='pt'
+ )
+ input_ids = text_tokens_and_mask['input_ids']
+ cond_mask = text_tokens_and_mask['attention_mask']
+ if self.use_motion:
+ motion_score = motion_mapping_fun(video_data['motion_score'])
+ return dict(pixel_values=video, input_ids=input_ids, cond_mask=cond_mask, motion_score=motion_score)
+ else:
+ return dict(pixel_values=video, input_ids=input_ids, cond_mask=cond_mask, motion_score=None)
+
+ def get_image(self, idx):
+ image_data = dataset_prog.cap_list[idx] # [{'path': path, 'cap': cap}, ...]
+ sample_h = image_data['resolution']['sample_height']
+ sample_w = image_data['resolution']['sample_width']
+
+ # import ipdb;ipdb.set_trace()
+ image = Image.open(image_data['path']).convert('RGB') # [h, w, c]
+ image = torch.from_numpy(np.array(image)) # [h, w, c]
+ image = rearrange(image, 'h w c -> c h w').unsqueeze(0) # [1 c h w]
+
+ # inpaint_cond_data = self.get_mask_masked_pixel_values(image, self.mask_func_weights_image)
+ # mask, masked_image = inpaint_cond_data['mask'], inpaint_cond_data['masked_pixel_values']
+
+ # import ipdb;ipdb.set_trace()
+ image = self.transform(image) # [1 C H W] -> [1 C H W]
+ # masked_image = self.transform(masked_image) # [1 C H W] -> [1 C H W]
+ # mask = self.mask_processor(mask) # [1 1 H W] -> [1 1 H W]
+ assert image.shape[2] == sample_h, image.shape[3] == sample_w
+
+ # image = torch.cat([image, masked_image, mask], dim=1) # [1 2C+1 H W]
+
+ # image = [torch.rand(1, 3, 480, 640) for i in image_data]
+ # image = image.transpose(0, 1) # [1 C H W] -> [C 1 H W]
+
+ caps = image_data['cap'] if isinstance(image_data['cap'], list) else [image_data['cap']]
+ caps = [random.choice(caps)]
+ if '/sam/' in image_data['path']:
+ caps = [add_masking_notice(caps[0])]
+ if 'ideogram' in image_data['path']:
+ caps = [add_high_aesthetic_notice_image(caps[0])]
+ if 'human_images' in image_data['path']:
+ caps = [add_high_aesthetic_notice_image_human(caps[0])]
+ text = text_preprocessing(caps, support_Chinese=self.support_Chinese)
+ input_ids, cond_mask = [], []
+ text = self.drop(text, is_video=False)['text']
+
+ text_tokens_and_mask = self.tokenizer(
+ text,
+ max_length=self.model_max_length,
+ padding='max_length',
+ truncation=True,
+ return_attention_mask=True,
+ add_special_tokens=True,
+ return_tensors='pt'
+ )
+ input_ids = text_tokens_and_mask['input_ids'] # 1, l
+ cond_mask = text_tokens_and_mask['attention_mask'] # 1, l
+ if self.use_motion:
+ motion_score = motion_mapping_fun(image_data['motion_score'])
+ return dict(pixel_values=image, input_ids=input_ids, cond_mask=cond_mask, motion_score=motion_score)
+ else:
+ return dict(pixel_values=image, input_ids=input_ids, cond_mask=cond_mask, motion_score=None)
+
+
+# class AllInpaintDataset(Inpaint_dataset):
+
+
+if __name__ == "__main__":
+ class Args:
+ t2v_ratio = 0.0
+ i2v_ratio = 0.0
+ transition_ratio = 0.0
+ v2v_ratio = 0.00
+ clear_video_ratio = 0.99
+ min_clear_ratio = 0.0
+ Semantic_ratio = 0.0
+ bbox_ratio = 0.0
+ background_ratio = 0.0
+ fixed_ratio = 0.0
+ Semantic_expansion_ratio = 0.0
+ fixed_bg_ratio = 0.0
+ default_text_ratio = 0.1
+ use_motion = False
+ support_Chinese = False
+ model_max_length = 512
+ cfg = 0.1
+ num_frames = 93
+ force_resolution = False
+ max_height = 320
+ max_width = 320
+ hw_stride = 32
+ data = "/storage/gyy/hw/Open-Sora-Plan/scripts/train_data/merge_data_debug.txt"
+ train_fps = 16
+ use_image_num = 0
+ use_img_from_vid = False
+ speed_factor = 1.0
+ drop_short_ratio = 0.0
+ cfg = 0.1
+ dataloader_num_workers = 4
+ use_motion = True
+ skip_low_resolution = True
+ text_encoder_name = 'google/mt5-xxl'
+
+
+ from transformers import AutoTokenizer
+
+ from opensora.dataset.transform import ToTensorVideo, TemporalRandomCrop, RandomHorizontalFlipVideo, CenterCropResizeVideo, LongSideResizeVideo, SpatialStrideCropVideo, NormalizeVideo, ToTensorAfterResize
+ from torchvision.transforms import Lambda
+
+ args = Args()
+
+ temporal_sample = TemporalRandomCrop(args.num_frames) # 16 x
+ norm_fun = Lambda(lambda x: 2. * x - 1.)
+ if args.force_resolution:
+ resize = [CenterCropResizeVideo((args.max_height, args.max_width)), ]
+ else:
+ resize = [
+ LongSideResizeVideo((args.max_height, args.max_width), skip_low_resolution=True),
+ SpatialStrideCropVideo(stride=args.hw_stride),
+ ]
+ transform = transforms.Compose([
+ ToTensorVideo(),
+ *resize,
+ norm_fun
+ ])
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl")
+
+ mask_processor = transforms.Compose([*resize])
+
+ dataset = Inpaint_dataset(args, transform=transform, temporal_sample=temporal_sample, tokenizer=tokenizer, mask_processor=mask_processor)
+
+ data = next(iter(dataset))
+
+ # print(data['pixel_values'].shape)
+ # print(data['input_ids'].shape)
+ # print(data['cond_mask'].shape)
+ # print(data['motion_score'])
+
+ # print(data['pixel_values'])
+ # print(data['input_ids'])
+ # print(data['cond_mask'])
diff --git a/opensora/dataset/inpaint_utils.py b/opensora/dataset/inpaint_utils.py
new file mode 100644
index 000000000..a9bf895c2
--- /dev/null
+++ b/opensora/dataset/inpaint_utils.py
@@ -0,0 +1,408 @@
+from enum import Enum, auto
+import numpy as np
+from ultralytics import YOLO
+import os
+import av
+from PIL import Image
+import random
+import cv2
+import torch
+import torchvision.transforms as transforms
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+
+
+os.environ['YOLO_VERBOSS'] = 'False'
+
+class MaskType(Enum):
+ Semantic_mask = 1
+ bbox_mask = 2
+ background_mask = 3
+ fixed_mask = 4
+ Semantic_expansion_mask = 5
+ fixed_bg_mask = 6
+
+
+
+class single_info:
+ def __init__(self, id, label, shape) -> None:
+ self.id = id
+ self.label = label
+ self.shape = shape
+ self.frame_indexes = []
+ self.infos = []
+ def update(self,frame_index,box,conf,mask):
+ self.frame_indexes.append(frame_index)
+ info = dict(
+ box=box,
+ conf=conf,
+ mask=mask,
+ )
+ self.infos.append(info)
+ def return_dict(self,):
+ return dict(
+ id=self.id,
+ label=self.label,
+ frame_size=self.shape,
+ frame_index_list = self.frame_indexes,
+ infos_list = self.infos
+ )
+
+def save_videos_from_pil(pil_images, path, fps=24):
+ """
+ pil_images: list[Image,...]
+ """
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+
+ image = pil_images[0]
+
+ image = ndarray_to_pil(pil_images[0])
+ width, height = image.size
+
+
+ codec = "libx264"
+ container = av.open(path, "w")
+ stream = container.add_stream(codec, rate=fps)
+
+ stream.width = width
+ stream.height = height
+
+ for pil_image in pil_images:
+ # pil_image = Image.fromarray(image_arr).convert("RGB")
+ pil_image = ndarray_to_pil(pil_image)
+ av_frame = av.VideoFrame.from_image(pil_image)
+ container.mux(stream.encode(av_frame))
+ container.mux(stream.encode())
+ container.close()
+
+def read_frames(video_tensor) -> list:
+ """
+ 读取视频,返回一个元素类型为ndarray的列表
+ """
+ # container = av.open(video_path)
+ T = video_tensor.shape[0]
+ frames = []
+ for t in range(T):
+ frame_tensor = video_tensor[t]
+ frame_tensor = frame_tensor.cpu().numpy()
+ frame_tensor = np.transpose(frame_tensor, (1, 2, 0))
+ frames.append(frame_tensor)
+ return frames
+
+
+def get_masked_image(image: np.ndarray, mask: np.ndarray) -> np.ndarray:
+ mask = mask.astype(bool)
+ if len(mask.shape) == 2:
+ mask = np.expand_dims(mask, axis=2)
+ masked_img = image * (1-mask)
+ return masked_img # shape: [H,W,C]; range: [0, 255]
+
+def get_bbox_image(image: np.ndarray,bbox,obj_id):
+ # cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
+ bbox_image = image.copy()
+ bbox_image[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 0
+ # cv2.putText(image, f'ID: {obj_id}', (int(bbox[0]), int(bbox[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 0), 2)
+ return bbox_image
+
+
+
+def select_bg_from_video(bg_masks, video):
+ new_container = []
+ for index, frame in enumerate(video):
+
+ mask = bg_masks[index]
+ masked_frame = get_masked_image(frame, mask)
+ new_container.append(masked_frame)
+ return new_container
+
+def get_random_box(image_tensor, box_min_size, box_max_size):
+
+ H, W,C = image_tensor.shape
+
+ # 随机确定 box 的宽高
+ box_width = random.randint(box_min_size, min(box_max_size, W))
+ box_height = random.randint(box_min_size, min(box_max_size, H))
+
+ # 随机确定 box 的左上角坐标
+ x_start = random.randint(0, W - box_width)
+ y_start = random.randint(0, H - box_height)
+
+ box = (x_start, y_start, x_start + box_width, y_start + box_height)
+
+ return box
+
+def combine_masks_and_get_background(masks):
+ """
+ 合并所有 mask 并取反得到背景 mask
+ """
+ combined_mask = np.any(masks, axis=0)
+ background_mask = np.logical_not(combined_mask)
+ return background_mask
+
+def parser_results_for_ids(results, frame_size=None):
+ id_record = []
+ single_info_ins = {}
+ background_masks = []
+ for frame_index, result in enumerate(results):
+ result = result[0]
+ if frame_index == 0 and frame_size is None:
+ frame_size = result.boxes.orig_shape
+ id = result.boxes.id
+
+ # 如果没有检测到物体
+ if id is None:
+ background_masks.append(np.ones((frame_size)) * 255)
+ continue
+
+ id = id.tolist()
+ cls = result.boxes.cls.tolist() #每个id对应的label
+ conf = result.boxes.conf.tolist() #每个id对应的预测置信度
+ box_n = result.boxes.xyxy.tolist() #每个id对应的box
+ mask = result.masks.data.cpu().detach().numpy() #每个id对应的mask
+ background_masks.append(combine_masks_and_get_background(mask))
+
+ for i, iden in enumerate(id):
+ if iden not in id_record:
+ id_record.append(iden)
+ single_info_ins[iden] = single_info(iden, cls[i], frame_size)
+ single_info_ins[iden].update(frame_index, box_n[i], conf[i],mask[i])
+ return_list = []
+ for _, value in single_info_ins.items():
+ return_list.append(value.return_dict())
+ return return_list, background_masks
+
+
+def get_mask(video_tensor,mask_type,yole_model):
+
+ video = read_frames(video_tensor=video_tensor)
+
+ # video_tensor_batch = video_tensor.unsqueeze(1)
+ T,C,H,W = video_tensor.shape
+
+
+
+ tracker = yole_model.to("cuda")
+
+ results = []
+
+
+ for t in range(T):
+ frame_tensor = video_tensor[t] # 获取当前帧, (C, H, W)
+ frame_tensor = frame_tensor.data.cpu().numpy() # 转为numpy
+ frame_tensor = np.transpose(frame_tensor, (1, 2, 0))
+
+ # 进行推理
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False,nms=False)
+
+ # 保存结果
+ results.append(result)
+
+ parser_res, background_masks = parser_results_for_ids(results)
+
+ select_index = -1
+ object_info = []
+ frame_indexes = []
+ infos = []
+
+
+ #随机选择一个被追踪物体
+ if len(parser_res) != 0:
+ select_index = random.randint(0, len(parser_res)-1)
+ object_info = parser_res[select_index]
+ frame_indexes = object_info['frame_index_list']
+ infos = object_info['infos_list']
+ # print("infos size",len(infos))
+ # print("frame_indexed",len(frame_indexes))
+ else:
+ mask_type = MaskType.fixed_mask
+
+
+
+
+ # mask_type = get_random_type()
+ mask_type = MaskType.fixed_mask
+
+ if mask_type == MaskType.Semantic_mask or mask_type == MaskType.Semantic_expansion_mask:
+ Semantic_masks = []
+ mask_container = []
+ info_index = 0
+
+
+ for index, frame in enumerate(video):
+ if index in frame_indexes:
+ mask = infos[info_index]['mask']
+ info_index = info_index + 1
+
+ if mask_type == MaskType.Semantic_expansion_mask:
+ kernel = np.ones((5, 5), np.uint8)
+ # 进行膨胀操作
+ mask = cv2.dilate(mask, kernel, iterations=1)
+
+ # 计算掩码中前景像素的数量
+ foreground_pixels = np.sum(mask)
+
+ # 计算图像的总像素数
+ total_pixels = mask.size # 或者使用 image.shape[0] * image.shape[1]
+
+ # 计算比例
+ ratio = foreground_pixels / total_pixels
+
+ if ratio < 0.2:
+ if random.random() < 0.5:
+ mask_type = MaskType.fixed_mask
+ break
+
+ masked_frame = get_masked_image(frame, mask)
+ mask_container.append(masked_frame)
+ Semantic_masks.append(mask)
+ else:
+ mask_container.append(np.zeros_like(frame))
+ Semantic_masks.append(np.zeros_like(frame))
+ if mask_type == MaskType.Semantic_mask or mask_type == MaskType.Semantic_expansion_mask:
+ return mask_container, Semantic_masks
+
+ if mask_type == MaskType.bbox_mask:
+ boxes_masks = []
+ box_container = []
+
+ info_index = 0
+
+ for index, frame in enumerate(video):
+ if index in frame_indexes:
+ bbox = infos[info_index]['box']
+ info_index = info_index + 1
+
+
+ boxed_frame = get_bbox_image(frame, bbox, object_info['id'])
+ box_container.append(boxed_frame)
+ boxmask = np.zeros_like(frame)
+ boxmask[int(bbox[1]): int(bbox[3]), int(bbox[0]): int(bbox[2])] = 1
+ boxes_masks.append(boxmask)
+ else:
+ box_container.append(frame)
+ boxes_masks.append(np.zeros_like(frame))
+
+ return box_container, boxes_masks
+
+ if mask_type == MaskType.background_mask:
+ bg_container = select_bg_from_video(background_masks, video)
+ return bg_container, background_masks
+
+ if mask_type == MaskType.fixed_mask or mask_type == MaskType.fixed_bg_mask:
+ fixed_mask_container = []
+ fixed_masks = []
+ box_min_size = 50
+ box_max_size = 100
+ box = get_random_box(video[0],box_min_size=box_min_size, box_max_size=box_max_size)
+ for index , frame in enumerate(video):
+ if mask_type == MaskType.fixed_mask:
+ boxed_frame = frame.copy()
+ boxed_frame[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 0
+ fixed_mask_container.append(boxed_frame)
+
+ fixed_mask = np.zeros_like(frame)
+ fixed_mask[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 1
+ fixed_masks.append(fixed_mask)
+ if mask_type == MaskType.fixed_bg_mask:
+ boxed_frame = frame.copy()
+
+ fixed_mask = np.zeros_like(frame)
+ fixed_mask[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 1
+ fixed_mask = 1 - fixed_mask
+ fixed_masks.append(fixed_mask)
+
+ boxed_bg_frame = get_masked_image(boxed_frame, fixed_mask)
+ fixed_mask_container.append(boxed_bg_frame)
+
+ return fixed_mask_container, fixed_masks
+
+
+
+def video_to_tensor(video_path):
+ # 打开视频文件
+ cap = cv2.VideoCapture(video_path)
+
+ frames = []
+
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ # 将 BGR 转换为 RGB
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+ # 转换为张量并添加到帧列表中
+ frame_tensor = torch.from_numpy(frame).permute(2, 0, 1) # (H, W, C) -> (C, H, W)
+ frames.append(frame_tensor)
+
+ cap.release()
+
+ # 将所有帧组合成一个四维张量
+ video_tensor = torch.stack(frames) # (T, C, H, W)
+
+ return video_tensor
+
+
+def ndarray_to_pil(image: np.ndarray) -> Image:
+ if np.max(image) <= 1.1:
+ image = image * 255
+ image = image.astype(np.uint8)
+ return Image.fromarray(image)
+
+def get_random_type():
+ # 数字列表
+ mask_type = [MaskType.Semantic_mask, MaskType.bbox_mask, MaskType.background_mask, MaskType.fixed_mask, MaskType.Semantic_expansion_mask, MaskType.fixed_bg_mask]
+
+ # 概率权重列表(总和应为 1 或任意正数比例)
+ weights = [0.3, 0.2, 0.1, 0.1, 0.2, 0.1] # 例如,第一个数字1的概率是0.1
+
+ # 从1-6中按设定概率随机选择一个数字
+ chosen_number = random.choices(mask_type, weights=weights)[0]
+
+ return chosen_number
+
+def get_mask_tensor(video_tensor,mask_type,yolomodel):
+
+ # return video_tensor,video_tensor
+
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+
+ masked_frames = [frame.transpose(2,0,1) for frame in masked_video_container]
+ masked_video = torch.stack(masked_frames)
+
+ masks = [mask.unsqueeze(0) for mask in masks_container]
+ mask = torch.stack(masks)
+
+ return masked_video,mask
+
+
+if __name__ == "__main__":
+
+ video_path = "/home/image_data/hxy/data/video/000184.mp4"
+ model_name_or_path = "/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt"
+ device = "cuda"
+ mask_video_save_path = "/home/image_data/hxy/data/video/000001_mask_video.mp4"
+ mask_save_path = "/home/image_data/hxy/data/video/000001_mask.mp4"
+ box_save_path = "/root/Open-Sora-Plan/opensora/dataset/inpaint_dataset/000184_box.mp4"
+ background_save_path = "/root/Open-Sora-Plan/opensora/dataset/inpaint_dataset/000184_background.mp4"
+ expansion_mask_path = "/root/Open-Sora-Plan/opensora/dataset/inpaint_dataset/000184_expansion.mp4"
+ fixed_bg_mask_path = "/root/Open-Sora-Plan/opensora/dataset/inpaint_dataset/000184_fixed_bg.mp4"
+
+
+ video_tensor = video_to_tensor(video_path)
+ tracker = YOLO(model_name_or_path)
+ tracker = tracker.to(device)
+ Semantic_mask_container, Semantic_masks = get_mask(video_tensor, MaskType.fixed_mask, tracker)
+ # save_videos_from_pil(Semantic_mask_container, mask_video_save_path)
+ # save_videos_from_pil(Semantic_masks, mask_save_path)
+
+ # print(len(Semantic_mask_container))
+
+ # H,W,C = Semantic_mask_container[0].shape
+ # fourcc = cv2.VideoWriter_fourcc(*'mp4v') # MPEG-4 编码器
+ # video = cv2.VideoWriter(mask_save_path, fourcc, 30, (W, H))
+
+ # for frame in Semantic_mask_container:
+ # video.write(frame)
+ # video.release()
diff --git a/opensora/dataset/t2v_datasets.py b/opensora/dataset/t2v_datasets.py
index c548a453c..3b71fb04e 100644
--- a/opensora/dataset/t2v_datasets.py
+++ b/opensora/dataset/t2v_datasets.py
@@ -1,19 +1,54 @@
+import time
+import traceback
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+import glob
import json
+import pickle
import os, io, csv, math, random
import numpy as np
import torchvision
from einops import rearrange
-from decord import VideoReader
from os.path import join as opj
+from collections import Counter
+import cv2
+import pandas as pd
+import time
import torch
import torchvision.transforms as transforms
from torch.utils.data.dataset import Dataset
+from torch.utils.data import DataLoader, Dataset, get_worker_info
from tqdm import tqdm
from PIL import Image
+from accelerate.logging import get_logger
+import gc
from opensora.utils.dataset_utils import DecordInit
from opensora.utils.utils import text_preprocessing
+from opensora.dataset.transform import get_params, longsideresize, add_masking_notice, motion_mapping_fun, calculate_statistics, \
+ add_webvid_watermark_notice, clean_vidal, add_high_aesthetic_notice_image, add_aesthetic_notice_video, add_high_aesthetic_notice_image_human
+
+import decord
+logger = get_logger(__name__)
+
+def filter_json_by_existed_files(directory, data, postfix=".mp4"):
+ # 构建搜索模式,以匹配指定后缀的文件
+ pattern = os.path.join(directory, '**', f'*{postfix}')
+ mp4_files = glob.glob(pattern, recursive=True) # 使用glob查找所有匹配的文件
+
+ # 使用文件的绝对路径构建集合
+ mp4_files_set = set(os.path.abspath(path) for path in mp4_files)
+
+ # 过滤数据条目,只保留路径在mp4文件集合中的条目
+ filtered_items = [item for item in data if item['path'] in mp4_files_set]
+
+ return filtered_items
def random_video_noise(t, c, h, w):
@@ -21,57 +56,206 @@ def random_video_noise(t, c, h, w):
vid = vid.to(torch.uint8)
return vid
+
+class SingletonMeta(type):
+ """
+ 这是一个元类,用于创建单例类。
+ """
+ _instances = {}
+
+ def __call__(cls, *args, **kwargs):
+ if cls not in cls._instances:
+ instance = super().__call__(*args, **kwargs)
+ cls._instances[cls] = instance
+ return cls._instances[cls]
+
+
+class DataSetProg(metaclass=SingletonMeta):
+ def __init__(self):
+ self.cap_list = []
+ self.elements = []
+ self.num_workers = 1
+ self.n_elements = 0
+ self.worker_elements = dict()
+ self.n_used_elements = dict()
+
+ def set_cap_list(self, num_workers, cap_list, n_elements):
+ self.num_workers = num_workers
+ self.cap_list = cap_list
+ self.n_elements = n_elements
+ self.elements = list(range(n_elements))
+
+ print(f"n_elements: {len(self.elements)}", flush=True)
+ # if torch_npu is not None:
+ # random.shuffle(self.elements)
+ # for i in range(self.num_workers):
+ # self.n_used_elements[i] = 0
+ # per_worker = int(math.ceil(len(self.elements) / float(self.num_workers)))
+ # start = i * per_worker
+ # end = min(start + per_worker, len(self.elements))
+ # self.worker_elements[i] = self.elements[start: end]
+
+ def get_item(self, work_info):
+ if work_info is None:
+ worker_id = 0
+ else:
+ worker_id = work_info.id
+
+ idx = self.worker_elements[worker_id][self.n_used_elements[worker_id] % len(self.worker_elements[worker_id])]
+ self.n_used_elements[worker_id] += 1
+ return idx
+
+
+dataset_prog = DataSetProg()
+
+def find_closest_y(x, vae_stride_t=4, model_ds_t=4):
+ if x < 29:
+ return -1
+ for y in range(x, 12, -1):
+ if (y - 1) % vae_stride_t == 0 and ((y - 1) // vae_stride_t + 1) % model_ds_t == 0:
+ # 4, 8: y in [29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, ...]
+ # 4, 4: y in [29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, ...]
+ return y
+ return -1
+
+def filter_resolution(h, w, max_h_div_w_ratio=17/16, min_h_div_w_ratio=8 / 16):
+ if h / w <= max_h_div_w_ratio and h / w >= min_h_div_w_ratio:
+ return True
+ return False
+
+def read_parquet(path):
+ df = pd.read_parquet(path)
+ data = df.to_dict(orient='records')
+ return data
+
class T2V_dataset(Dataset):
def __init__(self, args, transform, temporal_sample, tokenizer):
- self.video_folder = args.video_folder
+ self.data = args.data
self.num_frames = args.num_frames
+ self.train_fps = args.train_fps
+ self.use_image_num = args.use_image_num
+ self.use_img_from_vid = args.use_img_from_vid
self.transform = transform
self.temporal_sample = temporal_sample
self.tokenizer = tokenizer
self.model_max_length = args.model_max_length
- self.video_folder = args.video_folder
- self.v_decoder = DecordInit()
+ self.cfg = args.cfg
+ self.speed_factor = args.speed_factor
+ self.max_height = args.max_height
+ self.max_width = args.max_width
+ self.drop_short_ratio = args.drop_short_ratio
+ self.hw_stride = args.hw_stride
+ self.skip_low_resolution = args.skip_low_resolution
+ self.force_resolution = args.force_resolution
+ self.use_motion = args.use_motion
+ assert self.speed_factor >= 1
+ self.video_reader = 'decord' if args.use_decord else 'opencv'
+ # self.v_decoder = DecordInit() if self.video_reader == 'decord' else None
- with open(args.video_data_path, 'r') as f:
- self.vid_cap_list = json.load(f)
- self.use_image_num = args.use_image_num
- self.use_img_from_vid = args.use_img_from_vid
- if self.use_image_num != 0 and not self.use_img_from_vid:
- self.image_folder = args.image_folder
- self.image_data_path = args.image_data_path
- self.img_cap_list = self.get_img_cap_list()
+ self.support_Chinese = True
+ if not ('mt5' in args.text_encoder_name):
+ self.support_Chinese = False
+
+ s = time.time()
+
+ # self.cache_pkl = f"/storage/dataset/{os.path.basename(self.data).replace('.txt', '_cache.pkl')}"
+ # self.cache_json = f"/storage/dataset/{os.path.basename(self.data).replace('.txt', '_cache.json')}"
+
+ # if os.path.exists(self.cache_pkl):
+ # print(f"Load cache from {self.cache_pkl}")
+ # with open(self.cache_pkl, 'rb') as f:
+ # load_data = pickle.load(f)
+ # cap_list, self.sample_size = load_data['cap_list'], load_data['sample_size']
+ # if os.path.exists(self.cache_json):
+ # print(f"Load cache from {self.cache_json}")
+ # with open(self.cache_json, 'r') as f:
+ # load_data = json.load(f)
+ # cap_list, self.sample_size = load_data['cap_list'], load_data['sample_size']
+ # else:
+ # data_root, cap_list = self.get_cap_list()
+ # assert len(cap_list) > 0
+ cap_list, self.sample_size, _ = self.define_frame_index(self.data)
+ # save_data = dict(cap_list=cap_list, sample_size=self.sample_size)
+ # print(f"Save cache to {self.cache_pkl}")
+ # with open(self.cache_pkl, 'wb') as f:
+ # pickle.dump(save_data, f)
+ # print(f"Save cache to {self.cache_json}")
+ # with open(self.cache_json, 'w') as f:
+ # json.dump(save_data, f)
+ e = time.time()
+ print('time', e-s)
+ self.lengths = self.sample_size
+
+ n_elements = len(cap_list)
+ dataset_prog.set_cap_list(args.dataloader_num_workers, cap_list, n_elements)
+ print(f"data length: {len(dataset_prog.cap_list)}")
+
+ def set_checkpoint(self, n_used_elements):
+ for i in range(len(dataset_prog.n_used_elements)):
+ dataset_prog.n_used_elements[i] = n_used_elements
def __len__(self):
- return len(self.vid_cap_list)
+ return dataset_prog.n_elements
def __getitem__(self, idx):
+ # if npu_config is not None:
+ # worker_info = get_worker_info()
+ # idx = dataset_prog.get_item(worker_info)
try:
- video_data = self.get_video(idx)
- image_data = {}
- if self.use_image_num != 0 and self.use_img_from_vid:
- image_data = self.get_image_from_video(video_data)
- elif self.use_image_num != 0 and not self.use_img_from_vid:
- image_data = self.get_image(idx)
- else:
- raise NotImplementedError
- return dict(video_data=video_data, image_data=image_data)
+ # print('idx:', idx)
+ data = self.get_data(idx)
+ return data
except Exception as e:
- print(f'Error with {e}, {self.vid_cap_list[idx]}')
+ print(e)
+ logger.info(f'Error with {e}')
return self.__getitem__(random.randint(0, self.__len__() - 1))
+ def get_data(self, idx):
+ path = dataset_prog.cap_list[idx]['path']
+ if path.endswith('.mp4'):
+ return self.get_video(idx)
+ else:
+ return self.get_image(idx)
+
def get_video(self, idx):
- # video = random.choice([random_video_noise(65, 3, 720, 360) * 255, random_video_noise(65, 3, 1024, 1024), random_video_noise(65, 3, 360, 720)])
- # print('random shape', video.shape)
+ # npu_config.print_msg(f"current idx is {idx}")
+ # video = random.choice([random_video_noise(65, 3, 336, 448), random_video_noise(65, 3, 1024, 1024), random_video_noise(65, 3, 360, 480)])
+ # # print('random shape', video.shape)
# input_ids = torch.ones(1, 120).to(torch.long).squeeze(0)
# cond_mask = torch.cat([torch.ones(1, 60).to(torch.long), torch.ones(1, 60).to(torch.long)], dim=1).squeeze(0)
-
- video_path = opj(self.video_folder, self.vid_cap_list[idx]['path'])
- video = self.decord_read(video_path, self.vid_cap_list[idx]['frame_idx'])
+ logger.info(f'Now we use t2v dataset {idx}')
+ video_data = dataset_prog.cap_list[idx]
+ video_path = video_data['path']
+ assert os.path.exists(video_path), f"file {video_path} do not exist!"
+ frame_indice = dataset_prog.cap_list[idx]['sample_frame_index']
+ sample_h = video_data['resolution']['sample_height']
+ sample_w = video_data['resolution']['sample_width']
+ if self.video_reader == 'decord':
+ video = self.decord_read(video_path, predefine_frame_indice=frame_indice)
+ elif self.video_reader == 'opencv':
+ video = self.opencv_read(video_path, predefine_frame_indice=frame_indice)
+ else:
+ NotImplementedError(f'Found {self.video_reader}, but support decord or opencv')
+ # import ipdb;ipdb.set_trace()
video = self.transform(video) # T C H W -> T C H W
+ assert video.shape[2] == sample_h and video.shape[3] == sample_w
+
+ # video = torch.rand(221, 3, 480, 640)
+
video = video.transpose(0, 1) # T C H W -> C T H W
- text = self.vid_cap_list[idx]['cap'][0]
+ text = video_data['cap']
+ if not isinstance(text, list):
+ text = [text]
+ text = [random.choice(text)]
+ if '/VIDAL-10M/' in video_path:
+ text = [clean_vidal(text[0])]
+ if '/Webvid-10M/' in video_path:
+ text = [add_webvid_watermark_notice(text[0])]
+ if not (video_data.get('aesthetic', None) is None):
+ text = [add_aesthetic_notice_video(text[0], video_data['aesthetic'])]
- text = text_preprocessing(text)
+ text = [text[0].replace(' image ', ' video ').replace(' image,', ' video,')]
+ text = text_preprocessing(text, support_Chinese=self.support_Chinese) if random.random() > self.cfg else ""
text_tokens_and_mask = self.tokenizer(
text,
max_length=self.model_max_length,
@@ -83,77 +267,341 @@ def get_video(self, idx):
)
input_ids = text_tokens_and_mask['input_ids']
cond_mask = text_tokens_and_mask['attention_mask']
- return dict(video=video, input_ids=input_ids, cond_mask=cond_mask)
-
- def get_image_from_video(self, video_data):
- select_image_idx = np.linspace(0, self.num_frames-1, self.use_image_num, dtype=int)
- assert self.num_frames >= self.use_image_num
- image = [video_data['video'][:, i:i+1] for i in select_image_idx] # num_img [c, 1, h, w]
- input_ids = video_data['input_ids'].repeat(self.use_image_num, 1) # self.use_image_num, l
- cond_mask = video_data['cond_mask'].repeat(self.use_image_num, 1) # self.use_image_num, l
- return dict(image=image, input_ids=input_ids, cond_mask=cond_mask)
+ if self.use_motion:
+ motion_score = motion_mapping_fun(video_data['motion_score'])
+ return dict(pixel_values=video, input_ids=input_ids, cond_mask=cond_mask, motion_score=motion_score)
+ else:
+ return dict(pixel_values=video, input_ids=input_ids, cond_mask=cond_mask, motion_score=None)
def get_image(self, idx):
- idx = idx % len(self.img_cap_list) # out of range
- image_data = self.img_cap_list[idx] # [{'path': path, 'cap': cap}, ...]
-
- image = [Image.open(os.path.join(self.image_folder, i['path'])).convert('RGB') for i in image_data] # num_img [h, w, c]
- image = [torch.from_numpy(np.array(i)) for i in image] # num_img [h, w, c]
- image = [rearrange(i, 'h w c -> c h w').unsqueeze(0) for i in image] # num_img [1 c h w]
- image = [self.transform(i) for i in image] # num_img [1 C H W] -> num_img [1 C H W]
- image = [i.transpose(0, 1) for i in image] # num_img [1 C H W] -> num_img [C 1 H W]
-
- caps = [i['cap'] for i in image_data]
- text = [text_preprocessing(cap) for cap in caps]
+ image_data = dataset_prog.cap_list[idx] # [{'path': path, 'cap': cap}, ...]
+ sample_h = image_data['resolution']['sample_height']
+ sample_w = image_data['resolution']['sample_width']
+
+ # import ipdb;ipdb.set_trace()
+ image = Image.open(image_data['path']).convert('RGB') # [h, w, c]
+ image = torch.from_numpy(np.array(image)) # [h, w, c]
+ image = rearrange(image, 'h w c -> c h w').unsqueeze(0) # [1 c h w]
+
+ # import ipdb;ipdb.set_trace()
+ image = self.transform(image) # [1 C H W] -> num_img [1 C H W]
+ assert image.shape[2] == sample_h, image.shape[3] == sample_w
+ # image = [torch.rand(1, 3, 480, 640) for i in image_data]
+ image = image.transpose(0, 1) # [1 C H W] -> [C 1 H W]
+
+ caps = image_data['cap'] if isinstance(image_data['cap'], list) else [image_data['cap']]
+ caps = [random.choice(caps)]
+ if '/sam/' in image_data['path']:
+ caps = [add_masking_notice(caps[0])]
+ if 'ideogram' in image_data['path']:
+ caps = [add_high_aesthetic_notice_image(caps[0])]
+ if 'civitai' in image_data['path']:
+ caps = [add_high_aesthetic_notice_image(caps[0])]
+ if 'human_images' in image_data['path']:
+ caps = [add_high_aesthetic_notice_image_human(caps[0])]
+ text = text_preprocessing(caps, support_Chinese=self.support_Chinese)
input_ids, cond_mask = [], []
- for t in text:
- text_tokens_and_mask = self.tokenizer(
- t,
- max_length=self.model_max_length,
- padding='max_length',
- truncation=True,
- return_attention_mask=True,
- add_special_tokens=True,
- return_tensors='pt'
- )
- input_ids.append(text_tokens_and_mask['input_ids'])
- cond_mask.append(text_tokens_and_mask['attention_mask'])
- input_ids = torch.cat(input_ids) # self.use_image_num, l
- cond_mask = torch.cat(cond_mask) # self.use_image_num, l
- return dict(image=image, input_ids=input_ids, cond_mask=cond_mask)
-
- def tv_read(self, path):
- vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit='sec', output_format='TCHW')
- total_frames = len(vframes)
-
- # Sampling video frames
- start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
- # assert end_frame_ind - start_frame_ind >= self.num_frames
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.num_frames, dtype=int)
-
- video = vframes[frame_indice] # (T, C, H, W)
-
- return video
-
- def decord_read(self, path, frame_idx=None):
- decord_vr = self.v_decoder(path)
- total_frames = len(decord_vr)
- # Sampling video frames
- if frame_idx is None:
- start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+ text = text if random.random() > self.cfg else ""
+ text_tokens_and_mask = self.tokenizer(
+ text,
+ max_length=self.model_max_length,
+ padding='max_length',
+ truncation=True,
+ return_attention_mask=True,
+ add_special_tokens=True,
+ return_tensors='pt'
+ )
+ input_ids = text_tokens_and_mask['input_ids'] # 1, l
+ cond_mask = text_tokens_and_mask['attention_mask'] # 1, l
+ if self.use_motion:
+ motion_score = motion_mapping_fun(image_data['motion_score'])
+ return dict(pixel_values=image, input_ids=input_ids, cond_mask=cond_mask, motion_score=motion_score)
else:
- start_frame_ind, end_frame_ind = frame_idx.split(':')
- start_frame_ind, end_frame_ind = int(start_frame_ind), int(end_frame_ind)
- # assert end_frame_ind - start_frame_ind >= self.num_frames
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.num_frames, dtype=int)
+ return dict(pixel_values=image, input_ids=input_ids, cond_mask=cond_mask, motion_score=None)
+
+ def define_frame_index(self, data):
+
+ new_cap_list = []
+ sample_size = []
+ motion_score = []
+ aesthetic_score = []
+ cnt_too_long = 0
+ cnt_too_short = 0
+ cnt_no_cap = 0
+ cnt_no_resolution = 0
+ cnt_no_motion = 0
+ cnt_no_aesthetic = 0
+ cnt_resolution_mismatch = 0
+ cnt_movie = 0
+ cnt_img = 0
+ cnt = 0
+
+ with open(data, 'r') as f:
+ folder_anno = [i.strip().split(',') for i in f.readlines() if len(i.strip()) > 0]
+ for sub_root, anno in tqdm(folder_anno):
+ logger.info(f'Building {anno}...')
+ if anno.endswith('.json'):
+ with open(anno, 'r') as f:
+ sub_list = json.load(f)
+ elif anno.endswith('.pkl'):
+ with open(anno, 'rb') as f:
+ sub_list = pickle.load(f)
+ elif anno.endswith('.parquet'):
+ sub_list = read_parquet(anno)
+ else:
+ raise NotImplementedError
+ # with jsonlines.open(anno) as sub_list:
+ for i in tqdm(sub_list):
+ cnt += 1
+ path = os.path.join(sub_root, i['path'])
+ i['path'] = path
+ cap = i.get('cap', None)
+
+ if i.get('aesthetic', None) is None:
+ cnt_no_aesthetic += 1
+ else:
+ aesthetic_score.append(i['aesthetic'])
+
+ # ======no caption=====
+ if cap is None:
+ cnt_no_cap += 1
+ continue
+ # ======no motion=====
+ if self.use_motion:
+ if '.mp4' in path and i.get('motion_average', None) is None and i.get('motion', None) is None:
+ cnt_no_motion += 1
+ continue
+
+ # ======resolution mismatch=====
+ if i.get('resolution', None) is None:
+ cnt_no_resolution += 1
+ continue
+ else:
+ if i['resolution'].get('height', None) is None or i['resolution'].get('width', None) is None:
+ cnt_no_resolution += 1
+ continue
+ else:
+ height, width = i['resolution']['height'], i['resolution']['width']
+ if not self.force_resolution:
+ if height <= 0 or width <= 0:
+ cnt_no_resolution += 1
+ continue
+ tr_h, tr_w = longsideresize(height, width, (self.max_height, self.max_width), self.skip_low_resolution)
+ _, _, sample_h, sample_w = get_params(tr_h, tr_w, self.hw_stride)
+ if sample_h <= 0 or sample_w <= 0:
+ cnt_resolution_mismatch += 1
+ continue
+ i['resolution'].update(dict(sample_height=sample_h, sample_width=sample_w))
+ else:
+ aspect = self.max_height / self.max_width
+ hw_aspect_thr = 1.85
+ is_pick = filter_resolution(height, width, max_h_div_w_ratio=hw_aspect_thr*aspect,
+ min_h_div_w_ratio=1/hw_aspect_thr*aspect)
+ if not is_pick:
+ cnt_resolution_mismatch += 1
+ continue
+ sample_h, sample_w = self.max_height, self.max_width
+ i['resolution'].update(dict(sample_height=sample_h, sample_width=sample_w))
+
+
+ if path.endswith('.mp4'):
+ # ======no fps and duration=====
+ duration = i.get('duration', None)
+ fps = i.get('fps', None)
+ if fps is None or duration is None:
+ continue
+
+ i['num_frames'] = int(fps * duration)
+ # max 5.0 and min 1.0 are just thresholds to filter some videos which have suitable duration.
+ if i['num_frames'] > 6.0 * (self.num_frames * fps / self.train_fps * self.speed_factor): # too long video is not suitable for this training stage (self.num_frames)
+ cnt_too_long += 1
+ continue
+ # if i['num_frames'] < 1.0/1 * (self.num_frames * fps / self.train_fps * self.speed_factor): # too short video is not suitable for this training stage
+ # cnt_too_short += 1
+ # continue
+
+ # resample in case high fps, such as 50/60/90/144 -> train_fps(e.g, 24)
+ frame_interval = fps / self.train_fps
+ start_frame_idx = 10 if '/storage/dataset/movie' in i['path'] else 0 # special video
+ frame_indices = np.arange(start_frame_idx, i['num_frames'], frame_interval).astype(int)
+ frame_indices = frame_indices[frame_indices < i['num_frames']]
- video_data = decord_vr.get_batch(frame_indice).asnumpy()
+ # comment out it to enable dynamic frames training
+ if len(frame_indices) < self.num_frames and random.random() < self.drop_short_ratio:
+ cnt_too_short += 1
+ continue
+
+ # too long video will be temporal-crop randomly
+ if len(frame_indices) > self.num_frames:
+ begin_index, end_index = self.temporal_sample(len(frame_indices))
+ frame_indices = frame_indices[begin_index: end_index]
+ # frame_indices = frame_indices[:self.num_frames] # head crop
+ # to find a suitable end_frame_idx, to ensure we do not need pad video
+ end_frame_idx = find_closest_y(len(frame_indices), vae_stride_t=4, model_ds_t=4)
+ if end_frame_idx == -1: # too short that can not be encoded exactly by videovae
+ cnt_too_short += 1
+ continue
+ frame_indices = frame_indices[:end_frame_idx]
+
+ if '/storage/dataset/movie' in i['path']:
+ cnt_movie += 1
+
+ i['sample_frame_index'] = frame_indices.tolist()
+ i['motion_score'] = i.get('motion_average', None) or i.get('motion')
+
+ new_cap_list.append(i)
+ # i['sample_num_frames'] = len(i['sample_frame_index']) # will use in dataloader(group sampler)
+
+
+ elif path.endswith('.jpg'): # image
+ cnt_img += 1
+ i['sample_frame_index'] = [0]
+ i['motion_score'] = 1.0
+ new_cap_list.append(i)
+ # i['sample_num_frames'] = len(i['sample_frame_index']) # will use in dataloader(group sampler)
+
+ else:
+ raise NameError(f"Unknown file extention {path.split('.')[-1]}, only support .mp4 for video and .jpg for image")
+
+ sample_size.append(f"{len(i['sample_frame_index'])}x{sample_h}x{sample_w}")
+ if self.use_motion:
+ motion_score.append(i['motion_score'])
+
+ logger.info(f'no_cap: {cnt_no_cap}, too_long: {cnt_too_long}, too_short: {cnt_too_short}, '
+ f'no_resolution: {cnt_no_resolution}, resolution_mismatch: {cnt_resolution_mismatch}, '
+ f'Counter(sample_size): {Counter(sample_size)}, cnt_movie: {cnt_movie}, cnt_img: {cnt_img}, '
+ f'before filter: {cnt}, after filter: {len(new_cap_list)}')
+ if self.use_motion:
+ stats_motion = calculate_statistics(motion_score)
+ logger.info(f"before filter: {cnt}, after filter: {len(new_cap_list)} | "
+ f"motion_score: {len(motion_score)}, cnt_no_motion: {cnt_no_motion} | "
+ f"{len([i for i in motion_score if i>=0.95])} > 0.95, 0.7 > {len([i for i in motion_score if i<=0.7])} "
+ f"Mean: {stats_motion['mean']}, Var: {stats_motion['variance']}, Std: {stats_motion['std_dev']}, "
+ f"Min: {stats_motion['min']}, Max: {stats_motion['max']}")
+
+ if len(aesthetic_score) > 0:
+ stats_aesthetic = calculate_statistics(aesthetic_score)
+ logger.info(f"before filter: {cnt}, after filter: {len(new_cap_list)} | "
+ f"aesthetic_score: {len(aesthetic_score)}, cnt_no_aesthetic: {cnt_no_aesthetic} | "
+ f"{len([i for i in aesthetic_score if i>=5.75])} > 5.75, 4.5 > {len([i for i in aesthetic_score if i<=4.5])} "
+ f"Mean: {stats_aesthetic['mean']}, Var: {stats_aesthetic['variance']}, Std: {stats_aesthetic['std_dev']}, "
+ f"Min: {stats_aesthetic['min']}, Max: {stats_aesthetic['max']}")
+
+ # import ipdb;ipdb.set_trace()
+
+ return new_cap_list, sample_size, motion_score
+
+ def decord_read(self, path, predefine_frame_indice):
+ predefine_num_frames = len(predefine_frame_indice)
+ # decord_vr = self.v_decoder(path)
+ decord_vr = decord.VideoReader(path, ctx=decord.cpu(0), num_threads=1)
+ # with open(path, 'rb') as f:
+ # decord_vr = decord.VideoReader(f, ctx=decord.cpu(0), num_threads=1)
+ total_frames = len(decord_vr)
+ fps = decord_vr.get_avg_fps() if decord_vr.get_avg_fps() > 0 else 24.0
+
+ frame_indices = self.get_actual_frame(fps, total_frames, path, predefine_num_frames, predefine_frame_indice)
+
+ video_data = decord_vr.get_batch(frame_indices).asnumpy()
video_data = torch.from_numpy(video_data)
video_data = video_data.permute(0, 3, 1, 2) # (T, H, W, C) -> (T C H W)
+ # del decord_vr
+ # gc.collect()
+ return video_data
+
+ def opencv_read(self, path, predefine_frame_indice):
+ predefine_num_frames = len(predefine_frame_indice)
+ cv2_vr = cv2.VideoCapture(path)
+ if not cv2_vr.isOpened():
+ print(f'can not open {path}')
+ raise ValueError(f'can not open {path}')
+ total_frames = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
+ fps = cv2_vr.get(cv2.CAP_PROP_FPS) if cv2_vr.get(cv2.CAP_PROP_FPS) > 0 else 24.0
+ frame_indices = self.get_actual_frame(fps, total_frames, path, predefine_num_frames, predefine_frame_indice)
+
+ video_data = []
+ for frame_idx in frame_indices:
+ cv2_vr.set(1, frame_idx)
+ _, frame = cv2_vr.read()
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
+ cv2_vr.release()
+ video_data = torch.stack(video_data) # (T C H W)
return video_data
- def get_img_cap_list(self):
- with open(self.image_data_path, 'r') as f:
- image_data = json.load(f)
- image_data = [image_data[i: i+self.use_image_num] for i in range(0, len(image_data), self.use_image_num)]
- return image_data[:-1] # drop last to avoid error length
\ No newline at end of file
+ def get_actual_frame(self, fps, total_frames, path, predefine_num_frames, predefine_frame_indice):
+ # resample in case high fps, such as 50/60/90/144 -> train_fps(e.g, 24)
+ frame_interval = 1.0 if abs(fps - self.train_fps) < 0.1 else fps / self.train_fps
+ start_frame_idx = 10 if '/storage/dataset/movie' in path else 0 # special video
+ frame_indices = np.arange(start_frame_idx, total_frames, frame_interval).astype(int)
+ frame_indices = frame_indices[frame_indices < total_frames]
+
+ # speed up
+ max_speed_factor = len(frame_indices) / self.num_frames
+ if self.speed_factor > 1 and max_speed_factor > 1:
+ # speed_factor = random.uniform(1.0, min(self.speed_factor, max_speed_factor))
+ speed_factor = min(self.speed_factor, max_speed_factor)
+ target_frame_count = int(len(frame_indices) / speed_factor)
+ speed_frame_idx = np.linspace(0, len(frame_indices) - 1, target_frame_count, dtype=int)
+ frame_indices = frame_indices[speed_frame_idx]
+
+ # too long video will be temporal-crop randomly
+ if len(frame_indices) > self.num_frames:
+ begin_index, end_index = self.temporal_sample(len(frame_indices))
+ frame_indices = frame_indices[begin_index: end_index]
+ # frame_indices = frame_indices[:self.num_frames] # head crop
+
+ # to find a suitable end_frame_idx, to ensure we do not need pad video
+ end_frame_idx = find_closest_y(len(frame_indices), vae_stride_t=4, model_ds_t=4)
+ if end_frame_idx == -1: # too short that can not be encoded exactly by videovae
+ raise IndexError(f'video ({path}) has {total_frames} frames, but need to sample {len(frame_indices)} frames ({frame_indices})')
+ frame_indices = frame_indices[:end_frame_idx]
+ if predefine_num_frames != len(frame_indices):
+ raise ValueError(f'video ({path}) predefine_num_frames ({predefine_num_frames}) ({predefine_frame_indice}) is not equal with frame_indices ({len(frame_indices)}) ({frame_indices})')
+ if len(frame_indices) < self.num_frames and self.drop_short_ratio >= 1:
+ raise IndexError(f'video ({path}) has {total_frames} frames, but need to sample {len(frame_indices)} frames ({frame_indices})')
+ return frame_indices
+
+ def read_jsons(self, data, postfix=".jpg"):
+ data_roots = []
+ cap_lists = []
+ with open(data, 'r') as f:
+ folder_anno = [i.strip().split(',') for i in f.readlines() if len(i.strip()) > 0]
+ for folder, anno in tqdm(folder_anno):
+ logger.info(f'Building {anno}...')
+ if anno.endswith('.json'):
+ with open(anno, 'r') as f:
+ sub_list = json.load(f)
+ elif anno.endswith('.pkl'):
+ with open(anno, 'rb') as f:
+ sub_list = pickle.load(f)
+ elif anno.endswith('.parquet'):
+ sub_list = read_parquet(anno)
+ else:
+ raise NotImplementedError
+ # for i in tqdm(range(len(sub_list))):
+ # sub_list[i]['path'] = opj(folder, sub_list[i]['path'])
+ # if npu_config is not None:
+ # if "civitai" in anno or "ideogram" in anno or "human" in anno:
+ # sub_list = sub_list[npu_config.get_node_id()::npu_config.get_node_size()]
+ # else:
+ # sub_list = filter_json_by_existed_files(folder, sub_list, postfix=postfix)
+ data_roots.append(folder)
+ cap_lists.append(sub_list)
+ return data_roots, cap_lists
+
+ # def get_img_cap_list(self):
+ # use_image_num = self.use_image_num if self.use_image_num != 0 else 1
+ # if npu_config is None:
+ # img_cap_lists = self.read_jsons(self.image_data, postfix=".jpg")
+ # img_cap_lists = [img_cap_lists[i: i + use_image_num] for i in range(0, len(img_cap_lists), use_image_num)]
+ # else:
+ # img_cap_lists = npu_config.try_load_pickle("img_cap_lists_all",
+ # lambda: self.read_jsons(self.image_data, postfix=".jpg"))
+ # img_cap_lists = [img_cap_lists[i: i + use_image_num] for i in range(0, len(img_cap_lists), use_image_num)]
+ # img_cap_lists = img_cap_lists[npu_config.get_local_rank()::npu_config.N_NPU_PER_NODE]
+ # return img_cap_lists[:-1] # drop last to avoid error length
+
+ def get_cap_list(self):
+ data_roots, cap_lists = self.read_jsons(self.data, postfix=".mp4")
+ return data_roots, cap_lists
diff --git a/opensora/dataset/transform.py b/opensora/dataset/transform.py
index bb89c2c85..d2e9018a0 100644
--- a/opensora/dataset/transform.py
+++ b/opensora/dataset/transform.py
@@ -2,6 +2,11 @@
import random
import numbers
from torchvision.transforms import RandomCrop, RandomResizedCrop
+import statistics
+import numpy as np
+import ftfy
+import regex as re
+import html
def _is_tensor_video_clip(clip):
@@ -107,6 +112,25 @@ def center_crop_using_short_edge(clip):
return crop(clip, i, j, th, tw)
+
+def center_crop_th_tw(clip, th, tw, top_crop):
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+
+ # import ipdb;ipdb.set_trace()
+ h, w = clip.size(-2), clip.size(-1)
+ tr = th / tw
+ if h / w > tr:
+ new_h = int(w * tr)
+ new_w = w
+ else:
+ new_h = h
+ new_w = int(h / tr)
+
+ i = 0 if top_crop else int(round((h - new_h) / 2.0))
+ j = int(round((w - new_w) / 2.0))
+ return crop(clip, i, j, new_h, new_w)
+
def random_shift_crop(clip):
'''
Slide along the long edge, with the short edge as crop size
@@ -145,6 +169,18 @@ def to_tensor(clip):
return clip.float() / 255.0
+def to_tensor_after_resize(clip):
+ """
+ Convert resized tensor to [0, 1]
+ Args:
+ clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+ Return:
+ clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W), but in [0, 1]
+ """
+ _is_tensor_video_clip(clip)
+ # return clip.float().permute(3, 0, 1, 2) / 255.0
+ return clip.float() / 255.0
+
def normalize(clip, mean, std, inplace=False):
"""
Args:
@@ -214,9 +250,18 @@ def __repr__(self) -> str:
return f"{self.__class__.__name__}(size={self.size})"
+def get_params(h, w, stride):
+
+ th, tw = h // stride * stride, w // stride * stride
+
+ i = (h - th) // 2
+ j = (w - tw) // 2
+
+ return i, j, th, tw
+
class SpatialStrideCropVideo:
def __init__(self, stride):
- self.stride = stride
+ self.stride = stride
def __call__(self, clip):
"""
@@ -226,18 +271,24 @@ def __call__(self, clip):
torch.tensor: cropped video clip by stride.
size is (T, C, OH, OW)
"""
- i, j, h, w = self.get_params(clip)
+ h, w = clip.shape[-2:]
+ i, j, h, w = get_params(h, w, self.stride)
return crop(clip, i, j, h, w)
- def get_params(self, clip):
- h, w = clip.shape[-2:]
-
- th, tw = h // self.stride * self.stride, w // self.stride * self.stride
-
- return 0, 0, th, tw # from top-left
def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size})"
+ return f"{self.__class__.__name__}(stride={self.stride})"
+
+def longsideresize(h, w, size, skip_low_resolution):
+ if h <= size[0] and w <= size[1] and skip_low_resolution:
+ return h, w
+ if h / w > size[0] / size[1]:
+ w = int(w * size[0] / h)
+ h = size[0]
+ else:
+ h = int(h * size[1] / w)
+ w = size[1]
+ return h, w
class LongSideResizeVideo:
'''
@@ -261,24 +312,20 @@ def __call__(self, clip):
clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
Returns:
torch.tensor: scale resized video clip.
- size is (T, C, 512, *) or (T, C, *, 512)
"""
_, _, h, w = clip.shape
- if self.skip_low_resolution and max(h, w) <= self.size:
+ tr_h, tr_w = longsideresize(h, w, self.size, self.skip_low_resolution)
+ if h == tr_h and w == tr_w:
return clip
- if h > w:
- w = int(w * self.size / h)
- h = self.size
- else:
- h = int(h * self.size / w)
- w = self.size
- resize_clip = resize(clip, target_size=(h, w),
+ resize_clip = resize(clip, target_size=(tr_h, tr_w),
interpolation_mode=self.interpolation_mode)
return resize_clip
def __repr__(self) -> str:
return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+
+
class CenterCropResizeVideo:
'''
First use the short side for cropping length,
@@ -288,15 +335,13 @@ class CenterCropResizeVideo:
def __init__(
self,
size,
+ top_crop=False,
interpolation_mode="bilinear",
):
- if isinstance(size, tuple):
- if len(size) != 2:
- raise ValueError(f"size should be tuple (height, width), instead got {size}")
- self.size = size
- else:
- self.size = (size, size)
-
+ if len(size) != 2:
+ raise ValueError(f"size should be tuple (height, width), instead got {size}")
+ self.size = size
+ self.top_crop = top_crop
self.interpolation_mode = interpolation_mode
def __call__(self, clip):
@@ -307,7 +352,7 @@ def __call__(self, clip):
torch.tensor: scale resized / center cropped video clip.
size is (T, C, crop_size, crop_size)
"""
- clip_center_crop = center_crop_using_short_edge(clip)
+ clip_center_crop = center_crop_th_tw(clip, self.size[0], self.size[1], top_crop=self.top_crop)
clip_center_crop_resize = resize(clip_center_crop, target_size=self.size,
interpolation_mode=self.interpolation_mode)
return clip_center_crop_resize
@@ -452,6 +497,29 @@ def __call__(self, clip):
def __repr__(self) -> str:
return self.__class__.__name__
+
+
+class ToTensorAfterResize:
+ """
+ Convert tensor data type from uint8 to float, divide value by 255.0 and
+ permute the dimensions of clip tensor
+ """
+
+ def __init__(self):
+ pass
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+ Return:
+ clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W), but in [0, 1]
+ """
+ return to_tensor_after_resize(clip)
+
+ def __repr__(self) -> str:
+ return self.__class__.__name__
+
class RandomHorizontalFlipVideo:
@@ -518,6 +586,195 @@ def __call__(self, t, h, w):
truncate_t = truncate_t + 1
return 0, truncate_t
+keywords = [
+ ' man ', ' woman ', ' person ', ' people ', 'human',
+ ' individual ', ' child ', ' kid ', ' girl ', ' boy ',
+ ]
+keywords += [i[:-1] + 's ' for i in keywords]
+
+masking_notices = [
+ "Note: The faces in this image are blurred.",
+ "This image contains faces that have been pixelated.",
+ "Notice: Faces in this image are masked.",
+ "Please be aware that the faces in this image are obscured.",
+ "The faces in this image are hidden.",
+ "This is an image with blurred faces.",
+ "The faces in this image have been processed.",
+ "Attention: Faces in this image are not visible.",
+ "The faces in this image are partially blurred.",
+ "This image has masked faces.",
+ "Notice: The faces in this picture have been altered.",
+ "This is a picture with obscured faces.",
+ "The faces in this image are pixelated.",
+ "Please note, the faces in this image have been blurred.",
+ "The faces in this photo are hidden.",
+ "The faces in this picture have been masked.",
+ "Note: The faces in this picture are altered.",
+ "This is an image where faces are not clear.",
+ "Faces in this image have been obscured.",
+ "This picture contains masked faces.",
+ "The faces in this image are processed.",
+ "The faces in this picture are not visible.",
+ "Please be aware, the faces in this photo are pixelated.",
+ "The faces in this picture have been blurred.",
+]
+
+webvid_watermark_notices = [
+ "This video has a faint Shutterstock watermark in the center.",
+ "There is a slight Shutterstock watermark in the middle of this video.",
+ "The video contains a subtle Shutterstock watermark in the center.",
+ "This video features a light Shutterstock watermark at its center.",
+ "A faint Shutterstock watermark is present in the middle of this video.",
+ "There is a mild Shutterstock watermark at the center of this video.",
+ "This video has a slight Shutterstock watermark in the middle.",
+ "You can see a faint Shutterstock watermark in the center of this video.",
+ "A subtle Shutterstock watermark appears in the middle of this video.",
+ "This video includes a light Shutterstock watermark at its center.",
+]
+
+
+high_aesthetic_score_notices_video = [
+ "This video has a high aesthetic quality.",
+ "The beauty of this video is exceptional.",
+ "This video scores high in aesthetic value.",
+ "With its harmonious colors and balanced composition.",
+ "This video ranks highly for aesthetic quality",
+ "The artistic quality of this video is excellent.",
+ "This video is rated high for beauty.",
+ "The aesthetic quality of this video is impressive.",
+ "This video has a top aesthetic score.",
+ "The visual appeal of this video is outstanding.",
+]
+
+low_aesthetic_score_notices_video = [
+ "This video has a low aesthetic quality.",
+ "The beauty of this video is minimal.",
+ "This video scores low in aesthetic appeal.",
+ "The aesthetic quality of this video is below average.",
+ "This video ranks low for beauty.",
+ "The artistic quality of this video is lacking.",
+ "This video has a low score for aesthetic value.",
+ "The visual appeal of this video is low.",
+ "This video is rated low for beauty.",
+ "The aesthetic quality of this video is poor.",
+]
+
+
+high_aesthetic_score_notices_image = [
+ "This image has a high aesthetic quality.",
+ "The beauty of this image is exceptional",
+ "This photo scores high in aesthetic value.",
+ "With its harmonious colors and balanced composition.",
+ "This image ranks highly for aesthetic quality.",
+ "The artistic quality of this photo is excellent.",
+ "This image is rated high for beauty.",
+ "The aesthetic quality of this image is impressive.",
+ "This photo has a top aesthetic score.",
+ "The visual appeal of this image is outstanding.",
+]
+
+high_aesthetic_score_notices_image_human = [
+ "High-quality image with visible human features and high aesthetic score.",
+ "Clear depiction of an individual in a high-quality image with top aesthetics.",
+ "High-resolution photo showcasing visible human details and high beauty rating.",
+ "Detailed, high-quality image with well-defined human subject and strong aesthetic appeal.",
+ "Sharp, high-quality portrait with clear human features and high aesthetic value.",
+ "High-quality image featuring a well-defined human presence and exceptional aesthetics.",
+ "Visible human details in a high-resolution photo with a high aesthetic score.",
+ "Clear, high-quality image with prominent human subject and superior aesthetic rating.",
+ "High-quality photo capturing a visible human with excellent aesthetics.",
+ "Detailed, high-quality image of a human with high visual appeal and aesthetic value.",
+]
+
+
+def add_masking_notice(caption):
+ if any(keyword in caption for keyword in keywords):
+ notice = random.choice(masking_notices)
+ return random.choice([caption + ' ' + notice, notice + ' ' + caption])
+ return caption
+
+def add_webvid_watermark_notice(caption):
+ notice = random.choice(webvid_watermark_notices)
+ return random.choice([caption + ' ' + notice, notice + ' ' + caption])
+
+def add_aesthetic_notice_video(caption, aesthetic_score):
+ assert add_aesthetic_notice_video is not None
+ if aesthetic_score <= 4.5:
+ notice = random.choice(low_aesthetic_score_notices_video)
+ return random.choice([caption + ' ' + notice, notice + ' ' + caption])
+ if aesthetic_score >= 5.75:
+ notice = random.choice(high_aesthetic_score_notices_video)
+ return random.choice([caption + ' ' + notice, notice + ' ' + caption])
+ return caption
+
+def add_high_aesthetic_notice_image(caption):
+ notice = random.choice(high_aesthetic_score_notices_image)
+ return random.choice([caption + ' ' + notice, notice + ' ' + caption])
+
+def add_high_aesthetic_notice_image_human(caption):
+ notice = random.choice(high_aesthetic_score_notices_image_human)
+ return random.choice([caption + ' ' + notice, notice + ' ' + caption])
+
+def basic_clean(text):
+ text = ftfy.fix_text(text)
+ text = html.unescape(html.unescape(text))
+ return text.strip()
+
+
+def whitespace_clean(text):
+ text = re.sub(r"\s+", " ", text)
+ text = text.strip()
+ return text
+
+
+def clean_youtube(text, is_tags=False):
+ text = text.lower() + ' '
+ text = re.sub(
+ r'#video|video|#shorts|shorts| shorts|#short| short|#youtubeshorts|youtubeshorts|#youtube| youtube|#shortsyoutube|#ytshorts|ytshorts|#ytshort|#shortvideo|shortvideo|#shortsfeed|#tiktok|tiktok|#tiktokchallenge|#myfirstshorts|#myfirstshort|#viral|viralvideo|viral|#viralshorts|#virlshort|#ytviralshorts|#instagram',
+ ' ', text)
+ text = re.sub(r' s |short|youtube|virlshort|#', ' ', text)
+ pattern = r'[^a-zA-Z0-9\s\.,;:?!\'\"|]'
+ if is_tags:
+ pattern = r'[^a-zA-Z0-9\s]'
+ text = re.sub(pattern, '', text)
+ text = whitespace_clean(basic_clean(text))
+ return text
+
+def clean_vidal(text):
+ title_hashtags = text.split('#')
+ title, hashtags = title_hashtags[0], '#' + '#'.join(title_hashtags[1:])
+ title = clean_youtube(title)
+ hashtags = clean_youtube(hashtags, is_tags=True)
+ text = title + ', ' + hashtags
+ if text == '' or text.isspace():
+ raise ValueError('text is empty')
+ return text
+
+
+
+def motion_mapping_fun(motion_score, n=3):
+ assert motion_score is not None
+ return max(motion_score, 0.0) ** n
+
+
+def calculate_statistics(data):
+ if len(data) == 0:
+ return None
+ data = np.array(data)
+ mean = np.mean(data)
+ variance = np.var(data)
+ std_dev = np.std(data)
+ minimum = np.min(data)
+ maximum = np.max(data)
+
+ return {
+ 'mean': mean,
+ 'variance': variance,
+ 'std_dev': std_dev,
+ 'min': minimum,
+ 'max': maximum
+ }
+
if __name__ == '__main__':
from torchvision import transforms
import torchvision.io as io
diff --git a/opensora/dataset/ucf101.py b/opensora/dataset/ucf101.py
deleted file mode 100644
index c368976fa..000000000
--- a/opensora/dataset/ucf101.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import math
-import os
-
-import decord
-import numpy as np
-import torch
-import torchvision
-from decord import VideoReader, cpu
-from torch.utils.data import Dataset
-from torchvision.transforms import Compose, Lambda, ToTensor
-from torchvision.transforms._transforms_video import NormalizeVideo, RandomCropVideo, RandomHorizontalFlipVideo
-from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
-from torch.nn import functional as F
-import random
-
-from opensora.utils.dataset_utils import DecordInit
-
-
-class UCF101(Dataset):
- def __init__(self, args, transform, temporal_sample):
- self.data_path = args.data_path
- self.num_frames = args.num_frames
- self.transform = transform
- self.temporal_sample = temporal_sample
- self.v_decoder = DecordInit()
-
- self.classes = sorted(os.listdir(self.data_path))
- self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}
- self.samples = self._make_dataset()
-
-
- def _make_dataset(self):
- dataset = []
- for class_name in self.classes:
- class_path = os.path.join(self.data_path, class_name)
- for fname in os.listdir(class_path):
- if fname.endswith('.avi'):
- item = (os.path.join(class_path, fname), self.class_to_idx[class_name])
- dataset.append(item)
- return dataset
-
- def __len__(self):
- return len(self.samples)
-
- def __getitem__(self, idx):
- video_path, label = self.samples[idx]
- try:
- video = self.tv_read(video_path)
- video = self.transform(video) # T C H W -> T C H W
- video = video.transpose(0, 1) # T C H W -> C T H W
- return video, label
- except Exception as e:
- print(f'Error with {e}, {video_path}')
- return self.__getitem__(random.randint(0, self.__len__()-1))
-
- def tv_read(self, path):
- vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit='sec', output_format='TCHW')
- total_frames = len(vframes)
-
- # Sampling video frames
- start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
- # assert end_frame_ind - start_frame_ind >= self.num_frames
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.num_frames, dtype=int)
- video = vframes[frame_indice] # (T, C, H, W)
-
- return video
-
- def decord_read(self, path):
- decord_vr = self.v_decoder(path)
- total_frames = len(decord_vr)
- # Sampling video frames
- start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
- # assert end_frame_ind - start_frame_ind >= self.num_frames
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.num_frames, dtype=int)
-
- video_data = decord_vr.get_batch(frame_indice).asnumpy()
- video_data = torch.from_numpy(video_data)
- video_data = video_data.permute(0, 3, 1, 2) # (T, H, W, C) -> (T C H W)
- return video_data
-
diff --git a/opensora/dataset/virtual_disk.py b/opensora/dataset/virtual_disk.py
new file mode 100644
index 000000000..5865a31b9
--- /dev/null
+++ b/opensora/dataset/virtual_disk.py
@@ -0,0 +1,230 @@
+import subprocess
+import json
+import pickle
+from collections import OrderedDict
+from opensora.npu_config import npu_config
+
+import sys
+import os
+
+class SuppressStdout:
+ _instance = None
+
+ def __new__(cls, *args, **kwargs):
+ if cls._instance is None:
+ cls._instance = super(SuppressStdout, cls).__new__(cls, *args, **kwargs)
+ return cls._instance
+
+ def __enter__(self):
+ self._original_stdout = sys.stdout
+ sys.stdout = open(os.devnull, 'w')
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ sys.stdout.close()
+ sys.stdout = self._original_stdout
+
+# 创建单例
+
+
+class ObsConnection:
+ """
+ AK, SK, STS_TOKEN临时密钥有效时效云计算网站最长为24h
+ buckets & object: https://uconsole.ccaicc.com/#/mgt/modelarts -> 对象控制台
+ keys & tokens: https://uconsole.ccaicc.com/#/mgt/modelarts -> 对象控制台 -> 获取访问密匙(AK 和 SK)
+ """
+ def __init__(self):
+ with open(f"{npu_config.work_path}/scripts/train_data/key.json", "r") as f:
+ key = json.load(f)
+ self.AK = key["AK"]
+ self.SK = key["SK"]
+ self.endpoint = key["EP"]
+ self.bucket = "sora"
+ self.suppress_stdout = SuppressStdout()
+
+ def connect(self, obs):
+ config_command = [
+ obs, 'config',
+ '-i=' + self.AK,
+ '-k=' + self.SK,
+ '-e=' + self.endpoint
+ ]
+ result = subprocess.run(config_command, capture_output=True, text=True)
+ if result.returncode != 0:
+ print(f"Failed to configure obsutil: {result.stderr}")
+ else:
+ print("Successfully configured obsutil")
+
+class VirtualDisk:
+ """
+ :param storage_dir: 内存虚拟磁盘的挂载点路径。
+ :param size: 内存虚拟磁盘的大小,例如 '1G'。
+ :param obs: linux 系统里面obs具体位置
+ :param connection: 抽象出obs连接管理
+ """
+ def __init__(self, storage_dir, size="1G", obs="/home/opensora/obsutil_linux_arm64_5.5.12/obsutil"):
+ self.obs = obs
+ self.connection = ObsConnection()
+ self.connection.connect(obs)
+ os.makedirs(storage_dir, exist_ok=True)
+ self.storage_dir = storage_dir
+ self.size = self._convert_size_to_bytes(size)
+ if not self.is_tmpfs_mounted():
+ self.create_ramdisk()
+ else:
+ print(f"{self.storage_dir} is already mounted as tmpfs.")
+ self.index_file = os.path.join(self.storage_dir, 'index.pkl')
+ self.index = self.load_index()
+ self.lru = OrderedDict()
+ self.current_size = self.get_total_storage_size() # 初始化时计算总大小
+
+ def _convert_size_to_bytes(self, size):
+ unit = size[-1].upper()
+ size_value = int(size[:-1])
+ if unit == 'K':
+ return size_value * 1024
+ elif unit == 'M':
+ return size_value * 1024 ** 2
+ elif unit == 'G':
+ return size_value * 1024 ** 3
+ else:
+ raise ValueError("Invalid size unit. Use K, M, or G.")
+
+ """
+ 创建并挂载一个 tmpfs 类型的内存虚拟磁盘。
+ """
+ def create_ramdisk(self):
+ try:
+ # 如果挂载点目录不存在,创建它
+ if not os.path.exists(self.storage_dir):
+ os.makedirs(self.storage_dir)
+ # 挂载 tmpfs 到挂载点
+ subprocess.run(['sudo', 'mount', '-t', 'tmpfs', '-o', f'size={self.size}', 'tmpfs', self.storage_dir], check=True)
+ print(f"Successfully mounted tmpfs on {self.storage_dir} with size {self.size}.")
+
+ except subprocess.CalledProcessError as e:
+ print(f"Failed to mount tmpfs: {e}")
+ except Exception as e:
+ print(f"An error occurred: {e}")
+
+ def load_index(self):
+ """
+ 加载索引文件。
+ :return: 索引字典。
+ """
+ if os.path.exists(self.index_file):
+ with open(self.index_file, 'rb') as f:
+ return pickle.load(f)
+ return {}
+
+ def save_index(self):
+ """
+ 保存索引文件。
+ """
+ with open(self.index_file, 'wb') as f:
+ pickle.dump(self.index, f)
+
+ """
+ 取消挂载内存虚拟磁盘。
+
+ :param storage_dir: 内存虚拟磁盘的挂载点路径。
+ """
+ def unmount_ramdisk(self):
+ try:
+ # 确保没有进程在使用挂载点后取消挂载
+ subprocess.run(['sudo', 'umount', self.storage_dir], check=True)
+ print(f"Successfully unmounted tmpfs from {self.storage_dir}.")
+ except subprocess.CalledProcessError as e:
+ print(f"Failed to unmount tmpfs: {e}")
+ except Exception as e:
+ print(f"An error occurred: {e}")
+
+ """
+ 检查挂载点是否已经被挂载为 tmpfs。
+ :param storage_dir: 挂载点路径。
+ :return: 如果已挂载为 tmpfs,返回 True;否则返回 False。
+ """
+ def is_tmpfs_mounted(self):
+ try:
+ result = subprocess.run(['mountpoint', '-q', self.storage_dir], check=False)
+ if result.returncode == 0:
+ return True
+ return False
+ except Exception as e:
+ print(f"An error occurred while checking if tmpfs is mounted: {e}")
+ return False
+
+ def get_data(self, key):
+ """
+ 获取存储在本地磁盘上的数据。如果数据不存在,通过 obsutil 从远端获取并存储。
+ :param key: 数据的唯一键。
+ :return: 数据。
+ """
+ # if key in self.index:
+ # data_file = self.index[key]
+ # if os.path.exists(data_file):
+ # self.lru.move_to_end(key)
+ # with open(data_file, 'rb') as f:
+ # # print(f"Successfully get {key} from local")
+ # return pickle.load(f)
+
+
+ # 如果数据不存在,使用 obsutil 从远端获取
+ object_name = key # 假设 key 对应于远端对象名称
+ local_path = os.path.join(self.storage_dir, key)
+
+ with self.connection.suppress_stdout:
+ self.download_and_convert_to_pickle(self.connection.bucket, object_name, local_path)
+
+ # 保存数据的位置
+ # self.index[key] = local_path
+ # self.save_index()
+ # self.lru[key] = local_path
+ #
+ # file_size = os.path.getsize(local_path)
+ # self.current_size += file_size
+
+ # self.ensure_storage_limit()
+
+ return local_path
+
+ def del_data(self, local_path):
+ os.remove(local_path)
+
+ def download_and_convert_to_pickle(self, bucket, object_name, local_path):
+ """
+ 使用 obsutil 从 OBS 下载文件并转换为 pickle 格式存储到本地路径。
+ :param bucket: OBS 存储桶名称。
+ :param object_name: OBS 中的对象名称。
+ :param local_path: 本地文件路径。
+ """
+ # try:
+ # 下载文件到local_path路径
+ subprocess.run([self.obs, 'cp', f'obs://{bucket}/{object_name}', local_path], check=True)
+ # print(f"Successfully downloaded obs://{bucket}/{object_name} to {local_path}.")
+
+ # except subprocess.CalledProcessError as e:
+ # print(f"Failed to download obs://{bucket}/{object_name} to {local_path}: {e}")
+
+ def ensure_storage_limit(self):
+ """
+ 确保存储总大小不超过虚拟磁盘大小,超出时根据LRU策略删除最旧的文件。
+ """
+ while self.current_size > self.size:
+ oldest_key, oldest_path = self.lru.popitem(last=False)
+ file_size = os.path.getsize(oldest_path)
+ os.remove(oldest_path)
+ del self.index[oldest_key]
+ self.save_index()
+ print(f"Removed {oldest_key} to free up {file_size} bytes.")
+ self.current_size -= file_size
+
+ def get_total_storage_size(self):
+ """
+ 获取当前所有存储文件的总大小。
+ :return: 总大小(字节)。
+ """
+ total_size = 0
+ for path in self.lru.values():
+ if os.path.exists(path):
+ total_size += os.path.getsize(path)
+ return total_size
\ No newline at end of file
diff --git a/opensora/eval/eval_common_metric.py b/opensora/eval/eval_common_metric.py
deleted file mode 100644
index 452c03209..000000000
--- a/opensora/eval/eval_common_metric.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""Calculates the CLIP Scores
-
-The CLIP model is a contrasitively learned language-image model. There is
-an image encoder and a text encoder. It is believed that the CLIP model could
-measure the similarity of cross modalities. Please find more information from
-https://github.com/openai/CLIP.
-
-The CLIP Score measures the Cosine Similarity between two embedded features.
-This repository utilizes the pretrained CLIP Model to calculate
-the mean average of cosine similarities.
-
-See --help to see further details.
-
-Code apapted from https://github.com/mseitzer/pytorch-fid and https://github.com/openai/CLIP.
-
-Copyright 2023 The Hong Kong Polytechnic University
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import os
-import os.path as osp
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
-import numpy as np
-import torch
-from torch.utils.data import Dataset, DataLoader, Subset
-from decord import VideoReader, cpu
-import random
-from pytorchvideo.transforms import ShortSideScale
-from torchvision.io import read_video
-from torchvision.transforms import Lambda, Compose
-from torchvision.transforms._transforms_video import CenterCropVideo
-import sys
-sys.path.append(".")
-from opensora.eval.cal_lpips import calculate_lpips
-from opensora.eval.cal_fvd import calculate_fvd
-from opensora.eval.cal_psnr import calculate_psnr
-from opensora.eval.cal_flolpips import calculate_flolpips
-from opensora.eval.cal_ssim import calculate_ssim
-
-try:
- from tqdm import tqdm
-except ImportError:
- # If tqdm is not available, provide a mock version of it
- def tqdm(x):
- return x
-
-class VideoDataset(Dataset):
- def __init__(self,
- real_video_dir,
- generated_video_dir,
- num_frames,
- sample_rate = 1,
- crop_size=None,
- resolution=128,
- ) -> None:
- super().__init__()
- self.real_video_files = self._combine_without_prefix(real_video_dir)
- self.generated_video_files = self._combine_without_prefix(generated_video_dir)
- self.num_frames = num_frames
- self.sample_rate = sample_rate
- self.crop_size = crop_size
- self.short_size = resolution
-
-
- def __len__(self):
- return len(self.real_video_files)
-
- def __getitem__(self, index):
- if index >= len(self):
- raise IndexError
- real_video_file = self.real_video_files[index]
- generated_video_file = self.generated_video_files[index]
- print(real_video_file, generated_video_file)
- real_video_tensor = self._load_video(real_video_file)
- generated_video_tensor = self._load_video(generated_video_file)
- return {'real': real_video_tensor, 'generated':generated_video_tensor }
-
-
- def _load_video(self, video_path):
- num_frames = self.num_frames
- sample_rate = self.sample_rate
- decord_vr = VideoReader(video_path, ctx=cpu(0))
- total_frames = len(decord_vr)
- sample_frames_len = sample_rate * num_frames
-
- if total_frames >= sample_frames_len:
- s = 0
- e = s + sample_frames_len
- num_frames = num_frames
- else:
- s = 0
- e = total_frames
- num_frames = int(total_frames / sample_frames_len * num_frames)
- print(f'sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}', video_path,
- total_frames)
-
-
- frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
- video_data = decord_vr.get_batch(frame_id_list).asnumpy()
- video_data = torch.from_numpy(video_data)
- video_data = video_data.permute(0, 3, 1, 2) # (T, H, W, C) -> (C, T, H, W)
- return _preprocess(video_data, short_size=self.short_size, crop_size = self.crop_size)
-
-
- def _combine_without_prefix(self, folder_path, prefix='.'):
- folder = []
- os.makedirs(folder_path, exist_ok=True)
- for name in os.listdir(folder_path):
- if name[0] == prefix:
- continue
- if osp.isfile(osp.join(folder_path, name)):
- folder.append(osp.join(folder_path, name))
- folder.sort()
- return folder
-
-def _preprocess(video_data, short_size=128, crop_size=None):
- transform = Compose(
- [
- Lambda(lambda x: x / 255.0),
- ShortSideScale(size=short_size),
- CenterCropVideo(crop_size=crop_size),
- ]
- )
- video_outputs = transform(video_data)
- # video_outputs = torch.unsqueeze(video_outputs, 0) # (bz,c,t,h,w)
- return video_outputs
-
-
-def calculate_common_metric(args, dataloader, device):
-
- score_list = []
- for batch_data in tqdm(dataloader): # {'real': real_video_tensor, 'generated':generated_video_tensor }
- real_videos = batch_data['real']
- generated_videos = batch_data['generated']
- assert real_videos.shape[2] == generated_videos.shape[2]
- if args.metric == 'fvd':
- tmp_list = list(calculate_fvd(real_videos, generated_videos, args.device, method=args.fvd_method)['value'].values())
- elif args.metric == 'ssim':
- tmp_list = list(calculate_ssim(real_videos, generated_videos)['value'].values())
- elif args.metric == 'psnr':
- tmp_list = list(calculate_psnr(real_videos, generated_videos)['value'].values())
- elif args.metric == 'flolpips':
- result = calculate_flolpips(real_videos, generated_videos, args.device)
- tmp_list = list(result['value'].values())
- else:
- tmp_list = list(calculate_lpips(real_videos, generated_videos, args.device)['value'].values())
- score_list += tmp_list
- return np.mean(score_list)
-
-def main():
- parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
- parser.add_argument('--batch_size', type=int, default=2,
- help='Batch size to use')
- parser.add_argument('--real_video_dir', type=str,
- help=('the path of real videos`'))
- parser.add_argument('--generated_video_dir', type=str,
- help=('the path of generated videos`'))
- parser.add_argument('--device', type=str, default=None,
- help='Device to use. Like cuda, cuda:0 or cpu')
- parser.add_argument('--num_workers', type=int, default=8,
- help=('Number of processes to use for data loading. '
- 'Defaults to `min(8, num_cpus)`'))
- parser.add_argument('--sample_fps', type=int, default=30)
- parser.add_argument('--resolution', type=int, default=336)
- parser.add_argument('--crop_size', type=int, default=None)
- parser.add_argument('--num_frames', type=int, default=100)
- parser.add_argument('--sample_rate', type=int, default=1)
- parser.add_argument('--subset_size', type=int, default=None)
- parser.add_argument("--metric", type=str, default="fvd",choices=['fvd','psnr','ssim','lpips', 'flolpips'])
- parser.add_argument("--fvd_method", type=str, default='styleganv',choices=['styleganv','videogpt'])
-
-
- args = parser.parse_args()
-
- if args.device is None:
- device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
- else:
- device = torch.device(args.device)
-
- if args.num_workers is None:
- try:
- num_cpus = len(os.sched_getaffinity(0))
- except AttributeError:
- # os.sched_getaffinity is not available under Windows, use
- # os.cpu_count instead (which may not return the *available* number
- # of CPUs).
- num_cpus = os.cpu_count()
-
- num_workers = min(num_cpus, 8) if num_cpus is not None else 0
- else:
- num_workers = args.num_workers
-
-
- dataset = VideoDataset(args.real_video_dir,
- args.generated_video_dir,
- num_frames = args.num_frames,
- sample_rate = args.sample_rate,
- crop_size=args.crop_size,
- resolution=args.resolution)
-
- if args.subset_size:
- indices = range(args.subset_size)
- dataset = Subset(dataset, indices=indices)
-
- dataloader = DataLoader(dataset, args.batch_size,
- num_workers=num_workers, pin_memory=True)
-
-
- metric_score = calculate_common_metric(args, dataloader,device)
- print('metric: ', args.metric, " ",metric_score)
-
-if __name__ == '__main__':
- main()
diff --git a/opensora/models/__init__.py b/opensora/models/__init__.py
index e69de29bb..f6913d2f4 100644
--- a/opensora/models/__init__.py
+++ b/opensora/models/__init__.py
@@ -0,0 +1 @@
+from .causalvideovae import CausalVAEModelWrapper, WFVAEModelWrapper
\ No newline at end of file
diff --git a/opensora/models/ae/__init__.py b/opensora/models/ae/__init__.py
deleted file mode 100644
index 43a6ef0d4..000000000
--- a/opensora/models/ae/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from .imagebase import imagebase_ae, imagebase_ae_stride, imagebase_ae_channel
-from .videobase import videobase_ae, videobase_ae_stride, videobase_ae_channel
-from .videobase import (
- VQVAEConfiguration,
- VQVAEModel,
- VQVAETrainer,
- CausalVQVAEModel,
- CausalVQVAEConfiguration,
- CausalVQVAETrainer
-)
-
-ae_stride_config = {}
-ae_stride_config.update(imagebase_ae_stride)
-ae_stride_config.update(videobase_ae_stride)
-
-ae_channel_config = {}
-ae_channel_config.update(imagebase_ae_channel)
-ae_channel_config.update(videobase_ae_channel)
-
-def getae(args):
- """deprecation"""
- ae = imagebase_ae.get(args.ae, None) or videobase_ae.get(args.ae, None)
- assert ae is not None
- return ae(args.ae)
-
-def getae_wrapper(ae):
- """deprecation"""
- ae = imagebase_ae.get(ae, None) or videobase_ae.get(ae, None)
- assert ae is not None
- return ae
\ No newline at end of file
diff --git a/opensora/models/ae/imagebase/__init__.py b/opensora/models/ae/imagebase/__init__.py
deleted file mode 100644
index 12eeb327f..000000000
--- a/opensora/models/ae/imagebase/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from .vae.vae import HFVAEWrapper
-from .vae.vae import SDVAEWrapper
-from .vqvae.vqvae import SDVQVAEWrapper
-
-vae = ['stabilityai/sd-vae-ft-mse', 'stabilityai/sd-vae-ft-ema']
-vqvae = ['vqgan_imagenet_f16_1024', 'vqgan_imagenet_f16_16384', 'vqgan_gumbel_f8']
-
-imagebase_ae_stride = {
- 'stabilityai/sd-vae-ft-mse': [1, 8, 8],
- 'stabilityai/sd-vae-ft-ema': [1, 8, 8],
- 'vqgan_imagenet_f16_1024': [1, 16, 16],
- 'vqgan_imagenet_f16_16384': [1, 16, 16],
- 'vqgan_gumbel_f8': [1, 8, 8],
-}
-
-imagebase_ae_channel = {
- 'stabilityai/sd-vae-ft-mse': 4,
- 'stabilityai/sd-vae-ft-ema': 4,
- 'vqgan_imagenet_f16_1024': -1,
- 'vqgan_imagenet_f16_16384': -1,
- 'vqgan_gumbel_f8': -1,
-}
-
-imagebase_ae = {
- 'stabilityai/sd-vae-ft-mse': HFVAEWrapper,
- 'stabilityai/sd-vae-ft-ema': HFVAEWrapper,
- 'vqgan_imagenet_f16_1024': SDVQVAEWrapper,
- 'vqgan_imagenet_f16_16384': SDVQVAEWrapper,
- 'vqgan_gumbel_f8': SDVQVAEWrapper,
-}
\ No newline at end of file
diff --git a/opensora/models/ae/imagebase/vae/vae.py b/opensora/models/ae/imagebase/vae/vae.py
deleted file mode 100644
index 4f197ae12..000000000
--- a/opensora/models/ae/imagebase/vae/vae.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from einops import rearrange
-from torch import nn
-from diffusers.models import AutoencoderKL
-
-
-class HFVAEWrapper(nn.Module):
- def __init__(self, hfvae='mse'):
- super(HFVAEWrapper, self).__init__()
- self.vae = AutoencoderKL.from_pretrained(hfvae, cache_dir='cache_dir')
- def encode(self, x): # b c h w
- t = 0
- if x.ndim == 5:
- b, c, t, h, w = x.shape
- x = rearrange(x, 'b c t h w -> (b t) c h w').contiguous()
- x = self.vae.encode(x).latent_dist.sample().mul_(0.18215)
- if t != 0:
- x = rearrange(x, '(b t) c h w -> b c t h w', t=t).contiguous()
- return x
- def decode(self, x):
- t = 0
- if x.ndim == 5:
- b, c, t, h, w = x.shape
- x = rearrange(x, 'b c t h w -> (b t) c h w').contiguous()
- x = self.vae.decode(x / 0.18215).sample
- if t != 0:
- x = rearrange(x, '(b t) c h w -> b t c h w', t=t).contiguous()
- return x
-
-class SDVAEWrapper(nn.Module):
- def __init__(self):
- super(SDVAEWrapper, self).__init__()
- raise NotImplementedError
-
- def encode(self, x): # b c h w
- raise NotImplementedError
-
- def decode(self, x):
- raise NotImplementedError
\ No newline at end of file
diff --git a/opensora/models/ae/imagebase/vqvae/model.py b/opensora/models/ae/imagebase/vqvae/model.py
deleted file mode 100644
index 7c9a757f7..000000000
--- a/opensora/models/ae/imagebase/vqvae/model.py
+++ /dev/null
@@ -1,775 +0,0 @@
-# pytorch_diffusion + derived encoder decoder
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-
-
-def get_timestep_embedding(timesteps, embedding_dim):
- """
- This matches the implementation in Denoising Diffusion Probabilistic Models:
- From Fairseq.
- Build sinusoidal embeddings.
- This matches the implementation in tensor2tensor, but differs slightly
- from the description in Section 3.5 of "Attention Is All You Need".
- """
- assert len(timesteps.shape) == 1
-
- half_dim = embedding_dim // 2
- emb = math.log(10000) / (half_dim - 1)
- emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
- emb = emb.to(device=timesteps.device)
- emb = timesteps.float()[:, None] * emb[None, :]
- emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
- if embedding_dim % 2 == 1: # zero pad
- emb = torch.nn.functional.pad(emb, (0,1,0,0))
- return emb
-
-
-def nonlinearity(x):
- # swish
- return x*torch.sigmoid(x)
-
-
-def Normalize(in_channels):
- return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class Upsample(nn.Module):
- def __init__(self, in_channels, with_conv):
- super().__init__()
- self.with_conv = with_conv
- if self.with_conv:
- self.conv = torch.nn.Conv2d(in_channels,
- in_channels,
- kernel_size=3,
- stride=1,
- padding=1)
-
- def forward(self, x):
- x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
- if self.with_conv:
- x = self.conv(x)
- return x
-
-
-class Downsample(nn.Module):
- def __init__(self, in_channels, with_conv):
- super().__init__()
- self.with_conv = with_conv
- if self.with_conv:
- # no asymmetric padding in torch conv, must do it ourselves
- self.conv = torch.nn.Conv2d(in_channels,
- in_channels,
- kernel_size=3,
- stride=2,
- padding=0)
-
- def forward(self, x):
- if self.with_conv:
- pad = (0,1,0,1)
- x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
- x = self.conv(x)
- else:
- x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
- return x
-
-
-class ResnetBlock(nn.Module):
- def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
- dropout, temb_channels=512):
- super().__init__()
- self.in_channels = in_channels
- out_channels = in_channels if out_channels is None else out_channels
- self.out_channels = out_channels
- self.use_conv_shortcut = conv_shortcut
-
- self.norm1 = Normalize(in_channels)
- self.conv1 = torch.nn.Conv2d(in_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
- if temb_channels > 0:
- self.temb_proj = torch.nn.Linear(temb_channels,
- out_channels)
- self.norm2 = Normalize(out_channels)
- self.dropout = torch.nn.Dropout(dropout)
- self.conv2 = torch.nn.Conv2d(out_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
- if self.in_channels != self.out_channels:
- if self.use_conv_shortcut:
- self.conv_shortcut = torch.nn.Conv2d(in_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
- else:
- self.nin_shortcut = torch.nn.Conv2d(in_channels,
- out_channels,
- kernel_size=1,
- stride=1,
- padding=0)
-
- def forward(self, x, temb):
- h = x
- h = self.norm1(h)
- h = nonlinearity(h)
- h = self.conv1(h)
-
- if temb is not None:
- h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
-
- h = self.norm2(h)
- h = nonlinearity(h)
- h = self.dropout(h)
- h = self.conv2(h)
-
- if self.in_channels != self.out_channels:
- if self.use_conv_shortcut:
- x = self.conv_shortcut(x)
- else:
- x = self.nin_shortcut(x)
-
- return x+h
-
-
-class AttnBlock(nn.Module):
- def __init__(self, in_channels):
- super().__init__()
- self.in_channels = in_channels
-
- self.norm = Normalize(in_channels)
- self.q = torch.nn.Conv2d(in_channels,
- in_channels,
- kernel_size=1,
- stride=1,
- padding=0)
- self.k = torch.nn.Conv2d(in_channels,
- in_channels,
- kernel_size=1,
- stride=1,
- padding=0)
- self.v = torch.nn.Conv2d(in_channels,
- in_channels,
- kernel_size=1,
- stride=1,
- padding=0)
- self.proj_out = torch.nn.Conv2d(in_channels,
- in_channels,
- kernel_size=1,
- stride=1,
- padding=0)
-
-
- def forward(self, x):
- h_ = x
- h_ = self.norm(h_)
- q = self.q(h_)
- k = self.k(h_)
- v = self.v(h_)
-
- # compute attention
- b,c,h,w = q.shape
- q = q.reshape(b,c,h*w)
- q = q.permute(0,2,1) # b,hw,c
- k = k.reshape(b,c,h*w) # b,c,hw
- w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
- w_ = w_ * (int(c)**(-0.5))
- w_ = torch.nn.functional.softmax(w_, dim=2)
-
- # attend to values
- v = v.reshape(b,c,h*w)
- w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
- h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
- h_ = h_.reshape(b,c,h,w)
-
- h_ = self.proj_out(h_)
-
- return x+h_
-
-
-class Model(nn.Module):
- def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
- attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
- resolution, use_timestep=True):
- super().__init__()
- self.ch = ch
- self.temb_ch = self.ch*4
- self.num_resolutions = len(ch_mult)
- self.num_res_blocks = num_res_blocks
- self.resolution = resolution
- self.in_channels = in_channels
-
- self.use_timestep = use_timestep
- if self.use_timestep:
- # timestep embedding
- self.temb = nn.Module()
- self.temb.dense = nn.ModuleList([
- torch.nn.Linear(self.ch,
- self.temb_ch),
- torch.nn.Linear(self.temb_ch,
- self.temb_ch),
- ])
-
- # downsampling
- self.conv_in = torch.nn.Conv2d(in_channels,
- self.ch,
- kernel_size=3,
- stride=1,
- padding=1)
-
- curr_res = resolution
- in_ch_mult = (1,)+tuple(ch_mult)
- self.down = nn.ModuleList()
- for i_level in range(self.num_resolutions):
- block = nn.ModuleList()
- attn = nn.ModuleList()
- block_in = ch*in_ch_mult[i_level]
- block_out = ch*ch_mult[i_level]
- for i_block in range(self.num_res_blocks):
- block.append(ResnetBlock(in_channels=block_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- if curr_res in attn_resolutions:
- attn.append(AttnBlock(block_in))
- down = nn.Module()
- down.block = block
- down.attn = attn
- if i_level != self.num_resolutions-1:
- down.downsample = Downsample(block_in, resamp_with_conv)
- curr_res = curr_res // 2
- self.down.append(down)
-
- # middle
- self.mid = nn.Module()
- self.mid.block_1 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
- self.mid.attn_1 = AttnBlock(block_in)
- self.mid.block_2 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
-
- # upsampling
- self.up = nn.ModuleList()
- for i_level in reversed(range(self.num_resolutions)):
- block = nn.ModuleList()
- attn = nn.ModuleList()
- block_out = ch*ch_mult[i_level]
- skip_in = ch*ch_mult[i_level]
- for i_block in range(self.num_res_blocks+1):
- if i_block == self.num_res_blocks:
- skip_in = ch*in_ch_mult[i_level]
- block.append(ResnetBlock(in_channels=block_in+skip_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- if curr_res in attn_resolutions:
- attn.append(AttnBlock(block_in))
- up = nn.Module()
- up.block = block
- up.attn = attn
- if i_level != 0:
- up.upsample = Upsample(block_in, resamp_with_conv)
- curr_res = curr_res * 2
- self.up.insert(0, up) # prepend to get consistent order
-
- # end
- self.norm_out = Normalize(block_in)
- self.conv_out = torch.nn.Conv2d(block_in,
- out_ch,
- kernel_size=3,
- stride=1,
- padding=1)
-
-
- def forward(self, x, t=None):
- #assert x.shape[2] == x.shape[3] == self.resolution
-
- if self.use_timestep:
- # timestep embedding
- assert t is not None
- temb = get_timestep_embedding(t, self.ch)
- temb = self.temb.dense[0](temb)
- temb = nonlinearity(temb)
- temb = self.temb.dense[1](temb)
- else:
- temb = None
-
- # downsampling
- hs = [self.conv_in(x)]
- for i_level in range(self.num_resolutions):
- for i_block in range(self.num_res_blocks):
- h = self.down[i_level].block[i_block](hs[-1], temb)
- if len(self.down[i_level].attn) > 0:
- h = self.down[i_level].attn[i_block](h)
- hs.append(h)
- if i_level != self.num_resolutions-1:
- hs.append(self.down[i_level].downsample(hs[-1]))
-
- # middle
- h = hs[-1]
- h = self.mid.block_1(h, temb)
- h = self.mid.attn_1(h)
- h = self.mid.block_2(h, temb)
-
- # upsampling
- for i_level in reversed(range(self.num_resolutions)):
- for i_block in range(self.num_res_blocks+1):
- h = self.up[i_level].block[i_block](
- torch.cat([h, hs.pop()], dim=1), temb)
- if len(self.up[i_level].attn) > 0:
- h = self.up[i_level].attn[i_block](h)
- if i_level != 0:
- h = self.up[i_level].upsample(h)
-
- # end
- h = self.norm_out(h)
- h = nonlinearity(h)
- h = self.conv_out(h)
- return h
-
-
-class Encoder(nn.Module):
- def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
- attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
- resolution, z_channels, double_z=True, **ignore_kwargs):
- super().__init__()
- self.ch = ch
- self.temb_ch = 0
- self.num_resolutions = len(ch_mult)
- self.num_res_blocks = num_res_blocks
- self.resolution = resolution
- self.in_channels = in_channels
-
- # downsampling
- self.conv_in = torch.nn.Conv2d(in_channels,
- self.ch,
- kernel_size=3,
- stride=1,
- padding=1)
-
- curr_res = resolution
- in_ch_mult = (1,)+tuple(ch_mult)
- self.down = nn.ModuleList()
- for i_level in range(self.num_resolutions):
- block = nn.ModuleList()
- attn = nn.ModuleList()
- block_in = ch*in_ch_mult[i_level]
- block_out = ch*ch_mult[i_level]
- for i_block in range(self.num_res_blocks):
- block.append(ResnetBlock(in_channels=block_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- if curr_res in attn_resolutions:
- attn.append(AttnBlock(block_in))
- down = nn.Module()
- down.block = block
- down.attn = attn
- if i_level != self.num_resolutions-1:
- down.downsample = Downsample(block_in, resamp_with_conv)
- curr_res = curr_res // 2
- self.down.append(down)
-
- # middle
- self.mid = nn.Module()
- self.mid.block_1 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
- self.mid.attn_1 = AttnBlock(block_in)
- self.mid.block_2 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
-
- # end
- self.norm_out = Normalize(block_in)
- self.conv_out = torch.nn.Conv2d(block_in,
- 2*z_channels if double_z else z_channels,
- kernel_size=3,
- stride=1,
- padding=1)
-
-
- def forward(self, x):
- #assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
-
- # timestep embedding
- temb = None
-
- # downsampling
- hs = [self.conv_in(x)]
- for i_level in range(self.num_resolutions):
- for i_block in range(self.num_res_blocks):
- h = self.down[i_level].block[i_block](hs[-1], temb)
- if len(self.down[i_level].attn) > 0:
- h = self.down[i_level].attn[i_block](h)
- hs.append(h)
- if i_level != self.num_resolutions-1:
- hs.append(self.down[i_level].downsample(hs[-1]))
-
- # middle
- h = hs[-1]
- h = self.mid.block_1(h, temb)
- h = self.mid.attn_1(h)
- h = self.mid.block_2(h, temb)
-
- # end
- h = self.norm_out(h)
- h = nonlinearity(h)
- h = self.conv_out(h)
- return h
-
-
-class Decoder(nn.Module):
- def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
- attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
- resolution, z_channels, give_pre_end=False, **ignorekwargs):
- super().__init__()
- self.ch = ch
- self.temb_ch = 0
- self.num_resolutions = len(ch_mult)
- self.num_res_blocks = num_res_blocks
- self.resolution = resolution
- self.in_channels = in_channels
- self.give_pre_end = give_pre_end
-
- # compute in_ch_mult, block_in and curr_res at lowest res
- in_ch_mult = (1,)+tuple(ch_mult)
- block_in = ch*ch_mult[self.num_resolutions-1]
- curr_res = resolution // 2**(self.num_resolutions-1)
- self.z_shape = (1,z_channels,curr_res,curr_res)
- print("Working with z of shape {} = {} dimensions.".format(
- self.z_shape, np.prod(self.z_shape)))
-
- # z to block_in
- self.conv_in = torch.nn.Conv2d(z_channels,
- block_in,
- kernel_size=3,
- stride=1,
- padding=1)
-
- # middle
- self.mid = nn.Module()
- self.mid.block_1 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
- self.mid.attn_1 = AttnBlock(block_in)
- self.mid.block_2 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
-
- # upsampling
- self.up = nn.ModuleList()
- for i_level in reversed(range(self.num_resolutions)):
- block = nn.ModuleList()
- attn = nn.ModuleList()
- block_out = ch*ch_mult[i_level]
- for i_block in range(self.num_res_blocks+1):
- block.append(ResnetBlock(in_channels=block_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- if curr_res in attn_resolutions:
- attn.append(AttnBlock(block_in))
- up = nn.Module()
- up.block = block
- up.attn = attn
- if i_level != 0:
- up.upsample = Upsample(block_in, resamp_with_conv)
- curr_res = curr_res * 2
- self.up.insert(0, up) # prepend to get consistent order
-
- # end
- self.norm_out = Normalize(block_in)
- self.conv_out = torch.nn.Conv2d(block_in,
- out_ch,
- kernel_size=3,
- stride=1,
- padding=1)
-
- def forward(self, z):
- #assert z.shape[1:] == self.z_shape[1:]
- self.last_z_shape = z.shape
-
- # timestep embedding
- temb = None
-
- # z to block_in
- h = self.conv_in(z)
-
- # middle
- h = self.mid.block_1(h, temb)
- h = self.mid.attn_1(h)
- h = self.mid.block_2(h, temb)
-
- # upsampling
- for i_level in reversed(range(self.num_resolutions)):
- for i_block in range(self.num_res_blocks+1):
- h = self.up[i_level].block[i_block](h, temb)
- if len(self.up[i_level].attn) > 0:
- h = self.up[i_level].attn[i_block](h)
- if i_level != 0:
- h = self.up[i_level].upsample(h)
-
- # end
- if self.give_pre_end:
- return h
-
- h = self.norm_out(h)
- h = nonlinearity(h)
- h = self.conv_out(h)
- return h
-
-
-class VUNet(nn.Module):
- def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
- attn_resolutions, dropout=0.0, resamp_with_conv=True,
- in_channels, c_channels,
- resolution, z_channels, use_timestep=False, **ignore_kwargs):
- super().__init__()
- self.ch = ch
- self.temb_ch = self.ch*4
- self.num_resolutions = len(ch_mult)
- self.num_res_blocks = num_res_blocks
- self.resolution = resolution
-
- self.use_timestep = use_timestep
- if self.use_timestep:
- # timestep embedding
- self.temb = nn.Module()
- self.temb.dense = nn.ModuleList([
- torch.nn.Linear(self.ch,
- self.temb_ch),
- torch.nn.Linear(self.temb_ch,
- self.temb_ch),
- ])
-
- # downsampling
- self.conv_in = torch.nn.Conv2d(c_channels,
- self.ch,
- kernel_size=3,
- stride=1,
- padding=1)
-
- curr_res = resolution
- in_ch_mult = (1,)+tuple(ch_mult)
- self.down = nn.ModuleList()
- for i_level in range(self.num_resolutions):
- block = nn.ModuleList()
- attn = nn.ModuleList()
- block_in = ch*in_ch_mult[i_level]
- block_out = ch*ch_mult[i_level]
- for i_block in range(self.num_res_blocks):
- block.append(ResnetBlock(in_channels=block_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- if curr_res in attn_resolutions:
- attn.append(AttnBlock(block_in))
- down = nn.Module()
- down.block = block
- down.attn = attn
- if i_level != self.num_resolutions-1:
- down.downsample = Downsample(block_in, resamp_with_conv)
- curr_res = curr_res // 2
- self.down.append(down)
-
- self.z_in = torch.nn.Conv2d(z_channels,
- block_in,
- kernel_size=1,
- stride=1,
- padding=0)
- # middle
- self.mid = nn.Module()
- self.mid.block_1 = ResnetBlock(in_channels=2*block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
- self.mid.attn_1 = AttnBlock(block_in)
- self.mid.block_2 = ResnetBlock(in_channels=block_in,
- out_channels=block_in,
- temb_channels=self.temb_ch,
- dropout=dropout)
-
- # upsampling
- self.up = nn.ModuleList()
- for i_level in reversed(range(self.num_resolutions)):
- block = nn.ModuleList()
- attn = nn.ModuleList()
- block_out = ch*ch_mult[i_level]
- skip_in = ch*ch_mult[i_level]
- for i_block in range(self.num_res_blocks+1):
- if i_block == self.num_res_blocks:
- skip_in = ch*in_ch_mult[i_level]
- block.append(ResnetBlock(in_channels=block_in+skip_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- if curr_res in attn_resolutions:
- attn.append(AttnBlock(block_in))
- up = nn.Module()
- up.block = block
- up.attn = attn
- if i_level != 0:
- up.upsample = Upsample(block_in, resamp_with_conv)
- curr_res = curr_res * 2
- self.up.insert(0, up) # prepend to get consistent order
-
- # end
- self.norm_out = Normalize(block_in)
- self.conv_out = torch.nn.Conv2d(block_in,
- out_ch,
- kernel_size=3,
- stride=1,
- padding=1)
-
-
- def forward(self, x, z):
- #assert x.shape[2] == x.shape[3] == self.resolution
-
- if self.use_timestep:
- # timestep embedding
- assert t is not None
- temb = get_timestep_embedding(t, self.ch)
- temb = self.temb.dense[0](temb)
- temb = nonlinearity(temb)
- temb = self.temb.dense[1](temb)
- else:
- temb = None
-
- # downsampling
- hs = [self.conv_in(x)]
- for i_level in range(self.num_resolutions):
- for i_block in range(self.num_res_blocks):
- h = self.down[i_level].block[i_block](hs[-1], temb)
- if len(self.down[i_level].attn) > 0:
- h = self.down[i_level].attn[i_block](h)
- hs.append(h)
- if i_level != self.num_resolutions-1:
- hs.append(self.down[i_level].downsample(hs[-1]))
-
- # middle
- h = hs[-1]
- z = self.z_in(z)
- h = torch.cat((h,z),dim=1)
- h = self.mid.block_1(h, temb)
- h = self.mid.attn_1(h)
- h = self.mid.block_2(h, temb)
-
- # upsampling
- for i_level in reversed(range(self.num_resolutions)):
- for i_block in range(self.num_res_blocks+1):
- h = self.up[i_level].block[i_block](
- torch.cat([h, hs.pop()], dim=1), temb)
- if len(self.up[i_level].attn) > 0:
- h = self.up[i_level].attn[i_block](h)
- if i_level != 0:
- h = self.up[i_level].upsample(h)
-
- # end
- h = self.norm_out(h)
- h = nonlinearity(h)
- h = self.conv_out(h)
- return h
-
-
-class SimpleDecoder(nn.Module):
- def __init__(self, in_channels, out_channels, *args, **kwargs):
- super().__init__()
- self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
- ResnetBlock(in_channels=in_channels,
- out_channels=2 * in_channels,
- temb_channels=0, dropout=0.0),
- ResnetBlock(in_channels=2 * in_channels,
- out_channels=4 * in_channels,
- temb_channels=0, dropout=0.0),
- ResnetBlock(in_channels=4 * in_channels,
- out_channels=2 * in_channels,
- temb_channels=0, dropout=0.0),
- nn.Conv2d(2*in_channels, in_channels, 1),
- Upsample(in_channels, with_conv=True)])
- # end
- self.norm_out = Normalize(in_channels)
- self.conv_out = torch.nn.Conv2d(in_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
-
- def forward(self, x):
- for i, layer in enumerate(self.model):
- if i in [1,2,3]:
- x = layer(x, None)
- else:
- x = layer(x)
-
- h = self.norm_out(x)
- h = nonlinearity(h)
- x = self.conv_out(h)
- return x
-
-
-class UpsampleDecoder(nn.Module):
- def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
- ch_mult=(2,2), dropout=0.0):
- super().__init__()
- # upsampling
- self.temb_ch = 0
- self.num_resolutions = len(ch_mult)
- self.num_res_blocks = num_res_blocks
- block_in = in_channels
- curr_res = resolution // 2 ** (self.num_resolutions - 1)
- self.res_blocks = nn.ModuleList()
- self.upsample_blocks = nn.ModuleList()
- for i_level in range(self.num_resolutions):
- res_block = []
- block_out = ch * ch_mult[i_level]
- for i_block in range(self.num_res_blocks + 1):
- res_block.append(ResnetBlock(in_channels=block_in,
- out_channels=block_out,
- temb_channels=self.temb_ch,
- dropout=dropout))
- block_in = block_out
- self.res_blocks.append(nn.ModuleList(res_block))
- if i_level != self.num_resolutions - 1:
- self.upsample_blocks.append(Upsample(block_in, True))
- curr_res = curr_res * 2
-
- # end
- self.norm_out = Normalize(block_in)
- self.conv_out = torch.nn.Conv2d(block_in,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
-
- def forward(self, x):
- # upsampling
- h = x
- for k, i_level in enumerate(range(self.num_resolutions)):
- for i_block in range(self.num_res_blocks + 1):
- h = self.res_blocks[i_level][i_block](h, None)
- if i_level != self.num_resolutions - 1:
- h = self.upsample_blocks[k](h)
- h = self.norm_out(h)
- h = nonlinearity(h)
- h = self.conv_out(h)
- return h
\ No newline at end of file
diff --git a/opensora/models/ae/imagebase/vqvae/quantize.py b/opensora/models/ae/imagebase/vqvae/quantize.py
deleted file mode 100644
index 148062e9e..000000000
--- a/opensora/models/ae/imagebase/vqvae/quantize.py
+++ /dev/null
@@ -1,447 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from torch import einsum
-from einops import rearrange
-
-
-class VectorQuantizer(nn.Module):
- """
- see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
- ____________________________________________
- Discretization bottleneck part of the VQ-VAE.
- Inputs:
- - n_e : number of embeddings
- - e_dim : dimension of embedding
- - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
- _____________________________________________
- """
-
- # NOTE: this class contains a bug regarding beta; see VectorQuantizer2 for
- # a fix and use legacy=False to apply that fix. VectorQuantizer2 can be
- # used wherever VectorQuantizer has been used before and is additionally
- # more efficient.
- def __init__(self, n_e, e_dim, beta):
- super(VectorQuantizer, self).__init__()
- self.n_e = n_e
- self.e_dim = e_dim
- self.beta = beta
-
- self.embedding = nn.Embedding(self.n_e, self.e_dim)
- self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
-
- def forward(self, z):
- """
- Inputs the output of the encoder network z and maps it to a discrete
- one-hot vector that is the index of the closest embedding vector e_j
- z (continuous) -> z_q (discrete)
- z.shape = (batch, channel, height, width)
- quantization pipeline:
- 1. get encoder input (B,C,H,W)
- 2. flatten input to (B*H*W,C)
- """
- # reshape z -> (batch, height, width, channel) and flatten
- z = z.permute(0, 2, 3, 1).contiguous()
- z_flattened = z.view(-1, self.e_dim)
- # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-
- d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
- torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
- torch.matmul(z_flattened, self.embedding.weight.t())
-
- ## could possible replace this here
- # #\start...
- # find closest encodings
- min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
-
- min_encodings = torch.zeros(
- min_encoding_indices.shape[0], self.n_e).to(z)
- min_encodings.scatter_(1, min_encoding_indices, 1)
-
- # dtype min encodings: torch.float32
- # min_encodings shape: torch.Size([2048, 512])
- # min_encoding_indices.shape: torch.Size([2048, 1])
-
- # get quantized latent vectors
- z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
- # .........\end
-
- # with:
- # .........\start
- # min_encoding_indices = torch.argmin(d, dim=1)
- # z_q = self.embedding(min_encoding_indices)
- # ......\end......... (TODO)
-
- # compute loss for embedding
- loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * \
- torch.mean((z_q - z.detach()) ** 2)
-
- # preserve gradients
- z_q = z + (z_q - z).detach()
-
- # perplexity
- e_mean = torch.mean(min_encodings, dim=0)
- perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
-
- # reshape back to match original input shape
- z_q = z_q.permute(0, 3, 1, 2).contiguous()
-
- return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
-
- def get_codebook_entry(self, indices, shape):
- # shape specifying (batch, height, width, channel)
- # TODO: check for more easy handling with nn.Embedding
- min_encodings = torch.zeros(indices.shape[0], self.n_e).to(indices)
- min_encodings.scatter_(1, indices[:, None], 1)
-
- # get quantized latent vectors
- z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
-
- if shape is not None:
- z_q = z_q.view(shape)
-
- # reshape back to match original input shape
- z_q = z_q.permute(0, 3, 1, 2).contiguous()
-
- return z_q
-
-
-class GumbelQuantize(nn.Module):
- """
- credit to @karpathy: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py (thanks!)
- Gumbel Softmax trick quantizer
- Categorical Reparameterization with Gumbel-Softmax, Jang et al. 2016
- https://arxiv.org/abs/1611.01144
- """
-
- def __init__(self, num_hiddens, embedding_dim, n_embed, straight_through=True,
- kl_weight=5e-4, temp_init=1.0, use_vqinterface=True,
- remap=None, unknown_index="random"):
- super().__init__()
-
- self.embedding_dim = embedding_dim
- self.n_embed = n_embed
-
- self.straight_through = straight_through
- self.temperature = temp_init
- self.kl_weight = kl_weight
-
- self.proj = nn.Conv2d(num_hiddens, n_embed, 1)
- self.embed = nn.Embedding(n_embed, embedding_dim)
-
- self.use_vqinterface = use_vqinterface
-
- self.remap = remap
- if self.remap is not None:
- self.register_buffer("used", torch.tensor(np.load(self.remap)))
- self.re_embed = self.used.shape[0]
- self.unknown_index = unknown_index # "random" or "extra" or integer
- if self.unknown_index == "extra":
- self.unknown_index = self.re_embed
- self.re_embed = self.re_embed + 1
- print(f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
- f"Using {self.unknown_index} for unknown indices.")
- else:
- self.re_embed = n_embed
-
- def remap_to_used(self, inds):
- ishape = inds.shape
- assert len(ishape) > 1
- inds = inds.reshape(ishape[0], -1)
- used = self.used.to(inds)
- match = (inds[:, :, None] == used[None, None, ...]).long()
- new = match.argmax(-1)
- unknown = match.sum(2) < 1
- if self.unknown_index == "random":
- new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
- else:
- new[unknown] = self.unknown_index
- return new.reshape(ishape)
-
- def unmap_to_all(self, inds):
- ishape = inds.shape
- assert len(ishape) > 1
- inds = inds.reshape(ishape[0], -1)
- used = self.used.to(inds)
- if self.re_embed > self.used.shape[0]: # extra token
- inds[inds >= self.used.shape[0]] = 0 # simply set to zero
- back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
- return back.reshape(ishape)
-
- def forward(self, z, temp=None, return_logits=False):
- # force hard = True when we are in eval mode, as we must quantize. actually, always true seems to work
- hard = self.straight_through if self.training else True
- temp = self.temperature if temp is None else temp
-
- logits = self.proj(z)
- if self.remap is not None:
- # continue only with used logits
- full_zeros = torch.zeros_like(logits)
- logits = logits[:, self.used, ...]
-
- soft_one_hot = F.gumbel_softmax(logits, tau=temp, dim=1, hard=hard)
- if self.remap is not None:
- # go back to all entries but unused set to zero
- full_zeros[:, self.used, ...] = soft_one_hot
- soft_one_hot = full_zeros
- z_q = einsum('b n h w, n d -> b d h w', soft_one_hot, self.embed.weight)
-
- # + kl divergence to the prior loss
- qy = F.softmax(logits, dim=1)
- diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.n_embed + 1e-10), dim=1).mean()
-
- ind = soft_one_hot.argmax(dim=1)
- if self.remap is not None:
- ind = self.remap_to_used(ind)
- if self.use_vqinterface:
- if return_logits:
- return z_q, diff, (None, None, ind), logits
- return z_q, diff, (None, None, ind)
- return z_q, diff, ind
-
- def get_codebook_entry(self, indices, shape):
- b, h, w, c = shape
- assert b * h * w == indices.shape[0]
- indices = rearrange(indices, '(b h w) -> b h w', b=b, h=h, w=w)
- if self.remap is not None:
- indices = self.unmap_to_all(indices)
- one_hot = F.one_hot(indices, num_classes=self.n_embed).permute(0, 3, 1, 2).float()
- z_q = einsum('b n h w, n d -> b d h w', one_hot, self.embed.weight)
- return z_q
-
-
-class VectorQuantizer2(nn.Module):
- """
- Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
- avoids costly matrix multiplications and allows for post-hoc remapping of indices.
- """
-
- # NOTE: due to a bug the beta term was applied to the wrong term. for
- # backwards compatibility we use the buggy version by default, but you can
- # specify legacy=False to fix it.
- def __init__(self, n_e, e_dim, beta, remap=None, unknown_index="random",
- sane_index_shape=False, legacy=True):
- super().__init__()
- self.n_e = n_e
- self.e_dim = e_dim
- self.beta = beta
- self.legacy = legacy
-
- self.embedding = nn.Embedding(self.n_e, self.e_dim)
- self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
-
- self.remap = remap
- if self.remap is not None:
- self.register_buffer("used", torch.tensor(np.load(self.remap)))
- self.re_embed = self.used.shape[0]
- self.unknown_index = unknown_index # "random" or "extra" or integer
- if self.unknown_index == "extra":
- self.unknown_index = self.re_embed
- self.re_embed = self.re_embed + 1
- print(f"Remapping {self.n_e} indices to {self.re_embed} indices. "
- f"Using {self.unknown_index} for unknown indices.")
- else:
- self.re_embed = n_e
-
- self.sane_index_shape = sane_index_shape
-
- def remap_to_used(self, inds):
- ishape = inds.shape
- assert len(ishape) > 1
- inds = inds.reshape(ishape[0], -1)
- used = self.used.to(inds)
- match = (inds[:, :, None] == used[None, None, ...]).long()
- new = match.argmax(-1)
- unknown = match.sum(2) < 1
- if self.unknown_index == "random":
- new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
- else:
- new[unknown] = self.unknown_index
- return new.reshape(ishape)
-
- def unmap_to_all(self, inds):
- ishape = inds.shape
- assert len(ishape) > 1
- inds = inds.reshape(ishape[0], -1)
- used = self.used.to(inds)
- if self.re_embed > self.used.shape[0]: # extra token
- inds[inds >= self.used.shape[0]] = 0 # simply set to zero
- back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
- return back.reshape(ishape)
-
- def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
- assert temp is None or temp == 1.0, "Only for interface compatible with Gumbel"
- assert rescale_logits == False, "Only for interface compatible with Gumbel"
- assert return_logits == False, "Only for interface compatible with Gumbel"
- # reshape z -> (batch, height, width, channel) and flatten
- z = rearrange(z, 'b c h w -> b h w c').contiguous()
- z_flattened = z.view(-1, self.e_dim)
- # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-
- d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
- torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
- torch.einsum('bd,dn->bn', z_flattened, rearrange(self.embedding.weight, 'n d -> d n'))
-
- min_encoding_indices = torch.argmin(d, dim=1)
- z_q = self.embedding(min_encoding_indices).view(z.shape)
- perplexity = None
- min_encodings = None
-
- # compute loss for embedding
- if not self.legacy:
- loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + \
- torch.mean((z_q - z.detach()) ** 2)
- else:
- loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * \
- torch.mean((z_q - z.detach()) ** 2)
-
- # preserve gradients
- z_q = z + (z_q - z).detach()
-
- # reshape back to match original input shape
- z_q = rearrange(z_q, 'b h w c -> b c h w').contiguous()
-
- if self.remap is not None:
- min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1) # add batch axis
- min_encoding_indices = self.remap_to_used(min_encoding_indices)
- min_encoding_indices = min_encoding_indices.reshape(-1, 1) # flatten
-
- if self.sane_index_shape:
- min_encoding_indices = min_encoding_indices.reshape(
- z_q.shape[0], z_q.shape[2], z_q.shape[3])
-
- return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
-
- def get_codebook_entry(self, indices, shape):
- # shape specifying (batch, height, width, channel)
- if self.remap is not None:
- indices = indices.reshape(shape[0], -1) # add batch axis
- indices = self.unmap_to_all(indices)
- indices = indices.reshape(-1) # flatten again
-
- # get quantized latent vectors
- z_q = self.embedding(indices)
-
- if shape is not None:
- z_q = z_q.view(shape)
- # reshape back to match original input shape
- z_q = z_q.permute(0, 3, 1, 2).contiguous()
-
- return z_q
-
-
-class EmbeddingEMA(nn.Module):
- def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5):
- super().__init__()
- self.decay = decay
- self.eps = eps
- weight = torch.randn(num_tokens, codebook_dim)
- self.weight = nn.Parameter(weight, requires_grad=False)
- self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False)
- self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
- self.update = True
-
- def forward(self, embed_id):
- return F.embedding(embed_id, self.weight)
-
- def cluster_size_ema_update(self, new_cluster_size):
- self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay)
-
- def embed_avg_ema_update(self, new_embed_avg):
- self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay)
-
- def weight_update(self, num_tokens):
- n = self.cluster_size.sum()
- smoothed_cluster_size = (
- (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n
- )
- # normalize embedding average with smoothed cluster size
- embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
- self.weight.data.copy_(embed_normalized)
-
-
-class EMAVectorQuantizer(nn.Module):
- def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5,
- remap=None, unknown_index="random"):
- super().__init__()
- self.codebook_dim = embedding_dim
- self.num_tokens = n_embed
- self.beta = beta
- self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps)
-
- self.remap = remap
- if self.remap is not None:
- self.register_buffer("used", torch.tensor(np.load(self.remap)))
- self.re_embed = self.used.shape[0]
- self.unknown_index = unknown_index # "random" or "extra" or integer
- if self.unknown_index == "extra":
- self.unknown_index = self.re_embed
- self.re_embed = self.re_embed + 1
- print(f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
- f"Using {self.unknown_index} for unknown indices.")
- else:
- self.re_embed = n_embed
-
- def remap_to_used(self, inds):
- ishape = inds.shape
- assert len(ishape) > 1
- inds = inds.reshape(ishape[0], -1)
- used = self.used.to(inds)
- match = (inds[:, :, None] == used[None, None, ...]).long()
- new = match.argmax(-1)
- unknown = match.sum(2) < 1
- if self.unknown_index == "random":
- new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
- else:
- new[unknown] = self.unknown_index
- return new.reshape(ishape)
-
- def unmap_to_all(self, inds):
- ishape = inds.shape
- assert len(ishape) > 1
- inds = inds.reshape(ishape[0], -1)
- used = self.used.to(inds)
- if self.re_embed > self.used.shape[0]: # extra token
- inds[inds >= self.used.shape[0]] = 0 # simply set to zero
- back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
- return back.reshape(ishape)
-
- def forward(self, z):
- # reshape z -> (batch, height, width, channel) and flatten
- # z, 'b c h w -> b h w c'
- z = rearrange(z, 'b c h w -> b h w c')
- z_flattened = z.reshape(-1, self.codebook_dim)
-
- # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
- d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \
- self.embedding.weight.pow(2).sum(dim=1) - 2 * \
- torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n'
-
- encoding_indices = torch.argmin(d, dim=1)
-
- z_q = self.embedding(encoding_indices).view(z.shape)
- encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
- avg_probs = torch.mean(encodings, dim=0)
- perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
-
- if self.training and self.embedding.update:
- # EMA cluster size
- encodings_sum = encodings.sum(0)
- self.embedding.cluster_size_ema_update(encodings_sum)
- # EMA embedding average
- embed_sum = encodings.transpose(0, 1) @ z_flattened
- self.embedding.embed_avg_ema_update(embed_sum)
- # normalize embed_avg and update weight
- self.embedding.weight_update(self.num_tokens)
-
- # compute loss for embedding
- loss = self.beta * F.mse_loss(z_q.detach(), z)
-
- # preserve gradients
- z_q = z + (z_q - z).detach()
-
- # reshape back to match original input shape
- # z_q, 'b h w c -> b c h w'
- z_q = rearrange(z_q, 'b h w c -> b c h w')
- return z_q, loss, (perplexity, encodings, encoding_indices)
\ No newline at end of file
diff --git a/opensora/models/ae/imagebase/vqvae/vqgan.py b/opensora/models/ae/imagebase/vqvae/vqgan.py
deleted file mode 100644
index 9e9125be1..000000000
--- a/opensora/models/ae/imagebase/vqvae/vqgan.py
+++ /dev/null
@@ -1,419 +0,0 @@
-import torch
-import torch.nn.functional as F
-import pytorch_lightning as pl
-import argparse, os, sys, datetime, glob, importlib
-
-from .model import Encoder, Decoder
-from .quantize import VectorQuantizer2 as VectorQuantizer
-from .quantize import GumbelQuantize
-from .quantize import EMAVectorQuantizer
-
-
-
-def get_obj_from_str(string, reload=False):
- module, cls = string.rsplit(".", 1)
- if reload:
- module_imp = importlib.import_module(module)
- importlib.reload(module_imp)
- return getattr(importlib.import_module(module, package=None), cls)
-
-
-def instantiate_from_config(config):
- if not "target" in config:
- raise KeyError("Expected key `target` to instantiate.")
- return get_obj_from_str(config["target"])(**config.get("params", dict()))
-
-
-class VQModel(pl.LightningModule):
- def __init__(self,
- ddconfig,
- lossconfig,
- n_embed,
- embed_dim,
- ckpt_path=None,
- ignore_keys=[],
- image_key="image",
- colorize_nlabels=None,
- monitor=None,
- remap=None,
- sane_index_shape=False, # tell vector quantizer to return indices as bhw
- ):
- super().__init__()
- self.image_key = image_key
- self.encoder = Encoder(**ddconfig)
- self.decoder = Decoder(**ddconfig)
- self.loss = instantiate_from_config(lossconfig)
- self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
- remap=remap, sane_index_shape=sane_index_shape)
- self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
- self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
- if ckpt_path is not None:
- self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
- self.image_key = image_key
- if colorize_nlabels is not None:
- assert type(colorize_nlabels)==int
- self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
- if monitor is not None:
- self.monitor = monitor
-
- def init_from_ckpt(self, path, ignore_keys=list()):
- sd = torch.load(path, map_location="cpu")["state_dict"]
- keys = list(sd.keys())
- for k in keys:
- for ik in ignore_keys:
- if k.startswith(ik):
- print("Deleting key {} from state_dict.".format(k))
- del sd[k]
- self.load_state_dict(sd, strict=False)
- print(f"Restored from {path}")
-
- def encode(self, x):
- h = self.encoder(x)
- h = self.quant_conv(h)
- quant, emb_loss, info = self.quantize(h)
- return quant, emb_loss, info
-
- def decode(self, quant):
- quant = self.post_quant_conv(quant)
- dec = self.decoder(quant)
- return dec
-
- def decode_code(self, code_b):
- quant_b = self.quantize.embed_code(code_b)
- dec = self.decode(quant_b)
- return dec
-
- def forward(self, input):
- quant, diff, _ = self.encode(input)
- dec = self.decode(quant)
- return dec, diff
-
- def get_input(self, batch, k):
- x = batch[k]
- if len(x.shape) == 3:
- x = x[..., None]
- x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
- return x.float()
-
- def training_step(self, batch, batch_idx, optimizer_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
-
- if optimizer_idx == 0:
- # autoencode
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
- last_layer=self.get_last_layer(), split="train")
-
- self.log("train/aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
- self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- return aeloss
-
- if optimizer_idx == 1:
- # discriminator
- discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
- last_layer=self.get_last_layer(), split="train")
- self.log("train/discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
- self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- return discloss
-
- def validation_step(self, batch, batch_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0, self.global_step,
- last_layer=self.get_last_layer(), split="val")
-
- discloss, log_dict_disc = self.loss(qloss, x, xrec, 1, self.global_step,
- last_layer=self.get_last_layer(), split="val")
- rec_loss = log_dict_ae["val/rec_loss"]
- self.log("val/rec_loss", rec_loss,
- prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
- self.log("val/aeloss", aeloss,
- prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
- self.log_dict(log_dict_ae)
- self.log_dict(log_dict_disc)
- return self.log_dict
-
- def configure_optimizers(self):
- lr = self.learning_rate
- opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
- list(self.decoder.parameters())+
- list(self.quantize.parameters())+
- list(self.quant_conv.parameters())+
- list(self.post_quant_conv.parameters()),
- lr=lr, betas=(0.5, 0.9))
- opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
- lr=lr, betas=(0.5, 0.9))
- return [opt_ae, opt_disc], []
-
- def get_last_layer(self):
- return self.decoder.conv_out.weight
-
- def log_images(self, batch, **kwargs):
- log = dict()
- x = self.get_input(batch, self.image_key)
- x = x.to(self.device)
- xrec, _ = self(x)
- if x.shape[1] > 3:
- # colorize with random projection
- assert xrec.shape[1] > 3
- x = self.to_rgb(x)
- xrec = self.to_rgb(xrec)
- log["inputs"] = x
- log["reconstructions"] = xrec
- return log
-
- def to_rgb(self, x):
- assert self.image_key == "segmentation"
- if not hasattr(self, "colorize"):
- self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
- x = F.conv2d(x, weight=self.colorize)
- x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
- return x
-
-
-class VQSegmentationModel(VQModel):
- def __init__(self, n_labels, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.register_buffer("colorize", torch.randn(3, n_labels, 1, 1))
-
- def configure_optimizers(self):
- lr = self.learning_rate
- opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
- list(self.decoder.parameters())+
- list(self.quantize.parameters())+
- list(self.quant_conv.parameters())+
- list(self.post_quant_conv.parameters()),
- lr=lr, betas=(0.5, 0.9))
- return opt_ae
-
- def training_step(self, batch, batch_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="train")
- self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- return aeloss
-
- def validation_step(self, batch, batch_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="val")
- self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- total_loss = log_dict_ae["val/total_loss"]
- self.log("val/total_loss", total_loss,
- prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
- return aeloss
-
- @torch.no_grad()
- def log_images(self, batch, **kwargs):
- log = dict()
- x = self.get_input(batch, self.image_key)
- x = x.to(self.device)
- xrec, _ = self(x)
- if x.shape[1] > 3:
- # colorize with random projection
- assert xrec.shape[1] > 3
- # convert logits to indices
- xrec = torch.argmax(xrec, dim=1, keepdim=True)
- xrec = F.one_hot(xrec, num_classes=x.shape[1])
- xrec = xrec.squeeze(1).permute(0, 3, 1, 2).float()
- x = self.to_rgb(x)
- xrec = self.to_rgb(xrec)
- log["inputs"] = x
- log["reconstructions"] = xrec
- return log
-
-
-class VQNoDiscModel(VQModel):
- def __init__(self,
- ddconfig,
- lossconfig,
- n_embed,
- embed_dim,
- ckpt_path=None,
- ignore_keys=[],
- image_key="image",
- colorize_nlabels=None
- ):
- super().__init__(ddconfig=ddconfig, lossconfig=lossconfig, n_embed=n_embed, embed_dim=embed_dim,
- ckpt_path=ckpt_path, ignore_keys=ignore_keys, image_key=image_key,
- colorize_nlabels=colorize_nlabels)
-
- def training_step(self, batch, batch_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
- # autoencode
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="train")
- output = pl.TrainResult(minimize=aeloss)
- output.log("train/aeloss", aeloss,
- prog_bar=True, logger=True, on_step=True, on_epoch=True)
- output.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- return output
-
- def validation_step(self, batch, batch_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="val")
- rec_loss = log_dict_ae["val/rec_loss"]
- output = pl.EvalResult(checkpoint_on=rec_loss)
- output.log("val/rec_loss", rec_loss,
- prog_bar=True, logger=True, on_step=True, on_epoch=True)
- output.log("val/aeloss", aeloss,
- prog_bar=True, logger=True, on_step=True, on_epoch=True)
- output.log_dict(log_dict_ae)
-
- return output
-
- def configure_optimizers(self):
- optimizer = torch.optim.Adam(list(self.encoder.parameters())+
- list(self.decoder.parameters())+
- list(self.quantize.parameters())+
- list(self.quant_conv.parameters())+
- list(self.post_quant_conv.parameters()),
- lr=self.learning_rate, betas=(0.5, 0.9))
- return optimizer
-
-
-class GumbelVQ(VQModel):
- def __init__(self,
- ddconfig,
- lossconfig,
- n_embed,
- embed_dim,
- temperature_scheduler_config,
- ckpt_path=None,
- ignore_keys=[],
- image_key="image",
- colorize_nlabels=None,
- monitor=None,
- kl_weight=1e-8,
- remap=None,
- ):
-
- z_channels = ddconfig["z_channels"]
- super().__init__(ddconfig,
- lossconfig,
- n_embed,
- embed_dim,
- ckpt_path=None,
- ignore_keys=ignore_keys,
- image_key=image_key,
- colorize_nlabels=colorize_nlabels,
- monitor=monitor,
- )
-
- self.loss.n_classes = n_embed
- self.vocab_size = n_embed
-
- self.quantize = GumbelQuantize(z_channels, embed_dim,
- n_embed=n_embed,
- kl_weight=kl_weight, temp_init=1.0,
- remap=remap)
-
- self.temperature_scheduler = instantiate_from_config(temperature_scheduler_config) # annealing of temp
-
- if ckpt_path is not None:
- self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-
- def temperature_scheduling(self):
- self.quantize.temperature = self.temperature_scheduler(self.global_step)
-
- def encode_to_prequant(self, x):
- h = self.encoder(x)
- h = self.quant_conv(h)
- return h
-
- def decode_code(self, code_b):
- raise NotImplementedError
-
- def training_step(self, batch, batch_idx, optimizer_idx):
- self.temperature_scheduling()
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x)
-
- if optimizer_idx == 0:
- # autoencode
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
- last_layer=self.get_last_layer(), split="train")
-
- self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- self.log("temperature", self.quantize.temperature, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- return aeloss
-
- if optimizer_idx == 1:
- # discriminator
- discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
- last_layer=self.get_last_layer(), split="train")
- self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
- return discloss
-
- def validation_step(self, batch, batch_idx):
- x = self.get_input(batch, self.image_key)
- xrec, qloss = self(x, return_pred_indices=True)
- aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0, self.global_step,
- last_layer=self.get_last_layer(), split="val")
-
- discloss, log_dict_disc = self.loss(qloss, x, xrec, 1, self.global_step,
- last_layer=self.get_last_layer(), split="val")
- rec_loss = log_dict_ae["val/rec_loss"]
- self.log("val/rec_loss", rec_loss,
- prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
- self.log("val/aeloss", aeloss,
- prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
- self.log_dict(log_dict_ae)
- self.log_dict(log_dict_disc)
- return self.log_dict
-
- def log_images(self, batch, **kwargs):
- log = dict()
- x = self.get_input(batch, self.image_key)
- x = x.to(self.device)
- # encode
- h = self.encoder(x)
- h = self.quant_conv(h)
- quant, _, _ = self.quantize(h)
- # decode
- x_rec = self.decode(quant)
- log["inputs"] = x
- log["reconstructions"] = x_rec
- return log
-
-
-class EMAVQ(VQModel):
- def __init__(self,
- ddconfig,
- lossconfig,
- n_embed,
- embed_dim,
- ckpt_path=None,
- ignore_keys=[],
- image_key="image",
- colorize_nlabels=None,
- monitor=None,
- remap=None,
- sane_index_shape=False, # tell vector quantizer to return indices as bhw
- ):
- super().__init__(ddconfig,
- lossconfig,
- n_embed,
- embed_dim,
- ckpt_path=None,
- ignore_keys=ignore_keys,
- image_key=image_key,
- colorize_nlabels=colorize_nlabels,
- monitor=monitor,
- )
- self.quantize = EMAVectorQuantizer(n_embed=n_embed,
- embedding_dim=embed_dim,
- beta=0.25,
- remap=remap)
- def configure_optimizers(self):
- lr = self.learning_rate
- #Remove self.quantize from parameter list since it is updated via EMA
- opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
- list(self.decoder.parameters())+
- list(self.quant_conv.parameters())+
- list(self.post_quant_conv.parameters()),
- lr=lr, betas=(0.5, 0.9))
- opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
- lr=lr, betas=(0.5, 0.9))
- return [opt_ae, opt_disc], []
\ No newline at end of file
diff --git a/opensora/models/ae/imagebase/vqvae/vqvae.py b/opensora/models/ae/imagebase/vqvae/vqvae.py
deleted file mode 100644
index c43f8c42b..000000000
--- a/opensora/models/ae/imagebase/vqvae/vqvae.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from torch import nn
-import yaml
-import torch
-from omegaconf import OmegaConf
-from .vqgan import VQModel, GumbelVQ
-
-def load_config(config_path, display=False):
- config = OmegaConf.load(config_path)
- if display:
- print(yaml.dump(OmegaConf.to_container(config)))
- return config
-
-
-def load_vqgan(config, ckpt_path=None, is_gumbel=False):
- if is_gumbel:
- model = GumbelVQ(**config.model.params)
- else:
- model = VQModel(**config.model.params)
- if ckpt_path is not None:
- sd = torch.load(ckpt_path, map_location="cpu")["state_dict"]
- missing, unexpected = model.load_state_dict(sd, strict=False)
- return model.eval()
-
-
-class SDVQVAEWrapper(nn.Module):
- def __init__(self, name):
- super(SDVQVAEWrapper, self).__init__()
- raise NotImplementedError
-
- def encode(self, x): # b c h w
- raise NotImplementedError
-
- def decode(self, x):
- raise NotImplementedError
diff --git a/opensora/models/ae/videobase/__init__.py b/opensora/models/ae/videobase/__init__.py
deleted file mode 100644
index 8c70a5651..000000000
--- a/opensora/models/ae/videobase/__init__.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from .vqvae import (
- VQVAEConfiguration,
- VQVAEModel,
- VQVAETrainer,
- VQVAEModelWrapper
-)
-from .causal_vqvae import (
- CausalVQVAEConfiguration,
- CausalVQVAETrainer,
- CausalVQVAEModel, CausalVQVAEModelWrapper
-)
-from .causal_vae import (
- CausalVAEModel, CausalVAEModelWrapper
-)
-
-
-videobase_ae_stride = {
- 'CausalVAEModel_4x8x8': [4, 8, 8],
- 'CausalVQVAEModel_4x4x4': [4, 4, 4],
- 'CausalVQVAEModel_4x8x8': [4, 8, 8],
- 'VQVAEModel_4x4x4': [4, 4, 4],
- 'OpenVQVAEModel_4x4x4': [4, 4, 4],
- 'VQVAEModel_4x8x8': [4, 8, 8],
- 'bair_stride4x2x2': [4, 2, 2],
- 'ucf101_stride4x4x4': [4, 4, 4],
- 'kinetics_stride4x4x4': [4, 4, 4],
- 'kinetics_stride2x4x4': [2, 4, 4],
-}
-
-videobase_ae_channel = {
- 'CausalVAEModel_4x8x8': 4,
- 'CausalVQVAEModel_4x4x4': 4,
- 'CausalVQVAEModel_4x8x8': 4,
- 'VQVAEModel_4x4x4': 4,
- 'OpenVQVAEModel_4x4x4': 4,
- 'VQVAEModel_4x8x8': 4,
- 'bair_stride4x2x2': 256,
- 'ucf101_stride4x4x4': 256,
- 'kinetics_stride4x4x4': 256,
- 'kinetics_stride2x4x4': 256,
-}
-
-videobase_ae = {
- 'CausalVAEModel_4x8x8': CausalVAEModelWrapper,
- 'CausalVQVAEModel_4x4x4': CausalVQVAEModelWrapper,
- 'CausalVQVAEModel_4x8x8': CausalVQVAEModelWrapper,
- 'VQVAEModel_4x4x4': VQVAEModelWrapper,
- 'VQVAEModel_4x8x8': VQVAEModelWrapper,
- "bair_stride4x2x2": VQVAEModelWrapper,
- "ucf101_stride4x4x4": VQVAEModelWrapper,
- "kinetics_stride4x4x4": VQVAEModelWrapper,
- "kinetics_stride2x4x4": VQVAEModelWrapper,
-}
diff --git a/opensora/models/ae/videobase/causal_vae/__init__.py b/opensora/models/ae/videobase/causal_vae/__init__.py
deleted file mode 100644
index d2c2d8958..000000000
--- a/opensora/models/ae/videobase/causal_vae/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from .modeling_causalvae import CausalVAEModel
-
-from einops import rearrange
-from torch import nn
-
-class CausalVAEModelWrapper(nn.Module):
- def __init__(self, model_path, subfolder=None, cache_dir=None, **kwargs):
- super(CausalVAEModelWrapper, self).__init__()
- # if os.path.exists(ckpt):
- # self.vae = CausalVAEModel.load_from_checkpoint(ckpt)
- self.vae = CausalVAEModel.from_pretrained(model_path, subfolder=subfolder, cache_dir=cache_dir, **kwargs)
- def encode(self, x): # b c t h w
- # x = self.vae.encode(x).sample()
- x = self.vae.encode(x).sample().mul_(0.18215)
- return x
- def decode(self, x):
- # x = self.vae.decode(x)
- x = self.vae.decode(x / 0.18215)
- x = rearrange(x, 'b c t h w -> b t c h w').contiguous()
- return x
-
- def dtype(self):
- return self.vae.dtype
- #
- # def device(self):
- # return self.vae.device
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/causal_vqvae/__init__.py b/opensora/models/ae/videobase/causal_vqvae/__init__.py
deleted file mode 100644
index 116d89564..000000000
--- a/opensora/models/ae/videobase/causal_vqvae/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from .configuration_causalvqvae import CausalVQVAEConfiguration
-from .modeling_causalvqvae import CausalVQVAEModel
-from .trainer_causalvqvae import CausalVQVAETrainer
-
-
-from einops import rearrange
-from torch import nn
-
-class CausalVQVAEModelWrapper(nn.Module):
- def __init__(self, ckpt):
- super(CausalVQVAEModelWrapper, self).__init__()
- self.vqvae = CausalVQVAEModel.load_from_checkpoint(ckpt)
- def encode(self, x): # b c t h w
- x = self.vqvae.pre_vq_conv(self.vqvae.encoder(x))
- return x
- def decode(self, x):
- vq_output = self.vqvae.codebook(x)
- x = self.vqvae.decoder(self.vqvae.post_vq_conv(vq_output['embeddings']))
- x = rearrange(x, 'b c t h w -> b t c h w').contiguous()
- return x
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/causal_vqvae/configuration_causalvqvae.py b/opensora/models/ae/videobase/causal_vqvae/configuration_causalvqvae.py
deleted file mode 100644
index 3f18be533..000000000
--- a/opensora/models/ae/videobase/causal_vqvae/configuration_causalvqvae.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from ..configuration_videobase import VideoBaseConfiguration
-from typing import Union, Tuple
-
-class CausalVQVAEConfiguration(VideoBaseConfiguration):
- def __init__(
- self,
- embedding_dim: int = 256,
- n_codes: int = 2048,
- n_hiddens: int = 240,
- n_res_layers: int = 4,
- resolution: int = 128,
- sequence_length: int = 16,
- time_downsample: int = 4,
- spatial_downsample: int = 8,
- no_pos_embd: bool = True,
- **kwargs,
- ):
- super().__init__(**kwargs)
-
- self.embedding_dim = embedding_dim
- self.n_codes = n_codes
- self.n_hiddens = n_hiddens
- self.n_res_layers = n_res_layers
- self.resolution = resolution
- self.sequence_length = sequence_length
- self.time_downsample = time_downsample
- self.spatial_downsample = spatial_downsample
- self.no_pos_embd = no_pos_embd
-
- self.hidden_size = n_hiddens
diff --git a/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py b/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py
deleted file mode 100644
index 65f3f98a9..000000000
--- a/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py
+++ /dev/null
@@ -1,848 +0,0 @@
-from ..modeling_videobase import VideoBaseAE
-import torch
-from torch import nn, Tensor
-import numpy as np
-import torch.distributed as dist
-import torch.nn.functional as F
-import math
-import os
-import json
-from typing import Tuple, Dict, Union
-from .configuration_causalvqvae import CausalVQVAEConfiguration
-from einops import rearrange, pack, unpack
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-def view_range(x, i, j, shape):
- shape = tuple(shape)
-
- n_dims = len(x.shape)
- if i < 0:
- i = n_dims + i
-
- if j is None:
- j = n_dims
- elif j < 0:
- j = n_dims + j
-
- assert 0 <= i < j <= n_dims
-
- x_shape = x.shape
- target_shape = x_shape[:i] + shape + x_shape[j:]
- return x.view(target_shape)
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
- n_dims = len(x.shape)
- if src_dim < 0:
- src_dim = n_dims + src_dim
- if dest_dim < 0:
- dest_dim = n_dims + dest_dim
- assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims
- dims = list(range(n_dims))
- del dims[src_dim]
- permutation = []
- ctr = 0
- for i in range(n_dims):
- if i == dest_dim:
- permutation.append(src_dim)
- else:
- permutation.append(dims[ctr])
- ctr += 1
- x = x.permute(permutation)
- if make_contiguous:
- x = x.contiguous()
- return x
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-def scaled_dot_product_attention(q, k, v, mask=None, attn_dropout=0.0, training=True):
- # Performs scaled dot-product attention over the second to last dimension dn
-
- # (b, n_head, d1, ..., dn, d)
- attn = torch.matmul(q, k.transpose(-1, -2))
- attn = attn / np.sqrt(q.shape[-1])
- if mask is not None:
- attn = attn.masked_fill(mask == 0, float("-inf"))
- attn_float = F.softmax(attn, dim=-1)
- attn = attn_float.type_as(attn) # b x n_head x d1 x ... x dn x d
- attn = F.dropout(attn, p=attn_dropout, training=training)
-
- a = torch.matmul(attn, v) # b x n_head x d1 x ... x dn x d
-
- return a
-
-def is_odd(n):
- return not n % 2 == 0
-
-def maybe_del_attr_(o, attr):
- if hasattr(o, attr):
- delattr(o, attr)
-
-def cast_tuple(t, length = 1):
- return t if isinstance(t, tuple) else ((t,) * length)
-
-class SpatialDownsample2x(torch.nn.Module):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: Union[int, Tuple[int]] = (4,4),
- stride: Union[int, Tuple[int]] = (2,2)
- ):
- super().__init__()
- kernel_size = cast_tuple(kernel_size, 2)
- stride = cast_tuple(stride, 2)
-
- self.chan_in = chan_in
- self.chan_out = chan_out
- self.kernel_size = kernel_size
-
- total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
- pad_input = []
- for p in total_pad[::-1]:
- pad_input.append((p // 2 + p % 2, p // 2))
- pad_input = sum(pad_input, tuple())
- self.pad_input = pad_input
-
- self.conv = torch.nn.Conv2d(self.chan_in, self.chan_out, self.kernel_size, stride=stride)
-
- def forward(self, x):
- x = F.pad(x, self.pad_input)
- x = rearrange(x, "b c f h w -> b f c h w")
- x, ps = pack([x], "* c h w")
- x = self.conv(x)
- x = unpack(x, ps, "* c h w")[0]
- x = rearrange(x, "b f c h w -> b c f h w")
- return x
-
-class SpatialUpsample2x(torch.nn.Module):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: Union[int, Tuple[int]] = (3,3),
- stride: Union[int, Tuple[int]] = (1,1)
- ):
- super().__init__()
- self.chan_in = chan_in
- self.chan_out = chan_out
- self.kernel_size = kernel_size
- self.conv = torch.nn.Conv2d(self.chan_in, self.chan_out, self.kernel_size, stride=stride, padding=tuple([(k - 1) // 2 for k in kernel_size]))
-
- def forward(self, x):
- x = rearrange(x, "b c f h w -> b f c h w")
- x, ps = pack([x], "* c h w")
- x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
- x = self.conv(x)
- x = unpack(x, ps, "* c h w")[0]
- x = rearrange(x, "b f c h w -> b c f h w")
- return x
-
-class TimeDownsample2x(nn.Module):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: int = 4,
- ):
- super().__init__()
- self.chan_in = chan_in
- self.chan_out = chan_out
- self.kernel_size = kernel_size
- self.conv = CausalConv3d(chan_in, chan_out, kernel_size, stride=2)
-
- def forward(self, x):
- return self.conv(x)
-
-class TimeUpsample2x(nn.Module):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: int = 3,
- ):
- super().__init__()
- self.chan_in = chan_in
- self.chan_out = chan_out
- self.kernel_size = kernel_size
- self.conv = CausalConv3d(chan_in, chan_out, kernel_size, stride=1)
-
- def forward(self, x):
- x = rearrange(x, "b c f h w -> b c h w f")
- x, ps = pack([x], "b * f")
- if x.size(-1) > 1:
- x = torch.concat((x[:,:,:1], F.interpolate(x[:,:,1:], scale_factor=2.0, mode="linear")), dim=-1)
- else:
- x = x
- x = unpack(x, ps, "b * f")[0]
- x = rearrange(x, "b c h w f -> b c f h w")
- x = self.conv(x)
- return x
-
-class CausalConv3d(nn.Module):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: Union[int, Tuple[int, int, int]],
- **kwargs
- ):
- super().__init__()
- kernel_size = cast_tuple(kernel_size, 3)
- self.time_kernel_size = kernel_size[0]
- stride = kwargs.pop('stride', 1)
- stride = (stride, 1, 1)
- total_pad = tuple([k - s for k, s in zip(kernel_size[1:], stride[1:])])
- pad_input = []
- for p in total_pad[::-1]:
- pad_input.append((p // 2 + p % 2, p // 2))
- pad_input = sum(pad_input, tuple())
- pad_input += (0, 0)
- self.padding = pad_input
- self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride = stride, **kwargs)
-
- def forward(self, x):
- x = F.pad(x, self.padding)
- first_frame_pad = x[:, :, :1, : ,:].repeat((1,1,self.time_kernel_size - 1,1,1))
- x = torch.concatenate((first_frame_pad, x), dim=2)
- return self.conv(x)
-
-# Modified from https://github.com/wilson1yan/VideoGPT
-class AxialBlock(nn.Module):
- def __init__(self, n_hiddens, n_head):
- super().__init__()
- kwargs = dict(
- shape=(0,) * 3,
- dim_q=n_hiddens,
- dim_kv=n_hiddens,
- n_head=n_head,
- n_layer=1,
- causal=False,
- attn_type="axial",
- )
- self.attn_w = MultiHeadAttention(attn_kwargs=dict(axial_dim=-2), **kwargs)
- self.attn_h = MultiHeadAttention(attn_kwargs=dict(axial_dim=-3), **kwargs)
- kwargs['causal'] = True
- self.attn_t = MultiHeadAttention(attn_kwargs=dict(axial_dim=-4), **kwargs)
-
- def forward(self, x):
- x = shift_dim(x, 1, -1)
- x = self.attn_w(x, x, x) + self.attn_h(x, x, x) + self.attn_t(x, x, x)
- x = shift_dim(x, -1, 1)
- return x
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class AttentionResidualBlock(nn.Module):
- def __init__(self, n_hiddens, n_heads: int = 2):
- super().__init__()
- self.block = nn.Sequential(
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- CausalConv3d(n_hiddens, n_hiddens // 2, 3, bias=False),
- nn.BatchNorm3d(n_hiddens // 2),
- nn.ReLU(),
- CausalConv3d(n_hiddens // 2, n_hiddens, 1, bias=False),
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- AxialBlock(n_hiddens, n_heads),
- )
-
- def forward(self, x):
- return x + self.block(x)
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class Codebook(nn.Module):
- def __init__(self, n_codes, embedding_dim):
- super().__init__()
- self.register_buffer("embeddings", torch.randn(n_codes, embedding_dim))
- self.register_buffer("N", torch.zeros(n_codes))
- self.register_buffer("z_avg", self.embeddings.data.clone())
-
- self.n_codes = n_codes
- self.embedding_dim = embedding_dim
- self._need_init = True
-
- def _tile(self, x):
- d, ew = x.shape
- if d < self.n_codes:
- n_repeats = (self.n_codes + d - 1) // d
- std = 0.01 / np.sqrt(ew)
- x = x.repeat(n_repeats, 1)
- x = x + torch.randn_like(x) * std
- return x
-
- def _init_embeddings(self, z):
- # z: [b, c, t, h, w]
- self._need_init = False
- flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
- y = self._tile(flat_inputs)
-
- d = y.shape[0]
- _k_rand = y[torch.randperm(y.shape[0])][: self.n_codes]
- if dist.is_initialized():
- dist.broadcast(_k_rand, 0)
- self.embeddings.data.copy_(_k_rand)
- self.z_avg.data.copy_(_k_rand)
- self.N.data.copy_(torch.ones(self.n_codes))
-
- def forward(self, z):
- # z: [b, c, t, h, w]
- if self._need_init and self.training:
- self._init_embeddings(z)
- flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
- distances = (
- (flat_inputs**2).sum(dim=1, keepdim=True)
- - 2 * flat_inputs @ self.embeddings.t()
- + (self.embeddings.t() ** 2).sum(dim=0, keepdim=True)
- )
-
- encoding_indices = torch.argmin(distances, dim=1)
- encode_onehot = F.one_hot(encoding_indices, self.n_codes).type_as(flat_inputs)
- encoding_indices = encoding_indices.view(z.shape[0], *z.shape[2:])
-
- embeddings = F.embedding(encoding_indices, self.embeddings)
- embeddings = shift_dim(embeddings, -1, 1)
-
- commitment_loss = 0.25 * F.mse_loss(z, embeddings.detach())
-
- # EMA codebook update
- if self.training:
- n_total = encode_onehot.sum(dim=0)
- encode_sum = flat_inputs.t() @ encode_onehot
- if dist.is_initialized():
- dist.all_reduce(n_total)
- dist.all_reduce(encode_sum)
-
- self.N.data.mul_(0.99).add_(n_total, alpha=0.01)
- self.z_avg.data.mul_(0.99).add_(encode_sum.t(), alpha=0.01)
-
- n = self.N.sum()
- weights = (self.N + 1e-7) / (n + self.n_codes * 1e-7) * n
- encode_normalized = self.z_avg / weights.unsqueeze(1)
- self.embeddings.data.copy_(encode_normalized)
-
- y = self._tile(flat_inputs)
- _k_rand = y[torch.randperm(y.shape[0])][: self.n_codes]
- if dist.is_initialized():
- dist.broadcast(_k_rand, 0)
-
- usage = (self.N.view(self.n_codes, 1) >= 1).float()
- self.embeddings.data.mul_(usage).add_(_k_rand * (1 - usage))
-
- embeddings_st = (embeddings - z).detach() + z
-
- avg_probs = torch.mean(encode_onehot, dim=0)
- perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
-
- return dict(
- embeddings=embeddings_st,
- encodings=encoding_indices,
- commitment_loss=commitment_loss,
- perplexity=perplexity,
- )
-
- def dictionary_lookup(self, encodings):
- embeddings = F.embedding(encodings, self.embeddings)
- return embeddings
-
-# Modified from https://github.com/wilson1yan/VideoGPT
-class Encoder(nn.Module):
- def __init__(self, n_hiddens, n_res_layers, time_downsample, spatial_downsample):
- super().__init__()
- spatial_downsample = int(math.log2(spatial_downsample))
- self.spatial_conv = nn.ModuleList()
- for i in range(spatial_downsample):
- in_channels = 3 if i == 0 else n_hiddens
- conv = SpatialDownsample2x(in_channels, n_hiddens)
- self.spatial_conv.append(conv)
- self.spatial_res_stack = nn.Sequential(
- *[AttentionResidualBlock(n_hiddens) for _ in range(n_res_layers)],
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- )
- time_downsample = int(math.log2(time_downsample))
- self.time_conv = nn.ModuleList()
- for i in range(time_downsample):
- conv = TimeDownsample2x(n_hiddens, n_hiddens)
- self.time_conv.append(conv)
- self.time_res_stack = nn.Sequential(
- *[AttentionResidualBlock(n_hiddens) for _ in range(n_res_layers)],
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- )
-
- def forward(self, x):
- h = x
- for conv in self.spatial_conv:
- h = F.relu(conv(h))
- h = self.spatial_res_stack(h)
- for conv in self.time_conv:
- h = F.relu(conv(h))
- h = self.time_res_stack(h)
- return h
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class MultiHeadAttention(nn.Module):
- def __init__(
- self, shape, dim_q, dim_kv, n_head, n_layer, causal, attn_type, attn_kwargs
- ):
- super().__init__()
- self.causal = causal
- self.shape = shape
-
- self.d_k = dim_q // n_head
- self.d_v = dim_kv // n_head
- self.n_head = n_head
-
- self.w_qs = nn.Linear(dim_q, n_head * self.d_k, bias=False) # q
- self.w_qs.weight.data.normal_(std=1.0 / np.sqrt(dim_q))
-
- self.w_ks = nn.Linear(dim_kv, n_head * self.d_k, bias=False) # k
- self.w_ks.weight.data.normal_(std=1.0 / np.sqrt(dim_kv))
-
- self.w_vs = nn.Linear(dim_kv, n_head * self.d_v, bias=False) # v
- self.w_vs.weight.data.normal_(std=1.0 / np.sqrt(dim_kv))
-
- self.fc = nn.Linear(n_head * self.d_v, dim_q, bias=True) # c
- self.fc.weight.data.normal_(std=1.0 / np.sqrt(dim_q * n_layer))
-
- if attn_type == "full":
- self.attn = FullAttention(shape, causal, **attn_kwargs)
- elif attn_type == "axial":
- self.attn = AxialAttention(len(shape), causal=causal, **attn_kwargs)
- elif attn_type == "sparse":
- self.attn = SparseAttention(shape, n_head, causal, **attn_kwargs)
-
- self.cache = None
-
- def forward(self, q, k, v, decode_step=None, decode_idx=None):
- """Compute multi-head attention
- Args
- q, k, v: a [b, d1, ..., dn, c] tensor or
- a [b, 1, ..., 1, c] tensor if decode_step is not None
-
- Returns
- The output after performing attention
- """
-
- # compute k, q, v
- d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
- q = view_range(self.w_qs(q), -1, None, (n_head, d_k))
- k = view_range(self.w_ks(k), -1, None, (n_head, d_k))
- v = view_range(self.w_vs(v), -1, None, (n_head, d_v))
-
- # b x n_head x seq_len x d
- # (b, *d_shape, n_head, d) -> (b, n_head, *d_shape, d)
- q = shift_dim(q, -2, 1)
- k = shift_dim(k, -2, 1)
- v = shift_dim(v, -2, 1)
-
- # fast decoding
- if decode_step is not None:
- if decode_step == 0:
- if self.causal:
- k_shape = (q.shape[0], n_head, *self.shape, self.d_k)
- v_shape = (q.shape[0], n_head, *self.shape, self.d_v)
- self.cache = dict(
- k=torch.zeros(k_shape, dtype=k.dtype, device=q.device),
- v=torch.zeros(v_shape, dtype=v.dtype, device=q.device),
- )
- else:
- # cache only once in the non-causal case
- self.cache = dict(k=k.clone(), v=v.clone())
- if self.causal:
- idx = (
- slice(None, None),
- slice(None, None),
- *[slice(i, i + 1) for i in decode_idx],
- )
- self.cache["k"][idx] = k
- self.cache["v"][idx] = v
- k, v = self.cache["k"], self.cache["v"]
-
- a = self.attn(q, k, v, decode_step, decode_idx)
-
- # (b, *d_shape, n_head, d) -> (b, *d_shape, n_head * d)
- a = shift_dim(a, 1, -2).flatten(start_dim=-2)
- a = self.fc(a) # (b x seq_len x embd_dim)
-
- return a
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class Decoder(nn.Module):
- def __init__(self, n_hiddens, n_res_layers, time_downsample, spatial_downsample):
- super().__init__()
- self.time_res_stack = nn.Sequential(
- *[AttentionResidualBlock(n_hiddens) for _ in range(n_res_layers)],
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- )
- time_downsample = int(math.log2(time_downsample))
- self.time_conv = nn.ModuleList()
- for i in range(time_downsample):
- convt = TimeUpsample2x(n_hiddens, n_hiddens)
- self.time_conv.append(convt)
- self.spatial_res_stack = nn.Sequential(
- *[AttentionResidualBlock(n_hiddens) for _ in range(n_res_layers)],
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- )
- spatial_downsample = int(math.log2(spatial_downsample))
- self.spatial_conv = nn.ModuleList()
- for i in range(spatial_downsample):
- out_channels = 3 if i == spatial_downsample - 1 else n_hiddens
- convt = SpatialUpsample2x(n_hiddens, out_channels)
- self.spatial_conv.append(convt)
-
- def forward(self, x):
- h = self.time_res_stack(x)
- for conv in self.time_conv:
- h = F.relu(conv(h))
- h = self.spatial_res_stack(h)
- for i, conv in enumerate(self.spatial_conv):
- h = conv(h)
- if i < len(self.spatial_conv) - 1:
- h = F.relu(h)
- return h
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class FullAttention(nn.Module):
- def __init__(self, shape, causal, attn_dropout):
- super().__init__()
- self.causal = causal
- self.attn_dropout = attn_dropout
-
- seq_len = np.prod(shape)
- if self.causal:
- self.register_buffer("mask", torch.tril(torch.ones(seq_len, seq_len)))
-
- def forward(self, q, k, v, decode_step, decode_idx):
- mask = self.mask if self.causal else None
- if decode_step is not None and mask is not None:
- mask = mask[[decode_step]]
-
- old_shape = q.shape[2:-1]
- q = q.flatten(start_dim=2, end_dim=-2)
- k = k.flatten(start_dim=2, end_dim=-2)
- v = v.flatten(start_dim=2, end_dim=-2)
-
- out = scaled_dot_product_attention(
- q, k, v, mask=mask, attn_dropout=self.attn_dropout, training=self.training
- )
-
- return view_range(out, 2, 3, old_shape)
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class AxialAttention(nn.Module):
- def __init__(self, n_dim, axial_dim, causal=False):
- super().__init__()
- if axial_dim < 0:
- axial_dim = 2 + n_dim + 1 + axial_dim
- else:
- axial_dim += 2 # account for batch, head, dim
- self.causal = causal
- self.axial_dim = axial_dim
-
- def forward(self, q, k, v, decode_step, decode_idx):
- # batch, head, frame, height, width, dim
- q = shift_dim(q, self.axial_dim, -2).flatten(end_dim=-3)
- k = shift_dim(k, self.axial_dim, -2).flatten(end_dim=-3)
- v = shift_dim(v, self.axial_dim, -2)
-
- old_shape = list(v.shape)
- v = v.flatten(end_dim=-3)
-
- if self.causal:
- mask = torch.tril(torch.ones(q.shape[-2], q.shape[-2])) if self.causal else None
- if decode_step is not None and mask is not None:
- mask = mask[[decode_step]]
- mask = mask.to(q.device)
- else:
- mask = None
-
- out = scaled_dot_product_attention(q, k, v, mask=mask, training=self.training)
- out = out.view(*old_shape)
- out = shift_dim(out, -2, self.axial_dim)
- return out
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class StridedSparsityConfig(object):
- """
- Strided Sparse configuration specified in https://arxiv.org/abs/1904.10509 that
- generalizes to arbitrary dimensions
- """
-
- def __init__(self, shape, n_head, causal, block, num_local_blocks):
- self.n_head = n_head
- self.shape = shape
- self.causal = causal
- self.block = block
- self.num_local_blocks = num_local_blocks
-
- assert self.num_local_blocks >= 1, "Must have at least 1 local block"
- assert self.seq_len % self.block == 0, "seq len must be divisible by block size"
-
- self._block_shape = self._compute_block_shape()
- self._block_shape_cum = self._block_shape_cum_sizes()
-
- @property
- def seq_len(self):
- return np.prod(self.shape)
-
- @property
- def num_blocks(self):
- return self.seq_len // self.block
-
- def set_local_layout(self, layout):
- num_blocks = self.num_blocks
- for row in range(0, num_blocks):
- end = min(row + self.num_local_blocks, num_blocks)
- for col in range(
- max(0, row - self.num_local_blocks), (row + 1 if self.causal else end)
- ):
- layout[:, row, col] = 1
- return layout
-
- def set_global_layout(self, layout):
- num_blocks = self.num_blocks
- n_dim = len(self._block_shape)
- for row in range(num_blocks):
- assert self._to_flattened_idx(self._to_unflattened_idx(row)) == row
- cur_idx = self._to_unflattened_idx(row)
- # no strided attention over last dim
- for d in range(n_dim - 1):
- end = self._block_shape[d]
- for i in range(0, (cur_idx[d] + 1 if self.causal else end)):
- new_idx = list(cur_idx)
- new_idx[d] = i
- new_idx = tuple(new_idx)
-
- col = self._to_flattened_idx(new_idx)
- layout[:, row, col] = 1
-
- return layout
-
- def make_layout(self):
- layout = torch.zeros(
- (self.n_head, self.num_blocks, self.num_blocks), dtype=torch.int64
- )
- layout = self.set_local_layout(layout)
- layout = self.set_global_layout(layout)
- return layout
-
- def make_sparse_attn_mask(self):
- block_layout = self.make_layout()
- assert block_layout.shape[1] == block_layout.shape[2] == self.num_blocks
-
- num_dense_blocks = block_layout.sum().item()
- attn_mask = torch.ones(num_dense_blocks, self.block, self.block)
- counter = 0
- for h in range(self.n_head):
- for i in range(self.num_blocks):
- for j in range(self.num_blocks):
- elem = block_layout[h, i, j].item()
- if elem == 1:
- assert i >= j
- if i == j: # need to mask within block on diagonals
- attn_mask[counter] = torch.tril(attn_mask[counter])
- counter += 1
- assert counter == num_dense_blocks
-
- return attn_mask.unsqueeze(0)
-
- def get_non_block_layout_row(self, block_layout, row):
- block_row = row // self.block
- block_row = block_layout[:, [block_row]] # n_head x 1 x n_blocks
- block_row = block_row.repeat_interleave(self.block, dim=-1)
- block_row[:, :, row + 1 :] = 0.0
- return block_row
-
- ############# Helper functions ##########################
-
- def _compute_block_shape(self):
- n_dim = len(self.shape)
- cum_prod = 1
- for i in range(n_dim - 1, -1, -1):
- cum_prod *= self.shape[i]
- if cum_prod > self.block:
- break
- assert cum_prod % self.block == 0
- new_shape = (*self.shape[:i], cum_prod // self.block)
-
- assert np.prod(new_shape) == np.prod(self.shape) // self.block
-
- return new_shape
-
- def _block_shape_cum_sizes(self):
- bs = np.flip(np.array(self._block_shape))
- return tuple(np.flip(np.cumprod(bs)[:-1])) + (1,)
-
- def _to_flattened_idx(self, idx):
- assert len(idx) == len(
- self._block_shape
- ), f"{len(idx)} != {len(self._block_shape)}"
- flat_idx = 0
- for i in range(len(self._block_shape)):
- flat_idx += idx[i] * self._block_shape_cum[i]
- return flat_idx
-
- def _to_unflattened_idx(self, flat_idx):
- assert flat_idx < np.prod(self._block_shape)
- idx = []
- for i in range(len(self._block_shape)):
- idx.append(flat_idx // self._block_shape_cum[i])
- flat_idx %= self._block_shape_cum[i]
- return tuple(idx)
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class SparseAttention(nn.Module):
- ops = dict()
- attn_mask = dict()
- block_layout = dict()
-
- def __init__(
- self, shape, n_head, causal, num_local_blocks=4, block=32, attn_dropout=0.0
- ): # does not use attn_dropout
- super().__init__()
- self.causal = causal
- self.shape = shape
-
- self.sparsity_config = StridedSparsityConfig(
- shape=shape,
- n_head=n_head,
- causal=causal,
- block=block,
- num_local_blocks=num_local_blocks,
- )
-
- if self.shape not in SparseAttention.block_layout:
- SparseAttention.block_layout[self.shape] = (
- self.sparsity_config.make_layout()
- )
- if causal and self.shape not in SparseAttention.attn_mask:
- SparseAttention.attn_mask[self.shape] = (
- self.sparsity_config.make_sparse_attn_mask()
- )
-
- def get_ops(self):
- try:
- from deepspeed.ops.sparse_attention import MatMul, Softmax
- except:
- raise Exception(
- "Error importing deepspeed. Please install using `DS_BUILD_SPARSE_ATTN=1 pip install deepspeed`"
- )
- if self.shape not in SparseAttention.ops:
- sparsity_layout = self.sparsity_config.make_layout()
- sparse_dot_sdd_nt = MatMul(
- sparsity_layout,
- self.sparsity_config.block,
- "sdd",
- trans_a=False,
- trans_b=True,
- )
-
- sparse_dot_dsd_nn = MatMul(
- sparsity_layout,
- self.sparsity_config.block,
- "dsd",
- trans_a=False,
- trans_b=False,
- )
-
- sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block)
-
- SparseAttention.ops[self.shape] = (
- sparse_dot_sdd_nt,
- sparse_dot_dsd_nn,
- sparse_softmax,
- )
- return SparseAttention.ops[self.shape]
-
- def forward(self, q, k, v, decode_step, decode_idx):
- if self.training and self.shape not in SparseAttention.ops:
- self.get_ops()
-
- SparseAttention.block_layout[self.shape] = SparseAttention.block_layout[
- self.shape
- ].to(q)
- if self.causal:
- SparseAttention.attn_mask[self.shape] = (
- SparseAttention.attn_mask[self.shape].to(q).type_as(q)
- )
- attn_mask = SparseAttention.attn_mask[self.shape] if self.causal else None
-
- old_shape = q.shape[2:-1]
- q = q.flatten(start_dim=2, end_dim=-2)
- k = k.flatten(start_dim=2, end_dim=-2)
- v = v.flatten(start_dim=2, end_dim=-2)
-
- if decode_step is not None:
- mask = self.sparsity_config.get_non_block_layout_row(
- SparseAttention.block_layout[self.shape], decode_step
- )
- out = scaled_dot_product_attention(
- q, k, v, mask=mask, training=self.training
- )
- else:
- if q.shape != k.shape or k.shape != v.shape:
- raise Exception("SparseAttention only support self-attention")
- sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax = self.get_ops()
- scaling = float(q.shape[-1]) ** -0.5
-
- attn_output_weights = sparse_dot_sdd_nt(q, k)
- if attn_mask is not None:
- attn_output_weights = attn_output_weights.masked_fill(
- attn_mask == 0, float("-inf")
- )
- attn_output_weights = sparse_softmax(attn_output_weights, scale=scaling)
-
- out = sparse_dot_dsd_nn(attn_output_weights, v)
-
- return view_range(out, 2, 3, old_shape)
-
-class CausalVQVAEModel(VideoBaseAE):
-
- def __init__(self, config: CausalVQVAEConfiguration):
- super().__init__()
- self.config = config
- self.embedding_dim = config.embedding_dim
- self.n_codes = config.n_codes
- self.encoder = Encoder(config.n_hiddens, config.n_res_layers, config.time_downsample, config.spatial_downsample)
- self.decoder = Decoder(config.n_hiddens, config.n_res_layers, config.time_downsample, config.spatial_downsample)
- self.pre_vq_conv = CausalConv3d(config.n_hiddens, config.embedding_dim, 1)
- self.post_vq_conv = CausalConv3d(config.embedding_dim, config.n_hiddens, 1)
- self.codebook = Codebook(config.n_codes, config.embedding_dim)
-
- def forward(self, x):
- z = self.pre_vq_conv(self.encoder(x))
- vq_output = self.codebook(z)
- x_recon = self.decoder(self.post_vq_conv(vq_output["embeddings"]))
- recon_loss = F.mse_loss(x_recon, x) / 0.06
- return recon_loss, x_recon, vq_output
-
- def encode(self, x: Tensor, include_embeddings: bool = False) -> Union[Tuple[Tensor, Tensor], Tensor]:
- h = self.pre_vq_conv(self.encoder(x))
- vq_output: Dict[str, Tensor] = self.codebook(h)
- if include_embeddings:
- return vq_output["encodings"], vq_output["embeddings"]
- else:
- return vq_output["encodings"]
-
- def decode(self, encodings: Tensor) -> Tensor:
- h = F.embedding(encodings, self.codebook.embeddings)
- h = self.post_vq_conv(shift_dim(h, -1, 1))
- return self.decoder(h)
-
- @classmethod
- def load_from_checkpoint(cls, model_path):
- with open(os.path.join(model_path, "config.json"), "r") as file:
- config = json.load(file)
- state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
- model = cls(config=CausalVQVAEConfiguration(**config))
- model.load_state_dict(state_dict)
- return model
-
- @classmethod
- def download_and_load_model(cls, model_name, cache_dir=None):
- raise NotImplementedError()
diff --git a/opensora/models/ae/videobase/causal_vqvae/trainer_causalvqvae.py b/opensora/models/ae/videobase/causal_vqvae/trainer_causalvqvae.py
deleted file mode 100644
index f819bce36..000000000
--- a/opensora/models/ae/videobase/causal_vqvae/trainer_causalvqvae.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from ..trainer_videobase import VideoBaseTrainer
-import torch.nn.functional as F
-from typing import Optional
-import os
-import torch
-from transformers.utils import WEIGHTS_NAME
-import json
-
-class CausalVQVAETrainer(VideoBaseTrainer):
-
- def compute_loss(self, model, inputs, return_outputs=False):
- model = model.module
- x = inputs.get("video")
- x = x / 2
- z = model.pre_vq_conv(model.encoder(x))
- vq_output = model.codebook(z)
- x_recon = model.decoder(model.post_vq_conv(vq_output["embeddings"]))
- recon_loss = F.mse_loss(x_recon, x) / 0.06
- commitment_loss = vq_output['commitment_loss']
- loss = recon_loss + commitment_loss
- return loss
diff --git a/opensora/models/ae/videobase/modules/__init__.py b/opensora/models/ae/videobase/modules/__init__.py
deleted file mode 100644
index 61ca11dec..000000000
--- a/opensora/models/ae/videobase/modules/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from .block import Block
-from .attention import (
- AttnBlock3D,
- AttnBlock3DFix,
- AttnBlock,
- LinAttnBlock,
- LinearAttention,
- TemporalAttnBlock
-)
-from .conv import CausalConv3d, Conv2d
-from .normalize import GroupNorm, Normalize
-from .resnet_block import ResnetBlock2D, ResnetBlock3D
-from .updownsample import (
- SpatialDownsample2x,
- SpatialUpsample2x,
- TimeDownsample2x,
- TimeUpsample2x,
- Upsample,
- Downsample,
- TimeDownsampleRes2x,
- TimeUpsampleRes2x,
- TimeDownsampleResAdv2x,
- TimeUpsampleResAdv2x
-)
diff --git a/opensora/models/ae/videobase/modules/attention.py b/opensora/models/ae/videobase/modules/attention.py
deleted file mode 100644
index c3b9bb19b..000000000
--- a/opensora/models/ae/videobase/modules/attention.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import torch.nn as nn
-from .normalize import Normalize
-from .conv import CausalConv3d
-import torch
-import numpy as np
-from einops import rearrange
-from .block import Block
-from .ops import video_to_image
-
-class LinearAttention(Block):
- def __init__(self, dim, heads=4, dim_head=32):
- super().__init__()
- self.heads = heads
- hidden_dim = dim_head * heads
- self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
- self.to_out = nn.Conv2d(hidden_dim, dim, 1)
-
- def forward(self, x):
- b, c, h, w = x.shape
- qkv = self.to_qkv(x)
- q, k, v = rearrange(
- qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
- )
- k = k.softmax(dim=-1)
- context = torch.einsum("bhdn,bhen->bhde", k, v)
- out = torch.einsum("bhde,bhdn->bhen", context, q)
- out = rearrange(
- out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
- )
- return self.to_out(out)
-
-
-class LinAttnBlock(LinearAttention):
- """to match AttnBlock usage"""
-
- def __init__(self, in_channels):
- super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
-
-
-class AttnBlock3D(Block):
- """Compatible with old versions, there are issues, use with caution."""
- def __init__(self, in_channels):
- super().__init__()
- self.in_channels = in_channels
-
- self.norm = Normalize(in_channels)
- self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
- self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
- self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
- self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
-
- def forward(self, x):
- h_ = x
- h_ = self.norm(h_)
- q = self.q(h_)
- k = self.k(h_)
- v = self.v(h_)
-
- # compute attention
- b, c, t, h, w = q.shape
- q = q.reshape(b * t, c, h * w)
- q = q.permute(0, 2, 1) # b,hw,c
- k = k.reshape(b * t, c, h * w) # b,c,hw
- w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
- w_ = w_ * (int(c) ** (-0.5))
- w_ = torch.nn.functional.softmax(w_, dim=2)
-
- # attend to values
- v = v.reshape(b * t, c, h * w)
- w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
- h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
- h_ = h_.reshape(b, c, t, h, w)
-
- h_ = self.proj_out(h_)
-
- return x + h_
-
-class AttnBlock3DFix(nn.Module):
- """
- Thanks to https://github.com/PKU-YuanGroup/Open-Sora-Plan/pull/172.
- """
- def __init__(self, in_channels):
- super().__init__()
- self.in_channels = in_channels
-
- self.norm = Normalize(in_channels)
- self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
- self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
- self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
- self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
-
- def forward(self, x):
- h_ = x
- h_ = self.norm(h_)
- q = self.q(h_)
- k = self.k(h_)
- v = self.v(h_)
-
- # compute attention
- # q: (b c t h w) -> (b t c h w) -> (b*t c h*w) -> (b*t h*w c)
- b, c, t, h, w = q.shape
- q = q.permute(0, 2, 1, 3, 4)
- q = q.reshape(b * t, c, h * w)
- q = q.permute(0, 2, 1)
-
- # k: (b c t h w) -> (b t c h w) -> (b*t c h*w)
- k = k.permute(0, 2, 1, 3, 4)
- k = k.reshape(b * t, c, h * w)
-
- # w: (b*t hw hw)
- w_ = torch.bmm(q, k)
- w_ = w_ * (int(c) ** (-0.5))
- w_ = torch.nn.functional.softmax(w_, dim=2)
-
- # attend to values
- # v: (b c t h w) -> (b t c h w) -> (bt c hw)
- # w_: (bt hw hw) -> (bt hw hw)
- v = v.permute(0, 2, 1, 3, 4)
- v = v.reshape(b * t, c, h * w)
- w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
- h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-
- # h_: (b*t c hw) -> (b t c h w) -> (b c t h w)
- h_ = h_.reshape(b, t, c, h, w)
- h_ = h_.permute(0, 2, 1, 3 ,4)
-
- h_ = self.proj_out(h_)
-
- return x + h_
-
-
-class AttnBlock(Block):
- def __init__(self, in_channels):
- super().__init__()
- self.in_channels = in_channels
-
- self.norm = Normalize(in_channels)
- self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
- self.k = torch.nn.Conv2d(
- in_channels, in_channels, kernel_size=1, stride=1, padding=0
- )
- self.v = torch.nn.Conv2d(
- in_channels, in_channels, kernel_size=1, stride=1, padding=0
- )
- self.proj_out = torch.nn.Conv2d(
- in_channels, in_channels, kernel_size=1, stride=1, padding=0
- )
-
- @video_to_image
- def forward(self, x):
- h_ = x
- h_ = self.norm(h_)
- q = self.q(h_)
- k = self.k(h_)
- v = self.v(h_)
-
- # compute attention
- b, c, h, w = q.shape
- q = q.reshape(b, c, h * w)
- q = q.permute(0, 2, 1) # b,hw,c
- k = k.reshape(b, c, h * w) # b,c,hw
- w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
- w_ = w_ * (int(c) ** (-0.5))
- w_ = torch.nn.functional.softmax(w_, dim=2)
-
- # attend to values
- v = v.reshape(b, c, h * w)
- w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
- h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
- h_ = h_.reshape(b, c, h, w)
-
- h_ = self.proj_out(h_)
-
- return x + h_
-
-
-class TemporalAttnBlock(Block):
- def __init__(self, in_channels):
- super().__init__()
- self.in_channels = in_channels
-
- self.norm = Normalize(in_channels)
- self.q = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
- self.k = torch.nn.Conv3d(
- in_channels, in_channels, kernel_size=1, stride=1, padding=0
- )
- self.v = torch.nn.Conv3d(
- in_channels, in_channels, kernel_size=1, stride=1, padding=0
- )
- self.proj_out = torch.nn.Conv3d(
- in_channels, in_channels, kernel_size=1, stride=1, padding=0
- )
-
- def forward(self, x):
- h_ = x
- h_ = self.norm(h_)
- q = self.q(h_)
- k = self.k(h_)
- v = self.v(h_)
-
- # compute attention
- b, c, t, h, w = q.shape
- q = rearrange(q, "b c t h w -> (b h w) t c")
- k = rearrange(k, "b c t h w -> (b h w) c t")
- v = rearrange(v, "b c t h w -> (b h w) c t")
- w_ = torch.bmm(q, k)
- w_ = w_ * (int(c) ** (-0.5))
- w_ = torch.nn.functional.softmax(w_, dim=2)
-
- # attend to values
- w_ = w_.permute(0, 2, 1)
- h_ = torch.bmm(v, w_)
- h_ = rearrange(h_, "(b h w) c t -> b c t h w", h=h, w=w)
- h_ = self.proj_out(h_)
-
- return x + h_
-
-
-def make_attn(in_channels, attn_type="vanilla"):
- assert attn_type in ["vanilla", "linear", "none", "vanilla3D"], f"attn_type {attn_type} unknown"
- print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
- print(attn_type)
- if attn_type == "vanilla":
- return AttnBlock(in_channels)
- elif attn_type == "vanilla3D":
- return AttnBlock3D(in_channels)
- elif attn_type == "none":
- return nn.Identity(in_channels)
- else:
- return LinAttnBlock(in_channels)
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/modules/conv.py b/opensora/models/ae/videobase/modules/conv.py
deleted file mode 100644
index 5a4c8ae27..000000000
--- a/opensora/models/ae/videobase/modules/conv.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import torch.nn as nn
-from typing import Union, Tuple
-import torch.nn.functional as F
-import torch
-from .block import Block
-from .ops import cast_tuple
-from einops import rearrange
-from .ops import video_to_image
-
-class Conv2d(nn.Conv2d):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- kernel_size: Union[int, Tuple[int]] = 3,
- stride: Union[int, Tuple[int]] = 1,
- padding: Union[str, int, Tuple[int]] = 0,
- dilation: Union[int, Tuple[int]] = 1,
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = "zeros",
- device=None,
- dtype=None,
- ) -> None:
- super().__init__(
- in_channels,
- out_channels,
- kernel_size,
- stride,
- padding,
- dilation,
- groups,
- bias,
- padding_mode,
- device,
- dtype,
- )
-
- @video_to_image
- def forward(self, x):
- return super().forward(x)
-
-
-class CausalConv3d(nn.Module):
- def __init__(
- self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], init_method="random", **kwargs
- ):
- super().__init__()
- self.kernel_size = cast_tuple(kernel_size, 3)
- self.time_kernel_size = self.kernel_size[0]
- self.chan_in = chan_in
- self.chan_out = chan_out
- stride = kwargs.pop("stride", 1)
- padding = kwargs.pop("padding", 0)
- padding = list(cast_tuple(padding, 3))
- padding[0] = 0
- stride = cast_tuple(stride, 3)
- self.conv = nn.Conv3d(chan_in, chan_out, self.kernel_size, stride=stride, padding=padding)
- self._init_weights(init_method)
-
- def _init_weights(self, init_method):
- ks = torch.tensor(self.kernel_size)
- if init_method == "avg":
- assert (
- self.kernel_size[1] == 1 and self.kernel_size[2] == 1
- ), "only support temporal up/down sample"
- assert self.chan_in == self.chan_out, "chan_in must be equal to chan_out"
- weight = torch.zeros((self.chan_out, self.chan_in, *self.kernel_size))
-
- eyes = torch.concat(
- [
- torch.eye(self.chan_in).unsqueeze(-1) * 1/3,
- torch.eye(self.chan_in).unsqueeze(-1) * 1/3,
- torch.eye(self.chan_in).unsqueeze(-1) * 1/3,
- ],
- dim=-1,
- )
- weight[:, :, :, 0, 0] = eyes
-
- self.conv.weight = nn.Parameter(
- weight,
- requires_grad=True,
- )
- elif init_method == "zero":
- self.conv.weight = nn.Parameter(
- torch.zeros((self.chan_out, self.chan_in, *self.kernel_size)),
- requires_grad=True,
- )
- if self.conv.bias is not None:
- nn.init.constant_(self.conv.bias, 0)
-
- def forward(self, x):
- # 1 + 16 16 as video, 1 as image
- first_frame_pad = x[:, :, :1, :, :].repeat(
- (1, 1, self.time_kernel_size - 1, 1, 1)
- ) # b c t h w
- x = torch.concatenate((first_frame_pad, x), dim=2) # 3 + 16
- return self.conv(x)
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/modules/normalize.py b/opensora/models/ae/videobase/modules/normalize.py
deleted file mode 100644
index 7c8c05f0f..000000000
--- a/opensora/models/ae/videobase/modules/normalize.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-import torch.nn as nn
-from .block import Block
-
-class GroupNorm(Block):
- def __init__(self, num_channels, num_groups=32, eps=1e-6, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
- self.norm = torch.nn.GroupNorm(
- num_groups=num_groups, num_channels=num_channels, eps=1e-6, affine=True
- )
- def forward(self, x):
- return self.norm(x)
-
-def Normalize(in_channels, num_groups=32):
- return torch.nn.GroupNorm(
- num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
- )
-
-class ActNorm(nn.Module):
- def __init__(self, num_features, logdet=False, affine=True,
- allow_reverse_init=False):
- assert affine
- super().__init__()
- self.logdet = logdet
- self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
- self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
- self.allow_reverse_init = allow_reverse_init
-
- self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
-
- def initialize(self, input):
- with torch.no_grad():
- flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
- mean = (
- flatten.mean(1)
- .unsqueeze(1)
- .unsqueeze(2)
- .unsqueeze(3)
- .permute(1, 0, 2, 3)
- )
- std = (
- flatten.std(1)
- .unsqueeze(1)
- .unsqueeze(2)
- .unsqueeze(3)
- .permute(1, 0, 2, 3)
- )
-
- self.loc.data.copy_(-mean)
- self.scale.data.copy_(1 / (std + 1e-6))
-
- def forward(self, input, reverse=False):
- if reverse:
- return self.reverse(input)
- if len(input.shape) == 2:
- input = input[:,:,None,None]
- squeeze = True
- else:
- squeeze = False
-
- _, _, height, width = input.shape
-
- if self.training and self.initialized.item() == 0:
- self.initialize(input)
- self.initialized.fill_(1)
-
- h = self.scale * (input + self.loc)
-
- if squeeze:
- h = h.squeeze(-1).squeeze(-1)
-
- if self.logdet:
- log_abs = torch.log(torch.abs(self.scale))
- logdet = height*width*torch.sum(log_abs)
- logdet = logdet * torch.ones(input.shape[0]).to(input)
- return h, logdet
-
- return h
-
- def reverse(self, output):
- if self.training and self.initialized.item() == 0:
- if not self.allow_reverse_init:
- raise RuntimeError(
- "Initializing ActNorm in reverse direction is "
- "disabled by default. Use allow_reverse_init=True to enable."
- )
- else:
- self.initialize(output)
- self.initialized.fill_(1)
-
- if len(output.shape) == 2:
- output = output[:,:,None,None]
- squeeze = True
- else:
- squeeze = False
-
- h = output / self.scale - self.loc
-
- if squeeze:
- h = h.squeeze(-1).squeeze(-1)
- return h
diff --git a/opensora/models/ae/videobase/modules/resnet_block.py b/opensora/models/ae/videobase/modules/resnet_block.py
deleted file mode 100644
index 189766a5b..000000000
--- a/opensora/models/ae/videobase/modules/resnet_block.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import torch
-import torch.nn as nn
-from einops import rearrange, pack, unpack
-from .normalize import Normalize
-from .ops import nonlinearity, video_to_image
-from .conv import CausalConv3d
-from .block import Block
-
-class ResnetBlock2D(Block):
- def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
- dropout):
- super().__init__()
- self.in_channels = in_channels
- self.out_channels = in_channels if out_channels is None else out_channels
- self.use_conv_shortcut = conv_shortcut
-
- self.norm1 = Normalize(in_channels)
- self.conv1 = torch.nn.Conv2d(
- in_channels, out_channels, kernel_size=3, stride=1, padding=1
- )
- self.norm2 = Normalize(out_channels)
- self.dropout = torch.nn.Dropout(dropout)
- self.conv2 = torch.nn.Conv2d(
- out_channels, out_channels, kernel_size=3, stride=1, padding=1
- )
- if self.in_channels != self.out_channels:
- if self.use_conv_shortcut:
- self.conv_shortcut = torch.nn.Conv2d(
- in_channels, out_channels, kernel_size=3, stride=1, padding=1
- )
- else:
- self.nin_shortcut = torch.nn.Conv2d(
- in_channels, out_channels, kernel_size=1, stride=1, padding=0
- )
-
- @video_to_image
- def forward(self, x):
- h = x
- h = self.norm1(h)
- h = nonlinearity(h)
- h = self.conv1(h)
- h = self.norm2(h)
- h = nonlinearity(h)
- h = self.dropout(h)
- h = self.conv2(h)
- if self.in_channels != self.out_channels:
- if self.use_conv_shortcut:
- x = self.conv_shortcut(x)
- else:
- x = self.nin_shortcut(x)
- x = x + h
- return x
-
-class ResnetBlock3D(Block):
- def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, dropout):
- super().__init__()
- self.in_channels = in_channels
- self.out_channels = in_channels if out_channels is None else out_channels
- self.use_conv_shortcut = conv_shortcut
-
- self.norm1 = Normalize(in_channels)
- self.conv1 = CausalConv3d(in_channels, out_channels, 3, padding=1)
- self.norm2 = Normalize(out_channels)
- self.dropout = torch.nn.Dropout(dropout)
- self.conv2 = CausalConv3d(out_channels, out_channels, 3, padding=1)
- if self.in_channels != self.out_channels:
- if self.use_conv_shortcut:
- self.conv_shortcut = CausalConv3d(in_channels, out_channels, 3, padding=1)
- else:
- self.nin_shortcut = CausalConv3d(in_channels, out_channels, 1, padding=0)
-
- def forward(self, x):
- h = x
- h = self.norm1(h)
- h = nonlinearity(h)
- h = self.conv1(h)
- h = self.norm2(h)
- h = nonlinearity(h)
- h = self.dropout(h)
- h = self.conv2(h)
- if self.in_channels != self.out_channels:
- if self.use_conv_shortcut:
- x = self.conv_shortcut(x)
- else:
- x = self.nin_shortcut(x)
- return x + h
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/modules/updownsample.py b/opensora/models/ae/videobase/modules/updownsample.py
deleted file mode 100644
index 9e3d489ae..000000000
--- a/opensora/models/ae/videobase/modules/updownsample.py
+++ /dev/null
@@ -1,236 +0,0 @@
-from typing import Union, Tuple
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .resnet_block import ResnetBlock3D
-from .attention import TemporalAttnBlock
-from .normalize import Normalize
-from .ops import cast_tuple, video_to_image
-from .conv import CausalConv3d
-from einops import rearrange
-from .block import Block
-
-class Upsample(Block):
- def __init__(self, in_channels, out_channels):
- super().__init__()
- self.with_conv = True
- if self.with_conv:
- self.conv = torch.nn.Conv2d(in_channels,
- out_channels,
- kernel_size=3,
- stride=1,
- padding=1)
-
- @video_to_image
- def forward(self, x):
- x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
- if self.with_conv:
- x = self.conv(x)
- return x
-
-class Downsample(Block):
- def __init__(self, in_channels, out_channels):
- super().__init__()
- self.with_conv = True
- if self.with_conv:
- # no asymmetric padding in torch conv, must do it ourselves
- self.conv = torch.nn.Conv2d(in_channels,
- out_channels,
- kernel_size=3,
- stride=2,
- padding=0)
- @video_to_image
- def forward(self, x):
- if self.with_conv:
- pad = (0,1,0,1)
- x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
- x = self.conv(x)
- else:
- x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
- return x
-
-class SpatialDownsample2x(Block):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: Union[int, Tuple[int]] = (3, 3),
- stride: Union[int, Tuple[int]] = (2, 2),
- ):
- super().__init__()
- kernel_size = cast_tuple(kernel_size, 2)
- stride = cast_tuple(stride, 2)
- self.chan_in = chan_in
- self.chan_out = chan_out
- self.kernel_size = kernel_size
- self.conv = CausalConv3d(
- self.chan_in,
- self.chan_out,
- (1,) + self.kernel_size,
- stride=(1, ) + stride,
- padding=0
- )
-
- def forward(self, x):
- pad = (0,1,0,1,0,0)
- x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
- x = self.conv(x)
- return x
-
-
-class SpatialUpsample2x(Block):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: Union[int, Tuple[int]] = (3, 3),
- stride: Union[int, Tuple[int]] = (1, 1),
- ):
- super().__init__()
- self.chan_in = chan_in
- self.chan_out = chan_out
- self.kernel_size = kernel_size
- self.conv = CausalConv3d(
- self.chan_in,
- self.chan_out,
- (1,) + self.kernel_size,
- stride=(1, ) + stride,
- padding=1
- )
-
- def forward(self, x):
- t = x.shape[2]
- x = rearrange(x, "b c t h w -> b (c t) h w")
- x = F.interpolate(x, scale_factor=(2,2), mode="nearest")
- x = rearrange(x, "b (c t) h w -> b c t h w", t=t)
- x = self.conv(x)
- return x
-
-class TimeDownsample2x(Block):
- def __init__(
- self,
- chan_in,
- chan_out,
- kernel_size: int = 3
- ):
- super().__init__()
- self.kernel_size = kernel_size
- self.conv = nn.AvgPool3d((kernel_size,1,1), stride=(2,1,1))
-
- def forward(self, x):
- first_frame_pad = x[:, :, :1, :, :].repeat(
- (1, 1, self.kernel_size - 1, 1, 1)
- )
- x = torch.concatenate((first_frame_pad, x), dim=2)
- return self.conv(x)
-
-class TimeUpsample2x(Block):
- def __init__(
- self,
- chan_in,
- chan_out
- ):
- super().__init__()
- def forward(self, x):
- if x.size(2) > 1:
- x,x_= x[:,:,:1],x[:,:,1:]
- x_= F.interpolate(x_, scale_factor=(2,1,1), mode='trilinear')
- x = torch.concat([x, x_], dim=2)
- return x
-
-class TimeDownsampleRes2x(nn.Module):
- def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size: int = 3,
- mix_factor: float = 2.0,
- ):
- super().__init__()
- self.kernel_size = cast_tuple(kernel_size, 3)
- self.avg_pool = nn.AvgPool3d((kernel_size,1,1), stride=(2,1,1))
- self.conv = nn.Conv3d(
- in_channels, out_channels, self.kernel_size, stride=(2,1,1), padding=(0,1,1)
- )
- self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
-
- def forward(self, x):
- alpha = torch.sigmoid(self.mix_factor)
- first_frame_pad = x[:, :, :1, :, :].repeat(
- (1, 1, self.kernel_size[0] - 1, 1, 1)
- )
- x = torch.concatenate((first_frame_pad, x), dim=2)
- return alpha * self.avg_pool(x) + (1 - alpha) * self.conv(x)
-
-class TimeUpsampleRes2x(nn.Module):
- def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size: int = 3,
- mix_factor: float = 2.0,
- ):
- super().__init__()
- self.conv = CausalConv3d(
- in_channels, out_channels, kernel_size, padding=1
- )
- self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
-
- def forward(self, x):
- alpha = torch.sigmoid(self.mix_factor)
- if x.size(2) > 1:
- x,x_= x[:,:,:1],x[:,:,1:]
- x_= F.interpolate(x_, scale_factor=(2,1,1), mode='trilinear')
- x = torch.concat([x, x_], dim=2)
- return alpha * x + (1-alpha) * self.conv(x)
-
-class TimeDownsampleResAdv2x(nn.Module):
- def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size: int = 3,
- mix_factor: float = 1.5,
- ):
- super().__init__()
- self.kernel_size = cast_tuple(kernel_size, 3)
- self.avg_pool = nn.AvgPool3d((kernel_size,1,1), stride=(2,1,1))
- self.attn = TemporalAttnBlock(in_channels)
- self.res = ResnetBlock3D(in_channels=in_channels, out_channels=in_channels, dropout=0.0)
- self.conv = nn.Conv3d(
- in_channels, out_channels, self.kernel_size, stride=(2,1,1), padding=(0,1,1)
- )
- self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
-
- def forward(self, x):
- first_frame_pad = x[:, :, :1, :, :].repeat(
- (1, 1, self.kernel_size[0] - 1, 1, 1)
- )
- x = torch.concatenate((first_frame_pad, x), dim=2)
- alpha = torch.sigmoid(self.mix_factor)
- return alpha * self.avg_pool(x) + (1 - alpha) * self.conv(self.attn((self.res(x))))
-
-class TimeUpsampleResAdv2x(nn.Module):
- def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size: int = 3,
- mix_factor: float = 1.5,
- ):
- super().__init__()
- self.res = ResnetBlock3D(in_channels=in_channels, out_channels=in_channels, dropout=0.0)
- self.attn = TemporalAttnBlock(in_channels)
- self.norm = Normalize(in_channels=in_channels)
- self.conv = CausalConv3d(
- in_channels, out_channels, kernel_size, padding=1
- )
- self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
-
- def forward(self, x):
- if x.size(2) > 1:
- x,x_= x[:,:,:1],x[:,:,1:]
- x_= F.interpolate(x_, scale_factor=(2,1,1), mode='trilinear')
- x = torch.concat([x, x_], dim=2)
- alpha = torch.sigmoid(self.mix_factor)
- return alpha * x + (1 - alpha) * self.conv(self.attn(self.res(x)))
diff --git a/opensora/models/ae/videobase/vqvae/__init__.py b/opensora/models/ae/videobase/vqvae/__init__.py
deleted file mode 100644
index ec138ecc7..000000000
--- a/opensora/models/ae/videobase/vqvae/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from einops import rearrange
-from torch import nn
-
-from .configuration_vqvae import VQVAEConfiguration
-from .modeling_vqvae import VQVAEModel
-from .trainer_vqvae import VQVAETrainer
-
-videovqvae = [
- "bair_stride4x2x2",
- "ucf101_stride4x4x4",
- "kinetics_stride4x4x4",
- "kinetics_stride2x4x4",
-]
-videovae = []
-
-class VQVAEModelWrapper(nn.Module):
- def __init__(self, ckpt='kinetics_stride4x4x4'):
- super(VQVAEModelWrapper, self).__init__()
- if ckpt in videovqvae:
- self.vqvae = VQVAEModel.download_and_load_model(ckpt)
- else:
- self.vqvae = VQVAEModel.load_from_checkpoint(ckpt)
- def encode(self, x): # b c t h w
- x = self.vqvae.pre_vq_conv(self.vqvae.encoder(x))
- return x
- def decode(self, x):
- vq_output = self.vqvae.codebook(x)
- x = self.vqvae.decoder(self.vqvae.post_vq_conv(vq_output['embeddings']))
- x = rearrange(x, 'b c t h w -> b t c h w').contiguous()
- return x
diff --git a/opensora/models/ae/videobase/vqvae/configuration_vqvae.py b/opensora/models/ae/videobase/vqvae/configuration_vqvae.py
deleted file mode 100644
index 90ac29cfa..000000000
--- a/opensora/models/ae/videobase/vqvae/configuration_vqvae.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from ..configuration_videobase import VideoBaseConfiguration
-from typing import Union, Tuple
-
-class VQVAEConfiguration(VideoBaseConfiguration):
- def __init__(
- self,
- embedding_dim: int = 256,
- n_codes: int = 2048,
- n_hiddens: int = 240,
- n_res_layers: int = 4,
- resolution: int = 128,
- sequence_length: int = 16,
- downsample: Union[Tuple[int, int, int], str] = (4, 4, 4),
- no_pos_embd: bool = True,
- **kwargs,
- ):
- super().__init__(**kwargs)
-
- self.embedding_dim = embedding_dim
- self.n_codes = n_codes
- self.n_hiddens = n_hiddens
- self.n_res_layers = n_res_layers
- self.resolution = resolution
- self.sequence_length = sequence_length
-
- if isinstance(downsample, str):
- self.downsample = tuple(map(int, downsample.split(",")))
- else:
- self.downsample = downsample
-
- self.no_pos_embd = no_pos_embd
-
- self.hidden_size = n_hiddens
diff --git a/opensora/models/ae/videobase/vqvae/modeling_vqvae.py b/opensora/models/ae/videobase/vqvae/modeling_vqvae.py
deleted file mode 100644
index 6a51677e6..000000000
--- a/opensora/models/ae/videobase/vqvae/modeling_vqvae.py
+++ /dev/null
@@ -1,775 +0,0 @@
-from ..modeling_videobase import VideoBaseAE
-import torch
-from torch import nn, Tensor
-import numpy as np
-import torch.distributed as dist
-import torch.nn.functional as F
-import math
-import os
-import json
-from typing import Tuple, Dict, Union
-from .configuration_vqvae import VQVAEConfiguration
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-def view_range(x, i, j, shape):
- shape = tuple(shape)
-
- n_dims = len(x.shape)
- if i < 0:
- i = n_dims + i
-
- if j is None:
- j = n_dims
- elif j < 0:
- j = n_dims + j
-
- assert 0 <= i < j <= n_dims
-
- x_shape = x.shape
- target_shape = x_shape[:i] + shape + x_shape[j:]
- return x.view(target_shape)
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
- n_dims = len(x.shape)
- if src_dim < 0:
- src_dim = n_dims + src_dim
- if dest_dim < 0:
- dest_dim = n_dims + dest_dim
- assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims
- dims = list(range(n_dims))
- del dims[src_dim]
- permutation = []
- ctr = 0
- for i in range(n_dims):
- if i == dest_dim:
- permutation.append(src_dim)
- else:
- permutation.append(dims[ctr])
- ctr += 1
- x = x.permute(permutation)
- if make_contiguous:
- x = x.contiguous()
- return x
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-def scaled_dot_product_attention(q, k, v, mask=None, attn_dropout=0.0, training=True):
- # Performs scaled dot-product attention over the second to last dimension dn
-
- # (b, n_head, d1, ..., dn, d)
- attn = torch.matmul(q, k.transpose(-1, -2))
- attn = attn / np.sqrt(q.shape[-1])
- if mask is not None:
- attn = attn.masked_fill(mask == 0, float("-inf"))
- attn_float = F.softmax(attn, dim=-1)
- attn = attn_float.type_as(attn) # b x n_head x d1 x ... x dn x d
- attn = F.dropout(attn, p=attn_dropout, training=training)
-
- a = torch.matmul(attn, v) # b x n_head x d1 x ... x dn x d
-
- return a
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class AxialBlock(nn.Module):
- def __init__(self, n_hiddens, n_head):
- super().__init__()
- kwargs = dict(
- shape=(0,) * 3,
- dim_q=n_hiddens,
- dim_kv=n_hiddens,
- n_head=n_head,
- n_layer=1,
- causal=False,
- attn_type="axial",
- )
- self.attn_w = MultiHeadAttention(attn_kwargs=dict(axial_dim=-2), **kwargs)
- self.attn_h = MultiHeadAttention(attn_kwargs=dict(axial_dim=-3), **kwargs)
- self.attn_t = MultiHeadAttention(attn_kwargs=dict(axial_dim=-4), **kwargs)
-
- def forward(self, x):
- x = shift_dim(x, 1, -1)
- x = self.attn_w(x, x, x) + self.attn_h(x, x, x) + self.attn_t(x, x, x)
- x = shift_dim(x, -1, 1)
- return x
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class AttentionResidualBlock(nn.Module):
- def __init__(self, n_hiddens):
- super().__init__()
- self.block = nn.Sequential(
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- SamePadConv3d(n_hiddens, n_hiddens // 2, 3, bias=False),
- nn.BatchNorm3d(n_hiddens // 2),
- nn.ReLU(),
- SamePadConv3d(n_hiddens // 2, n_hiddens, 1, bias=False),
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- AxialBlock(n_hiddens, 2),
- )
-
- def forward(self, x):
- return x + self.block(x)
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class Codebook(nn.Module):
- def __init__(self, n_codes, embedding_dim):
- super().__init__()
- self.register_buffer("embeddings", torch.randn(n_codes, embedding_dim))
- self.register_buffer("N", torch.zeros(n_codes))
- self.register_buffer("z_avg", self.embeddings.data.clone())
-
- self.n_codes = n_codes
- self.embedding_dim = embedding_dim
- self._need_init = True
-
- def _tile(self, x):
- d, ew = x.shape
- if d < self.n_codes:
- n_repeats = (self.n_codes + d - 1) // d
- std = 0.01 / np.sqrt(ew)
- x = x.repeat(n_repeats, 1)
- x = x + torch.randn_like(x) * std
- return x
-
- def _init_embeddings(self, z):
- # z: [b, c, t, h, w]
- self._need_init = False
- flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
- y = self._tile(flat_inputs)
-
- d = y.shape[0]
- _k_rand = y[torch.randperm(y.shape[0])][: self.n_codes]
- if dist.is_initialized():
- dist.broadcast(_k_rand, 0)
- self.embeddings.data.copy_(_k_rand)
- self.z_avg.data.copy_(_k_rand)
- self.N.data.copy_(torch.ones(self.n_codes))
-
- def forward(self, z):
- # z: [b, c, t, h, w]
- if self._need_init and self.training:
- self._init_embeddings(z)
- flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
- distances = (
- (flat_inputs**2).sum(dim=1, keepdim=True)
- - 2 * flat_inputs @ self.embeddings.t()
- + (self.embeddings.t() ** 2).sum(dim=0, keepdim=True)
- )
-
- encoding_indices = torch.argmin(distances, dim=1)
- encode_onehot = F.one_hot(encoding_indices, self.n_codes).type_as(flat_inputs)
- encoding_indices = encoding_indices.view(z.shape[0], *z.shape[2:])
-
- embeddings = F.embedding(encoding_indices, self.embeddings)
- embeddings = shift_dim(embeddings, -1, 1)
-
- commitment_loss = 0.25 * F.mse_loss(z, embeddings.detach())
-
- # EMA codebook update
- if self.training:
- n_total = encode_onehot.sum(dim=0)
- encode_sum = flat_inputs.t() @ encode_onehot
- if dist.is_initialized():
- dist.all_reduce(n_total)
- dist.all_reduce(encode_sum)
-
- self.N.data.mul_(0.99).add_(n_total, alpha=0.01)
- self.z_avg.data.mul_(0.99).add_(encode_sum.t(), alpha=0.01)
-
- n = self.N.sum()
- weights = (self.N + 1e-7) / (n + self.n_codes * 1e-7) * n
- encode_normalized = self.z_avg / weights.unsqueeze(1)
- self.embeddings.data.copy_(encode_normalized)
-
- y = self._tile(flat_inputs)
- _k_rand = y[torch.randperm(y.shape[0])][: self.n_codes]
- if dist.is_initialized():
- dist.broadcast(_k_rand, 0)
-
- usage = (self.N.view(self.n_codes, 1) >= 1).float()
- self.embeddings.data.mul_(usage).add_(_k_rand * (1 - usage))
-
- embeddings_st = (embeddings - z).detach() + z
-
- avg_probs = torch.mean(encode_onehot, dim=0)
- perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
-
- return dict(
- embeddings=embeddings_st,
- encodings=encoding_indices,
- commitment_loss=commitment_loss,
- perplexity=perplexity,
- )
-
- def dictionary_lookup(self, encodings):
- embeddings = F.embedding(encodings, self.embeddings)
- return embeddings
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class Encoder(nn.Module):
- def __init__(self, n_hiddens, n_res_layers, downsample):
- super().__init__()
- n_times_downsample = np.array([int(math.log2(d)) for d in downsample])
- self.convs = nn.ModuleList()
- max_ds = n_times_downsample.max()
- for i in range(max_ds):
- in_channels = 3 if i == 0 else n_hiddens
- stride = tuple([2 if d > 0 else 1 for d in n_times_downsample])
- conv = SamePadConv3d(in_channels, n_hiddens, 4, stride=stride)
- self.convs.append(conv)
- n_times_downsample -= 1
- self.conv_last = SamePadConv3d(in_channels, n_hiddens, kernel_size=3)
-
- self.res_stack = nn.Sequential(
- *[AttentionResidualBlock(n_hiddens) for _ in range(n_res_layers)],
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- )
-
- def forward(self, x):
- h = x
- for conv in self.convs:
- h = F.relu(conv(h))
- h = self.conv_last(h)
- h = self.res_stack(h)
- return h
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class MultiHeadAttention(nn.Module):
- def __init__(
- self, shape, dim_q, dim_kv, n_head, n_layer, causal, attn_type, attn_kwargs
- ):
- super().__init__()
- self.causal = causal
- self.shape = shape
-
- self.d_k = dim_q // n_head
- self.d_v = dim_kv // n_head
- self.n_head = n_head
-
- self.w_qs = nn.Linear(dim_q, n_head * self.d_k, bias=False) # q
- self.w_qs.weight.data.normal_(std=1.0 / np.sqrt(dim_q))
-
- self.w_ks = nn.Linear(dim_kv, n_head * self.d_k, bias=False) # k
- self.w_ks.weight.data.normal_(std=1.0 / np.sqrt(dim_kv))
-
- self.w_vs = nn.Linear(dim_kv, n_head * self.d_v, bias=False) # v
- self.w_vs.weight.data.normal_(std=1.0 / np.sqrt(dim_kv))
-
- self.fc = nn.Linear(n_head * self.d_v, dim_q, bias=True) # c
- self.fc.weight.data.normal_(std=1.0 / np.sqrt(dim_q * n_layer))
-
- if attn_type == "full":
- self.attn = FullAttention(shape, causal, **attn_kwargs)
- elif attn_type == "axial":
- assert not causal, "causal axial attention is not supported"
- self.attn = AxialAttention(len(shape), **attn_kwargs)
- elif attn_type == "sparse":
- self.attn = SparseAttention(shape, n_head, causal, **attn_kwargs)
-
- self.cache = None
-
- def forward(self, q, k, v, decode_step=None, decode_idx=None):
- """Compute multi-head attention
- Args
- q, k, v: a [b, d1, ..., dn, c] tensor or
- a [b, 1, ..., 1, c] tensor if decode_step is not None
-
- Returns
- The output after performing attention
- """
-
- # compute k, q, v
- d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
- q = view_range(self.w_qs(q), -1, None, (n_head, d_k))
- k = view_range(self.w_ks(k), -1, None, (n_head, d_k))
- v = view_range(self.w_vs(v), -1, None, (n_head, d_v))
-
- # b x n_head x seq_len x d
- # (b, *d_shape, n_head, d) -> (b, n_head, *d_shape, d)
- q = shift_dim(q, -2, 1)
- k = shift_dim(k, -2, 1)
- v = shift_dim(v, -2, 1)
-
- # fast decoding
- if decode_step is not None:
- if decode_step == 0:
- if self.causal:
- k_shape = (q.shape[0], n_head, *self.shape, self.d_k)
- v_shape = (q.shape[0], n_head, *self.shape, self.d_v)
- self.cache = dict(
- k=torch.zeros(k_shape, dtype=k.dtype, device=q.device),
- v=torch.zeros(v_shape, dtype=v.dtype, device=q.device),
- )
- else:
- # cache only once in the non-causal case
- self.cache = dict(k=k.clone(), v=v.clone())
- if self.causal:
- idx = (
- slice(None, None),
- slice(None, None),
- *[slice(i, i + 1) for i in decode_idx],
- )
- self.cache["k"][idx] = k
- self.cache["v"][idx] = v
- k, v = self.cache["k"], self.cache["v"]
-
- a = self.attn(q, k, v, decode_step, decode_idx)
-
- # (b, *d_shape, n_head, d) -> (b, *d_shape, n_head * d)
- a = shift_dim(a, 1, -2).flatten(start_dim=-2)
- a = self.fc(a) # (b x seq_len x embd_dim)
-
- return a
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class Decoder(nn.Module):
- def __init__(self, n_hiddens, n_res_layers, upsample):
- super().__init__()
- self.res_stack = nn.Sequential(
- *[AttentionResidualBlock(n_hiddens) for _ in range(n_res_layers)],
- nn.BatchNorm3d(n_hiddens),
- nn.ReLU(),
- )
-
- n_times_upsample = np.array([int(math.log2(d)) for d in upsample])
- max_us = n_times_upsample.max()
- self.convts = nn.ModuleList()
- for i in range(max_us):
- out_channels = 3 if i == max_us - 1 else n_hiddens
- us = tuple([2 if d > 0 else 1 for d in n_times_upsample])
- convt = SamePadConvTranspose3d(n_hiddens, out_channels, 4, stride=us)
- self.convts.append(convt)
- n_times_upsample -= 1
-
- def forward(self, x):
- h = self.res_stack(x)
- for i, convt in enumerate(self.convts):
- h = convt(h)
- if i < len(self.convts) - 1:
- h = F.relu(h)
- return h
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class SamePadConv3d(nn.Module):
- def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True):
- super().__init__()
- if isinstance(kernel_size, int):
- kernel_size = (kernel_size,) * 3
- if isinstance(stride, int):
- stride = (stride,) * 3
-
- # assumes that the input shape is divisible by stride
- total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
- pad_input = []
- for p in total_pad[::-1]: # reverse since F.pad starts from last dim
- pad_input.append((p // 2 + p % 2, p // 2))
- pad_input = sum(pad_input, tuple())
- self.pad_input = pad_input
-
- self.conv = nn.Conv3d(
- in_channels, out_channels, kernel_size, stride=stride, padding=0, bias=bias
- )
-
- def forward(self, x):
- return self.conv(F.pad(x, self.pad_input))
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class SamePadConvTranspose3d(nn.Module):
- def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True):
- super().__init__()
- if isinstance(kernel_size, int):
- kernel_size = (kernel_size,) * 3
- if isinstance(stride, int):
- stride = (stride,) * 3
-
- total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
- pad_input = []
- for p in total_pad[::-1]: # reverse since F.pad starts from last dim
- pad_input.append((p // 2 + p % 2, p // 2))
- pad_input = sum(pad_input, tuple())
- self.pad_input = pad_input
-
- self.convt = nn.ConvTranspose3d(
- in_channels,
- out_channels,
- kernel_size,
- stride=stride,
- bias=bias,
- padding=tuple([k - 1 for k in kernel_size]),
- )
-
- def forward(self, x):
- return self.convt(F.pad(x, self.pad_input))
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class FullAttention(nn.Module):
- def __init__(self, shape, causal, attn_dropout):
- super().__init__()
- self.causal = causal
- self.attn_dropout = attn_dropout
-
- seq_len = np.prod(shape)
- if self.causal:
- self.register_buffer("mask", torch.tril(torch.ones(seq_len, seq_len)))
-
- def forward(self, q, k, v, decode_step, decode_idx):
- mask = self.mask if self.causal else None
- if decode_step is not None and mask is not None:
- mask = mask[[decode_step]]
-
- old_shape = q.shape[2:-1]
- q = q.flatten(start_dim=2, end_dim=-2)
- k = k.flatten(start_dim=2, end_dim=-2)
- v = v.flatten(start_dim=2, end_dim=-2)
-
- out = scaled_dot_product_attention(
- q, k, v, mask=mask, attn_dropout=self.attn_dropout, training=self.training
- )
-
- return view_range(out, 2, 3, old_shape)
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class AxialAttention(nn.Module):
- def __init__(self, n_dim, axial_dim):
- super().__init__()
- if axial_dim < 0:
- axial_dim = 2 + n_dim + 1 + axial_dim
- else:
- axial_dim += 2 # account for batch, head, dim
- self.axial_dim = axial_dim
-
- def forward(self, q, k, v, decode_step, decode_idx):
- q = shift_dim(q, self.axial_dim, -2).flatten(end_dim=-3)
- k = shift_dim(k, self.axial_dim, -2).flatten(end_dim=-3)
- v = shift_dim(v, self.axial_dim, -2)
- old_shape = list(v.shape)
- v = v.flatten(end_dim=-3)
-
- out = scaled_dot_product_attention(q, k, v, training=self.training)
- out = out.view(*old_shape)
- out = shift_dim(out, -2, self.axial_dim)
- return out
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class StridedSparsityConfig(object):
- """
- Strided Sparse configuration specified in https://arxiv.org/abs/1904.10509 that
- generalizes to arbitrary dimensions
- """
-
- def __init__(self, shape, n_head, causal, block, num_local_blocks):
- self.n_head = n_head
- self.shape = shape
- self.causal = causal
- self.block = block
- self.num_local_blocks = num_local_blocks
-
- assert self.num_local_blocks >= 1, "Must have at least 1 local block"
- assert self.seq_len % self.block == 0, "seq len must be divisible by block size"
-
- self._block_shape = self._compute_block_shape()
- self._block_shape_cum = self._block_shape_cum_sizes()
-
- @property
- def seq_len(self):
- return np.prod(self.shape)
-
- @property
- def num_blocks(self):
- return self.seq_len // self.block
-
- def set_local_layout(self, layout):
- num_blocks = self.num_blocks
- for row in range(0, num_blocks):
- end = min(row + self.num_local_blocks, num_blocks)
- for col in range(
- max(0, row - self.num_local_blocks), (row + 1 if self.causal else end)
- ):
- layout[:, row, col] = 1
- return layout
-
- def set_global_layout(self, layout):
- num_blocks = self.num_blocks
- n_dim = len(self._block_shape)
- for row in range(num_blocks):
- assert self._to_flattened_idx(self._to_unflattened_idx(row)) == row
- cur_idx = self._to_unflattened_idx(row)
- # no strided attention over last dim
- for d in range(n_dim - 1):
- end = self._block_shape[d]
- for i in range(0, (cur_idx[d] + 1 if self.causal else end)):
- new_idx = list(cur_idx)
- new_idx[d] = i
- new_idx = tuple(new_idx)
-
- col = self._to_flattened_idx(new_idx)
- layout[:, row, col] = 1
-
- return layout
-
- def make_layout(self):
- layout = torch.zeros(
- (self.n_head, self.num_blocks, self.num_blocks), dtype=torch.int64
- )
- layout = self.set_local_layout(layout)
- layout = self.set_global_layout(layout)
- return layout
-
- def make_sparse_attn_mask(self):
- block_layout = self.make_layout()
- assert block_layout.shape[1] == block_layout.shape[2] == self.num_blocks
-
- num_dense_blocks = block_layout.sum().item()
- attn_mask = torch.ones(num_dense_blocks, self.block, self.block)
- counter = 0
- for h in range(self.n_head):
- for i in range(self.num_blocks):
- for j in range(self.num_blocks):
- elem = block_layout[h, i, j].item()
- if elem == 1:
- assert i >= j
- if i == j: # need to mask within block on diagonals
- attn_mask[counter] = torch.tril(attn_mask[counter])
- counter += 1
- assert counter == num_dense_blocks
-
- return attn_mask.unsqueeze(0)
-
- def get_non_block_layout_row(self, block_layout, row):
- block_row = row // self.block
- block_row = block_layout[:, [block_row]] # n_head x 1 x n_blocks
- block_row = block_row.repeat_interleave(self.block, dim=-1)
- block_row[:, :, row + 1 :] = 0.0
- return block_row
-
- ############# Helper functions ##########################
-
- def _compute_block_shape(self):
- n_dim = len(self.shape)
- cum_prod = 1
- for i in range(n_dim - 1, -1, -1):
- cum_prod *= self.shape[i]
- if cum_prod > self.block:
- break
- assert cum_prod % self.block == 0
- new_shape = (*self.shape[:i], cum_prod // self.block)
-
- assert np.prod(new_shape) == np.prod(self.shape) // self.block
-
- return new_shape
-
- def _block_shape_cum_sizes(self):
- bs = np.flip(np.array(self._block_shape))
- return tuple(np.flip(np.cumprod(bs)[:-1])) + (1,)
-
- def _to_flattened_idx(self, idx):
- assert len(idx) == len(
- self._block_shape
- ), f"{len(idx)} != {len(self._block_shape)}"
- flat_idx = 0
- for i in range(len(self._block_shape)):
- flat_idx += idx[i] * self._block_shape_cum[i]
- return flat_idx
-
- def _to_unflattened_idx(self, flat_idx):
- assert flat_idx < np.prod(self._block_shape)
- idx = []
- for i in range(len(self._block_shape)):
- idx.append(flat_idx // self._block_shape_cum[i])
- flat_idx %= self._block_shape_cum[i]
- return tuple(idx)
-
-
-# Copied from https://github.com/wilson1yan/VideoGPT
-class SparseAttention(nn.Module):
- ops = dict()
- attn_mask = dict()
- block_layout = dict()
-
- def __init__(
- self, shape, n_head, causal, num_local_blocks=4, block=32, attn_dropout=0.0
- ): # does not use attn_dropout
- super().__init__()
- self.causal = causal
- self.shape = shape
-
- self.sparsity_config = StridedSparsityConfig(
- shape=shape,
- n_head=n_head,
- causal=causal,
- block=block,
- num_local_blocks=num_local_blocks,
- )
-
- if self.shape not in SparseAttention.block_layout:
- SparseAttention.block_layout[self.shape] = (
- self.sparsity_config.make_layout()
- )
- if causal and self.shape not in SparseAttention.attn_mask:
- SparseAttention.attn_mask[self.shape] = (
- self.sparsity_config.make_sparse_attn_mask()
- )
-
- def get_ops(self):
- try:
- from deepspeed.ops.sparse_attention import MatMul, Softmax
- except:
- raise Exception(
- "Error importing deepspeed. Please install using `DS_BUILD_SPARSE_ATTN=1 pip install deepspeed`"
- )
- if self.shape not in SparseAttention.ops:
- sparsity_layout = self.sparsity_config.make_layout()
- sparse_dot_sdd_nt = MatMul(
- sparsity_layout,
- self.sparsity_config.block,
- "sdd",
- trans_a=False,
- trans_b=True,
- )
-
- sparse_dot_dsd_nn = MatMul(
- sparsity_layout,
- self.sparsity_config.block,
- "dsd",
- trans_a=False,
- trans_b=False,
- )
-
- sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block)
-
- SparseAttention.ops[self.shape] = (
- sparse_dot_sdd_nt,
- sparse_dot_dsd_nn,
- sparse_softmax,
- )
- return SparseAttention.ops[self.shape]
-
- def forward(self, q, k, v, decode_step, decode_idx):
- if self.training and self.shape not in SparseAttention.ops:
- self.get_ops()
-
- SparseAttention.block_layout[self.shape] = SparseAttention.block_layout[
- self.shape
- ].to(q)
- if self.causal:
- SparseAttention.attn_mask[self.shape] = (
- SparseAttention.attn_mask[self.shape].to(q).type_as(q)
- )
- attn_mask = SparseAttention.attn_mask[self.shape] if self.causal else None
-
- old_shape = q.shape[2:-1]
- q = q.flatten(start_dim=2, end_dim=-2)
- k = k.flatten(start_dim=2, end_dim=-2)
- v = v.flatten(start_dim=2, end_dim=-2)
-
- if decode_step is not None:
- mask = self.sparsity_config.get_non_block_layout_row(
- SparseAttention.block_layout[self.shape], decode_step
- )
- out = scaled_dot_product_attention(
- q, k, v, mask=mask, training=self.training
- )
- else:
- if q.shape != k.shape or k.shape != v.shape:
- raise Exception("SparseAttention only support self-attention")
- sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax = self.get_ops()
- scaling = float(q.shape[-1]) ** -0.5
-
- attn_output_weights = sparse_dot_sdd_nt(q, k)
- if attn_mask is not None:
- attn_output_weights = attn_output_weights.masked_fill(
- attn_mask == 0, float("-inf")
- )
- attn_output_weights = sparse_softmax(attn_output_weights, scale=scaling)
-
- out = sparse_dot_dsd_nn(attn_output_weights, v)
-
- return view_range(out, 2, 3, old_shape)
-
-
-# Modified from https://github.com/wilson1yan/VideoGPT
-class VQVAEModel(VideoBaseAE):
-
- DOWNLOADED_VQVAE = {
- "bair_stride4x2x2": "1iIAYJ2Qqrx5Q94s5eIXQYJgAydzvT_8L",
- "ucf101_stride4x4x4": "1uuB_8WzHP_bbBmfuaIV7PK_Itl3DyHY5",
- "kinetics_stride4x4x4": "1DOvOZnFAIQmux6hG7pN_HkyJZy3lXbCB",
- "kinetics_stride2x4x4": "1jvtjjtrtE4cy6pl7DK_zWFEPY3RZt2pB",
- }
-
- def __init__(self, config: VQVAEConfiguration):
- super().__init__()
- self.config = config
- self.embedding_dim = config.embedding_dim
- self.n_codes = config.n_codes
- self.encoder = Encoder(config.n_hiddens, config.n_res_layers, config.downsample)
- self.decoder = Decoder(config.n_hiddens, config.n_res_layers, config.downsample)
- self.pre_vq_conv = SamePadConv3d(config.n_hiddens, config.embedding_dim, 1)
- self.post_vq_conv = SamePadConv3d(config.embedding_dim, config.n_hiddens, 1)
- self.codebook = Codebook(config.n_codes, config.embedding_dim)
-
- def forward(self, x):
- z = self.pre_vq_conv(self.encoder(x))
- vq_output = self.codebook(z)
- x_recon = self.decoder(self.post_vq_conv(vq_output["embeddings"]))
- recon_loss = F.mse_loss(x_recon, x) / 0.06
- return recon_loss, x_recon, vq_output
-
- def encode(self, x: Tensor, include_embeddings: bool = False) -> Union[Tuple[Tensor, Tensor], Tensor]:
- h = self.pre_vq_conv(self.encoder(x))
- vq_output: Dict[str, Tensor] = self.codebook(h)
- if include_embeddings:
- return vq_output["encodings"], vq_output["embeddings"]
- else:
- return vq_output["encodings"]
-
- def decode(self, encodings: Tensor) -> Tensor:
- h = F.embedding(encodings, self.codebook.embeddings)
- h = self.post_vq_conv(shift_dim(h, -1, 1))
- return self.decoder(h)
-
- @classmethod
- def load_from_checkpoint(cls, model_path):
- if not os.path.isdir(model_path):
- """model downloaded from internet"""
- model_cpkt = torch.load(model_path)
- # Compatible with old videogpt model formats.
- if "hyper_parameters" in model_cpkt:
- hyper_parameters = vars(model_cpkt.get("hyper_parameters").get("args"))
- state_dict = model_cpkt.get("state_dict")
- model = cls(config=VQVAEConfiguration(**hyper_parameters))
- model.load_state_dict(state_dict)
- return model
- else:
- raise RuntimeError("Model checkpoint has a wrong format.")
- else:
- with open(os.path.join(model_path, "config.json"), "r") as file:
- config = json.load(file)
- state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
- model = cls(config=VQVAEConfiguration(**config))
- model.load_state_dict(state_dict)
- return model
-
- @classmethod
- def download_and_load_model(cls, model_name, cache_dir=None):
- from .....utils.downloader import gdown_download
- path = gdown_download(
- cls.DOWNLOADED_VQVAE[model_name], model_name, cache_dir=cache_dir
- )
- return cls.load_from_checkpoint(path)
diff --git a/opensora/models/ae/videobase/vqvae/trainer_vqvae.py b/opensora/models/ae/videobase/vqvae/trainer_vqvae.py
deleted file mode 100644
index df3f866ee..000000000
--- a/opensora/models/ae/videobase/vqvae/trainer_vqvae.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from ..trainer_videobase import VideoBaseTrainer
-import torch.nn.functional as F
-from typing import Optional
-import os
-import torch
-from transformers.utils import WEIGHTS_NAME
-import json
-
-class VQVAETrainer(VideoBaseTrainer):
-
- def compute_loss(self, model, inputs, return_outputs=False):
- model = model.module
- x = inputs.get("video")
- x = x / 2
- z = model.pre_vq_conv(model.encoder(x))
- vq_output = model.codebook(z)
- x_recon = model.decoder(model.post_vq_conv(vq_output["embeddings"]))
- recon_loss = F.mse_loss(x_recon, x) / 0.06
- commitment_loss = vq_output['commitment_loss']
- loss = recon_loss + commitment_loss
- return loss
-
diff --git a/opensora/models/captioner/caption_refiner/README.md b/opensora/models/captioner/caption_refiner/README.md
deleted file mode 100644
index cf1ae266a..000000000
--- a/opensora/models/captioner/caption_refiner/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Refiner for Video Caption
-
-Transform the short caption annotations from video datasets into the long and detailed caption annotations.
-
-* Add detailed description for background scene.
-* Add detailed description for object attributes, including color, material, pose.
-* Add detailed description for object-level spatial relationship.
-
-## 🛠️ Extra Requirements and Installation
-
-* openai == 0.28.0
-* jsonlines == 4.0.0
-* nltk == 3.8.1
-* Install the LLaMA-Accessory:
-
-you also need to download the weight of SPHINX to ./ckpt/ folder
-
-## 🗝️ Refining
-
-The refining instruction is in [demo_for_refiner.py](demo_for_refiner.py).
-
-```bash
-python demo_for_refiner.py --root_path $path_to_repo$ --api_key $openai_api_key$
-```
-
-### Refining Demos
-
-```bash
-[original caption]: A red mustang parked in a showroom with american flags hanging from the ceiling.
-```
-
-```bash
-[refine caption]: This scene depicts a red Mustang parked in a showroom with American flags hanging from the ceiling. The showroom likely serves as a space for showcasing and purchasing cars, and the Mustang is displayed prominently near the flags and ceiling. The scene also features a large window and other objects. Overall, it seems to take place in a car show or dealership.
-```
-
-- [ ] Add GPT-3.5-Turbo for caption summarization. ⌛ [WIP]
-- [ ] Add LLAVA-1.6. ⌛ [WIP]
-- [ ] More descriptions. ⌛ [WIP]
\ No newline at end of file
diff --git a/opensora/models/captioner/caption_refiner/caption_refiner.py b/opensora/models/captioner/caption_refiner/caption_refiner.py
deleted file mode 100644
index 23952f6d9..000000000
--- a/opensora/models/captioner/caption_refiner/caption_refiner.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import itertools
-import numpy as np
-from PIL import Image
-from PIL import ImageSequence
-from nltk import pos_tag, word_tokenize
-
-from LLaMA2_Accessory.SPHINX import SPHINXModel
-from gpt_combinator import caption_summary
-
-class CaptionRefiner():
- def __init__(self, sample_num, add_detect=True, add_pos=True, add_attr=True,
- openai_api_key=None, openai_api_base=None,
- ):
- self.sample_num = sample_num
- self.ADD_DETECTION_OBJ = add_detect
- self.ADD_POS = add_pos
- self.ADD_ATTR = add_attr
- self.openai_api_key = openai_api_key
- self.openai_api_base =openai_api_base
-
- def video_load_split(self, video_path=None):
- frame_img_list, sampled_img_list = [], []
-
- if ".gif" in video_path:
- img = Image.open(video_path)
- # process every frame in GIF from to
- for frame in ImageSequence.Iterator(img):
- frame_np = np.array(frame.copy().convert('RGB').getdata(),dtype=np.uint8).reshape(frame.size[1],frame.size[0],3)
- frame_img = Image.fromarray(np.uint8(frame_np))
- frame_img_list.append(frame_img)
- elif ".mp4" in video_path:
- pass
-
- # sample frames from the mp4/gif
- for i in range(0, len(frame_img_list), int(len(frame_img_list)/self.sample_num)):
- sampled_img_list.append(frame_img_list[i])
-
- return sampled_img_list # [, ...]
-
- def caption_refine(self, video_path, org_caption, model_path):
- sampled_imgs = self.video_load_split(video_path)
-
- model = SPHINXModel.from_pretrained(
- pretrained_path=model_path,
- with_visual=True
- )
-
- existing_objects, scene_description = [], []
- text = word_tokenize(org_caption)
- existing_objects = [word for word,tag in pos_tag(text) if tag in ["NN", "NNS", "NNP"]]
- if self.ADD_DETECTION_OBJ:
- # Detect the objects and scene in the sampled images
-
- qas = [["Where is this scene in the picture most likely to take place?", None]]
- sc_response = model.generate_response(qas, sampled_imgs[0], max_gen_len=1024, temperature=0.9, top_p=0.5, seed=0)
- scene_description.append(sc_response)
-
- # # Lacking accuracy
- # for img in sampled_imgs:
- # qas = [["Please detect the objects in the image.", None]]
- # response = model.generate_response(qas, img, max_gen_len=1024, temperature=0.9, top_p=0.5, seed=0)
- # print(response)
-
- object_attrs = []
- if self.ADD_ATTR:
- # Detailed Description for all the objects in the sampled images
- for obj in existing_objects:
- obj_attr = []
- for img in sampled_imgs:
- qas = [["Please describe the attribute of the {}, including color, position, etc".format(obj), None]]
- response = model.generate_response(qas, img, max_gen_len=1024, temperature=0.9, top_p=0.5, seed=0)
- obj_attr.append(response)
- object_attrs.append({obj : obj_attr})
-
- space_relations = []
- if self.ADD_POS:
- obj_pairs = list(itertools.combinations(existing_objects, 2))
- # Description for the relationship between each object in the sample images
- for obj_pair in obj_pairs:
- qas = [["What is the spatial relationship between the {} and the {}? Please describe in lease than twenty words".format(obj_pair[0], obj_pair[1]), None]]
- response = model.generate_response(qas, img, max_gen_len=1024, temperature=0.9, top_p=0.5, seed=0)
- space_relations.append(response)
-
- return dict(
- org_caption = org_caption,
- scene_description = scene_description,
- existing_objects = existing_objects,
- object_attrs = object_attrs,
- space_relations = space_relations,
- )
-
- def gpt_summary(self, total_captions):
- # combine all captions into a detailed long caption
- detailed_caption = ""
-
- if "org_caption" in total_captions.keys():
- detailed_caption += "In summary, "+ total_captions['org_caption']
-
- if "scene_description" in total_captions.keys():
- detailed_caption += "We first describe the whole scene. "+total_captions['scene_description'][-1]
-
- if "existing_objects" in total_captions.keys():
- tmp_sentence = "There are multiple objects in the video, including "
- for obj in total_captions['existing_objects']:
- tmp_sentence += obj+", "
- detailed_caption += tmp_sentence
-
- # if "object_attrs" in total_captions.keys():
- # caption_summary(
- # caption_list="",
- # api_key=self.openai_api_key,
- # api_base=self.openai_api_base,
- # )
-
- if "space_relations" in total_captions.keys():
- tmp_sentence = "As for the spatial relationship. "
- for sentence in total_captions['space_relations']: tmp_sentence += sentence
- detailed_caption += tmp_sentence
-
- detailed_caption = caption_summary(detailed_caption, self.open_api_key, self.open_api_base)
-
- return detailed_caption
\ No newline at end of file
diff --git a/opensora/models/captioner/caption_refiner/dataset/test_videos/captions.json b/opensora/models/captioner/caption_refiner/dataset/test_videos/captions.json
deleted file mode 100644
index 098a352f2..000000000
--- a/opensora/models/captioner/caption_refiner/dataset/test_videos/captions.json
+++ /dev/null
@@ -1 +0,0 @@
-{"video1.gif": "A red mustang parked in a showroom with american flags hanging from the ceiling.", "video2.gif": "An aerial view of a city with a river running through it."}
\ No newline at end of file
diff --git a/opensora/models/captioner/caption_refiner/demo_for_refiner.py b/opensora/models/captioner/caption_refiner/demo_for_refiner.py
deleted file mode 100644
index c7c0bfc5e..000000000
--- a/opensora/models/captioner/caption_refiner/demo_for_refiner.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import argparse
-from caption_refiner import CaptionRefiner
-from gpt_combinator import caption_summary, caption_qa
-
-def parse_args():
- parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
- parser.add_argument("--root_path", required=True, help="The path to repo.")
- parser.add_argument("--api_key", required=True, help="OpenAI API key.")
- args = parser.parse_args()
- return args
-
-if __name__ == "__main__":
- args = parse_args()
- myrefiner = CaptionRefiner(
- sample_num=6, add_detect=True, add_pos=True, add_attr=True,
- openai_api_key = args.api_key,
- openai_api_base = "https://one-api.bltcy.top/v1",
- )
-
- results = myrefiner.caption_refine(
- video_path="./dataset/test_videos/video1.gif",
- org_caption="A red mustang parked in a showroom with american flags hanging from the ceiling.",
- model_path = args.root_path + "/ckpts/SPHINX-Tiny",
- )
-
- final_caption = myrefiner.gpt_summary(results)
-
- print(final_caption)
diff --git a/opensora/models/captioner/caption_refiner/gpt_combinator.py b/opensora/models/captioner/caption_refiner/gpt_combinator.py
deleted file mode 100644
index c0a6f0dff..000000000
--- a/opensora/models/captioner/caption_refiner/gpt_combinator.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import openai
-import ast
-
-def caption_qa(caption_list, api_key, api_base):
- openai.api_key = api_key
- openai.api_base = api_base
-
- question = "What is the color of a red apple"
- answer = "red"
- pred = "green"
- try:
- # Compute the correctness score
- completion = openai.ChatCompletion.create(
- model="gpt-3.5-turbo",
- # model="gpt-4",
- # model="gpt-4-vision-compatible",
- messages=[
- {
- "role": "system",
- "content":
- "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
- "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
- "------"
- "##INSTRUCTIONS: "
- "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
- "- Consider synonyms or paraphrases as valid matches.\n"
- "- Evaluate the correctness of the prediction compared to the answer."
- },
- {
- "role": "user",
- "content":
- "Please evaluate the following video-based question-answer pair:\n\n"
- f"Question: {question}\n"
- f"Correct Answer: {answer}\n"
- f"Predicted Answer: {pred}\n\n"
- "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
- "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
- "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
- "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
- }
- ]
- )
- # Convert response to a Python dictionary.
- response_message = completion["choices"][0]["message"]["content"]
- response_dict = ast.literal_eval(response_message)
- print(response_dict)
-
- except Exception as e:
- print(f"Error processing file : {e}")
-
-
-def caption_summary(long_caption, api_key, api_base):
- """
- apply GPT3-Turbo as the combination for original caption and the prompted captions for a video
- """
- openai.api_key = api_key
- openai.api_base = api_base
-
- try:
- # Compute the correctness score
- completion = openai.ChatCompletion.create(
- model="gpt-3.5-turbo",
- messages=[
- {
- "role": "system",
- "content":
- "You are an intelligent chatbot designed for summarizing from a long sentence. "
- },
- {
- "role": "user",
- "content":
- "Please summarize the following sentences. Make it shorter than 70 words."
- f"the long sentence: {long_caption}\n"
- "Provide your summarization with less than 70 words. "
- "DO NOT PROVIDE ANY OTHER TEXT OR EXPLANATION. Only provide the summary sentence. "
- }
- ]
- )
- # "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
- # "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
- # "For example, your response should look like this: {'summary': 'your summary sentence'}."
-
- # Convert response to a Python dictionary.
- response_message = completion["choices"][0]["message"]["content"]
- response_dict = ast.literal_eval(response_message)
-
- except Exception as e:
- print(f"Error processing file : {e}")
-
- return response_dict
-
-if __name__ == "__main__":
- caption_summary()
\ No newline at end of file
diff --git a/opensora/models/causalvideovae/__init__.py b/opensora/models/causalvideovae/__init__.py
new file mode 100644
index 000000000..2263e4826
--- /dev/null
+++ b/opensora/models/causalvideovae/__init__.py
@@ -0,0 +1,77 @@
+from torchvision.transforms import Lambda
+from .model.vae import CausalVAEModel, WFVAEModel
+from einops import rearrange
+import torch.nn as nn
+
+class CausalVAEModelWrapper(nn.Module):
+ def __init__(self, model_path, subfolder=None, cache_dir=None, use_ema=False, **kwargs):
+ super(CausalVAEModelWrapper, self).__init__()
+ self.vae = CausalVAEModel.from_pretrained(model_path, subfolder=subfolder, cache_dir=cache_dir, **kwargs)
+
+ def encode(self, x):
+ x = self.vae.encode(x).sample().mul_(0.18215)
+ return x
+ def decode(self, x):
+ x = self.vae.decode(x / 0.18215)
+ x = rearrange(x, 'b c t h w -> b t c h w').contiguous()
+ return x
+
+ def dtype(self):
+ return self.vae.dtype
+
+class WFVAEModelWrapper(nn.Module):
+ def __init__(self, model_path, subfolder=None, cache_dir=None, use_ema=False, **kwargs):
+ super(WFVAEModelWrapper, self).__init__()
+ self.vae = WFVAEModel.from_pretrained(model_path, subfolder=subfolder, cache_dir=cache_dir, **kwargs)
+
+ def encode(self, x):
+ x = self.vae.encode(x).sample().mul_(0.18215)
+ return x
+
+ def decode(self, x):
+ x = self.vae.decode(x / 0.18215)
+ x = rearrange(x, 'b c t h w -> b t c h w').contiguous()
+ return x
+
+ def dtype(self):
+ return self.vae.dtype
+
+ae_wrapper = {
+ 'CausalVAEModel_D4_2x8x8': CausalVAEModelWrapper,
+ 'CausalVAEModel_D8_2x8x8': CausalVAEModelWrapper,
+ 'CausalVAEModel_D4_4x8x8': CausalVAEModelWrapper,
+ 'CausalVAEModel_D8_4x8x8': CausalVAEModelWrapper,
+ 'WFVAEModel_D8_4x8x8': WFVAEModelWrapper,
+}
+
+ae_stride_config = {
+ 'CausalVAEModel_D4_2x8x8': [2, 8, 8],
+ 'CausalVAEModel_D8_2x8x8': [2, 8, 8],
+ 'CausalVAEModel_D4_4x8x8': [4, 8, 8],
+ 'CausalVAEModel_D8_4x8x8': [4, 8, 8],
+ 'WFVAEModel_D8_4x8x8': [4, 8, 8],
+}
+
+ae_channel_config = {
+ 'CausalVAEModel_D4_2x8x8': 4,
+ 'CausalVAEModel_D8_2x8x8': 8,
+ 'CausalVAEModel_D4_4x8x8': 4,
+ 'CausalVAEModel_D8_4x8x8': 8,
+ 'WFVAEModel_D8_4x8x8': 8,
+}
+
+ae_denorm = {
+ 'CausalVAEModel_D4_2x8x8': lambda x: (x + 1.) / 2.,
+ 'CausalVAEModel_D8_2x8x8': lambda x: (x + 1.) / 2.,
+ 'CausalVAEModel_D4_4x8x8': lambda x: (x + 1.) / 2.,
+ 'CausalVAEModel_D8_4x8x8': lambda x: (x + 1.) / 2.,
+ 'WFVAEModel_D8_4x8x8': lambda x: (x + 1.) / 2.,
+}
+
+ae_norm = {
+ 'CausalVAEModel_D4_2x8x8': Lambda(lambda x: 2. * x - 1.),
+ 'CausalVAEModel_D8_2x8x8': Lambda(lambda x: 2. * x - 1.),
+ 'CausalVAEModel_D4_4x8x8': Lambda(lambda x: 2. * x - 1.),
+ 'CausalVAEModel_D8_4x8x8': Lambda(lambda x: 2. * x - 1.),
+ 'WFVAEModel_D8_4x8x8': Lambda(lambda x: 2. * x - 1.),
+}
\ No newline at end of file
diff --git a/opensora/models/causalvideovae/dataset/__init__.py b/opensora/models/causalvideovae/dataset/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/opensora/models/causalvideovae/dataset/transform.py b/opensora/models/causalvideovae/dataset/transform.py
new file mode 100644
index 000000000..bb89c2c85
--- /dev/null
+++ b/opensora/models/causalvideovae/dataset/transform.py
@@ -0,0 +1,573 @@
+import torch
+import random
+import numbers
+from torchvision.transforms import RandomCrop, RandomResizedCrop
+
+
+def _is_tensor_video_clip(clip):
+ if not torch.is_tensor(clip):
+ raise TypeError("clip should be Tensor. Got %s" % type(clip))
+
+ if not clip.ndimension() == 4:
+ raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+
+ return True
+
+
+def center_crop_arr(pil_image, image_size):
+ """
+ Center cropping implementation from ADM.
+ https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+ """
+ while min(*pil_image.size) >= 2 * image_size:
+ pil_image = pil_image.resize(
+ tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+ )
+
+ scale = image_size / min(*pil_image.size)
+ pil_image = pil_image.resize(
+ tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+ )
+
+ arr = np.array(pil_image)
+ crop_y = (arr.shape[0] - image_size) // 2
+ crop_x = (arr.shape[1] - image_size) // 2
+ return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
+
+
+def crop(clip, i, j, h, w):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ """
+ if len(clip.size()) != 4:
+ raise ValueError("clip should be a 4D tensor")
+ return clip[..., i: i + h, j: j + w]
+
+
+def resize(clip, target_size, interpolation_mode):
+ if len(target_size) != 2:
+ raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+ return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=True, antialias=True)
+
+
+def resize_scale(clip, target_size, interpolation_mode):
+ if len(target_size) != 2:
+ raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+ H, W = clip.size(-2), clip.size(-1)
+ scale_ = target_size[0] / min(H, W)
+ return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=True, antialias=True)
+
+
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+ """
+ Do spatial cropping and resizing to the video clip
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ i (int): i in (i,j) i.e coordinates of the upper left corner.
+ j (int): j in (i,j) i.e coordinates of the upper left corner.
+ h (int): Height of the cropped region.
+ w (int): Width of the cropped region.
+ size (tuple(int, int)): height and width of resized clip
+ Returns:
+ clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+ """
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+ clip = crop(clip, i, j, h, w)
+ clip = resize(clip, size, interpolation_mode)
+ return clip
+
+
+def center_crop(clip, crop_size):
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+ h, w = clip.size(-2), clip.size(-1)
+ th, tw = crop_size
+ if h < th or w < tw:
+ raise ValueError("height and width must be no smaller than crop_size")
+
+ i = int(round((h - th) / 2.0))
+ j = int(round((w - tw) / 2.0))
+ return crop(clip, i, j, th, tw)
+
+
+def center_crop_using_short_edge(clip):
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+ h, w = clip.size(-2), clip.size(-1)
+ if h < w:
+ th, tw = h, h
+ i = 0
+ j = int(round((w - tw) / 2.0))
+ else:
+ th, tw = w, w
+ i = int(round((h - th) / 2.0))
+ j = 0
+ return crop(clip, i, j, th, tw)
+
+
+def random_shift_crop(clip):
+ '''
+ Slide along the long edge, with the short edge as crop size
+ '''
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+ h, w = clip.size(-2), clip.size(-1)
+
+ if h <= w:
+ long_edge = w
+ short_edge = h
+ else:
+ long_edge = h
+ short_edge = w
+
+ th, tw = short_edge, short_edge
+
+ i = torch.randint(0, h - th + 1, size=(1,)).item()
+ j = torch.randint(0, w - tw + 1, size=(1,)).item()
+ return crop(clip, i, j, th, tw)
+
+
+def to_tensor(clip):
+ """
+ Convert tensor data type from uint8 to float, divide value by 255.0 and
+ permute the dimensions of clip tensor
+ Args:
+ clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+ Return:
+ clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+ """
+ _is_tensor_video_clip(clip)
+ if not clip.dtype == torch.uint8:
+ raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+ # return clip.float().permute(3, 0, 1, 2) / 255.0
+ return clip.float() / 255.0
+
+
+def normalize(clip, mean, std, inplace=False):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+ mean (tuple): pixel RGB mean. Size is (3)
+ std (tuple): pixel standard deviation. Size is (3)
+ Returns:
+ normalized clip (torch.tensor): Size is (T, C, H, W)
+ """
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+ if not inplace:
+ clip = clip.clone()
+ mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+ # print(mean)
+ std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+ clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+ return clip
+
+
+def hflip(clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+ Returns:
+ flipped clip (torch.tensor): Size is (T, C, H, W)
+ """
+ if not _is_tensor_video_clip(clip):
+ raise ValueError("clip should be a 4D torch.tensor")
+ return clip.flip(-1)
+
+
+class RandomCropVideo:
+ def __init__(self, size):
+ if isinstance(size, numbers.Number):
+ self.size = (int(size), int(size))
+ else:
+ self.size = size
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ Returns:
+ torch.tensor: randomly cropped video clip.
+ size is (T, C, OH, OW)
+ """
+ i, j, h, w = self.get_params(clip)
+ return crop(clip, i, j, h, w)
+
+ def get_params(self, clip):
+ h, w = clip.shape[-2:]
+ th, tw = self.size
+
+ if h < th or w < tw:
+ raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+
+ if w == tw and h == th:
+ return 0, 0, h, w
+
+ i = torch.randint(0, h - th + 1, size=(1,)).item()
+ j = torch.randint(0, w - tw + 1, size=(1,)).item()
+
+ return i, j, th, tw
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(size={self.size})"
+
+
+class SpatialStrideCropVideo:
+ def __init__(self, stride):
+ self.stride = stride
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ Returns:
+ torch.tensor: cropped video clip by stride.
+ size is (T, C, OH, OW)
+ """
+ i, j, h, w = self.get_params(clip)
+ return crop(clip, i, j, h, w)
+
+ def get_params(self, clip):
+ h, w = clip.shape[-2:]
+
+ th, tw = h // self.stride * self.stride, w // self.stride * self.stride
+
+ return 0, 0, th, tw # from top-left
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(size={self.size})"
+
+class LongSideResizeVideo:
+ '''
+ First use the long side,
+ then resize to the specified size
+ '''
+
+ def __init__(
+ self,
+ size,
+ skip_low_resolution=False,
+ interpolation_mode="bilinear",
+ ):
+ self.size = size
+ self.skip_low_resolution = skip_low_resolution
+ self.interpolation_mode = interpolation_mode
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ Returns:
+ torch.tensor: scale resized video clip.
+ size is (T, C, 512, *) or (T, C, *, 512)
+ """
+ _, _, h, w = clip.shape
+ if self.skip_low_resolution and max(h, w) <= self.size:
+ return clip
+ if h > w:
+ w = int(w * self.size / h)
+ h = self.size
+ else:
+ h = int(h * self.size / w)
+ w = self.size
+ resize_clip = resize(clip, target_size=(h, w),
+ interpolation_mode=self.interpolation_mode)
+ return resize_clip
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+
+class CenterCropResizeVideo:
+ '''
+ First use the short side for cropping length,
+ center crop video, then resize to the specified size
+ '''
+
+ def __init__(
+ self,
+ size,
+ interpolation_mode="bilinear",
+ ):
+ if isinstance(size, tuple):
+ if len(size) != 2:
+ raise ValueError(f"size should be tuple (height, width), instead got {size}")
+ self.size = size
+ else:
+ self.size = (size, size)
+
+ self.interpolation_mode = interpolation_mode
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ Returns:
+ torch.tensor: scale resized / center cropped video clip.
+ size is (T, C, crop_size, crop_size)
+ """
+ clip_center_crop = center_crop_using_short_edge(clip)
+ clip_center_crop_resize = resize(clip_center_crop, target_size=self.size,
+ interpolation_mode=self.interpolation_mode)
+ return clip_center_crop_resize
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+
+
+class UCFCenterCropVideo:
+ '''
+ First scale to the specified size in equal proportion to the short edge,
+ then center cropping
+ '''
+
+ def __init__(
+ self,
+ size,
+ interpolation_mode="bilinear",
+ ):
+ if isinstance(size, tuple):
+ if len(size) != 2:
+ raise ValueError(f"size should be tuple (height, width), instead got {size}")
+ self.size = size
+ else:
+ self.size = (size, size)
+
+ self.interpolation_mode = interpolation_mode
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ Returns:
+ torch.tensor: scale resized / center cropped video clip.
+ size is (T, C, crop_size, crop_size)
+ """
+ clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+ clip_center_crop = center_crop(clip_resize, self.size)
+ return clip_center_crop
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+
+
+class KineticsRandomCropResizeVideo:
+ '''
+ Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+ '''
+
+ def __init__(
+ self,
+ size,
+ interpolation_mode="bilinear",
+ ):
+ if isinstance(size, tuple):
+ if len(size) != 2:
+ raise ValueError(f"size should be tuple (height, width), instead got {size}")
+ self.size = size
+ else:
+ self.size = (size, size)
+
+ self.interpolation_mode = interpolation_mode
+
+ def __call__(self, clip):
+ clip_random_crop = random_shift_crop(clip)
+ clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
+ return clip_resize
+
+
+class CenterCropVideo:
+ def __init__(
+ self,
+ size,
+ interpolation_mode="bilinear",
+ ):
+ if isinstance(size, tuple):
+ if len(size) != 2:
+ raise ValueError(f"size should be tuple (height, width), instead got {size}")
+ self.size = size
+ else:
+ self.size = (size, size)
+
+ self.interpolation_mode = interpolation_mode
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+ Returns:
+ torch.tensor: center cropped video clip.
+ size is (T, C, crop_size, crop_size)
+ """
+ clip_center_crop = center_crop(clip, self.size)
+ return clip_center_crop
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+
+
+class NormalizeVideo:
+ """
+ Normalize the video clip by mean subtraction and division by standard deviation
+ Args:
+ mean (3-tuple): pixel RGB mean
+ std (3-tuple): pixel RGB standard deviation
+ inplace (boolean): whether do in-place normalization
+ """
+
+ def __init__(self, mean, std, inplace=False):
+ self.mean = mean
+ self.std = std
+ self.inplace = inplace
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+ """
+ return normalize(clip, self.mean, self.std, self.inplace)
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+
+
+class ToTensorVideo:
+ """
+ Convert tensor data type from uint8 to float, divide value by 255.0 and
+ permute the dimensions of clip tensor
+ """
+
+ def __init__(self):
+ pass
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+ Return:
+ clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+ """
+ return to_tensor(clip)
+
+ def __repr__(self) -> str:
+ return self.__class__.__name__
+
+
+class RandomHorizontalFlipVideo:
+ """
+ Flip the video clip along the horizontal direction with a given probability
+ Args:
+ p (float): probability of the clip being flipped. Default value is 0.5
+ """
+
+ def __init__(self, p=0.5):
+ self.p = p
+
+ def __call__(self, clip):
+ """
+ Args:
+ clip (torch.tensor): Size is (T, C, H, W)
+ Return:
+ clip (torch.tensor): Size is (T, C, H, W)
+ """
+ if random.random() < self.p:
+ clip = hflip(clip)
+ return clip
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}(p={self.p})"
+
+
+# ------------------------------------------------------------
+# --------------------- Sampling ---------------------------
+# ------------------------------------------------------------
+class TemporalRandomCrop(object):
+ """Temporally crop the given frame indices at a random location.
+
+ Args:
+ size (int): Desired length of frames will be seen in the model.
+ """
+
+ def __init__(self, size):
+ self.size = size
+
+ def __call__(self, total_frames):
+ rand_end = max(0, total_frames - self.size - 1)
+ begin_index = random.randint(0, rand_end)
+ end_index = min(begin_index + self.size, total_frames)
+ return begin_index, end_index
+
+class DynamicSampleDuration(object):
+ """Temporally crop the given frame indices at a random location.
+
+ Args:
+ size (int): Desired length of frames will be seen in the model.
+ """
+
+ def __init__(self, t_stride, extra_1):
+ self.t_stride = t_stride
+ self.extra_1 = extra_1
+
+ def __call__(self, t, h, w):
+ if self.extra_1:
+ t = t - 1
+ truncate_t_list = list(range(t+1))[t//2:][::self.t_stride] # need half at least
+ truncate_t = random.choice(truncate_t_list)
+ if self.extra_1:
+ truncate_t = truncate_t + 1
+ return 0, truncate_t
+
+if __name__ == '__main__':
+ from torchvision import transforms
+ import torchvision.io as io
+ import numpy as np
+ from torchvision.utils import save_image
+ import os
+
+ vframes, aframes, info = io.read_video(
+ filename='./v_Archery_g01_c03.avi',
+ pts_unit='sec',
+ output_format='TCHW'
+ )
+
+ trans = transforms.Compose([
+ ToTensorVideo(),
+ RandomHorizontalFlipVideo(),
+ UCFCenterCropVideo(512),
+ # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+ ])
+
+ target_video_len = 32
+ frame_interval = 1
+ total_frames = len(vframes)
+ print(total_frames)
+
+ temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
+
+ # Sampling video frames
+ start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+ # print(start_frame_ind)
+ # print(end_frame_ind)
+ assert end_frame_ind - start_frame_ind >= target_video_len
+ frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
+ print(frame_indice)
+
+ select_vframes = vframes[frame_indice]
+ print(select_vframes.shape)
+ print(select_vframes.dtype)
+
+ select_vframes_trans = trans(select_vframes)
+ print(select_vframes_trans.shape)
+ print(select_vframes_trans.dtype)
+
+ select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
+ print(select_vframes_trans_int.dtype)
+ print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
+
+ io.write_video('./test.avi', select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
+
+ for i in range(target_video_len):
+ save_image(select_vframes_trans[i], os.path.join('./test000', '%04d.png' % i), normalize=True,
+ value_range=(-1, 1))
diff --git a/opensora/models/causalvideovae/dataset/video_dataset.py b/opensora/models/causalvideovae/dataset/video_dataset.py
new file mode 100644
index 000000000..9745e2140
--- /dev/null
+++ b/opensora/models/causalvideovae/dataset/video_dataset.py
@@ -0,0 +1,245 @@
+import os.path as osp
+import random
+from glob import glob
+from torchvision import transforms
+import numpy as np
+import torch
+import torch.utils.data as data
+import torch.nn.functional as F
+import pickle
+import decord
+from torch.nn import functional as F
+from .transform import ToTensorVideo, CenterCropVideo
+from torchvision.transforms._transforms_video import CenterCropVideo as TVCenterCropVideo
+from torchvision.transforms import Lambda, Compose
+import torch
+import os
+
+
+class DecordInit(object):
+ def __init__(self, num_threads=1):
+ self.num_threads = num_threads
+ self.ctx = decord.cpu(0)
+
+ def __call__(self, filename):
+ reader = decord.VideoReader(
+ filename, ctx=self.ctx, num_threads=self.num_threads
+ )
+ return reader
+
+ def __repr__(self):
+ repr_str = (
+ f"{self.__class__.__name__}("
+ f"sr={self.sr},"
+ f"num_threads={self.num_threads})"
+ )
+ return repr_str
+
+def TemporalRandomCrop(total_frames, size):
+ rand_end = max(0, total_frames - size - 1)
+ begin_index = random.randint(0, rand_end)
+ end_index = min(begin_index + size, total_frames)
+ return begin_index, end_index
+
+def _format_video_shape(video, time_compress=4, spatial_compress=8):
+ """Prepare video for VAE"""
+ time = video.shape[1]
+ height = video.shape[2]
+ width = video.shape[3]
+ new_time = (
+ (time - (time - 1) % time_compress) if (time - 1) % time_compress != 0 else time
+ )
+ new_height = (
+ (height - (height) % spatial_compress)
+ if height % spatial_compress != 0
+ else height
+ )
+ new_width = (
+ (width - (width) % spatial_compress) if width % spatial_compress != 0 else width
+ )
+ return video[:, :new_time, :new_height, :new_width]
+
+
+class TrainVideoDataset(data.Dataset):
+ video_exts = ["avi", "mp4", "webm"]
+
+ def __init__(
+ self,
+ video_folder,
+ sequence_length,
+ train=True,
+ resolution=64,
+ sample_rate=1,
+ dynamic_sample=True,
+ cache_file=None,
+ is_main_process=False,
+ ):
+
+ self.train = train
+ self.sequence_length = sequence_length
+ self.sample_rate = sample_rate
+ self.resolution = resolution
+ self.v_decoder = DecordInit()
+ self.video_folder = video_folder
+ self.dynamic_sample = dynamic_sample
+ self.cache_file = cache_file
+ self.transform = transforms.Compose(
+ [
+ ToTensorVideo(),
+ CenterCropVideo(self.resolution),
+ Lambda(lambda x: 2.0 * x - 1.0),
+ ]
+ )
+ print("Building datasets...")
+ self.is_main_process = is_main_process
+ self.samples = self._make_dataset()
+
+ def _make_dataset(self):
+ cache_file = osp.join(self.video_folder, self.cache_file)
+ if osp.exists(cache_file):
+ with open(cache_file, "rb") as f:
+ samples = pickle.load(f)
+ else:
+ samples = []
+ samples += sum(
+ [
+ glob(osp.join(self.video_folder, "**", f"*.{ext}"), recursive=True)
+ for ext in self.video_exts
+ ],
+ [],
+ )
+ if self.is_main_process:
+ with open(cache_file, "wb") as f:
+ pickle.dump(samples, f)
+ return samples
+
+ def __len__(self):
+ return len(self.samples)
+
+ def __getitem__(self, idx):
+ video_path = self.samples[idx]
+ try:
+ video = self.decord_read(video_path)
+ video = self.transform(video) # T C H W -> T C H W
+ video = video.transpose(0, 1) # T C H W -> C T H W
+ return dict(video=video, label="")
+ except Exception as e:
+ print(f"Error with {e}, {video_path}")
+ return self.__getitem__(random.randint(0, self.__len__() - 1))
+
+ def decord_read(self, path):
+ decord_vr = self.v_decoder(path)
+ total_frames = len(decord_vr)
+ # Sampling video frames
+ if self.dynamic_sample:
+ sample_rate = random.randint(1, self.sample_rate)
+ else:
+ sample_rate = self.sample_rate
+ size = self.sequence_length * sample_rate
+ start_frame_ind, end_frame_ind = TemporalRandomCrop(total_frames, size)
+ frame_indice = np.linspace(
+ start_frame_ind, end_frame_ind - 1, self.sequence_length, dtype=int
+ )
+
+ video_data = decord_vr.get_batch(frame_indice).asnumpy()
+ video_data = torch.from_numpy(video_data)
+ video_data = video_data.permute(0, 3, 1, 2)
+ return video_data
+
+def resize(x, resolution):
+ height, width = x.shape[-2:]
+ aspect_ratio = width / height
+ if width <= height:
+ new_width = resolution
+ new_height = int(resolution / aspect_ratio)
+ else:
+ new_height = resolution
+ new_width = int(resolution * aspect_ratio)
+ resized_x = F.interpolate(x, size=(new_height, new_width), mode='bilinear', align_corners=True, antialias=True)
+ return resized_x
+
+class ValidVideoDataset(data.Dataset):
+ def __init__(
+ self,
+ real_video_dir,
+ num_frames,
+ sample_rate=1,
+ crop_size=None,
+ resolution=128,
+ ) -> None:
+ super().__init__()
+ self.real_video_files = self._combine_without_prefix(real_video_dir)
+ self.num_frames = num_frames
+ self.sample_rate = sample_rate
+ self.crop_size = crop_size
+ self.short_size = resolution
+ self.v_decoder = DecordInit()
+ self.transform = Compose(
+ [
+ Lambda(lambda x: (x / 255.0)), # [0, 1] for valid
+ Lambda(lambda x: resize(x, self.short_size)),
+ (
+ TVCenterCropVideo(crop_size=crop_size)
+ if crop_size is not None
+ else Lambda(lambda x: x)
+ ),
+ ]
+ )
+
+ def __len__(self):
+ return len(self.real_video_files)
+
+ def __getitem__(self, index):
+ try:
+ if index >= len(self):
+ raise IndexError
+ real_video_file = self.real_video_files[index]
+ real_video_tensor = self._load_video(real_video_file)
+ video_name = os.path.basename(real_video_file)
+ return {'video': real_video_tensor, 'file_name': video_name }
+ except:
+ print(f"Video error: {self.real_video_files[index]}")
+ return self.__getitem__(0)
+
+ def _load_video(self, video_path, sample_rate=None):
+ num_frames = self.num_frames
+ if not sample_rate:
+ sample_rate = self.sample_rate
+ try:
+ decord_vr = self.v_decoder(video_path)
+ except:
+ raise Exception(f"fail to load {video_path}.")
+ total_frames = len(decord_vr)
+ sample_frames_len = sample_rate * num_frames
+
+ if total_frames >= sample_frames_len:
+ s = 0
+ e = s + sample_frames_len
+ num_frames = num_frames
+ else:
+ s = 0
+ e = total_frames
+ num_frames = int(total_frames / sample_frames_len * num_frames)
+ print(
+ f"sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}",
+ video_path,
+ total_frames,
+ )
+
+ frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
+ video_data = torch.from_numpy(video_data)
+ video_data = video_data.permute(3, 0, 1, 2)
+ self.transform(video_data)
+ return _format_video_shape(self.transform(video_data))
+
+ def _combine_without_prefix(self, folder_path, prefix="."):
+ folder = []
+ for name in os.listdir(folder_path):
+ if not name.endswith(".mp4"):
+ continue
+ if name[0] == prefix:
+ continue
+ folder.append(os.path.join(folder_path, name))
+ folder.sort()
+ return folder
diff --git a/opensora/eval/cal_flolpips.py b/opensora/models/causalvideovae/eval/cal_flolpips.py
old mode 100644
new mode 100755
similarity index 95%
rename from opensora/eval/cal_flolpips.py
rename to opensora/models/causalvideovae/eval/cal_flolpips.py
index eaf7ca9e1..15cdd771e
--- a/opensora/eval/cal_flolpips.py
+++ b/opensora/models/causalvideovae/eval/cal_flolpips.py
@@ -5,8 +5,8 @@
from einops import rearrange
import sys
sys.path.append(".")
-from opensora.eval.flolpips.pwcnet import Network as PWCNet
-from opensora.eval.flolpips.flolpips import FloLPIPS
+from flolpips.pwcnet import Network as PWCNet
+from flolpips.flolpips import FloLPIPS
loss_fn = FloLPIPS(net='alex', version='0.1').eval().requires_grad_(False)
flownet = PWCNet().eval().requires_grad_(False)
diff --git a/opensora/eval/cal_fvd.py b/opensora/models/causalvideovae/eval/cal_fvd.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/cal_fvd.py
rename to opensora/models/causalvideovae/eval/cal_fvd.py
diff --git a/opensora/eval/cal_lpips.py b/opensora/models/causalvideovae/eval/cal_lpips.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/cal_lpips.py
rename to opensora/models/causalvideovae/eval/cal_lpips.py
diff --git a/opensora/eval/cal_psnr.py b/opensora/models/causalvideovae/eval/cal_psnr.py
old mode 100644
new mode 100755
similarity index 89%
rename from opensora/eval/cal_psnr.py
rename to opensora/models/causalvideovae/eval/cal_psnr.py
index b325106c6..08d6b94db
--- a/opensora/eval/cal_psnr.py
+++ b/opensora/models/causalvideovae/eval/cal_psnr.py
@@ -3,6 +3,18 @@
from tqdm import tqdm
import math
+def img_psnr_cuda(img1, img2):
+ # [0,1]
+ # compute mse
+ # mse = np.mean((img1-img2)**2)
+ mse = torch.mean((img1 / 1.0 - img2 / 1.0) ** 2)
+ # compute psnr
+ if mse < 1e-10:
+ return 100
+ psnr = 20 * torch.log10(1 / torch.sqrt(mse))
+ return psnr
+
+
def img_psnr(img1, img2):
# [0,1]
# compute mse
@@ -14,6 +26,7 @@ def img_psnr(img1, img2):
psnr = 20 * math.log10(1 / math.sqrt(mse))
return psnr
+
def trans(x):
return x
diff --git a/opensora/eval/cal_ssim.py b/opensora/models/causalvideovae/eval/cal_ssim.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/cal_ssim.py
rename to opensora/models/causalvideovae/eval/cal_ssim.py
diff --git a/opensora/eval/eval_clip_score.py b/opensora/models/causalvideovae/eval/eval_clip_score.py
similarity index 100%
rename from opensora/eval/eval_clip_score.py
rename to opensora/models/causalvideovae/eval/eval_clip_score.py
diff --git a/opensora/eval/flolpips/correlation/correlation.py b/opensora/models/causalvideovae/eval/flolpips/correlation/correlation.py
old mode 100644
new mode 100755
similarity index 97%
rename from opensora/eval/flolpips/correlation/correlation.py
rename to opensora/models/causalvideovae/eval/flolpips/correlation/correlation.py
index 7c91055ba..5f88f0107
--- a/opensora/eval/flolpips/correlation/correlation.py
+++ b/opensora/models/causalvideovae/eval/flolpips/correlation/correlation.py
@@ -2,7 +2,7 @@
import torch
-import cupy
+import cupy#add
import re
kernel_Correlation_rearrange = '''
diff --git a/opensora/eval/flolpips/flolpips.py b/opensora/models/causalvideovae/eval/flolpips/flolpips.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/flolpips/flolpips.py
rename to opensora/models/causalvideovae/eval/flolpips/flolpips.py
diff --git a/opensora/eval/flolpips/pretrained_networks.py b/opensora/models/causalvideovae/eval/flolpips/pretrained_networks.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/flolpips/pretrained_networks.py
rename to opensora/models/causalvideovae/eval/flolpips/pretrained_networks.py
diff --git a/opensora/eval/flolpips/pwcnet.py b/opensora/models/causalvideovae/eval/flolpips/pwcnet.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/flolpips/pwcnet.py
rename to opensora/models/causalvideovae/eval/flolpips/pwcnet.py
diff --git a/opensora/eval/flolpips/utils.py b/opensora/models/causalvideovae/eval/flolpips/utils.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/flolpips/utils.py
rename to opensora/models/causalvideovae/eval/flolpips/utils.py
diff --git a/opensora/eval/fvd/styleganv/fvd.py b/opensora/models/causalvideovae/eval/fvd/styleganv/fvd.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/fvd/styleganv/fvd.py
rename to opensora/models/causalvideovae/eval/fvd/styleganv/fvd.py
diff --git a/opensora/eval/fvd/videogpt/fvd.py b/opensora/models/causalvideovae/eval/fvd/videogpt/fvd.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/fvd/videogpt/fvd.py
rename to opensora/models/causalvideovae/eval/fvd/videogpt/fvd.py
diff --git a/opensora/eval/fvd/videogpt/pytorch_i3d.py b/opensora/models/causalvideovae/eval/fvd/videogpt/pytorch_i3d.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/fvd/videogpt/pytorch_i3d.py
rename to opensora/models/causalvideovae/eval/fvd/videogpt/pytorch_i3d.py
diff --git a/opensora/eval/script/cal_clip_score.sh b/opensora/models/causalvideovae/eval/script/cal_clip_score.sh
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/script/cal_clip_score.sh
rename to opensora/models/causalvideovae/eval/script/cal_clip_score.sh
diff --git a/opensora/eval/script/cal_fvd.sh b/opensora/models/causalvideovae/eval/script/cal_fvd.sh
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/script/cal_fvd.sh
rename to opensora/models/causalvideovae/eval/script/cal_fvd.sh
diff --git a/opensora/eval/script/cal_lpips.sh b/opensora/models/causalvideovae/eval/script/cal_lpips.sh
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/script/cal_lpips.sh
rename to opensora/models/causalvideovae/eval/script/cal_lpips.sh
diff --git a/opensora/eval/script/cal_psnr.sh b/opensora/models/causalvideovae/eval/script/cal_psnr.sh
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/script/cal_psnr.sh
rename to opensora/models/causalvideovae/eval/script/cal_psnr.sh
diff --git a/opensora/eval/script/cal_ssim.sh b/opensora/models/causalvideovae/eval/script/cal_ssim.sh
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/eval/script/cal_ssim.sh
rename to opensora/models/causalvideovae/eval/script/cal_ssim.sh
diff --git a/opensora/models/causalvideovae/model/__init__.py b/opensora/models/causalvideovae/model/__init__.py
new file mode 100755
index 000000000..ac357a09d
--- /dev/null
+++ b/opensora/models/causalvideovae/model/__init__.py
@@ -0,0 +1,4 @@
+from .registry import ModelRegistry
+from .vae import (
+ CausalVAEModel, WFVAEModel
+)
diff --git a/opensora/models/ae/videobase/configuration_videobase.py b/opensora/models/causalvideovae/model/configuration_videobase.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/models/ae/videobase/configuration_videobase.py
rename to opensora/models/causalvideovae/model/configuration_videobase.py
diff --git a/opensora/models/ae/videobase/dataset_videobase.py b/opensora/models/causalvideovae/model/dataset_videobase.py
old mode 100644
new mode 100755
similarity index 97%
rename from opensora/models/ae/videobase/dataset_videobase.py
rename to opensora/models/causalvideovae/model/dataset_videobase.py
index 32f842f63..564bc1f72
--- a/opensora/models/ae/videobase/dataset_videobase.py
+++ b/opensora/models/causalvideovae/model/dataset_videobase.py
@@ -9,8 +9,8 @@
import torch.nn.functional as F
from torchvision.transforms import Lambda
-from ....dataset.transform import ToTensorVideo, CenterCropVideo
-from ....utils.dataset_utils import DecordInit
+from ..dataset.transform import ToTensorVideo, CenterCropVideo
+from ..utils.dataset_utils import DecordInit
def TemporalRandomCrop(total_frames, size):
"""
diff --git a/opensora/models/causalvideovae/model/ema_model.py b/opensora/models/causalvideovae/model/ema_model.py
new file mode 100755
index 000000000..c4cf50172
--- /dev/null
+++ b/opensora/models/causalvideovae/model/ema_model.py
@@ -0,0 +1,31 @@
+class EMA:
+ def __init__(self, model, decay):
+ self.model = model
+ self.decay = decay
+ self.shadow = {}
+ self.backup = {}
+
+ def register(self):
+ for name, param in self.model.named_parameters():
+ if param.requires_grad:
+ self.shadow[name] = param.data.clone()
+
+ def update(self):
+ for name, param in self.model.named_parameters():
+ if name in self.shadow:
+ new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
+ self.shadow[name] = new_average.clone()
+
+ def apply_shadow(self):
+ for name, param in self.model.named_parameters():
+ if name in self.shadow:
+ self.backup[name] = param.data
+ param.data = self.shadow[name]
+
+ def restore(self):
+ for name, param in self.model.named_parameters():
+ if name in self.shadow:
+ param.data = self.backup[name]
+ self.backup = {}
+
+
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/losses/__init__.py b/opensora/models/causalvideovae/model/losses/__init__.py
old mode 100644
new mode 100755
similarity index 70%
rename from opensora/models/ae/videobase/losses/__init__.py
rename to opensora/models/causalvideovae/model/losses/__init__.py
index 91e59504e..07e681263
--- a/opensora/models/ae/videobase/losses/__init__.py
+++ b/opensora/models/causalvideovae/model/losses/__init__.py
@@ -1 +1 @@
-from .perceptual_loss import SimpleLPIPS, LPIPSWithDiscriminator, LPIPSWithDiscriminator3D
\ No newline at end of file
+from .perceptual_loss import SimpleLPIPS, LPIPSWithDiscriminator, LPIPSWithDiscriminator3D
diff --git a/opensora/models/ae/videobase/losses/discriminator.py b/opensora/models/causalvideovae/model/losses/discriminator.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/models/ae/videobase/losses/discriminator.py
rename to opensora/models/causalvideovae/model/losses/discriminator.py
diff --git a/opensora/models/ae/videobase/losses/lpips.py b/opensora/models/causalvideovae/model/losses/lpips.py
old mode 100644
new mode 100755
similarity index 97%
rename from opensora/models/ae/videobase/losses/lpips.py
rename to opensora/models/causalvideovae/model/losses/lpips.py
index 8b7062cdd..b51a73881
--- a/opensora/models/ae/videobase/losses/lpips.py
+++ b/opensora/models/causalvideovae/model/losses/lpips.py
@@ -4,7 +4,7 @@
import torch.nn as nn
from torchvision import models
from collections import namedtuple
-from .....utils.taming_download import get_ckpt_path
+from ...utils.taming_download import get_ckpt_path
class LPIPS(nn.Module):
# Learned perceptual metric
@@ -23,7 +23,7 @@ def __init__(self, use_dropout=True):
param.requires_grad = False
def load_from_pretrained(self, name="vgg_lpips"):
- ckpt = get_ckpt_path(name, "taming/modules/autoencoder/lpips")
+ ckpt = get_ckpt_path(name, ".cache/lpips")
self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
print("loaded pretrained LPIPS loss from {}".format(ckpt))
diff --git a/opensora/models/ae/videobase/losses/perceptual_loss.py b/opensora/models/causalvideovae/model/losses/perceptual_loss.py
old mode 100644
new mode 100755
similarity index 71%
rename from opensora/models/ae/videobase/losses/perceptual_loss.py
rename to opensora/models/causalvideovae/model/losses/perceptual_loss.py
index e4042d052..eb99daae2
--- a/opensora/models/ae/videobase/losses/perceptual_loss.py
+++ b/opensora/models/causalvideovae/model/losses/perceptual_loss.py
@@ -71,6 +71,7 @@ def __init__(
use_actnorm=False,
disc_conditional=False,
disc_loss="hinge",
+ loss_type: str = "l1"
):
super().__init__()
@@ -89,19 +90,14 @@ def __init__(
self.disc_factor = disc_factor
self.discriminator_weight = disc_weight
self.disc_conditional = disc_conditional
-
+ self.loss_func = l1 if loss_type == "l1" else l2
+
def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
- if last_layer is not None:
- nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
- g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
- else:
- nll_grads = torch.autograd.grad(
- nll_loss, self.last_layer[0], retain_graph=True
- )[0]
- g_grads = torch.autograd.grad(
- g_loss, self.last_layer[0], retain_graph=True
- )[0]
+ layer = last_layer if last_layer is not None else self.last_layer[0]
+ nll_grads = torch.autograd.grad(nll_loss, layer, retain_graph=True)[0]
+ g_grads = torch.autograd.grad(g_loss, layer, retain_graph=True)[0]
+
d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
d_weight = d_weight * self.discriminator_weight
@@ -119,47 +115,38 @@ def forward(
last_layer=None,
cond=None,
):
+ # GAN Part
inputs = rearrange(inputs, "b c t h w -> (b t) c h w").contiguous()
reconstructions = rearrange(
reconstructions, "b c t h w -> (b t) c h w"
).contiguous()
- rec_loss = torch.abs(inputs - reconstructions)
- if self.perceptual_weight > 0:
- p_loss = self.perceptual_loss(inputs, reconstructions)
- rec_loss = rec_loss + self.perceptual_weight * p_loss
- nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
- weighted_nll_loss = nll_loss
- if weights is not None:
- weighted_nll_loss = weights * nll_loss
- weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
- nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
- kl_loss = posteriors.kl()
- kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
-
- # GAN Part
if optimizer_idx == 0:
- # generator update
- if cond is None:
- assert not self.disc_conditional
- logits_fake = self.discriminator(reconstructions.contiguous())
- else:
- assert self.disc_conditional
- logits_fake = self.discriminator(
- torch.cat((reconstructions.contiguous(), cond), dim=1)
- )
+ rec_loss = self.loss_func(inputs, reconstructions)
+ if self.perceptual_weight > 0:
+ p_loss = self.perceptual_loss(inputs, reconstructions)
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
+ nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+ weighted_nll_loss = nll_loss
+ if weights is not None:
+ weighted_nll_loss = weights * nll_loss
+ weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+ nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+ kl_loss = posteriors.kl()
+ kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+
+ logits_fake = self.discriminator(reconstructions)
g_loss = -torch.mean(logits_fake)
-
- if self.disc_factor > 0.0:
- try:
+ if global_step >= self.discriminator_iter_start:
+ if self.disc_factor > 0.0:
d_weight = self.calculate_adaptive_weight(
nll_loss, g_loss, last_layer=last_layer
)
- except RuntimeError:
- assert not self.training
- d_weight = torch.tensor(0.0)
+ else:
+ d_weight = torch.tensor(1.0)
else:
d_weight = torch.tensor(0.0)
-
+ g_loss = torch.tensor(0.0, requires_grad=True)
+
disc_factor = adopt_weight(
self.disc_factor, global_step, threshold=self.discriminator_iter_start
)
@@ -221,6 +208,8 @@ def __init__(
use_actnorm=False,
disc_conditional=False,
disc_loss="hinge",
+ learn_logvar: bool = False,
+ loss_type: str = "l1"
):
super().__init__()
@@ -229,8 +218,9 @@ def __init__(
self.pixel_weight = pixelloss_weight
self.perceptual_loss = LPIPS().eval()
self.perceptual_weight = perceptual_weight
- self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
-
+ self.logvar = nn.Parameter(
+ torch.full((), logvar_init), requires_grad=learn_logvar
+ )
self.discriminator = NLayerDiscriminator3D(
input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm
).apply(weights_init)
@@ -239,19 +229,14 @@ def __init__(
self.disc_factor = disc_factor
self.discriminator_weight = disc_weight
self.disc_conditional = disc_conditional
-
+ self.loss_func = l1 if loss_type == "l1" else l2
+
def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
- if last_layer is not None:
- nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
- g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
- else:
- nll_grads = torch.autograd.grad(
- nll_loss, self.last_layer[0], retain_graph=True
- )[0]
- g_grads = torch.autograd.grad(
- g_loss, self.last_layer[0], retain_graph=True
- )[0]
+ layer = last_layer if last_layer is not None else self.last_layer[0]
+ nll_grads = torch.autograd.grad(nll_loss, layer, retain_graph=True)[0]
+ g_grads = torch.autograd.grad(g_loss, layer, retain_graph=True)[0]
+
d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
d_weight = d_weight * self.discriminator_weight
@@ -267,53 +252,47 @@ def forward(
split="train",
weights=None,
last_layer=None,
- cond=None,
+ cond=None
):
+
t = inputs.shape[2]
- inputs = rearrange(inputs, "b c t h w -> (b t) c h w").contiguous()
- reconstructions = rearrange(
- reconstructions, "b c t h w -> (b t) c h w"
- ).contiguous()
- rec_loss = torch.abs(inputs - reconstructions)
- if self.perceptual_weight > 0:
- p_loss = self.perceptual_loss(inputs, reconstructions)
- rec_loss = rec_loss + self.perceptual_weight * p_loss
- nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
- weighted_nll_loss = nll_loss
- if weights is not None:
- weighted_nll_loss = weights * nll_loss
- weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
- nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
- kl_loss = posteriors.kl()
- kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
- inputs = rearrange(inputs, "(b t) c h w -> b c t h w", t=t).contiguous()
- reconstructions = rearrange(
- reconstructions, "(b t) c h w -> b c t h w", t=t
- ).contiguous()
# GAN Part
if optimizer_idx == 0:
- # generator update
- if cond is None:
- assert not self.disc_conditional
- logits_fake = self.discriminator(reconstructions)
- else:
- assert self.disc_conditional
- logits_fake = self.discriminator(
- torch.cat((reconstructions, cond), dim=1)
- )
+ inputs = rearrange(inputs, "b c t h w -> (b t) c h w").contiguous()
+ reconstructions = rearrange(
+ reconstructions, "b c t h w -> (b t) c h w"
+ ).contiguous()
+ rec_loss = self.loss_func(inputs, reconstructions)
+ if self.perceptual_weight > 0:
+ p_loss = self.perceptual_loss(inputs, reconstructions)
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
+ nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+ weighted_nll_loss = nll_loss
+ if weights is not None:
+ weighted_nll_loss = weights * nll_loss
+ weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+ nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+ kl_loss = posteriors.kl()
+ kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+
+ inputs = rearrange(inputs, "(b t) c h w -> b c t h w", t=t).contiguous()
+ reconstructions = rearrange(
+ reconstructions, "(b t) c h w -> b c t h w", t=t
+ ).contiguous()
+
+ logits_fake = self.discriminator(reconstructions)
g_loss = -torch.mean(logits_fake)
-
- if self.disc_factor > 0.0:
- try:
+ if global_step >= self.discriminator_iter_start:
+ if self.disc_factor > 0.0:
d_weight = self.calculate_adaptive_weight(
nll_loss, g_loss, last_layer=last_layer
)
- except RuntimeError as e:
- assert not self.training, print(e)
- d_weight = torch.tensor(0.0)
+ else:
+ d_weight = torch.tensor(1.0)
else:
d_weight = torch.tensor(0.0)
-
+ g_loss = torch.tensor(0.0, requires_grad=True)
+
disc_factor = adopt_weight(
self.disc_factor, global_step, threshold=self.discriminator_iter_start
)
@@ -333,22 +312,14 @@ def forward(
"{}/g_loss".format(split): g_loss.detach().mean(),
}
return loss, log
-
- if optimizer_idx == 1:
- if cond is None:
- logits_real = self.discriminator(inputs.contiguous().detach())
- logits_fake = self.discriminator(reconstructions.contiguous().detach())
- else:
- logits_real = self.discriminator(
- torch.cat((inputs.contiguous().detach(), cond), dim=1)
- )
- logits_fake = self.discriminator(
- torch.cat((reconstructions.contiguous().detach(), cond), dim=1)
- )
+ elif optimizer_idx == 1:
+ logits_real = self.discriminator(inputs.contiguous().detach())
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
disc_factor = adopt_weight(
self.disc_factor, global_step, threshold=self.discriminator_iter_start
)
+
d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
log = {
@@ -367,6 +338,8 @@ def __init__(
pixelloss_weight=1.0,
perceptual_weight=1.0,
disc_loss="hinge",
+ learn_logvar: bool = False,
+ **kwargs
):
super().__init__()
@@ -375,7 +348,9 @@ def __init__(
self.pixel_weight = pixelloss_weight
self.perceptual_loss = LPIPS().eval()
self.perceptual_weight = perceptual_weight
- self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+ self.logvar = nn.Parameter(
+ torch.full((), logvar_init), requires_grad=learn_logvar
+ )
def forward(
self,
diff --git a/opensora/models/ae/videobase/modeling_videobase.py b/opensora/models/causalvideovae/model/modeling_videobase.py
old mode 100644
new mode 100755
similarity index 64%
rename from opensora/models/ae/videobase/modeling_videobase.py
rename to opensora/models/causalvideovae/model/modeling_videobase.py
index 0b2274ec5..260575675
--- a/opensora/models/ae/videobase/modeling_videobase.py
+++ b/opensora/models/causalvideovae/model/modeling_videobase.py
@@ -9,34 +9,8 @@
from typing import Optional, Union
import glob
-class VideoBaseAE(nn.Module):
- _supports_gradient_checkpointing = False
-
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
-
- @classmethod
- def load_from_checkpoint(cls, model_path):
- with open(os.path.join(model_path, "config.json"), "r") as file:
- config = json.load(file)
- state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
- if 'state_dict' in state_dict:
- state_dict = state_dict['state_dict']
- model = cls(config=cls.CONFIGURATION_CLS(**config))
- model.load_state_dict(state_dict)
- return model
-
- @classmethod
- def download_and_load_model(cls, model_name, cache_dir=None):
- pass
-
- def encode(self, x: torch.Tensor, *args, **kwargs):
- pass
-
- def decode(self, encoding: torch.Tensor, *args, **kwargs):
- pass
-class VideoBaseAE_PL(pl.LightningModule, ModelMixin, ConfigMixin):
+class VideoBaseAE(ModelMixin, ConfigMixin):
config_name = "config.json"
def __init__(self, *args, **kwargs) -> None:
@@ -69,11 +43,10 @@ def num_training_steps(self) -> int:
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
ckpt_files = glob.glob(os.path.join(pretrained_model_name_or_path, '*.ckpt'))
if ckpt_files:
- # Adapt to PyTorch Lightning
+ # Adapt to checkpoint
last_ckpt_file = ckpt_files[-1]
config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
model = cls.from_config(config_file)
- print("init from {}".format(last_ckpt_file))
model.init_from_ckpt(last_ckpt_file)
return model
else:
diff --git a/opensora/models/causalvideovae/model/modules/__init__.py b/opensora/models/causalvideovae/model/modules/__init__.py
new file mode 100755
index 000000000..7ef41edd2
--- /dev/null
+++ b/opensora/models/causalvideovae/model/modules/__init__.py
@@ -0,0 +1,6 @@
+from .block import Block
+from .attention import *
+from .conv import *
+from .normalize import *
+from .resnet_block import *
+from .updownsample import *
diff --git a/opensora/models/causalvideovae/model/modules/attention.py b/opensora/models/causalvideovae/model/modules/attention.py
new file mode 100755
index 000000000..39c1659cb
--- /dev/null
+++ b/opensora/models/causalvideovae/model/modules/attention.py
@@ -0,0 +1,101 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from .normalize import Normalize
+from .conv import CausalConv3d
+import torch
+from .block import Block
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+except:
+ torch_npu = None
+ npu_config = None
+ from xformers import ops as xops
+
+class AttnBlock3D(Block):
+ """Compatible with old versions, there are issues, use with caution."""
+ def __init__(self, in_channels):
+ super().__init__()
+ self.in_channels = in_channels
+
+ self.norm = Normalize(in_channels)
+ self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+ self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+ self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+ self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+
+ def forward(self, x):
+ h_ = x
+ h_ = self.norm(h_)
+ q = self.q(h_)
+ k = self.k(h_)
+ v = self.v(h_)
+
+ # compute attention
+ b, c, t, h, w = q.shape
+ q = q.reshape(b * t, c, h * w)
+ q = q.permute(0, 2, 1) # b,hw,c
+ k = k.reshape(b * t, c, h * w) # b,c,hw
+ w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+ w_ = w_ * (int(c) ** (-0.5))
+ w_ = torch.nn.functional.softmax(w_, dim=2)
+
+ # attend to values
+ v = v.reshape(b * t, c, h * w)
+ w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
+ h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+ h_ = h_.reshape(b, c, t, h, w)
+
+ h_ = self.proj_out(h_)
+
+ return x + h_
+
+class AttnBlock3DFix(nn.Module):
+ """
+ Thanks to https://github.com/PKU-YuanGroup/Open-Sora-Plan/pull/172.
+ """
+ def __init__(self, in_channels, norm_type="groupnorm"):
+ super().__init__()
+ self.in_channels = in_channels
+
+ self.norm = Normalize(in_channels, norm_type=norm_type)
+ self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+ self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+ self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+ self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+
+ def forward(self, x):
+ h_ = x
+ h_ = self.norm(h_)
+ q = self.q(h_)
+ k = self.k(h_)
+ v = self.v(h_)
+
+ b, c, t, h, w = q.shape
+ q = q.permute(0, 2, 3, 4, 1).reshape(b * t, h * w, c).contiguous()
+ k = k.permute(0, 2, 3, 4, 1).reshape(b * t, h * w, c).contiguous()
+ v = v.permute(0, 2, 3, 4, 1).reshape(b * t, h * w, c).contiguous()
+
+ if torch_npu is None:
+ attn_output = xops.memory_efficient_attention(
+ q, k, v,
+ scale=c ** -0.5
+ )
+ else:
+ # print('npu_config.enable_FA, q.dtype == torch.float32', npu_config.enable_FA, q.dtype == torch.float32)
+ if npu_config.enable_FA and q.dtype == torch.float32:
+ dtype = torch.bfloat16
+ else:
+ dtype = None
+ with set_run_dtype(q, dtype):
+ query, key, value = npu_config.set_current_run_dtype([q, k, v])
+ hidden_states = npu_config.run_attention(query, key, value, atten_mask=None, input_layout="BSH",
+ head_dim=c, head_num=1)
+
+ attn_output = npu_config.restore_dtype(hidden_states)
+
+ attn_output = attn_output.reshape(b, t, h, w, c).permute(0, 4, 1, 2, 3)
+ h_ = self.proj_out(attn_output)
+
+ return x + h_
diff --git a/opensora/models/ae/videobase/modules/block.py b/opensora/models/causalvideovae/model/modules/block.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/models/ae/videobase/modules/block.py
rename to opensora/models/causalvideovae/model/modules/block.py
diff --git a/opensora/models/causalvideovae/model/modules/conv.py b/opensora/models/causalvideovae/model/modules/conv.py
new file mode 100755
index 000000000..776cea57c
--- /dev/null
+++ b/opensora/models/causalvideovae/model/modules/conv.py
@@ -0,0 +1,131 @@
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+
+import torch.nn as nn
+from typing import Union, Tuple
+import torch
+from .block import Block
+from .ops import cast_tuple
+from .ops import video_to_image
+from torch.utils.checkpoint import checkpoint
+import torch.nn.functional as F
+
+
+class Conv2d(nn.Conv2d):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int]] = 3,
+ stride: Union[int, Tuple[int]] = 1,
+ padding: Union[str, int, Tuple[int]] = 0,
+ dilation: Union[int, Tuple[int]] = 1,
+ groups: int = 1,
+ bias: bool = True,
+ padding_mode: str = "zeros",
+ device=None,
+ dtype=None,
+ ) -> None:
+ super().__init__(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ groups,
+ bias,
+ padding_mode,
+ device,
+ dtype,
+ )
+
+ @video_to_image
+ def forward(self, x):
+ return super().forward(x)
+
+
+
+class CausalConv3d(Block):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: Union[int, Tuple[int, int, int]],
+ enable_cached=False,
+ bias=True,
+ **kwargs,
+ ):
+ super().__init__()
+ self.kernel_size = cast_tuple(kernel_size, 3)
+ self.time_kernel_size = self.kernel_size[0]
+ self.chan_in = chan_in
+ self.chan_out = chan_out
+ self.stride = kwargs.pop("stride", 1)
+ self.padding = kwargs.pop("padding", 0)
+ self.padding = list(cast_tuple(self.padding, 3))
+ self.padding[0] = 0
+ self.stride = cast_tuple(self.stride, 3)
+ self.conv = nn.Conv3d(
+ chan_in,
+ chan_out,
+ self.kernel_size,
+ stride=self.stride,
+ padding=self.padding,
+ bias=bias
+ )
+ self.enable_cached = enable_cached
+ self.causal_cached = None
+
+ def forward(self, x):
+
+ x_dtype = x.dtype
+
+ if self.causal_cached is None:
+ first_frame_pad = x[:, :, :1, :, :].repeat(
+ (1, 1, self.time_kernel_size - 1, 1, 1)
+ )
+ else:
+ first_frame_pad = self.causal_cached
+
+ x = torch.concatenate((first_frame_pad, x), dim=2)
+
+ if self.enable_cached and self.time_kernel_size != 1:
+ if (self.time_kernel_size - 1) // self.stride[0] != 0:
+ self.causal_cached = x[:, :, -(self.time_kernel_size - 1) // self.stride[0]:, :, :]
+ else:
+ self.causal_cached = x[:, :, 0:0, :, :]
+
+ # x = x.to(torch.float64)
+
+
+
+ if npu_config is not None and npu_config.on_npu:
+ return npu_config.run_conv3d(self.conv, x, x_dtype)
+ else:
+ x = self.conv(x)
+ return x
+
+
+class CausalConv3d_GC(CausalConv3d):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: Union[int, Tuple[int]],
+ init_method="random",
+ **kwargs
+ ):
+ super().__init__(chan_in, chan_out, kernel_size, init_method, **kwargs)
+
+ def forward(self, x):
+ # 1 + 16 16 as video, 1 as image
+ first_frame_pad = x[:, :, :1, :, :].repeat(
+ (1, 1, self.time_kernel_size - 1, 1, 1)
+ ) # b c t h w
+ x = torch.concatenate((first_frame_pad, x), dim=2) # 3 + 16
+ return checkpoint(self.conv, x)
diff --git a/opensora/models/causalvideovae/model/modules/normalize.py b/opensora/models/causalvideovae/model/modules/normalize.py
new file mode 100755
index 000000000..26e7bee76
--- /dev/null
+++ b/opensora/models/causalvideovae/model/modules/normalize.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+from .block import Block
+from einops import rearrange
+
+class GroupNorm(Block):
+ def __init__(self, num_channels, num_groups=32, eps=1e-6, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+ self.norm = torch.nn.GroupNorm(
+ num_groups=num_groups, num_channels=num_channels, eps=eps, affine=True
+ )
+ def forward(self, x):
+ return self.norm(x)
+
+class LayerNorm(Block):
+ def __init__(self, num_channels, eps=1e-6, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+ self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True)
+ def forward(self, x):
+ if x.dim() == 5:
+ x = rearrange(x, "b c t h w -> b t h w c")
+ x = self.norm(x)
+ x = rearrange(x, "b t h w c -> b c t h w")
+ else:
+ x = rearrange(x, "b c h w -> b h w c")
+ x = self.norm(x)
+ x = rearrange(x, "b h w c -> b c h w")
+ return x
+
+def Normalize(in_channels, num_groups=32, norm_type="groupnorm"):
+ if norm_type == "groupnorm":
+ return torch.nn.GroupNorm(
+ num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+ )
+ elif norm_type == "layernorm":
+ return LayerNorm(num_channels=in_channels, eps=1e-6)
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/modules/ops.py b/opensora/models/causalvideovae/model/modules/ops.py
old mode 100644
new mode 100755
similarity index 59%
rename from opensora/models/ae/videobase/modules/ops.py
rename to opensora/models/causalvideovae/model/modules/ops.py
index fdd262ad7..b160c2ab3
--- a/opensora/models/ae/videobase/modules/ops.py
+++ b/opensora/models/causalvideovae/model/modules/ops.py
@@ -5,9 +5,18 @@ def video_to_image(func):
def wrapper(self, x, *args, **kwargs):
if x.dim() == 5:
t = x.shape[2]
- x = rearrange(x, "b c t h w -> (b t) c h w")
- x = func(self, x, *args, **kwargs)
- x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+ if True:
+ x = rearrange(x, "b c t h w -> (b t) c h w")
+ x = func(self, x, *args, **kwargs)
+ x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+ else:
+ # Conv 2d slice infer
+ result = []
+ for i in range(t):
+ frame = x[:, :, i, :, :]
+ frame = func(self, frame, *args, **kwargs)
+ result.append(frame.unsqueeze(2))
+ x = torch.concatenate(result, dim=2)
return x
return wrapper
@@ -15,7 +24,7 @@ def nonlinearity(x):
return x * torch.sigmoid(x)
def cast_tuple(t, length=1):
- return t if isinstance(t, tuple) else ((t,) * length)
+ return t if isinstance(t, tuple) or isinstance(t, list) else ((t,) * length)
def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
n_dims = len(x.shape)
diff --git a/opensora/models/ae/videobase/modules/quant.py b/opensora/models/causalvideovae/model/modules/quant.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/models/ae/videobase/modules/quant.py
rename to opensora/models/causalvideovae/model/modules/quant.py
diff --git a/opensora/models/causalvideovae/model/modules/resnet_block.py b/opensora/models/causalvideovae/model/modules/resnet_block.py
new file mode 100755
index 000000000..9c1920264
--- /dev/null
+++ b/opensora/models/causalvideovae/model/modules/resnet_block.py
@@ -0,0 +1,175 @@
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+
+import torch
+from .normalize import Normalize
+from .ops import nonlinearity, video_to_image
+from .conv import CausalConv3d
+from .block import Block
+from torch.utils.checkpoint import checkpoint
+
+
+class ResnetBlock2D(Block):
+ def __init__(
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ norm_type,
+ dropout,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.use_conv_shortcut = conv_shortcut
+
+ self.norm1 = Normalize(in_channels, norm_type=norm_type)
+ self.conv1 = torch.nn.Conv2d(
+ in_channels, out_channels, kernel_size=3, stride=1, padding=1
+ )
+ self.norm2 = Normalize(out_channels, norm_type=norm_type)
+ self.dropout = torch.nn.Dropout(dropout)
+ self.conv2 = torch.nn.Conv2d(
+ out_channels, out_channels, kernel_size=3, stride=1, padding=1
+ )
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ self.conv_shortcut = torch.nn.Conv2d(
+ in_channels, out_channels, kernel_size=3, stride=1, padding=1
+ )
+ else:
+ self.nin_shortcut = torch.nn.Conv2d(
+ in_channels, out_channels, kernel_size=1, stride=1, padding=0
+ )
+
+ @video_to_image
+ def forward(self, x):
+ h = x
+ if npu_config is None:
+ h = self.norm1(h)
+ else:
+ h = npu_config.run_group_norm(self.norm1, h)
+ h = nonlinearity(h)
+ h = self.conv1(h)
+ if npu_config is None:
+ h = self.norm2(h)
+ else:
+ h = npu_config.run_group_norm(self.norm2, h)
+ h = nonlinearity(h)
+ h = self.dropout(h)
+ h = self.conv2(h)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ x = self.conv_shortcut(x)
+ else:
+ x = self.nin_shortcut(x)
+ x = x + h
+ return x
+
+
+class ResnetBlock3D(Block):
+ def __init__(
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ dropout,
+ norm_type,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.use_conv_shortcut = conv_shortcut
+
+ self.norm1 = Normalize(in_channels, norm_type=norm_type)
+ self.conv1 = CausalConv3d(in_channels, out_channels, 3, padding=1)
+ self.norm2 = Normalize(out_channels, norm_type=norm_type)
+ self.dropout = torch.nn.Dropout(dropout)
+ self.conv2 = CausalConv3d(out_channels, out_channels, 3, padding=1)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ self.conv_shortcut = CausalConv3d(
+ in_channels, out_channels, 3, padding=1
+ )
+ else:
+ self.nin_shortcut = CausalConv3d(
+ in_channels, out_channels, 1, padding=0
+ )
+
+ def forward(self, x):
+ h = x
+ if npu_config is None:
+ h = self.norm1(h)
+ else:
+ h = npu_config.run_group_norm(self.norm1, h)
+ h = nonlinearity(h)
+ h = self.conv1(h)
+ if npu_config is None:
+ h = self.norm2(h)
+ else:
+ h = npu_config.run_group_norm(self.norm2, h)
+ h = nonlinearity(h)
+ h = self.dropout(h)
+ h = self.conv2(h)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ x = self.conv_shortcut(x)
+ else:
+ x = self.nin_shortcut(x)
+ return x + h
+
+
+class ResnetBlock3D_GC(Block):
+ def __init__(
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ norm_type,
+ dropout,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.use_conv_shortcut = conv_shortcut
+
+ self.norm1 = Normalize(in_channels, norm_type=norm_type)
+ self.conv1 = CausalConv3d(in_channels, out_channels, 3, padding=1)
+ self.norm2 = Normalize(out_channels, norm_type=norm_type)
+ self.dropout = torch.nn.Dropout(dropout)
+ self.conv2 = CausalConv3d(out_channels, out_channels, 3, padding=1)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ self.conv_shortcut = CausalConv3d(
+ in_channels, out_channels, 3, padding=1
+ )
+ else:
+ self.nin_shortcut = CausalConv3d(
+ in_channels, out_channels, 1, padding=0
+ )
+
+ def forward(self, x):
+ return checkpoint(self._forward, x, use_reentrant=True)
+
+ def _forward(self, x):
+ h = x
+ h = self.norm1(h)
+ h = nonlinearity(h)
+ h = self.conv1(h)
+ h = self.norm2(h)
+ h = nonlinearity(h)
+ h = self.dropout(h)
+ h = self.conv2(h)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ x = self.conv_shortcut(x)
+ else:
+ x = self.nin_shortcut(x)
+ return x + h
diff --git a/opensora/models/causalvideovae/model/modules/updownsample.py b/opensora/models/causalvideovae/model/modules/updownsample.py
new file mode 100755
index 000000000..fea1230e2
--- /dev/null
+++ b/opensora/models/causalvideovae/model/modules/updownsample.py
@@ -0,0 +1,346 @@
+from typing import Union, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .ops import cast_tuple, video_to_image
+from .conv import CausalConv3d, CausalConv3d_GC
+from einops import rearrange
+from .block import Block
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+
+
+class Upsample(Block):
+ def __init__(self, in_channels, out_channels):
+ super().__init__()
+ self.with_conv = True
+ if self.with_conv:
+ self.conv = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ @video_to_image
+ def forward(self, x):
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+ if self.with_conv:
+ x = self.conv(x)
+ return x
+
+class Downsample(Block):
+ def __init__(self, in_channels, out_channels, undown=False):
+ super().__init__()
+ self.with_conv = True
+ self.undown = undown
+ if self.with_conv:
+ # no asymmetric padding in torch conv, must do it ourselves
+ if self.undown:
+ self.conv = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ else:
+ self.conv = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=2,
+ padding=0)
+ @video_to_image
+ def forward(self, x):
+ if self.with_conv:
+ if self.undown:
+ if npu_config is not None and npu_config.on_npu:
+ x_dtype = x.dtype
+ x = x.to(npu_config.replaced_type)
+ x = npu_config.run_conv3d(self.conv, x, x_dtype)
+ else:
+ x = self.conv(x)
+ else:
+ pad = (0, 1, 0, 1)
+ if npu_config is not None and npu_config.on_npu:
+ x_dtype = x.dtype
+ x = x.to(npu_config.replaced_type)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = npu_config.run_conv3d(self.conv, x, x_dtype)
+ else:
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ else:
+ x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+ return x
+
+class SpatialDownsample2x(Block):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: Union[int, Tuple[int]] = (3, 3),
+ stride: Union[int, Tuple[int]] = (2, 2),
+ **kwargs
+ ):
+ super().__init__()
+ kernel_size = cast_tuple(kernel_size, 2)
+ stride = cast_tuple(stride, 2)
+ self.chan_in = chan_in
+ self.chan_out = chan_out
+ self.kernel_size = kernel_size
+ self.conv = CausalConv3d(
+ self.chan_in,
+ self.chan_out,
+ (1,) + self.kernel_size,
+ stride=(1, ) + stride,
+ padding=0
+ )
+
+ def forward(self, x):
+ pad = (0,1,0,1,0,0)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ return x
+
+class SpatialUpsample2x_GC(Block):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: Union[int, Tuple[int]] = (3, 3),
+ stride: Union[int, Tuple[int]] = (1, 1),
+ unup=False,
+ ):
+ super().__init__()
+ self.chan_in = chan_in
+ self.chan_out = chan_out
+ self.kernel_size = kernel_size
+ self.unup = unup
+ self.conv = CausalConv3d_GC(
+ self.chan_in,
+ self.chan_out,
+ (1,) + self.kernel_size,
+ stride=(1, ) + stride,
+ padding=1
+ )
+
+ def forward(self, x):
+ if not self.unup:
+ t = x.shape[2]
+ x = rearrange(x, "b c t h w -> b (c t) h w")
+ x = F.interpolate(x, scale_factor=(2,2), mode="nearest")
+ x = rearrange(x, "b (c t) h w -> b c t h w", t=t)
+ x = self.conv(x)
+ return x
+
+
+class SpatialUpsample2x(Block):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: Union[int, Tuple[int]] = (3, 3),
+ stride: Union[int, Tuple[int]] = (1, 1),
+ unup=False,
+ ):
+ super().__init__()
+ self.chan_in = chan_in
+ self.chan_out = chan_out
+ self.kernel_size = kernel_size
+ self.unup = unup
+ self.conv = CausalConv3d(
+ self.chan_in,
+ self.chan_out,
+ (1,) + self.kernel_size,
+ stride=(1, ) + stride,
+ padding=1
+ )
+
+ def forward(self, x):
+ if not self.unup:
+ t = x.shape[2]
+ x = rearrange(x, "b c t h w -> b (c t) h w")
+ x = F.interpolate(x, scale_factor=(2,2), mode="nearest")
+ x = rearrange(x, "b (c t) h w -> b c t h w", t=t)
+ x = self.conv(x)
+ return x
+
+class TimeDownsample2x(Block):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: int = 3
+ ):
+ super().__init__()
+ self.kernel_size = kernel_size
+ if npu_config is not None and npu_config.on_npu:
+ self.avg_pool = nn.AvgPool2d((kernel_size, 1), stride=(2, 1))
+ self.pad = nn.ReplicationPad3d((0, 0, 0, 0, self.kernel_size - 1, 0))
+ else:
+ self.conv = nn.AvgPool3d((kernel_size, 1, 1), stride=(2, 1, 1))
+
+ def forward(self, x):
+ if npu_config is not None and npu_config.on_npu:
+ n, c, d, h, w = x.shape
+ x = self.pad(x)
+ x = x.view(n * c, -1, h * w)
+ pooled = self.avg_pool(x)
+ output = pooled.view(n, c, -1, h, w)
+ return output
+ else:
+ first_frame_pad = x[:, :, :1, :, :].repeat(
+ (1, 1, self.kernel_size - 1, 1, 1)
+ )
+ x = torch.concatenate((first_frame_pad, x), dim=2)
+ return self.conv(x)
+
+class TimeUpsample2x(Block):
+ def __init__(
+ self,
+ chan_in,
+ chan_out
+ ):
+ super().__init__()
+ def forward(self, x):
+ if x.size(2) > 1:
+ x,x_= x[:,:,:1],x[:,:,1:]
+ x_= F.interpolate(x_, scale_factor=(2,1,1), mode='trilinear')
+ x = torch.concat([x, x_], dim=2)
+ return x
+
+class TimeDownsampleRes2x(Block):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size: int = 3,
+ mix_factor: float = 2.0,
+ ):
+ super().__init__()
+ self.kernel_size = cast_tuple(kernel_size, 3)
+ if npu_config is not None and npu_config.on_npu:
+ self.avg_pool = nn.AvgPool2d((kernel_size, 1), stride=(2, 1))
+ self.pad = nn.ReplicationPad3d((0, 0, 0, 0, kernel_size - 1, 0))
+ else:
+ self.avg_pool = nn.AvgPool3d((kernel_size, 1, 1), stride=(2, 1, 1))
+ self.conv = nn.Conv3d(
+ in_channels, out_channels, self.kernel_size, stride=(2,1,1), padding=(0,1,1)
+ )
+ self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
+
+ def forward(self, x):
+ alpha = torch.sigmoid(self.mix_factor)
+ if npu_config is not None and npu_config.on_npu:
+ n, c, d, h, w = x.shape
+ x_dtype = x.dtype
+ x = x.to(npu_config.replaced_type)
+ x = self.pad(x)
+ pad_x = x.view(n, c, -1, h, w)
+ avg_x = self.avg_pool(x.view(n * c, -1, h * w)).view(n, c, -1, h, w).to(x_dtype)
+ conv_x = npu_config.run_conv3d(self.conv, pad_x, x_dtype)
+ return alpha * avg_x + (1 - alpha) * conv_x
+ else:
+ first_frame_pad = x[:, :, :1, :, :].repeat(
+ (1, 1, self.kernel_size[0] - 1, 1, 1)
+ )
+ x = torch.concatenate((first_frame_pad, x), dim=2)
+ return alpha * self.avg_pool(x) + (1 - alpha) * self.conv(x)
+
+class TimeUpsampleRes2x(Block):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size: int = 3,
+ mix_factor: float = 2.0,
+ ):
+ super().__init__()
+ self.conv = CausalConv3d(
+ in_channels, out_channels, kernel_size, padding=1
+ )
+ self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
+
+ def forward(self, x):
+ alpha = torch.sigmoid(self.mix_factor)
+ if x.size(2) > 1:
+ x,x_= x[:,:,:1],x[:,:,1:]
+ if npu_config is not None and npu_config.on_npu:
+ x_dtype = x_.dtype
+ x_ = x_.to(npu_config.replaced_type)
+ x_ = F.interpolate(x_, scale_factor=(2, 1, 1), mode='trilinear')
+ x_ = x_.to(x_dtype)
+ else:
+ x_= F.interpolate(x_, scale_factor=(2,1,1), mode='trilinear')
+ x = torch.concat([x, x_], dim=2)
+ return alpha * x + (1-alpha) * self.conv(x)
+
+class Spatial2xTime2x3DDownsample(Block):
+ def __init__(self, in_channels, out_channels):
+ super().__init__()
+ self.conv = CausalConv3d(in_channels, out_channels, kernel_size=3, padding=0, stride=2)
+
+ def forward(self, x):
+ pad = (0,1,0,1,0,0)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ return x
+
+class Spatial2x3DDownsample(Block):
+ def __init__(self, in_channels, out_channels):
+ super().__init__()
+ self.conv = CausalConv3d(in_channels, out_channels, kernel_size=3, padding=0, stride=(1,2,2))
+
+ def forward(self, x):
+ pad = (0,1,0,1,0,0)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ return x
+
+
+class Spatial2x3DUpsample(Block):
+ def __init__(self, in_channels, out_channels):
+ super().__init__()
+ self.conv = CausalConv3d(in_channels, out_channels, kernel_size=3, padding=1)
+
+ def forward(self, x):
+ x = F.interpolate(x, scale_factor=(1,2,2), mode='trilinear')
+ return self.conv(x)
+
+class Spatial2xTime2x3DUpsample(Block):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ t_interpolation="trilinear",
+ enable_cached=False,
+ ):
+ super().__init__()
+ self.t_interpolation = t_interpolation
+ self.conv = CausalConv3d(in_channels, out_channels, kernel_size=3, padding=1)
+ self.enable_cached = enable_cached
+ self.causal_cached = None
+
+ def forward(self, x):
+ if x.size(2) > 1:
+ if self.enable_cached and self.causal_cached:
+ x = F.interpolate(x, scale_factor=(2, 1, 1), mode=self.t_interpolation)
+ x = F.interpolate(x, scale_factor=(1, 2, 2), mode="trilinear")
+ else:
+ x, x_ = x[:, :, :1], x[:, :, 1:]
+ x_ = F.interpolate(
+ x_, scale_factor=(2, 1, 1), mode=self.t_interpolation
+ )
+ x_ = F.interpolate(x_, scale_factor=(1, 2, 2), mode="trilinear")
+ x = F.interpolate(x, scale_factor=(1, 2, 2), mode="trilinear")
+ x = torch.concat([x, x_], dim=2)
+ self.causal_cached = True
+ else:
+ if self.enable_cached and not self.causal_cached:
+ self.causal_cached = True
+ x = F.interpolate(x, scale_factor=(1, 2, 2), mode="trilinear")
+ return self.conv(x)
+
+
\ No newline at end of file
diff --git a/opensora/models/causalvideovae/model/registry.py b/opensora/models/causalvideovae/model/registry.py
new file mode 100644
index 000000000..36dec42aa
--- /dev/null
+++ b/opensora/models/causalvideovae/model/registry.py
@@ -0,0 +1,13 @@
+class ModelRegistry:
+ _models = {}
+
+ @classmethod
+ def register(cls, model_name):
+ def decorator(model_class):
+ cls._models[model_name] = model_class
+ return model_class
+ return decorator
+
+ @classmethod
+ def get_model(cls, model_name):
+ return cls._models.get(model_name)
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/trainer_videobase.py b/opensora/models/causalvideovae/model/trainer_videobase.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/models/ae/videobase/trainer_videobase.py
rename to opensora/models/causalvideovae/model/trainer_videobase.py
diff --git a/opensora/models/causalvideovae/model/utils/__init__.py b/opensora/models/causalvideovae/model/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/models/ae/videobase/utils/distrib_utils.py b/opensora/models/causalvideovae/model/utils/distrib_utils.py
old mode 100644
new mode 100755
similarity index 85%
rename from opensora/models/ae/videobase/utils/distrib_utils.py
rename to opensora/models/causalvideovae/model/utils/distrib_utils.py
index 760c0673f..d0df8979b
--- a/opensora/models/ae/videobase/utils/distrib_utils.py
+++ b/opensora/models/causalvideovae/model/utils/distrib_utils.py
@@ -10,10 +10,10 @@ def __init__(self, parameters, deterministic=False):
self.std = torch.exp(0.5 * self.logvar)
self.var = torch.exp(self.logvar)
if self.deterministic:
- self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+ self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device, dtype=self.parameters.dtype)
def sample(self):
- x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+ x = self.mean + self.std * torch.randn(self.mean.shape, device=self.parameters.device, dtype=self.parameters.dtype)
return x
def kl(self, other=None):
diff --git a/opensora/models/ae/videobase/utils/module_utils.py b/opensora/models/causalvideovae/model/utils/module_utils.py
old mode 100644
new mode 100755
similarity index 89%
rename from opensora/models/ae/videobase/utils/module_utils.py
rename to opensora/models/causalvideovae/model/utils/module_utils.py
index 187888aa1..3f843e4f3
--- a/opensora/models/ae/videobase/utils/module_utils.py
+++ b/opensora/models/causalvideovae/model/utils/module_utils.py
@@ -1,7 +1,7 @@
import importlib
Module = str
-MODULES_BASE = "opensora.models.ae.videobase.modules."
+MODULES_BASE = "opensora.models.causalvideovae.model.modules."
def resolve_str_to_obj(str_val, append=True):
if append:
diff --git a/opensora/models/ae/videobase/utils/scheduler_utils.py b/opensora/models/causalvideovae/model/utils/scheduler_utils.py
old mode 100644
new mode 100755
similarity index 100%
rename from opensora/models/ae/videobase/utils/scheduler_utils.py
rename to opensora/models/causalvideovae/model/utils/scheduler_utils.py
diff --git a/opensora/models/ae/videobase/utils/video_utils.py b/opensora/models/causalvideovae/model/utils/video_utils.py
old mode 100644
new mode 100755
similarity index 60%
rename from opensora/models/ae/videobase/utils/video_utils.py
rename to opensora/models/causalvideovae/model/utils/video_utils.py
index a038fcca7..4fbd3f04c
--- a/opensora/models/ae/videobase/utils/video_utils.py
+++ b/opensora/models/causalvideovae/model/utils/video_utils.py
@@ -2,9 +2,9 @@
import numpy as np
def tensor_to_video(x):
- x = x.detach().cpu()
+ x = (x * 2 - 1).detach().cpu()
x = torch.clamp(x, -1, 1)
x = (x + 1) / 2
- x = x.permute(1, 0, 2, 3).float().numpy() # c t h w ->
+ x = x.permute(1, 0, 2, 3).float().numpy() # c t h w -> t c h w
x = (255 * x).astype(np.uint8)
return x
\ No newline at end of file
diff --git a/opensora/models/causalvideovae/model/utils/wavelet_utils.py b/opensora/models/causalvideovae/model/utils/wavelet_utils.py
new file mode 100644
index 000000000..ac3a9e3bc
--- /dev/null
+++ b/opensora/models/causalvideovae/model/utils/wavelet_utils.py
@@ -0,0 +1,224 @@
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from ..modules import CausalConv3d
+from einops import rearrange
+
+class HaarWaveletTransform3D(nn.Module):
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+ h = torch.tensor([[[1, 1], [1, 1]], [[1, 1], [1, 1]]]) * 0.3536
+ g = torch.tensor([[[1, -1], [1, -1]], [[1, -1], [1, -1]]]) * 0.3536
+ hh = torch.tensor([[[1, 1], [-1, -1]], [[1, 1], [-1, -1]]]) * 0.3536
+ gh = torch.tensor([[[1, -1], [-1, 1]], [[1, -1], [-1, 1]]]) * 0.3536
+ h_v = torch.tensor([[[1, 1], [1, 1]], [[-1, -1], [-1, -1]]]) * 0.3536
+ g_v = torch.tensor([[[1, -1], [1, -1]], [[-1, 1], [-1, 1]]]) * 0.3536
+ hh_v = torch.tensor([[[1, 1], [-1, -1]], [[-1, -1], [1, 1]]]) * 0.3536
+ gh_v = torch.tensor([[[1, -1], [-1, 1]], [[-1, 1], [1, -1]]]) * 0.3536
+ h = h.view(1, 1, 2, 2, 2)
+ g = g.view(1, 1, 2, 2, 2)
+ hh = hh.view(1, 1, 2, 2, 2)
+ gh = gh.view(1, 1, 2, 2, 2)
+ h_v = h_v.view(1, 1, 2, 2, 2)
+ g_v = g_v.view(1, 1, 2, 2, 2)
+ hh_v = hh_v.view(1, 1, 2, 2, 2)
+ gh_v = gh_v.view(1, 1, 2, 2, 2)
+
+ self.h_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.g_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.hh_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.gh_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.h_v_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.g_v_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.hh_v_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+ self.gh_v_conv = CausalConv3d(1, 1, 2, padding=0, stride=2, bias=False)
+
+ self.h_conv.conv.weight.data = h
+ self.g_conv.conv.weight.data = g
+ self.hh_conv.conv.weight.data = hh
+ self.gh_conv.conv.weight.data = gh
+ self.h_v_conv.conv.weight.data = h_v
+ self.g_v_conv.conv.weight.data = g_v
+ self.hh_v_conv.conv.weight.data = hh_v
+ self.gh_v_conv.conv.weight.data = gh_v
+ self.h_conv.requires_grad_(False)
+ self.g_conv.requires_grad_(False)
+ self.hh_conv.requires_grad_(False)
+ self.gh_conv.requires_grad_(False)
+ self.h_v_conv.requires_grad_(False)
+ self.g_v_conv.requires_grad_(False)
+ self.hh_v_conv.requires_grad_(False)
+ self.gh_v_conv.requires_grad_(False)
+
+ def forward(self, x):
+ assert x.dim() == 5
+ b = x.shape[0]
+ x = rearrange(x, "b c t h w -> (b c) 1 t h w")
+ low_low_low = self.h_conv(x)
+ low_low_low = rearrange(low_low_low, "(b c) 1 t h w -> b c t h w", b=b)
+ low_low_high = self.g_conv(x)
+ low_low_high = rearrange(low_low_high, "(b c) 1 t h w -> b c t h w", b=b)
+ low_high_low = self.hh_conv(x)
+ low_high_low = rearrange(low_high_low, "(b c) 1 t h w -> b c t h w", b=b)
+ low_high_high = self.gh_conv(x)
+ low_high_high = rearrange(low_high_high, "(b c) 1 t h w -> b c t h w", b=b)
+ high_low_low = self.h_v_conv(x)
+ high_low_low = rearrange(high_low_low, "(b c) 1 t h w -> b c t h w", b=b)
+ high_low_high = self.g_v_conv(x)
+ high_low_high = rearrange(high_low_high, "(b c) 1 t h w -> b c t h w", b=b)
+ high_high_low = self.hh_v_conv(x)
+ high_high_low = rearrange(high_high_low, "(b c) 1 t h w -> b c t h w", b=b)
+ high_high_high = self.gh_v_conv(x)
+ high_high_high = rearrange(high_high_high, "(b c) 1 t h w -> b c t h w", b=b)
+
+ output = torch.cat(
+ [
+ low_low_low,
+ low_low_high,
+ low_high_low,
+ low_high_high,
+ high_low_low,
+ high_low_high,
+ high_high_low,
+ high_high_high,
+ ],
+ dim=1,
+ )
+ return output
+
+class InverseHaarWaveletTransform3D(nn.Module):
+ def __init__(self, enable_cached=False, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+
+ self.register_buffer('h',
+ torch.tensor([[[1, 1], [1, 1]], [[1, 1], [1, 1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('g',
+ torch.tensor([[[1, -1], [1, -1]], [[1, -1], [1, -1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('hh',
+ torch.tensor([[[1, 1], [-1, -1]], [[1, 1], [-1, -1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('gh',
+ torch.tensor([[[1, -1], [-1, 1]], [[1, -1], [-1, 1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('h_v',
+ torch.tensor([[[1, 1], [1, 1]], [[-1, -1], [-1, -1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('g_v',
+ torch.tensor([[[1, -1], [1, -1]], [[-1, 1], [-1, 1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('hh_v',
+ torch.tensor([[[1, 1], [-1, -1]], [[-1, -1], [1, 1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.register_buffer('gh_v',
+ torch.tensor([[[1, -1], [-1, 1]], [[-1, 1], [1, -1]]]).view(1, 1, 2, 2, 2) * 0.3536
+ )
+ self.enable_cached = enable_cached
+ self.causal_cached = None
+
+ def forward(self, coeffs):
+ assert coeffs.dim() == 5
+ b = coeffs.shape[0]
+
+ (
+ low_low_low,
+ low_low_high,
+ low_high_low,
+ low_high_high,
+ high_low_low,
+ high_low_high,
+ high_high_low,
+ high_high_high,
+ ) = coeffs.chunk(8, dim=1)
+
+ low_low_low = rearrange(low_low_low, "b c t h w -> (b c) 1 t h w")
+ low_low_high = rearrange(low_low_high, "b c t h w -> (b c) 1 t h w")
+ low_high_low = rearrange(low_high_low, "b c t h w -> (b c) 1 t h w")
+ low_high_high = rearrange(low_high_high, "b c t h w -> (b c) 1 t h w")
+ high_low_low = rearrange(high_low_low, "b c t h w -> (b c) 1 t h w")
+ high_low_high = rearrange(high_low_high, "b c t h w -> (b c) 1 t h w")
+ high_high_low = rearrange(high_high_low, "b c t h w -> (b c) 1 t h w")
+ high_high_high = rearrange(high_high_high, "b c t h w -> (b c) 1 t h w")
+
+ low_low_low = F.conv_transpose3d(low_low_low, self.h, stride=2)
+ low_low_high = F.conv_transpose3d(low_low_high, self.g, stride=2)
+ low_high_low = F.conv_transpose3d(low_high_low, self.hh, stride=2)
+ low_high_high = F.conv_transpose3d(low_high_high, self.gh, stride=2)
+ high_low_low = F.conv_transpose3d(high_low_low, self.h_v, stride=2)
+ high_low_high = F.conv_transpose3d(high_low_high, self.g_v, stride=2)
+ high_high_low = F.conv_transpose3d(high_high_low, self.hh_v, stride=2)
+ high_high_high = F.conv_transpose3d(high_high_high, self.gh_v, stride=2)
+ if self.enable_cached and self.causal_cached:
+ reconstructed = (
+ low_low_low
+ + low_low_high
+ + low_high_low
+ + low_high_high
+ + high_low_low
+ + high_low_high
+ + high_high_low
+ + high_high_high
+ )
+ else:
+ reconstructed = (
+ low_low_low[:, :, 1:]
+ + low_low_high[:, :, 1:]
+ + low_high_low[:, :, 1:]
+ + low_high_high[:, :, 1:]
+ + high_low_low[:, :, 1:]
+ + high_low_high[:, :, 1:]
+ + high_high_low[:, :, 1:]
+ + high_high_high[:, :, 1:]
+ )
+ self.causal_cached = True
+ reconstructed = rearrange(reconstructed, "(b c) 1 t h w -> b c t h w", b=b)
+ return reconstructed
+
+
+class HaarWaveletTransform2D(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.register_buffer('aa', torch.tensor([[1, 1], [1, 1]]).view(1, 1, 2, 2) / 2)
+ self.register_buffer('ad', torch.tensor([[1, 1], [-1, -1]]).view(1, 1, 2, 2) / 2)
+ self.register_buffer('da', torch.tensor([[1, -1], [1, -1]]).view(1, 1, 2, 2) / 2)
+ self.register_buffer('dd', torch.tensor([[1, -1], [-1, 1]]).view(1, 1, 2, 2) / 2)
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+ x = x.reshape(b * c, 1, h, w)
+ low_low = F.conv2d(x, self.aa, stride=2).reshape(b, c, h // 2, w // 2)
+ low_high = F.conv2d(x, self.ad, stride=2).reshape(b, c, h // 2, w // 2)
+ high_low = F.conv2d(x, self.da, stride=2).reshape(b, c, h // 2, w // 2)
+ high_high = F.conv2d(x, self.dd, stride=2).reshape(b, c, h // 2, w // 2)
+ coeffs = torch.cat([low_low, low_high, high_low, high_high], dim=1)
+ return coeffs
+
+class InverseHaarWaveletTransform2D(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.register_buffer('aa', torch.tensor([[1, 1], [1, 1]]).view(1, 1, 2, 2) / 2)
+ self.register_buffer('ad', torch.tensor([[1, 1], [-1, -1]]).view(1, 1, 2, 2) / 2)
+ self.register_buffer('da', torch.tensor([[1, -1], [1, -1]]).view(1, 1, 2, 2) / 2)
+ self.register_buffer('dd', torch.tensor([[1, -1], [-1, 1]]).view(1, 1, 2, 2) / 2)
+
+ def forward(self, coeffs):
+ low_low, low_high, high_low, high_high = coeffs.chunk(4, dim=1)
+ b, c, height_half, width_half = low_low.shape
+ height = height_half * 2
+ width = width_half * 2
+
+ low_low = F.conv_transpose2d(
+ low_low.reshape(b * c, 1, height_half, width_half), self.aa, stride=2
+ )
+ low_high = F.conv_transpose2d(
+ low_high.reshape(b * c, 1, height_half, width_half), self.ad, stride=2
+ )
+ high_low = F.conv_transpose2d(
+ high_low.reshape(b * c, 1, height_half, width_half), self.da, stride=2
+ )
+ high_high = F.conv_transpose2d(
+ high_high.reshape(b * c, 1, height_half, width_half), self.dd, stride=2
+ )
+
+ return (low_low + low_high + high_low + high_high).reshape(b, c, height, width)
\ No newline at end of file
diff --git a/opensora/models/causalvideovae/model/vae/__init__.py b/opensora/models/causalvideovae/model/vae/__init__.py
new file mode 100755
index 000000000..cf66930e1
--- /dev/null
+++ b/opensora/models/causalvideovae/model/vae/__init__.py
@@ -0,0 +1,4 @@
+from .modeling_causalvae import CausalVAEModel
+from .modeling_wfvae import WFVAEModel
+from einops import rearrange
+from torch import nn
\ No newline at end of file
diff --git a/opensora/models/ae/videobase/causal_vae/modeling_causalvae.py b/opensora/models/causalvideovae/model/vae/modeling_causalvae.py
old mode 100644
new mode 100755
similarity index 74%
rename from opensora/models/ae/videobase/causal_vae/modeling_causalvae.py
rename to opensora/models/causalvideovae/model/vae/modeling_causalvae.py
index 0b5566c37..0f7491b7b
--- a/opensora/models/ae/videobase/causal_vae/modeling_causalvae.py
+++ b/opensora/models/causalvideovae/model/vae/modeling_causalvae.py
@@ -1,13 +1,22 @@
-from ..modeling_videobase import VideoBaseAE_PL
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+from ..modeling_videobase import VideoBaseAE
from ..modules import Normalize
from ..modules.ops import nonlinearity
-from typing import List, Tuple
+from typing import Tuple
import torch.nn as nn
from ..utils.module_utils import resolve_str_to_obj, Module
from ..utils.distrib_utils import DiagonalGaussianDistribution
-from ..utils.scheduler_utils import cosine_scheduler
+from ..registry import ModelRegistry
import torch
from diffusers.configuration_utils import register_to_config
+from copy import deepcopy
+import os
class Encoder(nn.Module):
@@ -38,6 +47,7 @@ def __init__(
resolution: int = 256,
num_res_blocks: int = 2,
double_z: bool = True,
+ norm_type: str = "groupnorm",
) -> None:
super().__init__()
assert len(resnet_blocks) == len(hidden_size_mult), print(
@@ -69,6 +79,7 @@ def __init__(
in_channels=block_in,
out_channels=block_out,
dropout=dropout,
+ norm_type=norm_type
)
)
block_in = block_out
@@ -94,15 +105,17 @@ def __init__(
in_channels=block_in,
out_channels=block_in,
dropout=dropout,
+ norm_type=norm_type
)
self.mid.attn_1 = resolve_str_to_obj(attention)(block_in)
self.mid.block_2 = resolve_str_to_obj(mid_resnet)(
in_channels=block_in,
out_channels=block_in,
dropout=dropout,
+ norm_type=norm_type
)
# ---- Out ----
- self.norm_out = Normalize(block_in)
+ self.norm_out = Normalize(block_in, norm_type=norm_type)
self.conv_out = resolve_str_to_obj(conv_out)(
block_in,
2 * z_channels if double_z else z_channels,
@@ -112,24 +125,24 @@ def __init__(
)
def forward(self, x):
- hs = [self.conv_in(x)]
+ h = self.conv_in(x)
for i_level in range(self.num_resolutions):
for i_block in range(self.num_res_blocks):
- h = self.down[i_level].block[i_block](hs[-1])
+ h = self.down[i_level].block[i_block](h)
if len(self.down[i_level].attn) > 0:
h = self.down[i_level].attn[i_block](h)
- hs.append(h)
if hasattr(self.down[i_level], "downsample"):
- hs.append(self.down[i_level].downsample(hs[-1]))
+ h = self.down[i_level].downsample(h)
if hasattr(self.down[i_level], "time_downsample"):
- hs_down = self.down[i_level].time_downsample(hs[-1])
- hs.append(hs_down)
+ h = self.down[i_level].time_downsample(h)
h = self.mid.block_1(h)
h = self.mid.attn_1(h)
h = self.mid.block_2(h)
-
- h = self.norm_out(h)
+ if npu_config is None:
+ h = self.norm_out(h)
+ else:
+ h = npu_config.run_group_norm(self.norm_out, h)
h = nonlinearity(h)
h = self.conv_out(h)
return h
@@ -162,6 +175,7 @@ def __init__(
dropout: float = 0.0,
resolution: int = 256,
num_res_blocks: int = 2,
+ norm_type: str = "groupnorm",
):
super().__init__()
# ---- Config ----
@@ -182,12 +196,14 @@ def __init__(
in_channels=block_in,
out_channels=block_in,
dropout=dropout,
+ norm_type=norm_type
)
- self.mid.attn_1 = resolve_str_to_obj(attention)(block_in)
+ self.mid.attn_1 = resolve_str_to_obj(attention)(block_in, norm_type=norm_type)
self.mid.block_2 = resolve_str_to_obj(mid_resnet)(
in_channels=block_in,
out_channels=block_in,
dropout=dropout,
+ norm_type=norm_type
)
# ---- Upsample ----
@@ -202,11 +218,12 @@ def __init__(
in_channels=block_in,
out_channels=block_out,
dropout=dropout,
+ norm_type=norm_type
)
)
block_in = block_out
if curr_res in attn_resolutions:
- attn.append(resolve_str_to_obj(attention)(block_in))
+ attn.append(resolve_str_to_obj(attention)(block_in, norm_type=norm_type))
up = nn.Module()
up.block = block
up.attn = attn
@@ -222,7 +239,7 @@ def __init__(
self.up.insert(0, up)
# ---- Out ----
- self.norm_out = Normalize(block_in)
+ self.norm_out = Normalize(block_in, norm_type=norm_type)
self.conv_out = resolve_str_to_obj(conv_out)(
block_in, 3, kernel_size=3, padding=1
)
@@ -243,18 +260,19 @@ def forward(self, z):
if hasattr(self.up[i_level], "time_upsample"):
h = self.up[i_level].time_upsample(h)
- h = self.norm_out(h)
+ if npu_config is None:
+ h = self.norm_out(h)
+ else:
+ h = npu_config.run_group_norm(self.norm_out, h)
h = nonlinearity(h)
h = self.conv_out(h)
return h
-
-class CausalVAEModel(VideoBaseAE_PL):
-
+@ModelRegistry.register("CausalVAE")
+class CausalVAEModel(VideoBaseAE):
@register_to_config
def __init__(
self,
- lr: float = 1e-5,
hidden_size: int = 128,
z_channels: int = 4,
hidden_size_mult: Tuple[int] = (1, 2, 4, 4),
@@ -264,13 +282,6 @@ def __init__(
double_z: bool = True,
embed_dim: int = 4,
num_res_blocks: int = 2,
- loss_type: str = "opensora.models.ae.videobase.losses.LPIPSWithDiscriminator",
- loss_params: dict = {
- "kl_weight": 0.000001,
- "logvar_init": 0.0,
- "disc_start": 2001,
- "disc_weight": 0.5,
- },
q_conv: str = "CausalConv3d",
encoder_conv_in: Module = "CausalConv3d",
encoder_conv_out: Module = "CausalConv3d",
@@ -309,24 +320,32 @@ def __init__(
"SpatialUpsample2x",
"SpatialUpsample2x",
),
- decoder_temporal_upsample: Tuple[Module] = ("", "", "TimeUpsample2x", "TimeUpsample2x"),
+ decoder_temporal_upsample: Tuple[Module] = (
+ "",
+ "",
+ "TimeUpsample2x",
+ "TimeUpsample2x",
+ ),
decoder_mid_resnet: Module = "ResnetBlock3D",
+ use_quant_layer: bool = True,
+ norm_type: str = "groupnorm",
) -> None:
super().__init__()
- self.tile_sample_min_size = 256
- self.tile_sample_min_size_t = 65
- self.tile_latent_min_size = int(self.tile_sample_min_size / (2 ** (len(hidden_size_mult) - 1)))
- t_down_ratio = [i for i in encoder_temporal_downsample if len(i) > 0]
- self.tile_latent_min_size_t = int((self.tile_sample_min_size_t-1) / (2 ** len(t_down_ratio))) + 1
- self.tile_overlap_factor = 0.25
- self.use_tiling = False
- self.learning_rate = lr
- self.lr_g_factor = 1.0
+ self.tile_sample_min_size = 512000
+ self.tile_sample_min_size_t = 33
- self.loss = resolve_str_to_obj(loss_type, append=False)(
- **loss_params
- )
+ self.tile_sample_min_size_dec = 512
+ self.tile_sample_min_size_t_dec = 17
+ self.tile_latent_min_size = int(self.tile_sample_min_size_dec / (2 ** (len(hidden_size_mult) - 1)))
+ self.tile_latent_min_size_t = int((self.tile_sample_min_size_t_dec-1) / 4) + 1
+
+ self.tile_overlap_t = 2
+
+ self.tile_overlap_factor = 0.125
+ self.use_tiling = False
+
+ self.use_quant_layer = use_quant_layer
self.encoder = Encoder(
z_channels=z_channels,
@@ -344,6 +363,7 @@ def __init__(
resolution=resolution,
num_res_blocks=num_res_blocks,
double_z=double_z,
+ norm_type=norm_type
)
self.decoder = Decoder(
@@ -361,13 +381,22 @@ def __init__(
dropout=dropout,
resolution=resolution,
num_res_blocks=num_res_blocks,
+ norm_type=norm_type
)
+ if self.use_quant_layer:
+ quant_conv_cls = resolve_str_to_obj(q_conv)
+ self.quant_conv = quant_conv_cls(2 * z_channels, 2 * embed_dim, 1)
+ self.post_quant_conv = quant_conv_cls(embed_dim, z_channels, 1)
+
+ def get_encoder(self):
+ if self.use_quant_layer:
+ return [self.quant_conv, self.encoder]
+ return [self.encoder]
- quant_conv_cls = resolve_str_to_obj(q_conv)
- self.quant_conv = quant_conv_cls(2 * z_channels, 2 * embed_dim, 1)
- self.post_quant_conv = quant_conv_cls(embed_dim, z_channels, 1)
- if hasattr(self.loss, "discriminator"):
- self.automatic_optimization = False
+ def get_decoder(self):
+ if self.use_quant_layer:
+ return [self.post_quant_conv, self.decoder]
+ return [self.decoder]
def encode(self, x):
if self.use_tiling and (
@@ -375,10 +404,12 @@ def encode(self, x):
or x.shape[-2] > self.tile_sample_min_size
or x.shape[-3] > self.tile_sample_min_size_t
):
+ # import ipdb;ipdb.set_trace()
return self.tiled_encode(x)
h = self.encoder(x)
- moments = self.quant_conv(h)
- posterior = DiagonalGaussianDistribution(moments)
+ if self.use_quant_layer:
+ h = self.quant_conv(h)
+ posterior = DiagonalGaussianDistribution(h)
return posterior
def decode(self, z):
@@ -388,7 +419,8 @@ def decode(self, z):
or z.shape[-3] > self.tile_latent_min_size_t
):
return self.tiled_decode(z)
- z = self.post_quant_conv(z)
+ if self.use_quant_layer:
+ z = self.post_quant_conv(z)
dec = self.decoder(z)
return dec
@@ -401,133 +433,8 @@ def forward(self, input, sample_posterior=True):
dec = self.decode(z)
return dec, posterior
- def get_input(self, batch, k):
- x = batch[k]
- if len(x.shape) == 3:
- x = x[..., None]
- x = x.to(memory_format=torch.contiguous_format).float()
- return x
-
- def training_step(self, batch, batch_idx):
- if hasattr(self.loss, "discriminator"):
- return self._training_step_gan(batch, batch_idx=batch_idx)
- else:
- return self._training_step(batch, batch_idx=batch_idx)
-
- def _training_step(self, batch, batch_idx):
- inputs = self.get_input(batch, "video")
- reconstructions, posterior = self(inputs)
- aeloss, log_dict_ae = self.loss(
- inputs,
- reconstructions,
- posterior,
- split="train",
- )
- self.log(
- "aeloss",
- aeloss,
- prog_bar=True,
- logger=True,
- on_step=True,
- on_epoch=True,
- )
- self.log_dict(
- log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False
- )
- return aeloss
-
- def _training_step_gan(self, batch, batch_idx):
- inputs = self.get_input(batch, "video")
- reconstructions, posterior = self(inputs)
- opt1, opt2 = self.optimizers()
-
- # ---- AE Loss ----
- aeloss, log_dict_ae = self.loss(
- inputs,
- reconstructions,
- posterior,
- 0,
- self.global_step,
- last_layer=self.get_last_layer(),
- split="train",
- )
- self.log(
- "aeloss",
- aeloss,
- prog_bar=True,
- logger=True,
- on_step=True,
- on_epoch=True,
- )
- opt1.zero_grad()
- self.manual_backward(aeloss)
- self.clip_gradients(opt1, gradient_clip_val=1, gradient_clip_algorithm="norm")
- opt1.step()
- # ---- GAN Loss ----
- discloss, log_dict_disc = self.loss(
- inputs,
- reconstructions,
- posterior,
- 1,
- self.global_step,
- last_layer=self.get_last_layer(),
- split="train",
- )
- self.log(
- "discloss",
- discloss,
- prog_bar=True,
- logger=True,
- on_step=True,
- on_epoch=True,
- )
- opt2.zero_grad()
- self.manual_backward(discloss)
- self.clip_gradients(opt2, gradient_clip_val=1, gradient_clip_algorithm="norm")
- opt2.step()
- self.log_dict(
- {**log_dict_ae, **log_dict_disc},
- prog_bar=False,
- logger=True,
- on_step=True,
- on_epoch=False,
- )
-
- def configure_optimizers(self):
- from itertools import chain
-
- lr = self.learning_rate
- modules_to_train = [
- self.encoder.named_parameters(),
- self.decoder.named_parameters(),
- self.post_quant_conv.named_parameters(),
- self.quant_conv.named_parameters(),
- ]
- params_with_time = []
- params_without_time = []
- for name, param in chain(*modules_to_train):
- if "time" in name:
- params_with_time.append(param)
- else:
- params_without_time.append(param)
- optimizers = []
- opt_ae = torch.optim.Adam(
- [
- {"params": params_with_time, "lr": lr},
- {"params": params_without_time, "lr": lr},
- ],
- lr=lr,
- betas=(0.5, 0.9),
- )
- optimizers.append(opt_ae)
-
- if hasattr(self.loss, "discriminator"):
- opt_disc = torch.optim.Adam(
- self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)
- )
- optimizers.append(opt_disc)
-
- return optimizers, []
+ def on_train_start(self):
+ self.ema = deepcopy(self) if self.save_ema == True else None
def get_last_layer(self):
if hasattr(self.decoder.conv_out, "conv"):
@@ -558,20 +465,22 @@ def blend_h(
def tiled_encode(self, x):
t = x.shape[2]
t_chunk_idx = [i for i in range(0, t, self.tile_sample_min_size_t-1)]
+ # print('tiled_encode', t_chunk_idx)
if len(t_chunk_idx) == 1 and t_chunk_idx[0] == 0:
t_chunk_start_end = [[0, t]]
else:
- t_chunk_start_end = [[t_chunk_idx[i], t_chunk_idx[i+1]+1] for i in range(len(t_chunk_idx)-1)]
+ t_chunk_start_end = [[t_chunk_idx[i], t_chunk_idx[i+1]+1+(self.tile_overlap_t-1)*4] for i in range(len(t_chunk_idx)-1)]
if t_chunk_start_end[-1][-1] > t:
t_chunk_start_end[-1][-1] = t
elif t_chunk_start_end[-1][-1] < t:
last_start_end = [t_chunk_idx[-1], t]
t_chunk_start_end.append(last_start_end)
moments = []
+ # print('tiled_encode t_chunk_start_end', t_chunk_start_end)
for idx, (start, end) in enumerate(t_chunk_start_end):
chunk_x = x[:, :, start: end]
if idx != 0:
- moment = self.tiled_encode2d(chunk_x, return_moments=True)[:, :, 1:]
+ moment = self.tiled_encode2d(chunk_x, return_moments=True)[:, :, 1+(self.tile_overlap_t-1):]
else:
moment = self.tiled_encode2d(chunk_x, return_moments=True)
moments.append(moment)
@@ -582,22 +491,26 @@ def tiled_encode(self, x):
def tiled_decode(self, x):
t = x.shape[2]
t_chunk_idx = [i for i in range(0, t, self.tile_latent_min_size_t-1)]
+ # print('tiled_decode', t_chunk_idx)
if len(t_chunk_idx) == 1 and t_chunk_idx[0] == 0:
t_chunk_start_end = [[0, t]]
else:
- t_chunk_start_end = [[t_chunk_idx[i], t_chunk_idx[i+1]+1] for i in range(len(t_chunk_idx)-1)]
+ t_chunk_start_end = [[t_chunk_idx[i], t_chunk_idx[i+1]+1+(self.tile_overlap_t-1)] for i in range(len(t_chunk_idx)-1)]
if t_chunk_start_end[-1][-1] > t:
t_chunk_start_end[-1][-1] = t
elif t_chunk_start_end[-1][-1] < t:
last_start_end = [t_chunk_idx[-1], t]
t_chunk_start_end.append(last_start_end)
dec_ = []
+ # print('tiled_decode t_chunk_start_end', t_chunk_start_end)
for idx, (start, end) in enumerate(t_chunk_start_end):
+ # import ipdb;ipdb.set_trace()
chunk_x = x[:, :, start: end]
if idx != 0:
- dec = self.tiled_decode2d(chunk_x)[:, :, 1:]
+ dec = self.tiled_decode2d(chunk_x)[:, :, 1+(self.tile_overlap_t-1)*4:]
else:
dec = self.tiled_decode2d(chunk_x)
+ # print(chunk_x.shape, dec.shape)
dec_.append(dec)
dec_ = torch.cat(dec_, dim=2)
return dec_
@@ -606,12 +519,13 @@ def tiled_encode2d(self, x, return_moments=False):
overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
row_limit = self.tile_latent_min_size - blend_extent
-
+ # print('overlap_size, blend_extent, row_limit', overlap_size, blend_extent, row_limit)
# Split the image into 512x512 tiles and encode them separately.
rows = []
for i in range(0, x.shape[3], overlap_size):
row = []
for j in range(0, x.shape[4], overlap_size):
+ # print(i, j)
tile = x[
:,
:,
@@ -620,7 +534,8 @@ def tiled_encode2d(self, x, return_moments=False):
j : j + self.tile_sample_min_size,
]
tile = self.encoder(tile)
- tile = self.quant_conv(tile)
+ if self.use_quant_layer:
+ tile = self.quant_conv(tile)
row.append(tile)
rows.append(row)
result_rows = []
@@ -650,6 +565,8 @@ def tiled_decode2d(self, z):
# Split z into overlapping 64x64 tiles and decode them separately.
# The tiles have an overlap to avoid seams between tiles.
+ # print('tiled_decode2d', list(range(0, z.shape[3], overlap_size)), list(range(0, z.shape[4], overlap_size)))
+ # import ipdb;ipdb.set_trace()
rows = []
for i in range(0, z.shape[3], overlap_size):
row = []
@@ -661,7 +578,8 @@ def tiled_decode2d(self, z):
i : i + self.tile_latent_min_size,
j : j + self.tile_latent_min_size,
]
- tile = self.post_quant_conv(tile)
+ if self.use_quant_layer:
+ tile = self.post_quant_conv(tile)
decoded = self.decoder(tile)
row.append(decoded)
rows.append(row)
@@ -687,26 +605,31 @@ def enable_tiling(self, use_tiling: bool = True):
def disable_tiling(self):
self.enable_tiling(False)
- def init_from_ckpt(self, path, ignore_keys=list(), remove_loss=False):
+ def init_from_ckpt(self, path, ignore_keys=list()):
sd = torch.load(path, map_location="cpu")
print("init from " + path)
- if "state_dict" in sd:
- sd = sd["state_dict"]
+
+ if (
+ "ema_state_dict" in sd
+ and len(sd["ema_state_dict"]) > 0
+ and os.environ.get("NOT_USE_EMA_MODEL", 0) == 0
+ ):
+ print("Load from ema model!")
+ sd = sd["ema_state_dict"]
+ sd = {key.replace("module.", ""): value for key, value in sd.items()}
+ elif "state_dict" in sd:
+ print("Load from normal model!")
+ if "gen_model" in sd["state_dict"]:
+ sd = sd["state_dict"]["gen_model"]
+ else:
+ sd = sd["state_dict"]
+
keys = list(sd.keys())
+
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
print("Deleting key {} from state_dict.".format(k))
del sd[k]
- self.load_state_dict(sd, strict=False)
-
- def validation_step(self, batch, batch_idx):
-
- from ..utils.video_utils import tensor_to_video
- inputs = self.get_input(batch, 'video')
- latents = self.encode(inputs).sample()
- video_recon = self.decode(latents)
- for idx in range(len(video_recon)):
- self.logger.log_video(f"recon {batch_idx} {idx}", [tensor_to_video(video_recon[idx])], fps=[10])
-
-
\ No newline at end of file
+
+ missing_keys, unexpected_keys = self.load_state_dict(sd, strict=True)
diff --git a/opensora/models/causalvideovae/model/vae/modeling_wfvae.py b/opensora/models/causalvideovae/model/vae/modeling_wfvae.py
new file mode 100644
index 000000000..1acbc35ca
--- /dev/null
+++ b/opensora/models/causalvideovae/model/vae/modeling_wfvae.py
@@ -0,0 +1,504 @@
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+
+from ..modeling_videobase import VideoBaseAE
+from diffusers.configuration_utils import register_to_config
+import torch
+import torch.nn as nn
+from ..modules import (
+ ResnetBlock2D,
+ ResnetBlock3D,
+ Conv2d,
+ Downsample,
+ Upsample,
+ Spatial2xTime2x3DDownsample,
+ Spatial2xTime2x3DUpsample,
+ CausalConv3d,
+ Normalize,
+ AttnBlock3DFix,
+ nonlinearity,
+)
+import torch.nn as nn
+from ..utils.distrib_utils import DiagonalGaussianDistribution
+from ..utils.wavelet_utils import (
+ HaarWaveletTransform2D,
+ HaarWaveletTransform3D,
+ InverseHaarWaveletTransform2D,
+ InverseHaarWaveletTransform3D
+)
+import torch
+from copy import deepcopy
+import os
+from ..registry import ModelRegistry
+from einops import rearrange
+
+
+class Encoder(VideoBaseAE):
+
+ @register_to_config
+ def __init__(
+ self,
+ latent_dim: int = 8,
+ base_channels: int = 128,
+ num_resblocks: int = 2,
+ energy_flow_hidden_size: int = 64,
+ dropout: float = 0.0,
+ use_attention: bool = True,
+ norm_type: str = "groupnorm",
+ ) -> None:
+ super().__init__()
+ self.down1 = nn.Sequential(
+ Conv2d(24, base_channels, kernel_size=3, stride=1, padding=1),
+ *[
+ ResnetBlock2D(
+ in_channels=base_channels,
+ out_channels=base_channels,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for _ in range(num_resblocks)
+ ],
+ Downsample(in_channels=base_channels, out_channels=base_channels),
+ )
+ self.down2 = nn.Sequential(
+ Conv2d(
+ base_channels + energy_flow_hidden_size,
+ base_channels * 2,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ ),
+ *[
+ ResnetBlock3D(
+ in_channels=base_channels * 2,
+ out_channels=base_channels * 2,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for _ in range(num_resblocks)
+ ],
+ Spatial2xTime2x3DDownsample(base_channels * 2, base_channels * 2),
+ )
+ # Connection
+ self.connect_l2 = Conv2d(
+ 12, energy_flow_hidden_size, kernel_size=3, stride=1, padding=1
+ )
+ self.connect_l3 = Conv2d(
+ 24, energy_flow_hidden_size, kernel_size=3, stride=1, padding=1
+ )
+ # Mid
+ mid_layers = [
+ ResnetBlock3D(
+ in_channels=base_channels * 2 + energy_flow_hidden_size,
+ out_channels=base_channels * 4,
+ dropout=dropout,
+ norm_type=norm_type,
+ ),
+ ResnetBlock3D(
+ in_channels=base_channels * 4,
+ out_channels=base_channels * 4,
+ dropout=dropout,
+ norm_type=norm_type,
+ ),
+ ]
+ if use_attention:
+ mid_layers.insert(
+ 1, AttnBlock3DFix(in_channels=base_channels * 4, norm_type=norm_type)
+ )
+ self.mid = nn.Sequential(*mid_layers)
+
+ self.norm_out = Normalize(base_channels * 4, norm_type=norm_type)
+ self.conv_out = CausalConv3d(
+ base_channels * 4, latent_dim * 2, kernel_size=3, stride=1, padding=1
+ )
+
+ self.wavelet_tranform_3d = HaarWaveletTransform3D()
+ self.wavelet_tranform_2d = HaarWaveletTransform2D()
+
+
+ def forward(self, coeffs):
+ l2_coeffs = coeffs[:, :3]
+ t = l2_coeffs.shape[2]
+ l2_coeffs = rearrange(l2_coeffs, "b c t h w -> (b t) c h w")
+ l2_coeffs = self.wavelet_tranform_2d(l2_coeffs)
+ l2_coeffs = rearrange(l2_coeffs, "(b t) c h w -> b c t h w", t=t)
+ l2 = self.connect_l2(l2_coeffs)
+ l3_coeffs = self.wavelet_tranform_3d(l2_coeffs[:, :3])
+ l3 = self.connect_l3(l3_coeffs)
+
+ h = self.down1(coeffs)
+ h = torch.concat([h, l2], dim=1)
+ h = self.down2(h)
+ h = torch.concat([h, l3], dim=1)
+ h = self.mid(h)
+
+ if npu_config is None:
+ h = self.norm_out(h)
+ else:
+ h = npu_config.run_group_norm(self.norm_out, h)
+
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class Decoder(VideoBaseAE):
+
+ @register_to_config
+ def __init__(
+ self,
+ latent_dim: int = 8,
+ base_channels: int = 128,
+ num_resblocks: int = 2,
+ dropout: float = 0.0,
+ energy_flow_hidden_size: int = 128,
+ use_attention: bool = True,
+ norm_type: str = "groupnorm",
+ t_interpolation: str = "nearest",
+ ) -> None:
+ super().__init__()
+ self.energy_flow_hidden_size = energy_flow_hidden_size
+
+ self.conv_in = CausalConv3d(
+ latent_dim, base_channels * 4, kernel_size=3, stride=1, padding=1
+ )
+ mid_layers = [
+ ResnetBlock3D(
+ in_channels=base_channels * 4,
+ out_channels=base_channels * 4,
+ dropout=dropout,
+ norm_type=norm_type,
+ ),
+ ResnetBlock3D(
+ in_channels=base_channels * 4,
+ out_channels=base_channels * 4 + energy_flow_hidden_size,
+ dropout=dropout,
+ norm_type=norm_type,
+ ),
+ ]
+ if use_attention:
+ mid_layers.insert(
+ 1, AttnBlock3DFix(in_channels=base_channels * 4, norm_type=norm_type)
+ )
+ self.mid = nn.Sequential(*mid_layers)
+
+ self.up2 = nn.Sequential(
+ *[
+ ResnetBlock3D(
+ in_channels=base_channels * 4,
+ out_channels=base_channels * 4,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for _ in range(num_resblocks)
+ ],
+ Spatial2xTime2x3DUpsample(
+ base_channels * 4, base_channels * 4, t_interpolation=t_interpolation
+ ),
+ ResnetBlock3D(
+ in_channels=base_channels * 4,
+ out_channels=base_channels * 4 + energy_flow_hidden_size,
+ dropout=dropout,
+ norm_type=norm_type,
+ ),
+ )
+ self.up1 = nn.Sequential(
+ *[
+ ResnetBlock3D(
+ in_channels=base_channels * (4 if i == 0 else 2),
+ out_channels=base_channels * 2,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for i in range(num_resblocks)
+ ],
+ Upsample(in_channels=base_channels * 2, out_channels=base_channels * 2),
+ ResnetBlock3D(
+ in_channels=base_channels * 2,
+ out_channels=base_channels * 2,
+ dropout=dropout,
+ norm_type=norm_type,
+ ),
+ )
+ self.layer = nn.Sequential(
+ *[
+ ResnetBlock3D(
+ in_channels=base_channels * (2 if i == 0 else 1),
+ out_channels=base_channels,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for i in range(2)
+ ],
+ )
+ # Connection
+ self.connect_l2 = nn.Sequential(
+ *[
+ ResnetBlock3D(
+ in_channels=base_channels,
+ out_channels=base_channels,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for _ in range(2)
+ ],
+ Conv2d(base_channels, 12, kernel_size=3, stride=1, padding=1),
+ )
+ self.connect_l3 = nn.Sequential(
+ *[
+ ResnetBlock3D(
+ in_channels=base_channels,
+ out_channels=base_channels,
+ dropout=dropout,
+ norm_type=norm_type,
+ )
+ for _ in range(2)
+ ],
+ Conv2d(base_channels, 24, kernel_size=3, stride=1, padding=1),
+ )
+ # Out
+ self.norm_out = Normalize(base_channels, norm_type=norm_type)
+ self.conv_out = Conv2d(base_channels, 24, kernel_size=3, stride=1, padding=1)
+
+ self.inverse_wavelet_tranform_3d = InverseHaarWaveletTransform3D()
+ self.inverse_wavelet_tranform_2d = InverseHaarWaveletTransform2D()
+
+
+ def forward(self, z):
+ h = self.conv_in(z)
+ h = self.mid(h)
+ l3_coeffs = self.connect_l3(h[:, -self.energy_flow_hidden_size :])
+ l3 = self.inverse_wavelet_tranform_3d(l3_coeffs)
+ h = self.up2(h[:, : -self.energy_flow_hidden_size])
+ l2_coeffs = h[:, -self.energy_flow_hidden_size :]
+ l2_coeffs = self.connect_l2(l2_coeffs)
+ l2_coeffs[:, :3] = l2_coeffs[:, :3] + l3
+
+ t = l2_coeffs.shape[2]
+ l2_coeffs = rearrange(l2_coeffs, "b c t h w -> (b t) c h w")
+ l2 = self.inverse_wavelet_tranform_2d(l2_coeffs)
+ l2 = rearrange(l2, "(b t) c h w -> b c t h w", t=t)
+
+ h = self.up1(h[:, : -self.energy_flow_hidden_size])
+
+ h = self.layer(h)
+ if npu_config is None:
+ h = self.norm_out(h)
+ else:
+ h = npu_config.run_group_norm(self.norm_out, h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ h[:, :3] = h[:, :3] + l2
+ return h
+
+
+@ModelRegistry.register("WFVAE")
+class WFVAEModel(VideoBaseAE):
+
+ @register_to_config
+ def __init__(
+ self,
+ latent_dim: int = 8,
+ base_channels: int = 128,
+ encoder_num_resblocks: int = 2,
+ encoder_energy_flow_hidden_size: int = 64,
+ decoder_num_resblocks: int = 2,
+ decoder_energy_flow_hidden_size: int = 128,
+ use_attention: bool = True,
+ dropout: float = 0.0,
+ norm_type: str = "groupnorm",
+ t_interpolation: str = "nearest",
+ ) -> None:
+ super().__init__()
+ self.use_tiling = False
+ self.use_quant_layer = False
+
+ self.t_chunk_enc = 16
+ self.t_chunk_dec = self.t_chunk_enc // 2
+
+ self.encoder = Encoder(
+ latent_dim=latent_dim,
+ base_channels=base_channels,
+ num_resblocks=encoder_num_resblocks,
+ energy_flow_hidden_size=encoder_energy_flow_hidden_size,
+ dropout=dropout,
+ use_attention=use_attention,
+ norm_type=norm_type,
+ )
+ self.decoder = Decoder(
+ latent_dim=latent_dim,
+ base_channels=base_channels,
+ num_resblocks=decoder_num_resblocks,
+ energy_flow_hidden_size=decoder_energy_flow_hidden_size,
+ dropout=dropout,
+ use_attention=use_attention,
+ norm_type=norm_type,
+ t_interpolation=t_interpolation,
+ )
+
+ def get_encoder(self):
+ if self.use_quant_layer:
+ return [self.quant_conv, self.encoder]
+ return [self.encoder]
+
+ def get_decoder(self):
+ if self.use_quant_layer:
+ return [self.post_quant_conv, self.decoder]
+ return [self.decoder]
+
+ def _empty_causal_cached(self, parent):
+ for name, module in parent.named_modules():
+ if hasattr(module, 'causal_cached'):
+ module.causal_cached = None
+
+ def _set_causal_cached(self, enable_cached=True):
+ for name, module in self.named_modules():
+ if hasattr(module, 'enable_cached'):
+ module.enable_cached = enable_cached
+
+ def build_chunk_start_end(self, t, decoder_mode=False):
+ start_end = [[0, 1]]
+ start = 1
+ end = start
+ while True:
+ if start >= t:
+ break
+ end = min(t, end + (self.t_chunk_dec if decoder_mode else self.t_chunk_enc) )
+ start_end.append([start, end])
+ start = end
+ return start_end
+
+
+ def encode(self, x):
+ self._empty_causal_cached(self.encoder)
+
+ if torch_npu is not None:
+ dtype = x.dtype
+ x = x.to(torch.float16)
+ wt = HaarWaveletTransform3D().to(x.device, dtype=torch.float16)
+ coeffs = wt(x)
+ coeffs = coeffs.to(dtype)
+ else:
+ wt = HaarWaveletTransform3D().to(x.device, dtype=x.dtype)
+ coeffs = wt(x)
+ if self.use_tiling:
+ h = self.tile_encode(coeffs)
+ # torch.save(h.detach().cpu(), 'tile_encoder.pt')
+ else:
+ h = self.encoder(coeffs)
+ if self.use_quant_layer:
+ h = self.quant_conv(h)
+ # torch.save(h.detach().cpu(), 'encoder.pt')
+
+ posterior = DiagonalGaussianDistribution(h)
+ return posterior
+
+ def tile_encode(self, x):
+ b, c, t, h, w = x.shape
+
+ start_end = self.build_chunk_start_end(t)
+ result = []
+ for start, end in start_end:
+ chunk = x[:, :, start:end, :, :]
+ chunk = self.encoder(chunk)
+ if self.use_quant_layer:
+ chunk = self.encoder(chunk)
+ result.append(chunk)
+
+ return torch.cat(result, dim=2)
+
+
+ def decode(self, z):
+ self._empty_causal_cached(self.decoder)
+
+ if self.use_tiling:
+ dec = self.tile_decode(z)
+ # torch.save(dec.detach().cpu(), 'tile_decoder.pt')
+ else:
+ if self.use_quant_layer:
+ z = self.post_quant_conv(z)
+ dec = self.decoder(z)
+ # torch.save(dec.detach().cpu(), 'decoder.pt')
+
+ if torch_npu is not None:
+ dtype = dec.dtype
+ dec = dec.to(torch.float16)
+ wt = InverseHaarWaveletTransform3D().to(dec.device, dtype=torch.float16)
+ dec = wt(dec)
+ dec = dec.to(dtype)
+ else:
+ wt = InverseHaarWaveletTransform3D().to(dec.device, dtype=dec.dtype)
+ dec = wt(dec)
+
+ return dec
+
+ def tile_decode(self, x):
+ b, c, t, h, w = x.shape
+
+ start_end = self.build_chunk_start_end(t, decoder_mode=True)
+
+ result = []
+ for start, end in start_end:
+ chunk = x[:, :, start:end, :, :]
+ if self.use_quant_layer:
+ chunk = self.post_quant_conv(chunk)
+ chunk = self.decoder(chunk)
+ result.append(chunk)
+
+ return torch.cat(result, dim=2)
+
+ def forward(self, input, sample_posterior=True):
+ posterior = self.encode(input)
+ if sample_posterior:
+ z = posterior.sample()
+ else:
+ z = posterior.mode()
+ dec = self.decode(z)
+ return dec, posterior
+
+ def get_last_layer(self):
+ if hasattr(self.decoder.conv_out, "conv"):
+ return self.decoder.conv_out.conv.weight
+ else:
+ return self.decoder.conv_out.weight
+
+ def enable_tiling(self, use_tiling: bool = True):
+ self.use_tiling = use_tiling
+ self._set_causal_cached(use_tiling)
+
+ def disable_tiling(self):
+ self.enable_tiling(False)
+
+ def init_from_ckpt(self, path, ignore_keys=list()):
+ sd = torch.load(path, map_location="cpu")
+ print("init from " + path)
+
+ if (
+ "ema_state_dict" in sd
+ and len(sd["ema_state_dict"]) > 0
+ and os.environ.get("NOT_USE_EMA_MODEL", 0) == 0
+ ):
+ print("Load from ema model!")
+ sd = sd["ema_state_dict"]
+ sd = {key.replace("module.", ""): value for key, value in sd.items()}
+ elif "state_dict" in sd:
+ print("Load from normal model!")
+ if "gen_model" in sd["state_dict"]:
+ sd = sd["state_dict"]["gen_model"]
+ else:
+ sd = sd["state_dict"]
+
+ keys = list(sd.keys())
+
+ for k in keys:
+ for ik in ignore_keys:
+ if k.startswith(ik):
+ print("Deleting key {} from state_dict.".format(k))
+ del sd[k]
+
+ missing_keys, unexpected_keys = self.load_state_dict(sd, strict=False)
+ print(missing_keys, unexpected_keys)
\ No newline at end of file
diff --git a/examples/rec_video_vae.py b/opensora/models/causalvideovae/sample/rec_video_vae.py
similarity index 95%
rename from examples/rec_video_vae.py
rename to opensora/models/causalvideovae/sample/rec_video_vae.py
index 65266ed53..e2c8a549b 100644
--- a/examples/rec_video_vae.py
+++ b/opensora/models/causalvideovae/sample/rec_video_vae.py
@@ -13,9 +13,9 @@
import sys
from torch.utils.data import Dataset, DataLoader, Subset
import os
-
+import glob
sys.path.append(".")
-from opensora.models.ae.videobase import CausalVAEModel
+from causalvideovae.model import CausalVAEModel
import torch.nn as nn
@@ -72,6 +72,8 @@ def read_video(video_path: str, num_frames: int, sample_rate: int) -> torch.Tens
class RealVideoDataset(Dataset):
+ video_exts = ['avi', 'mp4', 'webm']
+
def __init__(
self,
real_video_dir,
@@ -127,14 +129,12 @@ def _load_video(self, video_path):
video_data, short_size=self.short_size, crop_size=self.crop_size
)
- def _combine_without_prefix(self, folder_path, prefix="."):
- folder = []
- for name in os.listdir(folder_path):
- if name[0] == prefix:
- continue
- folder.append(os.path.join(folder_path, name))
- folder.sort()
- return folder
+ def _combine_without_prefix(self, folder_path):
+ samples = []
+ samples += sum([glob.glob(os.path.join(folder_path, '**', f'*.{ext}'), recursive=True)
+ for ext in self.video_exts], [])
+ samples.sort()
+ return samples
def resize(x, resolution):
height, width = x.shape[-2:]
@@ -209,6 +209,7 @@ def main(args: argparse.Namespace):
# ---- Load Model ----
device = args.device
vqvae = CausalVAEModel.from_pretrained(args.ckpt)
+ print(vqvae)
vqvae = vqvae.to(device).to(data_type)
if args.enable_tiling:
vqvae.enable_tiling()
@@ -236,6 +237,7 @@ def main(args: argparse.Namespace):
# ---- Inference ----
for batch in tqdm(dataloader):
x, file_names = batch['video'], batch['file_name']
+
x = x.to(device=device, dtype=data_type) # b c t h w
latents = vqvae.encode(x).sample().to(data_type)
video_recon = vqvae.decode(latents)
@@ -272,4 +274,4 @@ def main(args: argparse.Namespace):
args = parser.parse_args()
main(args)
-
+
\ No newline at end of file
diff --git a/opensora/models/causalvideovae/utils/__init__.py b/opensora/models/causalvideovae/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/models/causalvideovae/utils/dataset_utils.py b/opensora/models/causalvideovae/utils/dataset_utils.py
new file mode 100755
index 000000000..1309989b1
--- /dev/null
+++ b/opensora/models/causalvideovae/utils/dataset_utils.py
@@ -0,0 +1,160 @@
+import math
+from einops import rearrange
+import decord
+from torch.nn import functional as F
+import torch
+
+
+IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
+
+def is_image_file(filename):
+ return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+class DecordInit(object):
+ """Using Decord(https://github.com/dmlc/decord) to initialize the video_reader."""
+
+ def __init__(self, num_threads=1):
+ self.num_threads = num_threads
+ self.ctx = decord.cpu(0)
+
+ def __call__(self, filename):
+ """Perform the Decord initialization.
+ Args:
+ results (dict): The resulting dict to be modified and passed
+ to the next transform in pipeline.
+ """
+ reader = decord.VideoReader(filename,
+ ctx=self.ctx,
+ num_threads=self.num_threads)
+ return reader
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'sr={self.sr},'
+ f'num_threads={self.num_threads})')
+ return repr_str
+
+def pad_to_multiple(number, ds_stride):
+ remainder = number % ds_stride
+ if remainder == 0:
+ return number
+ else:
+ padding = ds_stride - remainder
+ return number + padding
+
+class Collate:
+ def __init__(self, args):
+ self.max_image_size = args.max_image_size
+ self.ae_stride = args.ae_stride
+ self.ae_stride_t = args.ae_stride_t
+ self.ae_stride_thw = (self.ae_stride_t, self.ae_stride, self.ae_stride)
+ self.ae_stride_1hw = (1, self.ae_stride, self.ae_stride)
+
+ self.patch_size = args.patch_size
+ self.patch_size_t = args.patch_size_t
+ self.patch_size_thw = (self.patch_size_t, self.patch_size, self.patch_size)
+ self.patch_size_1hw = (1, self.patch_size, self.patch_size)
+
+ self.num_frames = args.num_frames
+ self.use_image_num = args.use_image_num
+ self.max_thw = (self.num_frames, self.max_image_size, self.max_image_size)
+ self.max_1hw = (1, self.max_image_size, self.max_image_size)
+
+ def package(self, batch):
+ # import ipdb;ipdb.set_trace()
+ batch_tubes_vid = [i['video_data']['video'] for i in batch] # b [c t h w]
+ input_ids_vid = torch.stack([i['video_data']['input_ids'] for i in batch]) # b 1 l
+ cond_mask_vid = torch.stack([i['video_data']['cond_mask'] for i in batch]) # b 1 l
+ batch_tubes_img, input_ids_img, cond_mask_img = None, None, None
+ if self.use_image_num != 0:
+ batch_tubes_img = [j for i in batch for j in i['image_data']['image']] # b*num_img [c 1 h w]
+ input_ids_img = torch.stack([i['image_data']['input_ids'] for i in batch]) # b image_num l
+ cond_mask_img = torch.stack([i['image_data']['cond_mask'] for i in batch]) # b image_num l
+ return batch_tubes_vid, input_ids_vid, cond_mask_vid, batch_tubes_img, input_ids_img, cond_mask_img
+
+ def __call__(self, batch):
+ batch_tubes_vid, input_ids_vid, cond_mask_vid, batch_tubes_img, input_ids_img, cond_mask_img = self.package(batch)
+
+ # import ipdb;ipdb.set_trace()
+ ds_stride = self.ae_stride * self.patch_size
+ t_ds_stride = self.ae_stride_t * self.patch_size_t
+ if self.use_image_num == 0:
+ pad_batch_tubes, attention_mask = self.process(batch_tubes_vid, t_ds_stride, ds_stride,
+ self.max_thw, self.ae_stride_thw, self.patch_size_thw, extra_1=True)
+ # attention_mask: b t h w
+ input_ids, cond_mask = input_ids_vid.squeeze(1), cond_mask_vid.squeeze(1) # b 1 l -> b l
+ else:
+ pad_batch_tubes_vid, attention_mask_vid = self.process(batch_tubes_vid, t_ds_stride, ds_stride,
+ self.max_thw, self.ae_stride_thw, self.patch_size_thw, extra_1=True)
+ # attention_mask_vid: b t h w
+ pad_batch_tubes_img, attention_mask_img = self.process(batch_tubes_img, 1, ds_stride,
+ self.max_1hw, self.ae_stride_1hw, self.patch_size_1hw, extra_1=False)
+ pad_batch_tubes_img = rearrange(pad_batch_tubes_img, '(b i) c 1 h w -> b c i h w', i=self.use_image_num)
+ attention_mask_img = rearrange(attention_mask_img, '(b i) 1 h w -> b i h w', i=self.use_image_num)
+ pad_batch_tubes = torch.cat([pad_batch_tubes_vid, pad_batch_tubes_img], dim=2) # concat at temporal, video first
+ # attention_mask_img: b num_img h w
+ attention_mask = torch.cat([attention_mask_vid, attention_mask_img], dim=1) # b t+num_img h w
+ input_ids = torch.cat([input_ids_vid, input_ids_img], dim=1) # b 1+num_img hw
+ cond_mask = torch.cat([cond_mask_vid, cond_mask_img], dim=1) # b 1+num_img hw
+ return pad_batch_tubes, attention_mask, input_ids, cond_mask
+
+ def process(self, batch_tubes, t_ds_stride, ds_stride, max_thw, ae_stride_thw, patch_size_thw, extra_1):
+
+ # pad to max multiple of ds_stride
+ batch_input_size = [i.shape for i in batch_tubes] # [(c t h w), (c t h w)]
+ max_t, max_h, max_w = max_thw
+ pad_max_t, pad_max_h, pad_max_w = pad_to_multiple(max_t-1 if extra_1 else max_t, t_ds_stride), \
+ pad_to_multiple(max_h, ds_stride), \
+ pad_to_multiple(max_w, ds_stride)
+ pad_max_t = pad_max_t + 1 if extra_1 else pad_max_t
+ each_pad_t_h_w = [[pad_max_t - i.shape[1],
+ pad_max_h - i.shape[2],
+ pad_max_w - i.shape[3]] for i in batch_tubes]
+ pad_batch_tubes = [F.pad(im,
+ (0, pad_w,
+ 0, pad_h,
+ 0, pad_t), value=0) for (pad_t, pad_h, pad_w), im in zip(each_pad_t_h_w, batch_tubes)]
+ pad_batch_tubes = torch.stack(pad_batch_tubes, dim=0)
+
+ # make attention_mask
+ # first_channel_first_frame, first_channel_other_frame = pad_batch_tubes[:, :1, :1], pad_batch_tubes[:, :1, 1:] # first channel to make attention_mask
+ # attention_mask_first_frame = F.max_pool3d(first_channel_first_frame, kernel_size=(1, *ae_stride_thw[1:]), stride=(1, *ae_stride_thw[1:]))
+ # if first_channel_other_frame.numel() != 0:
+ # attention_mask_other_frame = F.max_pool3d(first_channel_other_frame, kernel_size=ae_stride_thw, stride=ae_stride_thw)
+ # attention_mask = torch.cat([attention_mask_first_frame, attention_mask_other_frame], dim=2)
+ # else:
+ # attention_mask = attention_mask_first_frame
+ # attention_mask_ = attention_mask[:, 0].bool().float() # b t h w, do not channel
+
+ # import ipdb;ipdb.set_trace()
+ max_tube_size = [pad_max_t, pad_max_h, pad_max_w]
+ max_latent_size = [((max_tube_size[0]-1) // ae_stride_thw[0] + 1) if extra_1 else (max_tube_size[0] // ae_stride_thw[0]),
+ max_tube_size[1] // ae_stride_thw[1],
+ max_tube_size[2] // ae_stride_thw[2]]
+ valid_latent_size = [[int(math.ceil((i[1]-1) / ae_stride_thw[0])) + 1 if extra_1 else int(math.ceil(i[1] / ae_stride_thw[0])),
+ int(math.ceil(i[2] / ae_stride_thw[1])),
+ int(math.ceil(i[3] / ae_stride_thw[2]))] for i in batch_input_size]
+ attention_mask = [F.pad(torch.ones(i),
+ (0, max_latent_size[2] - i[2],
+ 0, max_latent_size[1] - i[1],
+ 0, max_latent_size[0] - i[0]), value=0) for i in valid_latent_size]
+ attention_mask = torch.stack(attention_mask) # b t h w
+
+
+ # max_tube_size = [pad_max_t, pad_max_h, pad_max_w]
+ # max_latent_size = [((max_tube_size[0]-1) // ae_stride_thw[0] + 1) if extra_1 else (max_tube_size[0] // ae_stride_thw[0]),
+ # max_tube_size[1] // ae_stride_thw[1],
+ # max_tube_size[2] // ae_stride_thw[2]]
+ # max_patchify_latent_size = [((max_latent_size[0]-1) // patch_size_thw[0] + 1) if extra_1 else (max_latent_size[0] // patch_size_thw[0]),
+ # max_latent_size[1] // patch_size_thw[1],
+ # max_latent_size[2] // patch_size_thw[2]]
+ # valid_patchify_latent_size = [[int(math.ceil((i[1]-1) / t_ds_stride)) + 1 if extra_1 else int(math.ceil(i[1] / t_ds_stride)),
+ # int(math.ceil(i[2] / ds_stride)),
+ # int(math.ceil(i[3] / ds_stride))] for i in batch_input_size]
+ # attention_mask = [F.pad(torch.ones(i),
+ # (0, max_patchify_latent_size[2] - i[2],
+ # 0, max_patchify_latent_size[1] - i[1],
+ # 0, max_patchify_latent_size[0] - i[0]), value=0) for i in valid_patchify_latent_size]
+ # attention_mask = torch.stack(attention_mask) # b t h w
+
+ return pad_batch_tubes, attention_mask
diff --git a/opensora/models/causalvideovae/utils/downloader.py b/opensora/models/causalvideovae/utils/downloader.py
new file mode 100755
index 000000000..f2ac4017b
--- /dev/null
+++ b/opensora/models/causalvideovae/utils/downloader.py
@@ -0,0 +1,18 @@
+import gdown
+import os
+
+opensora_cache_home = os.path.expanduser(
+ os.getenv("OPENSORA_HOME", os.path.join("~/.cache", "opensora"))
+)
+
+
+def gdown_download(id, fname, cache_dir=None):
+ cache_dir = opensora_cache_home if not cache_dir else cache_dir
+
+ os.makedirs(cache_dir, exist_ok=True)
+ destination = os.path.join(cache_dir, fname)
+ if os.path.exists(destination):
+ return destination
+
+ gdown.download(id=id, output=destination, quiet=False)
+ return destination
diff --git a/opensora/models/causalvideovae/utils/video_utils.py b/opensora/models/causalvideovae/utils/video_utils.py
new file mode 100755
index 000000000..d37f62680
--- /dev/null
+++ b/opensora/models/causalvideovae/utils/video_utils.py
@@ -0,0 +1,64 @@
+import torch
+import numpy as np
+import numpy.typing as npt
+import cv2
+from decord import VideoReader, cpu
+
+def array_to_video(
+ image_array: npt.NDArray, fps: float = 30.0, output_file: str = "output_video.mp4"
+) -> None:
+ """b h w c"""
+ height, width, channels = image_array[0].shape
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+ video_writer = cv2.VideoWriter(output_file, fourcc, float(fps), (width, height))
+
+ for image in image_array:
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+ video_writer.write(image_rgb)
+
+ video_writer.release()
+
+def custom_to_video(
+ x: torch.Tensor, fps: float = 2.0, output_file: str = "output_video.mp4"
+) -> None:
+ x = x.detach().cpu()
+ x = torch.clamp(x, -1, 1)
+ x = (x + 1) / 2
+ x = x.permute(1, 2, 3, 0).float().numpy()
+ x = (255 * x).astype(np.uint8)
+ array_to_video(x, fps=fps, output_file=output_file)
+ return
+
+def read_video(video_path: str, num_frames: int, sample_rate: int) -> torch.Tensor:
+ decord_vr = VideoReader(video_path, ctx=cpu(0), num_threads=8)
+ total_frames = len(decord_vr)
+ sample_frames_len = sample_rate * num_frames
+
+ if total_frames > sample_frames_len:
+ s = 0
+ e = s + sample_frames_len
+ num_frames = num_frames
+ else:
+ s = 0
+ e = total_frames
+ num_frames = int(total_frames / sample_frames_len * num_frames)
+ print(
+ f"sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}",
+ video_path,
+ total_frames,
+ )
+
+ frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
+ video_data = torch.from_numpy(video_data)
+ video_data = video_data.permute(3, 0, 1, 2) # (T, H, W, C) -> (C, T, H, W)
+ return video_data
+
+def tensor_to_video(x):
+ """[0-1] tensor to video"""
+ x = (x * 2 - 1).detach().cpu()
+ x = torch.clamp(x, -1, 1)
+ x = (x + 1) / 2
+ x = x.permute(1, 0, 2, 3).float().numpy() # c t h w -> t c h w
+ x = (255 * x).astype(np.uint8)
+ return x
\ No newline at end of file
diff --git a/opensora/models/diffusion/__init__.py b/opensora/models/diffusion/__init__.py
index bf3abffa9..d4d44ac5f 100644
--- a/opensora/models/diffusion/__init__.py
+++ b/opensora/models/diffusion/__init__.py
@@ -1,7 +1,30 @@
-from .latte.modeling_latte import Latte_models
+from .opensora.modeling_opensora import OpenSora_models
+from .opensora1.modeling_opensora import OpenSora1_models
+from .opensora2.modeling_opensora import OpenSora2_models
+from .udit.modeling_udit import UDiT_models
+
+from .opensora2.modeling_inpaint import OpenSoraInpaint_models
Diffusion_models = {}
-Diffusion_models.update(Latte_models)
+Diffusion_models.update(OpenSora_models)
+Diffusion_models.update(OpenSora1_models)
+Diffusion_models.update(OpenSora2_models)
+Diffusion_models.update(UDiT_models)
+Diffusion_models.update(OpenSoraInpaint_models)
+
+from .opensora.modeling_opensora import OpenSora_models_class
+from .opensora1.modeling_opensora import OpenSora1_models_class
+from .opensora2.modeling_opensora import OpenSora2_models_class
+from .udit.modeling_udit import UDiT_models_class
+
+from .opensora2.modeling_inpaint import OpenSoraInpaint_models_class
+
+Diffusion_models_class = {}
+Diffusion_models_class.update(OpenSora_models_class)
+Diffusion_models_class.update(OpenSora1_models_class)
+Diffusion_models_class.update(OpenSora2_models_class)
+Diffusion_models_class.update(UDiT_models_class)
+Diffusion_models_class.update(OpenSoraInpaint_models_class)
\ No newline at end of file
diff --git a/opensora/models/diffusion/diffusion/__init__.py b/opensora/models/diffusion/diffusion/__init__.py
deleted file mode 100644
index 04b2bd3d8..000000000
--- a/opensora/models/diffusion/diffusion/__init__.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Modified from OpenAI's diffusion repos
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-
-from .respace import SpacedDiffusion, space_timesteps, SpacedDiffusion_T
-
-
-def create_diffusion(
- timestep_respacing,
- noise_schedule="linear",
- use_kl=False,
- sigma_small=False,
- predict_xstart=False,
- learn_sigma=True,
- # learn_sigma=False,
- rescale_learned_sigmas=False,
- diffusion_steps=1000
-):
- from . import gaussian_diffusion as gd
- betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
- if use_kl:
- loss_type = gd.LossType.RESCALED_KL
- elif rescale_learned_sigmas:
- loss_type = gd.LossType.RESCALED_MSE
- else:
- loss_type = gd.LossType.MSE
- if timestep_respacing is None or timestep_respacing == "":
- timestep_respacing = [diffusion_steps]
- return SpacedDiffusion(
- use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
- betas=betas,
- model_mean_type=(
- gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
- ),
- model_var_type=(
- (
- gd.ModelVarType.FIXED_LARGE
- if not sigma_small
- else gd.ModelVarType.FIXED_SMALL
- )
- if not learn_sigma
- else gd.ModelVarType.LEARNED_RANGE
- ),
- loss_type=loss_type
- # rescale_timesteps=rescale_timesteps,
- )
-
-def create_diffusion_T(
- timestep_respacing,
- noise_schedule="linear",
- use_kl=False,
- sigma_small=False,
- predict_xstart=False,
- learn_sigma=True,
- # learn_sigma=False,
- rescale_learned_sigmas=False,
- diffusion_steps=1000
-):
- from . import gaussian_diffusion_t2v as gd
- betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
- if use_kl:
- loss_type = gd.LossType.RESCALED_KL
- elif rescale_learned_sigmas:
- loss_type = gd.LossType.RESCALED_MSE
- else:
- loss_type = gd.LossType.MSE
- if timestep_respacing is None or timestep_respacing == "":
- timestep_respacing = [diffusion_steps]
- return SpacedDiffusion_T(
- use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
- betas=betas,
- model_mean_type=(
- gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
- ),
- model_var_type=(
- (
- gd.ModelVarType.FIXED_LARGE
- if not sigma_small
- else gd.ModelVarType.FIXED_SMALL
- )
- if not learn_sigma
- else gd.ModelVarType.LEARNED_RANGE
- ),
- loss_type=loss_type
- # rescale_timesteps=rescale_timesteps,
- )
diff --git a/opensora/models/diffusion/diffusion/diffusion_utils.py b/opensora/models/diffusion/diffusion/diffusion_utils.py
deleted file mode 100644
index e493a6a3e..000000000
--- a/opensora/models/diffusion/diffusion/diffusion_utils.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Modified from OpenAI's diffusion repos
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-
-import torch as th
-import numpy as np
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
- """
- Compute the KL divergence between two gaussians.
- Shapes are automatically broadcasted, so batches can be compared to
- scalars, among other use cases.
- """
- tensor = None
- for obj in (mean1, logvar1, mean2, logvar2):
- if isinstance(obj, th.Tensor):
- tensor = obj
- break
- assert tensor is not None, "at least one argument must be a Tensor"
-
- # Force variances to be Tensors. Broadcasting helps convert scalars to
- # Tensors, but it does not work for th.exp().
- logvar1, logvar2 = [
- x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
- for x in (logvar1, logvar2)
- ]
-
- return 0.5 * (
- -1.0
- + logvar2
- - logvar1
- + th.exp(logvar1 - logvar2)
- + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
- )
-
-
-def approx_standard_normal_cdf(x):
- """
- A fast approximation of the cumulative distribution function of the
- standard normal.
- """
- return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
-
-
-def continuous_gaussian_log_likelihood(x, *, means, log_scales):
- """
- Compute the log-likelihood of a continuous Gaussian distribution.
- :param x: the targets
- :param means: the Gaussian mean Tensor.
- :param log_scales: the Gaussian log stddev Tensor.
- :return: a tensor like x of log probabilities (in nats).
- """
- centered_x = x - means
- inv_stdv = th.exp(-log_scales)
- normalized_x = centered_x * inv_stdv
- log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
- return log_probs
-
-
-def discretized_gaussian_log_likelihood(x, *, means, log_scales):
- """
- Compute the log-likelihood of a Gaussian distribution discretizing to a
- given image.
- :param x: the target images. It is assumed that this was uint8 values,
- rescaled to the range [-1, 1].
- :param means: the Gaussian mean Tensor.
- :param log_scales: the Gaussian log stddev Tensor.
- :return: a tensor like x of log probabilities (in nats).
- """
- assert x.shape == means.shape == log_scales.shape
- centered_x = x - means
- inv_stdv = th.exp(-log_scales)
- plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
- cdf_plus = approx_standard_normal_cdf(plus_in)
- min_in = inv_stdv * (centered_x - 1.0 / 255.0)
- cdf_min = approx_standard_normal_cdf(min_in)
- log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
- log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
- cdf_delta = cdf_plus - cdf_min
- log_probs = th.where(
- x < -0.999,
- log_cdf_plus,
- th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
- )
- assert log_probs.shape == x.shape
- return log_probs
diff --git a/opensora/models/diffusion/diffusion/gaussian_diffusion.py b/opensora/models/diffusion/diffusion/gaussian_diffusion.py
deleted file mode 100644
index 7fc3d43a7..000000000
--- a/opensora/models/diffusion/diffusion/gaussian_diffusion.py
+++ /dev/null
@@ -1,881 +0,0 @@
-# Modified from OpenAI's diffusion repos
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-
-
-import math
-
-import numpy as np
-import torch as th
-import enum
-
-from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
-
-
-def mean_flat(tensor):
- """
- Take the mean over all non-batch dimensions.
- """
- return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-class ModelMeanType(enum.Enum):
- """
- Which type of output the model predicts.
- """
-
- PREVIOUS_X = enum.auto() # the model predicts x_{t-1}
- START_X = enum.auto() # the model predicts x_0
- EPSILON = enum.auto() # the model predicts epsilon
-
-
-class ModelVarType(enum.Enum):
- """
- What is used as the model's output variance.
- The LEARNED_RANGE option has been added to allow the model to predict
- values between FIXED_SMALL and FIXED_LARGE, making its job easier.
- """
-
- LEARNED = enum.auto()
- FIXED_SMALL = enum.auto()
- FIXED_LARGE = enum.auto()
- LEARNED_RANGE = enum.auto()
-
-
-class LossType(enum.Enum):
- MSE = enum.auto() # use raw MSE loss (and KL when learning variances)
- RESCALED_MSE = (
- enum.auto()
- ) # use raw MSE loss (with RESCALED_KL when learning variances)
- KL = enum.auto() # use the variational lower-bound
- RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB
-
- def is_vb(self):
- return self == LossType.KL or self == LossType.RESCALED_KL
-
-
-def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- warmup_time = int(num_diffusion_timesteps * warmup_frac)
- betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
- return betas
-
-
-def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
- """
- This is the deprecated API for creating beta schedules.
- See get_named_beta_schedule() for the new library of schedules.
- """
- if beta_schedule == "quad":
- betas = (
- np.linspace(
- beta_start ** 0.5,
- beta_end ** 0.5,
- num_diffusion_timesteps,
- dtype=np.float64,
- )
- ** 2
- )
- elif beta_schedule == "linear":
- betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "warmup10":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
- elif beta_schedule == "warmup50":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
- elif beta_schedule == "const":
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
- betas = 1.0 / np.linspace(
- num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
- )
- else:
- raise NotImplementedError(beta_schedule)
- assert betas.shape == (num_diffusion_timesteps,)
- return betas
-
-
-def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
- """
- Get a pre-defined beta schedule for the given name.
- The beta schedule library consists of beta schedules which remain similar
- in the limit of num_diffusion_timesteps.
- Beta schedules may be added, but should not be removed or changed once
- they are committed to maintain backwards compatibility.
- """
- if schedule_name == "linear":
- # Linear schedule from Ho et al, extended to work for any number of
- # diffusion steps.
- scale = 1000 / num_diffusion_timesteps
- return get_beta_schedule(
- "linear",
- beta_start=scale * 0.0001,
- beta_end=scale * 0.02,
- num_diffusion_timesteps=num_diffusion_timesteps,
- )
- elif schedule_name == "squaredcos_cap_v2":
- return betas_for_alpha_bar(
- num_diffusion_timesteps,
- lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
- )
- else:
- raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
- """
- Create a beta schedule that discretizes the given alpha_t_bar function,
- which defines the cumulative product of (1-beta) over time from t = [0,1].
- :param num_diffusion_timesteps: the number of betas to produce.
- :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
- produces the cumulative product of (1-beta) up to that
- part of the diffusion process.
- :param max_beta: the maximum beta to use; use values lower than 1 to
- prevent singularities.
- """
- betas = []
- for i in range(num_diffusion_timesteps):
- t1 = i / num_diffusion_timesteps
- t2 = (i + 1) / num_diffusion_timesteps
- betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
- return np.array(betas)
-
-
-class GaussianDiffusion:
- """
- Utilities for training and sampling diffusion models.
- Original ported from this codebase:
- https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
- :param betas: a 1-D numpy array of betas for each diffusion timestep,
- starting at T and going to 1.
- """
-
- def __init__(
- self,
- *,
- betas,
- model_mean_type,
- model_var_type,
- loss_type
- ):
-
- self.model_mean_type = model_mean_type
- self.model_var_type = model_var_type
- self.loss_type = loss_type
-
- # Use float64 for accuracy.
- betas = np.array(betas, dtype=np.float64)
- self.betas = betas
- assert len(betas.shape) == 1, "betas must be 1-D"
- assert (betas > 0).all() and (betas <= 1).all()
-
- self.num_timesteps = int(betas.shape[0])
-
- alphas = 1.0 - betas
- self.alphas_cumprod = np.cumprod(alphas, axis=0)
- self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
- self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
- assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
-
- # calculations for diffusion q(x_t | x_{t-1}) and others
- self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
- self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
- self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
- self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
- self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
-
- # calculations for posterior q(x_{t-1} | x_t, x_0)
- self.posterior_variance = (
- betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
- )
- # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
- self.posterior_log_variance_clipped = np.log(
- np.append(self.posterior_variance[1], self.posterior_variance[1:])
- ) if len(self.posterior_variance) > 1 else np.array([])
-
- self.posterior_mean_coef1 = (
- betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
- )
- self.posterior_mean_coef2 = (
- (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
- )
-
- def q_mean_variance(self, x_start, t):
- """
- Get the distribution q(x_t | x_0).
- :param x_start: the [N x C x ...] tensor of noiseless inputs.
- :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
- :return: A tuple (mean, variance, log_variance), all of x_start's shape.
- """
- mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
- variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
- log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
- return mean, variance, log_variance
-
- def q_sample(self, x_start, t, noise=None):
- """
- Diffuse the data for a given number of diffusion steps.
- In other words, sample from q(x_t | x_0).
- :param x_start: the initial data batch.
- :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
- :param noise: if specified, the split-out normal noise.
- :return: A noisy version of x_start.
- """
- if noise is None:
- noise = th.randn_like(x_start)
- assert noise.shape == x_start.shape
- return (
- _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
- + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
- )
-
- def q_posterior_mean_variance(self, x_start, x_t, t):
- """
- Compute the mean and variance of the diffusion posterior:
- q(x_{t-1} | x_t, x_0)
- """
- assert x_start.shape == x_t.shape
- posterior_mean = (
- _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
- + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
- )
- posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
- posterior_log_variance_clipped = _extract_into_tensor(
- self.posterior_log_variance_clipped, t, x_t.shape
- )
- assert (
- posterior_mean.shape[0]
- == posterior_variance.shape[0]
- == posterior_log_variance_clipped.shape[0]
- == x_start.shape[0]
- )
- return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
- def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
- """
- Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
- the initial x, x_0.
- :param model: the model, which takes a signal and a batch of timesteps
- as input.
- :param x: the [N x C x ...] tensor at time t.
- :param t: a 1-D Tensor of timesteps.
- :param clip_denoised: if True, clip the denoised signal into [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample. Applies before
- clip_denoised.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict with the following keys:
- - 'mean': the model mean output.
- - 'variance': the model variance output.
- - 'log_variance': the log of 'variance'.
- - 'pred_xstart': the prediction for x_0.
- """
- if model_kwargs is None:
- model_kwargs = {}
-
- B, F, C = x.shape[:3]
- assert t.shape == (B,)
- model_output = model(x, t, **model_kwargs)
- # try:
- # model_output = model_output.sample # for tav unet
- # except:
- # model_output = model(x, t, **model_kwargs)
- if isinstance(model_output, tuple):
- model_output, extra = model_output
- else:
- extra = None
-
- if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
- assert model_output.shape == (B, F, C * 2, *x.shape[3:])
- model_output, model_var_values = th.split(model_output, C, dim=2)
- min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
- max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
- # The model_var_values is [-1, 1] for [min_var, max_var].
- frac = (model_var_values + 1) / 2
- model_log_variance = frac * max_log + (1 - frac) * min_log
- model_variance = th.exp(model_log_variance)
- else:
- model_variance, model_log_variance = {
- # for fixedlarge, we set the initial (log-)variance like so
- # to get a better decoder log likelihood.
- ModelVarType.FIXED_LARGE: (
- np.append(self.posterior_variance[1], self.betas[1:]),
- np.log(np.append(self.posterior_variance[1], self.betas[1:])),
- ),
- ModelVarType.FIXED_SMALL: (
- self.posterior_variance,
- self.posterior_log_variance_clipped,
- ),
- }[self.model_var_type]
- model_variance = _extract_into_tensor(model_variance, t, x.shape)
- model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
-
- def process_xstart(x):
- if denoised_fn is not None:
- x = denoised_fn(x)
- if clip_denoised:
- return x.clamp(-1, 1)
- return x
-
- if self.model_mean_type == ModelMeanType.START_X:
- pred_xstart = process_xstart(model_output)
- else:
- pred_xstart = process_xstart(
- self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
- )
- model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
-
- assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
- return {
- "mean": model_mean,
- "variance": model_variance,
- "log_variance": model_log_variance,
- "pred_xstart": pred_xstart,
- "extra": extra,
- }
-
- def _predict_xstart_from_eps(self, x_t, t, eps):
- assert x_t.shape == eps.shape
- return (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
- - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
- )
-
- def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
- return (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
- ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
- def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
- """
- Compute the mean for the previous step, given a function cond_fn that
- computes the gradient of a conditional log probability with respect to
- x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
- condition on y.
- This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
- """
- gradient = cond_fn(x, t, **model_kwargs)
- new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
- return new_mean
-
- def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
- """
- Compute what the p_mean_variance output would have been, should the
- model's score function be conditioned by cond_fn.
- See condition_mean() for details on cond_fn.
- Unlike condition_mean(), this instead uses the conditioning strategy
- from Song et al (2020).
- """
- alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-
- eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
- eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
-
- out = p_mean_var.copy()
- out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
- out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
- return out
-
- def p_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- ):
- """
- Sample x_{t-1} from the model at the given timestep.
- :param model: the model to sample from.
- :param x: the current tensor at x_{t-1}.
- :param t: the value of t, starting at 0 for the first diffusion step.
- :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample.
- :param cond_fn: if not None, this is a gradient function that acts
- similarly to the model.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict containing the following keys:
- - 'sample': a random sample from the model.
- - 'pred_xstart': a prediction of x_0.
- """
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- noise = th.randn_like(x)
- nonzero_mask = (
- (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
- ) # no noise when t == 0
- if cond_fn is not None:
- out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
- sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
- return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
- def p_sample_loop(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- ):
- """
- Generate samples from the model.
- :param model: the model module.
- :param shape: the shape of the samples, (N, C, H, W).
- :param noise: if specified, the noise from the encoder to sample.
- Should be of the same shape as `shape`.
- :param clip_denoised: if True, clip x_start predictions to [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample.
- :param cond_fn: if not None, this is a gradient function that acts
- similarly to the model.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :param device: if specified, the device to create the samples on.
- If not specified, use a model parameter's device.
- :param progress: if True, show a tqdm progress bar.
- :return: a non-differentiable batch of samples.
- """
- final = None
- for sample in self.p_sample_loop_progressive(
- model,
- shape,
- noise=noise,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- device=device,
- progress=progress,
- ):
- final = sample
- return final["sample"]
-
- def p_sample_loop_progressive(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- ):
- """
- Generate samples from the model and yield intermediate samples from
- each timestep of diffusion.
- Arguments are the same as p_sample_loop().
- Returns a generator over dicts, where each dict is the return value of
- p_sample().
- """
- if device is None:
- device = next(model.parameters()).device
- assert isinstance(shape, (tuple, list))
- if noise is not None:
- img = noise
- else:
- img = th.randn(*shape, device=device)
- indices = list(range(self.num_timesteps))[::-1]
-
- if progress:
- # Lazy import so that we don't depend on tqdm.
- from tqdm.auto import tqdm
-
- indices = tqdm(indices)
-
- for i in indices:
- t = th.tensor([i] * shape[0], device=device)
- with th.no_grad():
- out = self.p_sample(
- model,
- img,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- )
- yield out
- img = out["sample"]
-
- def ddim_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- eta=0.0,
- ):
- """
- Sample x_{t-1} from the model using DDIM.
- Same usage as p_sample().
- """
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- if cond_fn is not None:
- out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
-
- # Usually our model outputs epsilon, but we re-derive it
- # in case we used x_start or x_prev prediction.
- eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
-
- alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
- alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
- sigma = (
- eta
- * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
- * th.sqrt(1 - alpha_bar / alpha_bar_prev)
- )
- # Equation 12.
- noise = th.randn_like(x)
- mean_pred = (
- out["pred_xstart"] * th.sqrt(alpha_bar_prev)
- + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
- )
- nonzero_mask = (
- (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
- ) # no noise when t == 0
- sample = mean_pred + nonzero_mask * sigma * noise
- return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
- def ddim_reverse_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- eta=0.0,
- ):
- """
- Sample x_{t+1} from the model using DDIM reverse ODE.
- """
- assert eta == 0.0, "Reverse ODE only for deterministic path"
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- if cond_fn is not None:
- out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
- # Usually our model outputs epsilon, but we re-derive it
- # in case we used x_start or x_prev prediction.
- eps = (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
- - out["pred_xstart"]
- ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
- alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
-
- # Equation 12. reversed
- mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
-
- return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
-
- def ddim_sample_loop(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- eta=0.0,
- ):
- """
- Generate samples from the model using DDIM.
- Same usage as p_sample_loop().
- """
- final = None
- for sample in self.ddim_sample_loop_progressive(
- model,
- shape,
- noise=noise,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- device=device,
- progress=progress,
- eta=eta,
- ):
- final = sample
- return final["sample"]
-
- def ddim_sample_loop_progressive(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- eta=0.0,
- ):
- """
- Use DDIM to sample from the model and yield intermediate samples from
- each timestep of DDIM.
- Same usage as p_sample_loop_progressive().
- """
- if device is None:
- device = next(model.parameters()).device
- assert isinstance(shape, (tuple, list))
- if noise is not None:
- img = noise
- else:
- img = th.randn(*shape, device=device)
- indices = list(range(self.num_timesteps))[::-1]
-
- if progress:
- # Lazy import so that we don't depend on tqdm.
- from tqdm.auto import tqdm
-
- indices = tqdm(indices)
-
- for i in indices:
- t = th.tensor([i] * shape[0], device=device)
- with th.no_grad():
- out = self.ddim_sample(
- model,
- img,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- eta=eta,
- )
- yield out
- img = out["sample"]
-
- def _vb_terms_bpd(
- self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
- ):
- """
- Get a term for the variational lower-bound.
- The resulting units are bits (rather than nats, as one might expect).
- This allows for comparison to other papers.
- :return: a dict with the following keys:
- - 'output': a shape [N] tensor of NLLs or KLs.
- - 'pred_xstart': the x_0 predictions.
- """
- true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
- x_start=x_start, x_t=x_t, t=t
- )
- out = self.p_mean_variance(
- model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
- )
- kl = normal_kl(
- true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
- )
- kl = mean_flat(kl) / np.log(2.0)
-
- decoder_nll = -discretized_gaussian_log_likelihood(
- x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
- )
- assert decoder_nll.shape == x_start.shape
- decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
-
- # At the first timestep return the decoder NLL,
- # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
- output = th.where((t == 0), decoder_nll, kl)
- return {"output": output, "pred_xstart": out["pred_xstart"]}
-
- def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
- """
- Compute training losses for a single timestep.
- :param model: the model to evaluate loss on.
- :param x_start: the [N x C x ...] tensor of inputs.
- :param t: a batch of timestep indices.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :param noise: if specified, the specific Gaussian noise to try to remove.
- :return: a dict with the key "loss" containing a tensor of shape [N].
- Some mean or variance settings may also have other keys.
- """
- if model_kwargs is None:
- model_kwargs = {}
- if noise is None:
- noise = th.randn_like(x_start)
- x_t = self.q_sample(x_start, t, noise=noise)
-
- terms = {}
-
- if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
- terms["loss"] = self._vb_terms_bpd(
- model=model,
- x_start=x_start,
- x_t=x_t,
- t=t,
- clip_denoised=False,
- model_kwargs=model_kwargs,
- )["output"]
- if self.loss_type == LossType.RESCALED_KL:
- terms["loss"] *= self.num_timesteps
- elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
- model_output = model(x_t, t, **model_kwargs)
- # try:
- # model_output = model(x_t, t, **model_kwargs).sample # for tav unet
- # except:
- # model_output = model(x_t, t, **model_kwargs)
-
- if self.model_var_type in [
- ModelVarType.LEARNED,
- ModelVarType.LEARNED_RANGE,
- ]:
- B, F, C = x_t.shape[:3]
- assert model_output.shape == (B, F, C * 2, *x_t.shape[3:])
- model_output, model_var_values = th.split(model_output, C, dim=2)
- # Learn the variance using the variational bound, but don't let
- # it affect our mean prediction.
- frozen_out = th.cat([model_output.detach(), model_var_values], dim=2)
- terms["vb"] = self._vb_terms_bpd(
- model=lambda *args, r=frozen_out: r,
- x_start=x_start,
- x_t=x_t,
- t=t,
- clip_denoised=False,
- )["output"]
- if self.loss_type == LossType.RESCALED_MSE:
- # Divide by 1000 for equivalence with initial implementation.
- # Without a factor of 1/1000, the VB term hurts the MSE term.
- terms["vb"] *= self.num_timesteps / 1000.0
-
- target = {
- ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
- x_start=x_start, x_t=x_t, t=t
- )[0],
- ModelMeanType.START_X: x_start,
- ModelMeanType.EPSILON: noise,
- }[self.model_mean_type]
- assert model_output.shape == target.shape == x_start.shape
- terms["mse"] = mean_flat((target - model_output) ** 2)
- if "vb" in terms:
- terms["loss"] = terms["mse"] + terms["vb"]
- else:
- terms["loss"] = terms["mse"]
- else:
- raise NotImplementedError(self.loss_type)
-
- return terms
-
- def _prior_bpd(self, x_start):
- """
- Get the prior KL term for the variational lower-bound, measured in
- bits-per-dim.
- This term can't be optimized, as it only depends on the encoder.
- :param x_start: the [N x C x ...] tensor of inputs.
- :return: a batch of [N] KL values (in bits), one per batch element.
- """
- batch_size = x_start.shape[0]
- t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
- qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
- kl_prior = normal_kl(
- mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
- )
- return mean_flat(kl_prior) / np.log(2.0)
-
- def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
- """
- Compute the entire variational lower-bound, measured in bits-per-dim,
- as well as other related quantities.
- :param model: the model to evaluate loss on.
- :param x_start: the [N x C x ...] tensor of inputs.
- :param clip_denoised: if True, clip denoised samples.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict containing the following keys:
- - total_bpd: the total variational lower-bound, per batch element.
- - prior_bpd: the prior term in the lower-bound.
- - vb: an [N x T] tensor of terms in the lower-bound.
- - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
- - mse: an [N x T] tensor of epsilon MSEs for each timestep.
- """
- device = x_start.device
- batch_size = x_start.shape[0]
-
- vb = []
- xstart_mse = []
- mse = []
- for t in list(range(self.num_timesteps))[::-1]:
- t_batch = th.tensor([t] * batch_size, device=device)
- noise = th.randn_like(x_start)
- x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
- # Calculate VLB term at the current timestep
- with th.no_grad():
- out = self._vb_terms_bpd(
- model,
- x_start=x_start,
- x_t=x_t,
- t=t_batch,
- clip_denoised=clip_denoised,
- model_kwargs=model_kwargs,
- )
- vb.append(out["output"])
- xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
- eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
- mse.append(mean_flat((eps - noise) ** 2))
-
- vb = th.stack(vb, dim=1)
- xstart_mse = th.stack(xstart_mse, dim=1)
- mse = th.stack(mse, dim=1)
-
- prior_bpd = self._prior_bpd(x_start)
- total_bpd = vb.sum(dim=1) + prior_bpd
- return {
- "total_bpd": total_bpd,
- "prior_bpd": prior_bpd,
- "vb": vb,
- "xstart_mse": xstart_mse,
- "mse": mse,
- }
-
-
-def _extract_into_tensor(arr, timesteps, broadcast_shape):
- """
- Extract values from a 1-D numpy array for a batch of indices.
- :param arr: the 1-D numpy array.
- :param timesteps: a tensor of indices into the array to extract.
- :param broadcast_shape: a larger shape of K dimensions with the batch
- dimension equal to the length of timesteps.
- :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
- """
- res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
- while len(res.shape) < len(broadcast_shape):
- res = res[..., None]
- return res + th.zeros(broadcast_shape, device=timesteps.device)
diff --git a/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py b/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py
deleted file mode 100644
index b99b0d097..000000000
--- a/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py
+++ /dev/null
@@ -1,904 +0,0 @@
-# Modified from OpenAI's diffusion repos
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-
-
-import math
-
-import numpy as np
-import torch as th
-import enum
-
-from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
-
-
-def mean_flat(tensor):
- """
- Take the mean over all non-batch dimensions.
- """
- return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-class ModelMeanType(enum.Enum):
- """
- Which type of output the model predicts.
- """
-
- PREVIOUS_X = enum.auto() # the model predicts x_{t-1}
- START_X = enum.auto() # the model predicts x_0
- EPSILON = enum.auto() # the model predicts epsilon
-
-
-class ModelVarType(enum.Enum):
- """
- What is used as the model's output variance.
- The LEARNED_RANGE option has been added to allow the model to predict
- values between FIXED_SMALL and FIXED_LARGE, making its job easier.
- """
-
- LEARNED = enum.auto()
- FIXED_SMALL = enum.auto()
- FIXED_LARGE = enum.auto()
- LEARNED_RANGE = enum.auto()
-
-
-class LossType(enum.Enum):
- MSE = enum.auto() # use raw MSE loss (and KL when learning variances)
- RESCALED_MSE = (
- enum.auto()
- ) # use raw MSE loss (with RESCALED_KL when learning variances)
- KL = enum.auto() # use the variational lower-bound
- RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB
-
- def is_vb(self):
- return self == LossType.KL or self == LossType.RESCALED_KL
-
-
-def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- warmup_time = int(num_diffusion_timesteps * warmup_frac)
- betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
- return betas
-
-
-def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
- """
- This is the deprecated API for creating beta schedules.
- See get_named_beta_schedule() for the new library of schedules.
- """
- if beta_schedule == "quad":
- betas = (
- np.linspace(
- beta_start ** 0.5,
- beta_end ** 0.5,
- num_diffusion_timesteps,
- dtype=np.float64,
- )
- ** 2
- )
- elif beta_schedule == "linear":
- betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "warmup10":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
- elif beta_schedule == "warmup50":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
- elif beta_schedule == "const":
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
- betas = 1.0 / np.linspace(
- num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
- )
- else:
- raise NotImplementedError(beta_schedule)
- assert betas.shape == (num_diffusion_timesteps,)
- return betas
-
-
-def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
- """
- Get a pre-defined beta schedule for the given name.
- The beta schedule library consists of beta schedules which remain similar
- in the limit of num_diffusion_timesteps.
- Beta schedules may be added, but should not be removed or changed once
- they are committed to maintain backwards compatibility.
- """
- if schedule_name == "linear":
- # Linear schedule from Ho et al, extended to work for any number of
- # diffusion steps.
- scale = 1000 / num_diffusion_timesteps
- return get_beta_schedule(
- "linear",
- beta_start=scale * 0.0001,
- beta_end=scale * 0.02,
- num_diffusion_timesteps=num_diffusion_timesteps,
- )
- elif schedule_name == "squaredcos_cap_v2":
- return betas_for_alpha_bar(
- num_diffusion_timesteps,
- lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
- )
- else:
- raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
- """
- Create a beta schedule that discretizes the given alpha_t_bar function,
- which defines the cumulative product of (1-beta) over time from t = [0,1].
- :param num_diffusion_timesteps: the number of betas to produce.
- :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
- produces the cumulative product of (1-beta) up to that
- part of the diffusion process.
- :param max_beta: the maximum beta to use; use values lower than 1 to
- prevent singularities.
- """
- betas = []
- for i in range(num_diffusion_timesteps):
- t1 = i / num_diffusion_timesteps
- t2 = (i + 1) / num_diffusion_timesteps
- betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
- return np.array(betas)
-
-
-class GaussianDiffusion_T:
- """
- Utilities for training and sampling diffusion models.
- Original ported from this codebase:
- https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
- :param betas: a 1-D numpy array of betas for each diffusion timestep,
- starting at T and going to 1.
- """
-
- def __init__(
- self,
- *,
- betas,
- model_mean_type,
- model_var_type,
- loss_type
- ):
-
- self.model_mean_type = model_mean_type
- self.model_var_type = model_var_type
- self.loss_type = loss_type
-
- # Use float64 for accuracy.
- betas = np.array(betas, dtype=np.float64)
- self.betas = betas
- assert len(betas.shape) == 1, "betas must be 1-D"
- assert (betas > 0).all() and (betas <= 1).all()
-
- self.num_timesteps = int(betas.shape[0])
-
- alphas = 1.0 - betas
- self.alphas_cumprod = np.cumprod(alphas, axis=0)
- self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
- self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
- assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
-
- # calculations for diffusion q(x_t | x_{t-1}) and others
- self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
- self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
- self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
- self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
- self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
-
- # calculations for posterior q(x_{t-1} | x_t, x_0)
- self.posterior_variance = (
- betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
- )
- # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
- self.posterior_log_variance_clipped = np.log(
- np.append(self.posterior_variance[1], self.posterior_variance[1:])
- ) if len(self.posterior_variance) > 1 else np.array([])
-
- self.posterior_mean_coef1 = (
- betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
- )
- self.posterior_mean_coef2 = (
- (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
- )
-
- def q_mean_variance(self, x_start, t):
- """
- Get the distribution q(x_t | x_0).
- :param x_start: the [N x C x ...] tensor of noiseless inputs.
- :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
- :return: A tuple (mean, variance, log_variance), all of x_start's shape.
- """
- mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
- variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
- log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
- return mean, variance, log_variance
-
- def q_sample(self, x_start, t, noise=None):
- """
- Diffuse the data for a given number of diffusion steps.
- In other words, sample from q(x_t | x_0).
- :param x_start: the initial data batch.
- :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
- :param noise: if specified, the split-out normal noise.
- :return: A noisy version of x_start.
- """
- if noise is None:
- noise = th.randn_like(x_start)
- assert noise.shape == x_start.shape
- return (
- _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
- + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
- )
-
- def q_posterior_mean_variance(self, x_start, x_t, t):
- """
- Compute the mean and variance of the diffusion posterior:
- q(x_{t-1} | x_t, x_0)
- """
- assert x_start.shape == x_t.shape
- posterior_mean = (
- _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
- + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
- )
- posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
- posterior_log_variance_clipped = _extract_into_tensor(
- self.posterior_log_variance_clipped, t, x_t.shape
- )
- assert (
- posterior_mean.shape[0]
- == posterior_variance.shape[0]
- == posterior_log_variance_clipped.shape[0]
- == x_start.shape[0]
- )
- return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
- def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
- """
- Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
- the initial x, x_0.
- :param model: the model, which takes a signal and a batch of timesteps
- as input.
- :param x: the [N x C x ...] tensor at time t.
- :param t: a 1-D Tensor of timesteps.
- :param clip_denoised: if True, clip the denoised signal into [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample. Applies before
- clip_denoised.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict with the following keys:
- - 'mean': the model mean output.
- - 'variance': the model variance output.
- - 'log_variance': the log of 'variance'.
- - 'pred_xstart': the prediction for x_0.
- """
- if model_kwargs is None:
- model_kwargs = {}
-
- #B, F, C = x.shape[:3]
- B, C, F = x.shape[:3]
- assert t.shape == (B,)
- model_output = model(x, t, **model_kwargs)
-
- try:
- model_output.shape
- except:
- model_output = model_output[0]
- # try:
- # model_output = model_output.sample # for tav unet
- # except:
- # model_output = model(x, t, **model_kwargs)
- if isinstance(model_output, tuple):
- model_output, extra = model_output
- else:
- extra = None
-
- if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
- #assert model_output.shape == (B, F, C * 2, *x.shape[3:])
- #model_output, model_var_values = th.split(model_output, C, dim=2)
- #the output shape of uncondition or class condition latte is not the same as the latte_t2v
- #BFCHW vs BCFHW
- assert model_output.shape == (B, C * 2, F, *x.shape[3:]), f'model_output.shape ({model_output.shape}), != {(B, C * 2, F, *x.shape[3:])}'
- model_output, model_var_values = th.split(model_output, C, dim=1)
- min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
- max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
- # The model_var_values is [-1, 1] for [min_var, max_var].
- frac = (model_var_values + 1) / 2
- model_log_variance = frac * max_log + (1 - frac) * min_log
- model_variance = th.exp(model_log_variance)
- else:
- model_variance, model_log_variance = {
- # for fixedlarge, we set the initial (log-)variance like so
- # to get a better decoder log likelihood.
- ModelVarType.FIXED_LARGE: (
- np.append(self.posterior_variance[1], self.betas[1:]),
- np.log(np.append(self.posterior_variance[1], self.betas[1:])),
- ),
- ModelVarType.FIXED_SMALL: (
- self.posterior_variance,
- self.posterior_log_variance_clipped,
- ),
- }[self.model_var_type]
- model_variance = _extract_into_tensor(model_variance, t, x.shape)
- model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
-
- def process_xstart(x):
- if denoised_fn is not None:
- x = denoised_fn(x)
- if clip_denoised:
- return x.clamp(-1, 1)
- return x
-
- if self.model_mean_type == ModelMeanType.START_X:
- pred_xstart = process_xstart(model_output)
- else:
- pred_xstart = process_xstart(
- self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
- )
- model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
-
- assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
- return {
- "mean": model_mean,
- "variance": model_variance,
- "log_variance": model_log_variance,
- "pred_xstart": pred_xstart,
- "extra": extra,
- }
-
- def _predict_xstart_from_eps(self, x_t, t, eps):
- assert x_t.shape == eps.shape
- return (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
- - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
- )
-
- def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
- return (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
- ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
- def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
- """
- Compute the mean for the previous step, given a function cond_fn that
- computes the gradient of a conditional log probability with respect to
- x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
- condition on y.
- This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
- """
- gradient = cond_fn(x, t, **model_kwargs)
- new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
- return new_mean
-
- def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
- """
- Compute what the p_mean_variance output would have been, should the
- model's score function be conditioned by cond_fn.
- See condition_mean() for details on cond_fn.
- Unlike condition_mean(), this instead uses the conditioning strategy
- from Song et al (2020).
- """
- alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-
- eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
- eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
-
- out = p_mean_var.copy()
- out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
- out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
- return out
-
- def p_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- ):
- """
- Sample x_{t-1} from the model at the given timestep.
- :param model: the model to sample from.
- :param x: the current tensor at x_{t-1}.
- :param t: the value of t, starting at 0 for the first diffusion step.
- :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample.
- :param cond_fn: if not None, this is a gradient function that acts
- similarly to the model.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict containing the following keys:
- - 'sample': a random sample from the model.
- - 'pred_xstart': a prediction of x_0.
- """
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- noise = th.randn_like(x)
- nonzero_mask = (
- (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
- ) # no noise when t == 0
- if cond_fn is not None:
- out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
- sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
- return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
- def p_sample_loop(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- ):
- """
- Generate samples from the model.
- :param model: the model module.
- :param shape: the shape of the samples, (N, C, H, W).
- :param noise: if specified, the noise from the encoder to sample.
- Should be of the same shape as `shape`.
- :param clip_denoised: if True, clip x_start predictions to [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample.
- :param cond_fn: if not None, this is a gradient function that acts
- similarly to the model.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :param device: if specified, the device to create the samples on.
- If not specified, use a model parameter's device.
- :param progress: if True, show a tqdm progress bar.
- :return: a non-differentiable batch of samples.
- """
- final = None
- for sample in self.p_sample_loop_progressive(
- model,
- shape,
- noise=noise,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- device=device,
- progress=progress,
- ):
- final = sample
- return final["sample"]
-
- def p_sample_loop_progressive(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- ):
- """
- Generate samples from the model and yield intermediate samples from
- each timestep of diffusion.
- Arguments are the same as p_sample_loop().
- Returns a generator over dicts, where each dict is the return value of
- p_sample().
- """
- if device is None:
- device = next(model.parameters()).device
- assert isinstance(shape, (tuple, list))
- if noise is not None:
- img = noise
- else:
- img = th.randn(*shape, device=device)
- indices = list(range(self.num_timesteps))[::-1]
-
- if progress:
- # Lazy import so that we don't depend on tqdm.
- from tqdm.auto import tqdm
-
- indices = tqdm(indices)
-
- for i in indices:
- t = th.tensor([i] * shape[0], device=device)
- with th.no_grad():
- out = self.p_sample(
- model,
- img,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- )
- yield out
- img = out["sample"]
-
- def ddim_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- eta=0.0,
- ):
- """
- Sample x_{t-1} from the model using DDIM.
- Same usage as p_sample().
- """
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- if cond_fn is not None:
- out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
-
- # Usually our model outputs epsilon, but we re-derive it
- # in case we used x_start or x_prev prediction.
- eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
-
- alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
- alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
- sigma = (
- eta
- * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
- * th.sqrt(1 - alpha_bar / alpha_bar_prev)
- )
- # Equation 12.
- noise = th.randn_like(x)
- mean_pred = (
- out["pred_xstart"] * th.sqrt(alpha_bar_prev)
- + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
- )
- nonzero_mask = (
- (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
- ) # no noise when t == 0
- sample = mean_pred + nonzero_mask * sigma * noise
- return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
- def ddim_reverse_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- eta=0.0,
- ):
- """
- Sample x_{t+1} from the model using DDIM reverse ODE.
- """
- assert eta == 0.0, "Reverse ODE only for deterministic path"
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- if cond_fn is not None:
- out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
- # Usually our model outputs epsilon, but we re-derive it
- # in case we used x_start or x_prev prediction.
- eps = (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
- - out["pred_xstart"]
- ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
- alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
-
- # Equation 12. reversed
- mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
-
- return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
-
- def ddim_sample_loop(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- eta=0.0,
- ):
- """
- Generate samples from the model using DDIM.
- Same usage as p_sample_loop().
- """
- final = None
- for sample in self.ddim_sample_loop_progressive(
- model,
- shape,
- noise=noise,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- device=device,
- progress=progress,
- eta=eta,
- ):
- final = sample
- return final["sample"]
-
- def ddim_sample_loop_progressive(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- eta=0.0,
- ):
- """
- Use DDIM to sample from the model and yield intermediate samples from
- each timestep of DDIM.
- Same usage as p_sample_loop_progressive().
- """
- if device is None:
- device = next(model.parameters()).device
- assert isinstance(shape, (tuple, list))
- if noise is not None:
- img = noise
- else:
- img = th.randn(*shape, device=device)
- indices = list(range(self.num_timesteps))[::-1]
-
- if progress:
- # Lazy import so that we don't depend on tqdm.
- from tqdm.auto import tqdm
-
- indices = tqdm(indices)
-
- for i in indices:
- t = th.tensor([i] * shape[0], device=device)
- with th.no_grad():
- out = self.ddim_sample(
- model,
- img,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- eta=eta,
- )
- yield out
- img = out["sample"]
-
- def _vb_terms_bpd(
- self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None, mask=1.0,
- ):
- """
- Get a term for the variational lower-bound.
- The resulting units are bits (rather than nats, as one might expect).
- This allows for comparison to other papers.
- :return: a dict with the following keys:
- - 'output': a shape [N] tensor of NLLs or KLs.
- - 'pred_xstart': the x_0 predictions.
- """
- # import ipdb;ipdb.set_trace()
- true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
- x_start=x_start, x_t=x_t, t=t
- )
- out = self.p_mean_variance(
- model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
- )
- kl = normal_kl(
- true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
- )
- kl = mean_flat(kl * mask) / np.log(2.0)
-
- decoder_nll = -discretized_gaussian_log_likelihood(
- x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
- )
- assert decoder_nll.shape == x_start.shape
- decoder_nll = mean_flat(decoder_nll * mask) / np.log(2.0)
-
- # At the first timestep return the decoder NLL,
- # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
- output = th.where((t == 0), decoder_nll, kl)
- return {"output": output, "pred_xstart": out["pred_xstart"]}
-
- def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
- """
- Compute training losses for a single timestep.
- :param model: the model to evaluate loss on.
- :param x_start: the [N x C x ...] tensor of inputs.
- :param t: a batch of timestep indices.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :param noise: if specified, the specific Gaussian noise to try to remove.
- :return: a dict with the key "loss" containing a tensor of shape [N].
- Some mean or variance settings may also have other keys.
- """
- if model_kwargs is None:
- model_kwargs = {}
- mask = 1.0
- else:
- mask = model_kwargs['attention_mask'].unsqueeze(1) # b t h w -> b 1 t h w
-
- if noise is None:
- noise = th.randn_like(x_start)
- x_t = self.q_sample(x_start, t, noise=noise)
-
- terms = {}
- if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
- terms["loss"] = self._vb_terms_bpd(
- model=model,
- x_start=x_start,
- x_t=x_t,
- t=t,
- clip_denoised=False,
- model_kwargs=model_kwargs,
- )["output"]
- if self.loss_type == LossType.RESCALED_KL:
- terms["loss"] *= self.num_timesteps
- elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
- model_output = model(x_t, t, **model_kwargs)
- # try:
- # model_output = model(x_t, t, **model_kwargs).sample # for tav unet
- # except:
- # model_output = model(x_t, t, **model_kwargs)
-
- if self.model_var_type in [
- ModelVarType.LEARNED,
- ModelVarType.LEARNED_RANGE,
- ]:
- #B, F, C = x_t.shape[:3]
- #assert model_output.shape == (B, F, C * 2, *x_t.shape[3:])
- #the output shape of uncondition or class condition latte is not the same as the latte_t2v
- #BFCHW vs BCFHW
- B, C, F = x_t.shape[:3]
- assert model_output[0].shape == (B, C * 2, F, *x_t.shape[3:])
- #model_output, model_var_values = th.split(model_output, C, dim=2)
- model_output, model_var_values = th.split(model_output[0], C, dim=1)
-
- # Learn the variance using the variational bound, but don't let
- # it affect our mean prediction.
- #frozen_out = th.cat([model_output.detach(), model_var_values], dim=2)
- frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
- terms["vb"] = self._vb_terms_bpd(
- model=lambda *args, r=frozen_out: r,
- x_start=x_start,
- x_t=x_t,
- t=t,
- clip_denoised=False,
- mask=mask
- )["output"]
- if self.loss_type == LossType.RESCALED_MSE:
- # Divide by 1000 for equivalence with initial implementation.
- # Without a factor of 1/1000, the VB term hurts the MSE term.
- terms["vb"] *= self.num_timesteps / 1000.0
-
- target = {
- ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
- x_start=x_start, x_t=x_t, t=t
- )[0],
- ModelMeanType.START_X: x_start,
- ModelMeanType.EPSILON: noise,
- }[self.model_mean_type]
- assert model_output.shape == target.shape == x_start.shape
- terms["mse"] = mean_flat(((target - model_output) ** 2) * mask)
- # import ipdb;ipdb.set_trace()
- if "vb" in terms:
- terms["loss"] = terms["mse"] + terms["vb"]
- else:
- terms["loss"] = terms["mse"]
- else:
- raise NotImplementedError(self.loss_type)
-
- return terms
-
- def _prior_bpd(self, x_start):
- """
- Get the prior KL term for the variational lower-bound, measured in
- bits-per-dim.
- This term can't be optimized, as it only depends on the encoder.
- :param x_start: the [N x C x ...] tensor of inputs.
- :return: a batch of [N] KL values (in bits), one per batch element.
- """
- batch_size = x_start.shape[0]
- t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
- qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
- kl_prior = normal_kl(
- mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
- )
- return mean_flat(kl_prior) / np.log(2.0)
-
- def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
- """
- Compute the entire variational lower-bound, measured in bits-per-dim,
- as well as other related quantities.
- :param model: the model to evaluate loss on.
- :param x_start: the [N x C x ...] tensor of inputs.
- :param clip_denoised: if True, clip denoised samples.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict containing the following keys:
- - total_bpd: the total variational lower-bound, per batch element.
- - prior_bpd: the prior term in the lower-bound.
- - vb: an [N x T] tensor of terms in the lower-bound.
- - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
- - mse: an [N x T] tensor of epsilon MSEs for each timestep.
- """
- device = x_start.device
- batch_size = x_start.shape[0]
-
- vb = []
- xstart_mse = []
- mse = []
- for t in list(range(self.num_timesteps))[::-1]:
- t_batch = th.tensor([t] * batch_size, device=device)
- noise = th.randn_like(x_start)
- x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
- # Calculate VLB term at the current timestep
- with th.no_grad():
- out = self._vb_terms_bpd(
- model,
- x_start=x_start,
- x_t=x_t,
- t=t_batch,
- clip_denoised=clip_denoised,
- model_kwargs=model_kwargs,
- )
- vb.append(out["output"])
- xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
- eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
- mse.append(mean_flat((eps - noise) ** 2))
-
- vb = th.stack(vb, dim=1)
- xstart_mse = th.stack(xstart_mse, dim=1)
- mse = th.stack(mse, dim=1)
-
- prior_bpd = self._prior_bpd(x_start)
- total_bpd = vb.sum(dim=1) + prior_bpd
- return {
- "total_bpd": total_bpd,
- "prior_bpd": prior_bpd,
- "vb": vb,
- "xstart_mse": xstart_mse,
- "mse": mse,
- }
-
-
-def _extract_into_tensor(arr, timesteps, broadcast_shape):
- """
- Extract values from a 1-D numpy array for a batch of indices.
- :param arr: the 1-D numpy array.
- :param timesteps: a tensor of indices into the array to extract.
- :param broadcast_shape: a larger shape of K dimensions with the batch
- dimension equal to the length of timesteps.
- :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
- """
- res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
- while len(res.shape) < len(broadcast_shape):
- res = res[..., None]
- return res + th.zeros(broadcast_shape, device=timesteps.device)
diff --git a/opensora/models/diffusion/diffusion/respace.py b/opensora/models/diffusion/diffusion/respace.py
deleted file mode 100644
index aed6ed77f..000000000
--- a/opensora/models/diffusion/diffusion/respace.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Modified from OpenAI's diffusion repos
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-import torch
-import numpy as np
-import torch as th
-
-from .gaussian_diffusion import GaussianDiffusion
-from .gaussian_diffusion_t2v import GaussianDiffusion_T
-
-
-def space_timesteps(num_timesteps, section_counts):
- """
- Create a list of timesteps to use from an original diffusion process,
- given the number of timesteps we want to take from equally-sized portions
- of the original process.
- For example, if there's 300 timesteps and the section counts are [10,15,20]
- then the first 100 timesteps are strided to be 10 timesteps, the second 100
- are strided to be 15 timesteps, and the final 100 are strided to be 20.
- If the stride is a string starting with "ddim", then the fixed striding
- from the DDIM paper is used, and only one section is allowed.
- :param num_timesteps: the number of diffusion steps in the original
- process to divide up.
- :param section_counts: either a list of numbers, or a string containing
- comma-separated numbers, indicating the step count
- per section. As a special case, use "ddimN" where N
- is a number of steps to use the striding from the
- DDIM paper.
- :return: a set of diffusion steps from the original process to use.
- """
- if isinstance(section_counts, str):
- if section_counts.startswith("ddim"):
- desired_count = int(section_counts[len("ddim") :])
- for i in range(1, num_timesteps):
- if len(range(0, num_timesteps, i)) == desired_count:
- return set(range(0, num_timesteps, i))
- raise ValueError(
- f"cannot create exactly {num_timesteps} steps with an integer stride"
- )
- section_counts = [int(x) for x in section_counts.split(",")]
- size_per = num_timesteps // len(section_counts)
- extra = num_timesteps % len(section_counts)
- start_idx = 0
- all_steps = []
- for i, section_count in enumerate(section_counts):
- size = size_per + (1 if i < extra else 0)
- if size < section_count:
- raise ValueError(
- f"cannot divide section of {size} steps into {section_count}"
- )
- if section_count <= 1:
- frac_stride = 1
- else:
- frac_stride = (size - 1) / (section_count - 1)
- cur_idx = 0.0
- taken_steps = []
- for _ in range(section_count):
- taken_steps.append(start_idx + round(cur_idx))
- cur_idx += frac_stride
- all_steps += taken_steps
- start_idx += size
- return set(all_steps)
-
-
-class SpacedDiffusion(GaussianDiffusion):
- """
- A diffusion process which can skip steps in a base diffusion process.
- :param use_timesteps: a collection (sequence or set) of timesteps from the
- original diffusion process to retain.
- :param kwargs: the kwargs to create the base diffusion process.
- """
-
- def __init__(self, use_timesteps, **kwargs):
- self.use_timesteps = set(use_timesteps)
- self.timestep_map = []
- self.original_num_steps = len(kwargs["betas"])
-
- base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
- last_alpha_cumprod = 1.0
- new_betas = []
- for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
- if i in self.use_timesteps:
- new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
- last_alpha_cumprod = alpha_cumprod
- self.timestep_map.append(i)
- kwargs["betas"] = np.array(new_betas)
- super().__init__(**kwargs)
-
- def p_mean_variance(
- self, model, *args, **kwargs
- ): # pylint: disable=signature-differs
- return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
-
- # @torch.compile
- def training_losses(
- self, model, *args, **kwargs
- ): # pylint: disable=signature-differs
- return super().training_losses(self._wrap_model(model), *args, **kwargs)
-
- def condition_mean(self, cond_fn, *args, **kwargs):
- return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
-
- def condition_score(self, cond_fn, *args, **kwargs):
- return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
-
- def _wrap_model(self, model):
- if isinstance(model, _WrappedModel):
- return model
- return _WrappedModel(
- model, self.timestep_map, self.original_num_steps
- )
-
- def _scale_timesteps(self, t):
- # Scaling is done by the wrapped model.
- return t
-
-
-class _WrappedModel:
- def __init__(self, model, timestep_map, original_num_steps):
- self.model = model
- self.timestep_map = timestep_map
- # self.rescale_timesteps = rescale_timesteps
- self.original_num_steps = original_num_steps
-
- def __call__(self, x, ts, **kwargs):
- map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
- new_ts = map_tensor[ts]
- # if self.rescale_timesteps:
- # new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
- return self.model(x, new_ts, **kwargs)
-
-class SpacedDiffusion_T(GaussianDiffusion_T):
- """
- A diffusion process which can skip steps in a base diffusion process.
- :param use_timesteps: a collection (sequence or set) of timesteps from the
- original diffusion process to retain.
- :param kwargs: the kwargs to create the base diffusion process.
- """
-
- def __init__(self, use_timesteps, **kwargs):
- self.use_timesteps = set(use_timesteps)
- self.timestep_map = []
- self.original_num_steps = len(kwargs["betas"])
-
- base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
- last_alpha_cumprod = 1.0
- new_betas = []
- for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
- if i in self.use_timesteps:
- new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
- last_alpha_cumprod = alpha_cumprod
- self.timestep_map.append(i)
- kwargs["betas"] = np.array(new_betas)
- super().__init__(**kwargs)
-
- def p_mean_variance(
- self, model, *args, **kwargs
- ): # pylint: disable=signature-differs
- return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
-
- # @torch.compile
- def training_losses(
- self, model, *args, **kwargs
- ): # pylint: disable=signature-differs
- return super().training_losses(self._wrap_model(model), *args, **kwargs)
-
- def condition_mean(self, cond_fn, *args, **kwargs):
- return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
-
- def condition_score(self, cond_fn, *args, **kwargs):
- return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
-
- def _wrap_model(self, model):
- if isinstance(model, _WrappedModel):
- return model
- return _WrappedModel(
- model, self.timestep_map, self.original_num_steps
- )
-
- def _scale_timesteps(self, t):
- # Scaling is done by the wrapped model.
- return t
-
-
-class _WrappedModel:
- def __init__(self, model, timestep_map, original_num_steps):
- self.model = model
- self.timestep_map = timestep_map
- # self.rescale_timesteps = rescale_timesteps
- self.original_num_steps = original_num_steps
-
- def __call__(self, x, ts, **kwargs):
- map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
- new_ts = map_tensor[ts]
- # if self.rescale_timesteps:
- # new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
- return self.model(x, new_ts, **kwargs)
\ No newline at end of file
diff --git a/opensora/models/diffusion/diffusion/timestep_sampler.py b/opensora/models/diffusion/diffusion/timestep_sampler.py
deleted file mode 100644
index a3f369847..000000000
--- a/opensora/models/diffusion/diffusion/timestep_sampler.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Modified from OpenAI's diffusion repos
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-
-from abc import ABC, abstractmethod
-
-import numpy as np
-import torch as th
-import torch.distributed as dist
-
-
-def create_named_schedule_sampler(name, diffusion):
- """
- Create a ScheduleSampler from a library of pre-defined samplers.
- :param name: the name of the sampler.
- :param diffusion: the diffusion object to sample for.
- """
- if name == "uniform":
- return UniformSampler(diffusion)
- elif name == "loss-second-moment":
- return LossSecondMomentResampler(diffusion)
- else:
- raise NotImplementedError(f"unknown schedule sampler: {name}")
-
-
-class ScheduleSampler(ABC):
- """
- A distribution over timesteps in the diffusion process, intended to reduce
- variance of the objective.
- By default, samplers perform unbiased importance sampling, in which the
- objective's mean is unchanged.
- However, subclasses may override sample() to change how the resampled
- terms are reweighted, allowing for actual changes in the objective.
- """
-
- @abstractmethod
- def weights(self):
- """
- Get a numpy array of weights, one per diffusion step.
- The weights needn't be normalized, but must be positive.
- """
-
- def sample(self, batch_size, device):
- """
- Importance-sample timesteps for a batch.
- :param batch_size: the number of timesteps.
- :param device: the torch device to save to.
- :return: a tuple (timesteps, weights):
- - timesteps: a tensor of timestep indices.
- - weights: a tensor of weights to scale the resulting losses.
- """
- w = self.weights()
- p = w / np.sum(w)
- indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
- indices = th.from_numpy(indices_np).long().to(device)
- weights_np = 1 / (len(p) * p[indices_np])
- weights = th.from_numpy(weights_np).float().to(device)
- return indices, weights
-
-
-class UniformSampler(ScheduleSampler):
- def __init__(self, diffusion):
- self.diffusion = diffusion
- self._weights = np.ones([diffusion.num_timesteps])
-
- def weights(self):
- return self._weights
-
-
-class LossAwareSampler(ScheduleSampler):
- def update_with_local_losses(self, local_ts, local_losses):
- """
- Update the reweighting using losses from a model.
- Call this method from each rank with a batch of timesteps and the
- corresponding losses for each of those timesteps.
- This method will perform synchronization to make sure all of the ranks
- maintain the exact same reweighting.
- :param local_ts: an integer Tensor of timesteps.
- :param local_losses: a 1D Tensor of losses.
- """
- batch_sizes = [
- th.tensor([0], dtype=th.int32, device=local_ts.device)
- for _ in range(dist.get_world_size())
- ]
- dist.all_gather(
- batch_sizes,
- th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
- )
-
- # Pad all_gather batches to be the maximum batch size.
- batch_sizes = [x.item() for x in batch_sizes]
- max_bs = max(batch_sizes)
-
- timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
- loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
- dist.all_gather(timestep_batches, local_ts)
- dist.all_gather(loss_batches, local_losses)
- timesteps = [
- x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
- ]
- losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
- self.update_with_all_losses(timesteps, losses)
-
- @abstractmethod
- def update_with_all_losses(self, ts, losses):
- """
- Update the reweighting using losses from a model.
- Sub-classes should override this method to update the reweighting
- using losses from the model.
- This method directly updates the reweighting without synchronizing
- between workers. It is called by update_with_local_losses from all
- ranks with identical arguments. Thus, it should have deterministic
- behavior to maintain state across workers.
- :param ts: a list of int timesteps.
- :param losses: a list of float losses, one per timestep.
- """
-
-
-class LossSecondMomentResampler(LossAwareSampler):
- def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
- self.diffusion = diffusion
- self.history_per_term = history_per_term
- self.uniform_prob = uniform_prob
- self._loss_history = np.zeros(
- [diffusion.num_timesteps, history_per_term], dtype=np.float64
- )
- self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
-
- def weights(self):
- if not self._warmed_up():
- return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
- weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
- weights /= np.sum(weights)
- weights *= 1 - self.uniform_prob
- weights += self.uniform_prob / len(weights)
- return weights
-
- def update_with_all_losses(self, ts, losses):
- for t, loss in zip(ts, losses):
- if self._loss_counts[t] == self.history_per_term:
- # Shift out the oldest loss term.
- self._loss_history[t, :-1] = self._loss_history[t, 1:]
- self._loss_history[t, -1] = loss
- else:
- self._loss_history[t, self._loss_counts[t]] = loss
- self._loss_counts[t] += 1
-
- def _warmed_up(self):
- return (self._loss_counts == self.history_per_term).all()
diff --git a/opensora/models/diffusion/latte/modeling_latte.py b/opensora/models/diffusion/latte/modeling_latte.py
deleted file mode 100644
index 340acb6dc..000000000
--- a/opensora/models/diffusion/latte/modeling_latte.py
+++ /dev/null
@@ -1,674 +0,0 @@
-import torch
-
-import os
-import json
-
-from dataclasses import dataclass
-from einops import rearrange, repeat
-from typing import Any, Dict, Optional, Tuple
-from diffusers.models import Transformer2DModel
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate
-from diffusers.models.embeddings import get_1d_sincos_pos_embed_from_grid, ImagePositionalEmbeddings
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from opensora.models.diffusion.utils.pos_embed import get_1d_sincos_pos_embed, PositionGetter1D, PositionGetter2D
-from opensora.models.diffusion.latte.modules import PatchEmbed, BasicTransformerBlock, BasicTransformerBlock_, AdaLayerNormSingle, \
- Transformer3DModelOutput, CaptionProjection
-
-
-class LatteT2V(ModelMixin, ConfigMixin):
- _supports_gradient_checkpointing = True
-
- """
- A 2D Transformer model for image-like data.
-
- Parameters:
- num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
- attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
- in_channels (`int`, *optional*):
- The number of channels in the input and output (specify if the input is **continuous**).
- num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
- cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
- sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
- This is fixed during training since it is used to learn a number of position embeddings.
- num_vector_embeds (`int`, *optional*):
- The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
- Includes the class for the masked latent pixel.
- activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
- num_embeds_ada_norm ( `int`, *optional*):
- The number of diffusion steps used during training. Pass if at least one of the norm_layers is
- `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
- added to the hidden states.
-
- During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
- attention_bias (`bool`, *optional*):
- Configure if the `TransformerBlocks` attention should contain a bias parameter.
- """
-
- @register_to_config
- def __init__(
- self,
- num_attention_heads: int = 16,
- patch_size_t: int = 1,
- attention_head_dim: int = 88,
- in_channels: Optional[int] = None,
- out_channels: Optional[int] = None,
- num_layers: int = 1,
- dropout: float = 0.0,
- norm_num_groups: int = 32,
- cross_attention_dim: Optional[int] = None,
- attention_bias: bool = False,
- sample_size: Optional[int] = None,
- num_vector_embeds: Optional[int] = None,
- patch_size: Optional[int] = None,
- activation_fn: str = "geglu",
- num_embeds_ada_norm: Optional[int] = None,
- use_linear_projection: bool = False,
- only_cross_attention: bool = False,
- double_self_attention: bool = False,
- upcast_attention: bool = False,
- norm_type: str = "layer_norm",
- norm_elementwise_affine: bool = True,
- norm_eps: float = 1e-5,
- attention_type: str = "default",
- caption_channels: int = None,
- video_length: int = 16,
- attention_mode: str = 'flash',
- use_rope: bool = False,
- model_max_length: int = 300,
- rope_scaling_type: str = 'linear',
- compress_kv_factor: int = 1,
- ):
- super().__init__()
- self.use_linear_projection = use_linear_projection
- self.num_attention_heads = num_attention_heads
- self.attention_head_dim = attention_head_dim
- inner_dim = num_attention_heads * attention_head_dim
- self.video_length = video_length
- self.use_rope = use_rope
- self.model_max_length = model_max_length
- self.compress_kv_factor = compress_kv_factor
- self.num_layers = num_layers
- self.config.hidden_size = model_max_length
-
- assert not (self.compress_kv_factor != 1 and use_rope), "Can not both enable compressing kv and using rope"
-
- conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
- linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
-
- # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
- # Define whether input is continuous or discrete depending on configuration
- self.is_input_continuous = (in_channels is not None) and (patch_size is None)
- self.is_input_vectorized = num_vector_embeds is not None
- # self.is_input_patches = in_channels is not None and patch_size is not None
- self.is_input_patches = True
-
- if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
- deprecation_message = (
- f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
- " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
- " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
- " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
- " would be very nice if you could open a Pull request for the `transformer/config.json` file"
- )
- deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
- norm_type = "ada_norm"
-
- # 2. Define input layers
- assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
-
- self.height = sample_size[0]
- self.width = sample_size[1]
-
- self.patch_size = patch_size
- interpolation_scale_2d = self.config.sample_size[0] // 64 # => 64 (= 512 pixart) has interpolation scale 1
- interpolation_scale_2d = max(interpolation_scale_2d, 1)
- self.pos_embed = PatchEmbed(
- height=sample_size[0],
- width=sample_size[1],
- patch_size=patch_size,
- in_channels=in_channels,
- embed_dim=inner_dim,
- interpolation_scale=interpolation_scale_2d,
- )
-
-
- # define temporal positional embedding
- if self.config.video_length % 2 == 1:
- interpolation_scale_1d = (self.config.video_length - 1) // 16 # => 16 (= 16 Latte) has interpolation scale 1
- else:
- interpolation_scale_1d = self.config.video_length // 16 # => 16 (= 16 Latte) has interpolation scale 1
- # interpolation_scale_1d = self.config.video_length // 5 #
- interpolation_scale_1d = max(interpolation_scale_1d, 1)
- temp_pos_embed = get_1d_sincos_pos_embed(inner_dim, video_length, interpolation_scale=interpolation_scale_1d) # 1152 hidden size
- self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
-
- rope_scaling = None
- if self.use_rope:
- self.position_getter_2d = PositionGetter2D()
- self.position_getter_1d = PositionGetter1D()
- rope_scaling = dict(type=rope_scaling_type, factor_2d=interpolation_scale_2d, factor_1d=interpolation_scale_1d)
-
- # 3. Define transformers blocks, spatial attention
- self.transformer_blocks = nn.ModuleList(
- [
- BasicTransformerBlock(
- inner_dim,
- num_attention_heads,
- attention_head_dim,
- dropout=dropout,
- cross_attention_dim=cross_attention_dim,
- activation_fn=activation_fn,
- num_embeds_ada_norm=num_embeds_ada_norm,
- attention_bias=attention_bias,
- only_cross_attention=only_cross_attention,
- double_self_attention=double_self_attention,
- upcast_attention=upcast_attention,
- norm_type=norm_type,
- norm_elementwise_affine=norm_elementwise_affine,
- norm_eps=norm_eps,
- attention_type=attention_type,
- attention_mode=attention_mode,
- use_rope=use_rope,
- rope_scaling=rope_scaling,
- compress_kv_factor=(compress_kv_factor, compress_kv_factor) if d >= num_layers // 2 and compress_kv_factor != 1 else None, # follow pixart-sigma, apply in second-half layers
- )
- for d in range(num_layers)
- ]
- )
-
- # Define temporal transformers blocks
- self.temporal_transformer_blocks = nn.ModuleList(
- [
- BasicTransformerBlock_( # one attention
- inner_dim,
- num_attention_heads, # num_attention_heads
- attention_head_dim, # attention_head_dim 72
- dropout=dropout,
- cross_attention_dim=None,
- activation_fn=activation_fn,
- num_embeds_ada_norm=num_embeds_ada_norm,
- attention_bias=attention_bias,
- only_cross_attention=only_cross_attention,
- double_self_attention=False,
- upcast_attention=upcast_attention,
- norm_type=norm_type,
- norm_elementwise_affine=norm_elementwise_affine,
- norm_eps=norm_eps,
- attention_type=attention_type,
- attention_mode=attention_mode,
- use_rope=use_rope,
- rope_scaling=rope_scaling,
- compress_kv_factor=(compress_kv_factor, ) if d >= num_layers // 2 and compress_kv_factor != 1 else None, # follow pixart-sigma, apply in second-half layers
- )
- for d in range(num_layers)
- ]
- )
-
- # 4. Define output layers
- self.out_channels = in_channels if out_channels is None else out_channels
- if self.is_input_continuous:
- # TODO: should use out_channels for continuous projections
- if use_linear_projection:
- self.proj_out = linear_cls(inner_dim, in_channels)
- else:
- self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
- elif self.is_input_vectorized:
- self.norm_out = nn.LayerNorm(inner_dim)
- self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
- elif self.is_input_patches and norm_type != "ada_norm_single":
- self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
- self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
- self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
- elif self.is_input_patches and norm_type == "ada_norm_single":
- self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
- self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim ** 0.5)
- self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
-
- # 5. PixArt-Alpha blocks.
- self.adaln_single = None
- self.use_additional_conditions = False
- if norm_type == "ada_norm_single":
- # self.use_additional_conditions = self.config.sample_size[0] == 128 # False, 128 -> 1024
- # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
- # additional conditions until we find better name
- self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
-
- self.caption_projection = None
- if caption_channels is not None:
- self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim)
-
- self.gradient_checkpointing = False
-
- def _set_gradient_checkpointing(self, module, value=False):
- self.gradient_checkpointing = value
-
- def make_position(self, b, t, use_image_num, h, w, device):
- pos_hw = self.position_getter_2d(b*(t+use_image_num), h, w, device) # fake_b = b*(t+use_image_num)
- pos_t = self.position_getter_1d(b*h*w, t, device) # fake_b = b*h*w
- return pos_hw, pos_t
-
- def make_attn_mask(self, attention_mask, frame, dtype):
- attention_mask = rearrange(attention_mask, 'b t h w -> (b t) 1 (h w)')
- # assume that mask is expressed as:
- # (1 = keep, 0 = discard)
- # convert mask into a bias that can be added to attention scores:
- # (keep = +0, discard = -10000.0)
- attention_mask = (1 - attention_mask.to(dtype)) * -10000.0
- attention_mask = attention_mask.to(self.dtype)
- return attention_mask
-
- def vae_to_diff_mask(self, attention_mask, use_image_num):
- dtype = attention_mask.dtype
- # b, t+use_image_num, h, w, assume t as channel
- # this version do not use 3d patch embedding
- attention_mask = F.max_pool2d(attention_mask, kernel_size=(self.patch_size, self.patch_size), stride=(self.patch_size, self.patch_size))
- attention_mask = attention_mask.bool().to(dtype)
- return attention_mask
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- timestep: Optional[torch.LongTensor] = None,
- encoder_hidden_states: Optional[torch.Tensor] = None,
- added_cond_kwargs: Dict[str, torch.Tensor] = None,
- class_labels: Optional[torch.LongTensor] = None,
- cross_attention_kwargs: Dict[str, Any] = None,
- attention_mask: Optional[torch.Tensor] = None,
- encoder_attention_mask: Optional[torch.Tensor] = None,
- use_image_num: int = 0,
- enable_temporal_attentions: bool = True,
- return_dict: bool = True,
- ):
- """
- The [`Transformer2DModel`] forward method.
-
- Args:
- hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, frame, channel, height, width)` if continuous):
- Input `hidden_states`.
- encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
- Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
- self-attention.
- timestep ( `torch.LongTensor`, *optional*):
- Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
- class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
- Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
- `AdaLayerZeroNorm`.
- cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
- attention_mask ( `torch.Tensor`, *optional*):
- An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
- is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
- negative values to the attention scores corresponding to "discard" tokens.
- encoder_attention_mask ( `torch.Tensor`, *optional*):
- Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
-
- * Mask `(batch, sequence_length)` True = keep, False = discard.
- * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
-
- If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
- above. This bias will be added to the cross-attention scores.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
- tuple.
-
- Returns:
- If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
- `tuple` where the first element is the sample tensor.
- """
- input_batch_size, c, frame, h, w = hidden_states.shape
- frame = frame - use_image_num # 20-4=16
- hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) c h w').contiguous()
- # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
- # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
- # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
- # expects mask of shape:
- # [batch, key_tokens]
- # adds singleton query_tokens dimension:
- # [batch, 1, key_tokens]
- # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
- # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
- # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
- if attention_mask is None:
- attention_mask = torch.ones((input_batch_size, frame+use_image_num, h, w), device=hidden_states.device, dtype=hidden_states.dtype)
- attention_mask = self.vae_to_diff_mask(attention_mask, use_image_num)
- dtype = attention_mask.dtype
- attention_mask_compress = F.max_pool2d(attention_mask.float(), kernel_size=self.compress_kv_factor, stride=self.compress_kv_factor)
- attention_mask_compress = attention_mask_compress.to(dtype)
-
- attention_mask = self.make_attn_mask(attention_mask, frame, hidden_states.dtype)
- attention_mask_compress = self.make_attn_mask(attention_mask_compress, frame, hidden_states.dtype)
-
- # 1 + 4, 1 -> video condition, 4 -> image condition
- # convert encoder_attention_mask to a bias the same way we do for attention_mask
- if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2: # ndim == 2 means no image joint
- encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
- encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
- encoder_attention_mask = repeat(encoder_attention_mask, 'b 1 l -> (b f) 1 l', f=frame).contiguous()
- encoder_attention_mask = encoder_attention_mask.to(self.dtype)
- elif encoder_attention_mask is not None and encoder_attention_mask.ndim == 3: # ndim == 3 means image joint
- encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
- encoder_attention_mask_video = encoder_attention_mask[:, :1, ...]
- encoder_attention_mask_video = repeat(encoder_attention_mask_video, 'b 1 l -> b (1 f) l',
- f=frame).contiguous()
- encoder_attention_mask_image = encoder_attention_mask[:, 1:, ...]
- encoder_attention_mask = torch.cat([encoder_attention_mask_video, encoder_attention_mask_image], dim=1)
- encoder_attention_mask = rearrange(encoder_attention_mask, 'b n l -> (b n) l').contiguous().unsqueeze(1)
- encoder_attention_mask = encoder_attention_mask.to(self.dtype)
-
- # Retrieve lora scale.
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
- # 1. Input
- if self.is_input_patches: # here
- height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
- hw = (height, width)
- num_patches = height * width
-
- hidden_states = self.pos_embed(hidden_states.to(self.dtype)) # alrady add positional embeddings
-
- if self.adaln_single is not None:
- if self.use_additional_conditions and added_cond_kwargs is None:
- raise ValueError(
- "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
- )
- # batch_size = hidden_states.shape[0]
- batch_size = input_batch_size
- timestep, embedded_timestep = self.adaln_single(
- timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
- )
-
- # 2. Blocks
- if self.caption_projection is not None:
- batch_size = hidden_states.shape[0]
- encoder_hidden_states = self.caption_projection(encoder_hidden_states.to(self.dtype)) # 3 120 1152
-
- if use_image_num != 0 and self.training:
- encoder_hidden_states_video = encoder_hidden_states[:, :1, ...]
- encoder_hidden_states_video = repeat(encoder_hidden_states_video, 'b 1 t d -> b (1 f) t d', f=frame).contiguous()
- encoder_hidden_states_image = encoder_hidden_states[:, 1:, ...]
- encoder_hidden_states = torch.cat([encoder_hidden_states_video, encoder_hidden_states_image], dim=1)
- encoder_hidden_states_spatial = rearrange(encoder_hidden_states, 'b f t d -> (b f) t d').contiguous()
- else:
- encoder_hidden_states_spatial = repeat(encoder_hidden_states, 'b t d -> (b f) t d', f=frame).contiguous()
-
- # prepare timesteps for spatial and temporal block
- timestep_spatial = repeat(timestep, 'b d -> (b f) d', f=frame + use_image_num).contiguous()
- timestep_temp = repeat(timestep, 'b d -> (b p) d', p=num_patches).contiguous()
-
- pos_hw, pos_t = None, None
- if self.use_rope:
- pos_hw, pos_t = self.make_position(input_batch_size, frame, use_image_num, height, width, hidden_states.device)
-
- for i, (spatial_block, temp_block) in enumerate(zip(self.transformer_blocks, self.temporal_transformer_blocks)):
-
- if self.training and self.gradient_checkpointing:
- hidden_states = torch.utils.checkpoint.checkpoint(
- spatial_block,
- hidden_states,
- attention_mask_compress if i >= self.num_layers // 2 else attention_mask,
- encoder_hidden_states_spatial,
- encoder_attention_mask,
- timestep_spatial,
- cross_attention_kwargs,
- class_labels,
- pos_hw,
- pos_hw,
- hw,
- use_reentrant=False,
- )
-
- if enable_temporal_attentions:
- hidden_states = rearrange(hidden_states, '(b f) t d -> (b t) f d', b=input_batch_size).contiguous()
-
- if use_image_num != 0: # image-video joitn training
- hidden_states_video = hidden_states[:, :frame, ...]
- hidden_states_image = hidden_states[:, frame:, ...]
-
- # if i == 0 and not self.use_rope:
- if i == 0:
- hidden_states_video = hidden_states_video + self.temp_pos_embed
-
- hidden_states_video = torch.utils.checkpoint.checkpoint(
- temp_block,
- hidden_states_video,
- None, # attention_mask
- None, # encoder_hidden_states
- None, # encoder_attention_mask
- timestep_temp,
- cross_attention_kwargs,
- class_labels,
- pos_t,
- pos_t,
- (frame, ),
- use_reentrant=False,
- )
-
- hidden_states = torch.cat([hidden_states_video, hidden_states_image], dim=1)
- hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d',
- b=input_batch_size).contiguous()
-
- else:
- # if i == 0 and not self.use_rope:
- if i == 0:
- hidden_states = hidden_states + self.temp_pos_embed
-
- hidden_states = torch.utils.checkpoint.checkpoint(
- temp_block,
- hidden_states,
- None, # attention_mask
- None, # encoder_hidden_states
- None, # encoder_attention_mask
- timestep_temp,
- cross_attention_kwargs,
- class_labels,
- pos_t,
- pos_t,
- (frame, ),
- use_reentrant=False,
- )
-
- hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d',
- b=input_batch_size).contiguous()
- else:
- hidden_states = spatial_block(
- hidden_states,
- attention_mask_compress if i >= self.num_layers // 2 else attention_mask,
- encoder_hidden_states_spatial,
- encoder_attention_mask,
- timestep_spatial,
- cross_attention_kwargs,
- class_labels,
- pos_hw,
- pos_hw,
- hw,
- )
-
- if enable_temporal_attentions:
- # b c f h w, f = 16 + 4
- hidden_states = rearrange(hidden_states, '(b f) t d -> (b t) f d', b=input_batch_size).contiguous()
-
- if use_image_num != 0 and self.training:
- hidden_states_video = hidden_states[:, :frame, ...]
- hidden_states_image = hidden_states[:, frame:, ...]
-
- # if i == 0 and not self.use_rope:
- # hidden_states_video = hidden_states_video + self.temp_pos_embed
-
- hidden_states_video = temp_block(
- hidden_states_video,
- None, # attention_mask
- None, # encoder_hidden_states
- None, # encoder_attention_mask
- timestep_temp,
- cross_attention_kwargs,
- class_labels,
- pos_t,
- pos_t,
- (frame, ),
- )
-
- hidden_states = torch.cat([hidden_states_video, hidden_states_image], dim=1)
- hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d',
- b=input_batch_size).contiguous()
-
- else:
- # if i == 0 and not self.use_rope:
- if i == 0:
- hidden_states = hidden_states + self.temp_pos_embed
-
- hidden_states = temp_block(
- hidden_states,
- None, # attention_mask
- None, # encoder_hidden_states
- None, # encoder_attention_mask
- timestep_temp,
- cross_attention_kwargs,
- class_labels,
- pos_t,
- pos_t,
- (frame, ),
- )
-
- hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d',
- b=input_batch_size).contiguous()
-
- if self.is_input_patches:
- if self.config.norm_type != "ada_norm_single":
- conditioning = self.transformer_blocks[0].norm1.emb(
- timestep, class_labels, hidden_dtype=hidden_states.dtype
- )
- shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
- hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
- hidden_states = self.proj_out_2(hidden_states)
- elif self.config.norm_type == "ada_norm_single":
- embedded_timestep = repeat(embedded_timestep, 'b d -> (b f) d', f=frame + use_image_num).contiguous()
- shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
- hidden_states = self.norm_out(hidden_states)
- # Modulation
- hidden_states = hidden_states * (1 + scale) + shift
- hidden_states = self.proj_out(hidden_states)
-
- # unpatchify
- if self.adaln_single is None:
- height = width = int(hidden_states.shape[1] ** 0.5)
- hidden_states = hidden_states.reshape(
- shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
- )
- hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
- output = hidden_states.reshape(
- shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
- )
- output = rearrange(output, '(b f) c h w -> b c f h w', b=input_batch_size).contiguous()
-
- if not return_dict:
- return (output,)
-
- return Transformer3DModelOutput(sample=output)
-
- @classmethod
- def from_pretrained_2d(cls, pretrained_model_path, subfolder=None, **kwargs):
- if subfolder is not None:
- pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-
- config_file = os.path.join(pretrained_model_path, 'config.json')
- if not os.path.isfile(config_file):
- raise RuntimeError(f"{config_file} does not exist")
- with open(config_file, "r") as f:
- config = json.load(f)
-
- model = cls.from_config(config, **kwargs)
- return model
-
-# depth = num_layers * 2
-def LatteT2V_XL_122(**kwargs):
- return LatteT2V(num_layers=28, attention_head_dim=72, num_attention_heads=16, patch_size_t=1, patch_size=2,
- norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=1152, **kwargs)
-def LatteT2V_D64_XL_122(**kwargs):
- return LatteT2V(num_layers=28, attention_head_dim=64, num_attention_heads=18, patch_size_t=1, patch_size=2,
- norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=1152, **kwargs)
-
-Latte_models = {
- "LatteT2V-XL/122": LatteT2V_XL_122,
- "LatteT2V-D64-XL/122": LatteT2V_D64_XL_122,
-}
-
-if __name__ == '__main__':
- from opensora.models.ae import ae_channel_config, ae_stride_config
- from opensora.models.ae import getae, getae_wrapper
- from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
-
- args = type('args', (),
- {
- 'ae': 'CausalVAEModel_4x8x8',
- 'attention_mode': 'xformers',
- 'use_rope': False,
- 'model_max_length': 300,
- 'max_image_size': 512,
- 'num_frames': 65,
- 'use_image_num': 16,
- 'compress_kv_factor': 1
- }
- )
- b = 2
- c = 4
- cond_c = 4096
- num_timesteps = 1000
- ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
- latent_size = (args.max_image_size // ae_stride_h, args.max_image_size // ae_stride_w)
- if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper:
- args.video_length = video_length = (args.num_frames - 1) // ae_stride_t + 1
- else:
- video_length = args.num_frames // ae_stride_t
-
- device = torch.device('cuda:6')
- model = LatteT2V_D64_XL_122(
- in_channels=ae_channel_config[args.ae],
- out_channels=ae_channel_config[args.ae] * 2,
- # caption_channels=4096,
- # cross_attention_dim=1152,
- attention_bias=True,
- sample_size=latent_size,
- num_vector_embeds=None,
- activation_fn="gelu-approximate",
- num_embeds_ada_norm=1000,
- use_linear_projection=False,
- only_cross_attention=False,
- double_self_attention=False,
- upcast_attention=False,
- # norm_type="ada_norm_single",
- norm_elementwise_affine=False,
- norm_eps=1e-6,
- attention_type='default',
- video_length=video_length,
- attention_mode=args.attention_mode,
- compress_kv_factor=args.compress_kv_factor,
- use_rope=args.use_rope,
- model_max_length=args.model_max_length,
- ).to(device)
- # try:
- # ckpt = torch.load(r"t2v.pt", map_location='cpu')['model']
- # model.load_state_dict(ckpt)
- # except Exception as e:
- # print(e)
- print(model)
-
- x = torch.randn(b, c, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_image_size//ae_stride_h, args.max_image_size//ae_stride_w).to(device)
- cond = torch.randn(b, 1+args.use_image_num, args.model_max_length, cond_c).to(device)
- attn_mask = torch.randint(0, 2, (b, 1+args.use_image_num, args.max_image_size//ae_stride_h//2, args.max_image_size//ae_stride_w//2)).to(device) # B L or B 1+num_images L
- cond_mask = torch.randint(0, 2, (b, 1+args.use_image_num, args.model_max_length)).to(device) # B L or B 1+num_images L
- timestep = torch.randint(0, 1000, (b,), device=device)
- model_kwargs = dict(hidden_states=x, encoder_hidden_states=cond, attention_mask=attn_mask,
- encoder_attention_mask=cond_mask, use_image_num=args.use_image_num, timestep=timestep)
- with torch.no_grad():
- output = model(**model_kwargs)
- # print(output)
\ No newline at end of file
diff --git a/opensora/models/diffusion/latte/modules.py b/opensora/models/diffusion/latte/modules.py
deleted file mode 100644
index 224a0922d..000000000
--- a/opensora/models/diffusion/latte/modules.py
+++ /dev/null
@@ -1,1729 +0,0 @@
-from importlib import import_module
-
-import numpy as np
-from typing import Any, Dict, Optional, Tuple, Callable
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_xformers_available
-from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from diffusers.models.embeddings import SinusoidalPositionalEmbedding, TimestepEmbedding, Timesteps
-from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero
-from diffusers.models.attention_processor import SpatialNorm, LORA_ATTENTION_PROCESSORS, \
- CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0, \
- AttnAddedKVProcessor, AttnAddedKVProcessor2_0, SlicedAttnAddedKVProcessor, XFormersAttnAddedKVProcessor, \
- LoRAAttnAddedKVProcessor, LoRAXFormersAttnProcessor, XFormersAttnProcessor, LoRAAttnProcessor2_0, LoRAAttnProcessor, \
- AttnProcessor, SlicedAttnProcessor, logger
-from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
-
-from dataclasses import dataclass
-
-from opensora.models.diffusion.utils.pos_embed import get_2d_sincos_pos_embed, RoPE1D, RoPE2D, LinearScalingRoPE2D, LinearScalingRoPE1D
-
-if is_xformers_available():
- import xformers
- import xformers.ops
-else:
- xformers = None
-
-
-class CombinedTimestepSizeEmbeddings(nn.Module):
- """
- For PixArt-Alpha.
-
- Reference:
- https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
- """
-
- def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
- super().__init__()
-
- self.outdim = size_emb_dim
- self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
- self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-
- self.use_additional_conditions = use_additional_conditions
- if use_additional_conditions:
- self.use_additional_conditions = True
- self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
- self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
- self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-
- def apply_condition(self, size: torch.Tensor, batch_size: int, embedder: nn.Module):
- if size.ndim == 1:
- size = size[:, None]
-
- if size.shape[0] != batch_size:
- size = size.repeat(batch_size // size.shape[0], 1)
- if size.shape[0] != batch_size:
- raise ValueError(f"`batch_size` should be {size.shape[0]} but found {batch_size}.")
-
- current_batch_size, dims = size.shape[0], size.shape[1]
- size = size.reshape(-1)
- size_freq = self.additional_condition_proj(size).to(size.dtype)
-
- size_emb = embedder(size_freq)
- size_emb = size_emb.reshape(current_batch_size, dims * self.outdim)
- return size_emb
-
- def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
- timesteps_proj = self.time_proj(timestep)
- timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (N, D)
-
- if self.use_additional_conditions:
- resolution = self.apply_condition(resolution, batch_size=batch_size, embedder=self.resolution_embedder)
- aspect_ratio = self.apply_condition(
- aspect_ratio, batch_size=batch_size, embedder=self.aspect_ratio_embedder
- )
- conditioning = timesteps_emb + torch.cat([resolution, aspect_ratio], dim=1)
- else:
- conditioning = timesteps_emb
-
- return conditioning
-
-class CaptionProjection(nn.Module):
- """
- Projects caption embeddings. Also handles dropout for classifier-free guidance.
-
- Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
- """
-
- def __init__(self, in_features, hidden_size, num_tokens=120):
- super().__init__()
- self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
- self.act_1 = nn.GELU(approximate="tanh")
- self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
- self.register_buffer("y_embedding", nn.Parameter(torch.randn(num_tokens, in_features) / in_features**0.5))
-
- def forward(self, caption, force_drop_ids=None):
- hidden_states = self.linear_1(caption)
- hidden_states = self.act_1(hidden_states)
- hidden_states = self.linear_2(hidden_states)
- return hidden_states
-
-class PatchEmbed(nn.Module):
- """2D Image to Patch Embedding"""
-
- def __init__(
- self,
- height=224,
- width=224,
- patch_size=16,
- in_channels=3,
- embed_dim=768,
- layer_norm=False,
- flatten=True,
- bias=True,
- interpolation_scale=1,
- ):
- super().__init__()
-
- num_patches = (height // patch_size) * (width // patch_size)
- self.flatten = flatten
- self.layer_norm = layer_norm
-
- self.proj = nn.Conv2d(
- in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
- )
- if layer_norm:
- self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
- else:
- self.norm = None
-
- self.patch_size = patch_size
- # See:
- # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
- self.height, self.width = height // patch_size, width // patch_size
-
- self.base_size = height // patch_size
- self.interpolation_scale = interpolation_scale
- pos_embed = get_2d_sincos_pos_embed(
- embed_dim, int(num_patches**0.5), base_size=self.base_size, interpolation_scale=self.interpolation_scale
- )
- self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
-
- def forward(self, latent):
- height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
-
- latent = self.proj(latent)
- if self.flatten:
- latent = latent.flatten(2).transpose(1, 2) # BCHW -> BNC
- if self.layer_norm:
- latent = self.norm(latent)
- # Interpolate positional embeddings if needed.
- # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
- if self.height != height or self.width != width:
- raise ValueError
- pos_embed = get_2d_sincos_pos_embed(
- embed_dim=self.pos_embed.shape[-1],
- grid_size=(height, width),
- base_size=self.base_size,
- interpolation_scale=self.interpolation_scale,
- )
- pos_embed = torch.from_numpy(pos_embed)
- pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
- else:
- pos_embed = self.pos_embed
- return (latent + pos_embed).to(latent.dtype)
-
-
-@maybe_allow_in_graph
-class Attention(nn.Module):
- r"""
- A cross attention layer.
-
- Parameters:
- query_dim (`int`):
- The number of channels in the query.
- cross_attention_dim (`int`, *optional*):
- The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
- heads (`int`, *optional*, defaults to 8):
- The number of heads to use for multi-head attention.
- dim_head (`int`, *optional*, defaults to 64):
- The number of channels in each head.
- dropout (`float`, *optional*, defaults to 0.0):
- The dropout probability to use.
- bias (`bool`, *optional*, defaults to False):
- Set to `True` for the query, key, and value linear layers to contain a bias parameter.
- upcast_attention (`bool`, *optional*, defaults to False):
- Set to `True` to upcast the attention computation to `float32`.
- upcast_softmax (`bool`, *optional*, defaults to False):
- Set to `True` to upcast the softmax computation to `float32`.
- cross_attention_norm (`str`, *optional*, defaults to `None`):
- The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
- cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
- The number of groups to use for the group norm in the cross attention.
- added_kv_proj_dim (`int`, *optional*, defaults to `None`):
- The number of channels to use for the added key and value projections. If `None`, no projection is used.
- norm_num_groups (`int`, *optional*, defaults to `None`):
- The number of groups to use for the group norm in the attention.
- spatial_norm_dim (`int`, *optional*, defaults to `None`):
- The number of channels to use for the spatial normalization.
- out_bias (`bool`, *optional*, defaults to `True`):
- Set to `True` to use a bias in the output linear layer.
- scale_qk (`bool`, *optional*, defaults to `True`):
- Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
- only_cross_attention (`bool`, *optional*, defaults to `False`):
- Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
- `added_kv_proj_dim` is not `None`.
- eps (`float`, *optional*, defaults to 1e-5):
- An additional value added to the denominator in group normalization that is used for numerical stability.
- rescale_output_factor (`float`, *optional*, defaults to 1.0):
- A factor to rescale the output by dividing it with this value.
- residual_connection (`bool`, *optional*, defaults to `False`):
- Set to `True` to add the residual connection to the output.
- _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
- Set to `True` if the attention block is loaded from a deprecated state dict.
- processor (`AttnProcessor`, *optional*, defaults to `None`):
- The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
- `AttnProcessor` otherwise.
- """
-
- def __init__(
- self,
- query_dim: int,
- cross_attention_dim: Optional[int] = None,
- heads: int = 8,
- dim_head: int = 64,
- dropout: float = 0.0,
- bias: bool = False,
- upcast_attention: bool = False,
- upcast_softmax: bool = False,
- cross_attention_norm: Optional[str] = None,
- cross_attention_norm_num_groups: int = 32,
- added_kv_proj_dim: Optional[int] = None,
- norm_num_groups: Optional[int] = None,
- spatial_norm_dim: Optional[int] = None,
- out_bias: bool = True,
- scale_qk: bool = True,
- only_cross_attention: bool = False,
- eps: float = 1e-5,
- rescale_output_factor: float = 1.0,
- residual_connection: bool = False,
- _from_deprecated_attn_block: bool = False,
- processor: Optional["AttnProcessor"] = None,
- attention_mode: str = 'xformers',
- use_rope: bool = False,
- rope_scaling: Optional[Dict] = None,
- compress_kv_factor: Optional[Tuple] = None,
- ):
- super().__init__()
- self.inner_dim = dim_head * heads
- self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
- self.upcast_attention = upcast_attention
- self.upcast_softmax = upcast_softmax
- self.rescale_output_factor = rescale_output_factor
- self.residual_connection = residual_connection
- self.dropout = dropout
- self.use_rope = use_rope
- self.rope_scaling = rope_scaling
- self.compress_kv_factor = compress_kv_factor
-
- # we make use of this private variable to know whether this class is loaded
- # with an deprecated state dict so that we can convert it on the fly
- self._from_deprecated_attn_block = _from_deprecated_attn_block
-
- self.scale_qk = scale_qk
- self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-
- self.heads = heads
- # for slice_size > 0 the attention score computation
- # is split across the batch axis to save memory
- # You can set slice_size with `set_attention_slice`
- self.sliceable_head_dim = heads
-
- self.added_kv_proj_dim = added_kv_proj_dim
- self.only_cross_attention = only_cross_attention
-
- if self.added_kv_proj_dim is None and self.only_cross_attention:
- raise ValueError(
- "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
- )
-
- if norm_num_groups is not None:
- self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
- else:
- self.group_norm = None
-
- if spatial_norm_dim is not None:
- self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
- else:
- self.spatial_norm = None
-
- if cross_attention_norm is None:
- self.norm_cross = None
- elif cross_attention_norm == "layer_norm":
- self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
- elif cross_attention_norm == "group_norm":
- if self.added_kv_proj_dim is not None:
- # The given `encoder_hidden_states` are initially of shape
- # (batch_size, seq_len, added_kv_proj_dim) before being projected
- # to (batch_size, seq_len, cross_attention_dim). The norm is applied
- # before the projection, so we need to use `added_kv_proj_dim` as
- # the number of channels for the group norm.
- norm_cross_num_channels = added_kv_proj_dim
- else:
- norm_cross_num_channels = self.cross_attention_dim
-
- self.norm_cross = nn.GroupNorm(
- num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
- )
- else:
- raise ValueError(
- f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
- )
-
- if USE_PEFT_BACKEND:
- linear_cls = nn.Linear
- else:
- linear_cls = LoRACompatibleLinear
-
- assert not (self.use_rope and (self.compress_kv_factor is not None)), "Can not both enable compressing kv and using rope"
- if self.compress_kv_factor is not None:
- self._init_compress()
-
- self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)
-
- if not self.only_cross_attention:
- # only relevant for the `AddedKVProcessor` classes
- self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
- self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
- else:
- self.to_k = None
- self.to_v = None
-
- if self.added_kv_proj_dim is not None:
- self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
- self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
-
- self.to_out = nn.ModuleList([])
- self.to_out.append(linear_cls(self.inner_dim, query_dim, bias=out_bias))
- self.to_out.append(nn.Dropout(dropout))
-
- # set attention processor
- # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
- # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
- # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
- if processor is None:
- processor = (
- AttnProcessor2_0(self.inner_dim, attention_mode, use_rope, rope_scaling=rope_scaling, compress_kv_factor=compress_kv_factor) if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
- )
- self.set_processor(processor)
-
- def set_use_memory_efficient_attention_xformers(
- self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
- ) -> None:
- r"""
- Set whether to use memory efficient attention from `xformers` or not.
-
- Args:
- use_memory_efficient_attention_xformers (`bool`):
- Whether to use memory efficient attention from `xformers` or not.
- attention_op (`Callable`, *optional*):
- The attention operation to use. Defaults to `None` which uses the default attention operation from
- `xformers`.
- """
- is_lora = hasattr(self, "processor") and isinstance(
- self.processor,
- LORA_ATTENTION_PROCESSORS,
- )
- is_custom_diffusion = hasattr(self, "processor") and isinstance(
- self.processor,
- (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
- )
- is_added_kv_processor = hasattr(self, "processor") and isinstance(
- self.processor,
- (
- AttnAddedKVProcessor,
- AttnAddedKVProcessor2_0,
- SlicedAttnAddedKVProcessor,
- XFormersAttnAddedKVProcessor,
- LoRAAttnAddedKVProcessor,
- ),
- )
-
- if use_memory_efficient_attention_xformers:
- if is_added_kv_processor and (is_lora or is_custom_diffusion):
- raise NotImplementedError(
- f"Memory efficient attention is currently not supported for LoRA or custom diffusion for attention processor type {self.processor}"
- )
- if not is_xformers_available():
- raise ModuleNotFoundError(
- (
- "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
- " xformers"
- ),
- name="xformers",
- )
- elif not torch.cuda.is_available():
- raise ValueError(
- "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
- " only available for GPU "
- )
- else:
- try:
- # Make sure we can run the memory efficient attention
- _ = xformers.ops.memory_efficient_attention(
- torch.randn((1, 2, 40), device="cuda"),
- torch.randn((1, 2, 40), device="cuda"),
- torch.randn((1, 2, 40), device="cuda"),
- )
- except Exception as e:
- raise e
-
- if is_lora:
- # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers
- # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0?
- processor = LoRAXFormersAttnProcessor(
- hidden_size=self.processor.hidden_size,
- cross_attention_dim=self.processor.cross_attention_dim,
- rank=self.processor.rank,
- attention_op=attention_op,
- )
- processor.load_state_dict(self.processor.state_dict())
- processor.to(self.processor.to_q_lora.up.weight.device)
- elif is_custom_diffusion:
- processor = CustomDiffusionXFormersAttnProcessor(
- train_kv=self.processor.train_kv,
- train_q_out=self.processor.train_q_out,
- hidden_size=self.processor.hidden_size,
- cross_attention_dim=self.processor.cross_attention_dim,
- attention_op=attention_op,
- )
- processor.load_state_dict(self.processor.state_dict())
- if hasattr(self.processor, "to_k_custom_diffusion"):
- processor.to(self.processor.to_k_custom_diffusion.weight.device)
- elif is_added_kv_processor:
- # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
- # which uses this type of cross attention ONLY because the attention mask of format
- # [0, ..., -10.000, ..., 0, ...,] is not supported
- # throw warning
- logger.info(
- "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
- )
- processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
- else:
- processor = XFormersAttnProcessor(attention_op=attention_op)
- else:
- if is_lora:
- attn_processor_class = (
- LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
- )
- processor = attn_processor_class(
- hidden_size=self.processor.hidden_size,
- cross_attention_dim=self.processor.cross_attention_dim,
- rank=self.processor.rank,
- )
- processor.load_state_dict(self.processor.state_dict())
- processor.to(self.processor.to_q_lora.up.weight.device)
- elif is_custom_diffusion:
- attn_processor_class = (
- CustomDiffusionAttnProcessor2_0
- if hasattr(F, "scaled_dot_product_attention")
- else CustomDiffusionAttnProcessor
- )
- processor = attn_processor_class(
- train_kv=self.processor.train_kv,
- train_q_out=self.processor.train_q_out,
- hidden_size=self.processor.hidden_size,
- cross_attention_dim=self.processor.cross_attention_dim,
- )
- processor.load_state_dict(self.processor.state_dict())
- if hasattr(self.processor, "to_k_custom_diffusion"):
- processor.to(self.processor.to_k_custom_diffusion.weight.device)
- else:
- # set attention processor
- # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
- # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
- # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
- processor = (
- AttnProcessor2_0()
- if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
- else AttnProcessor()
- )
-
- self.set_processor(processor)
-
- def set_attention_slice(self, slice_size: int) -> None:
- r"""
- Set the slice size for attention computation.
-
- Args:
- slice_size (`int`):
- The slice size for attention computation.
- """
- if slice_size is not None and slice_size > self.sliceable_head_dim:
- raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
-
- if slice_size is not None and self.added_kv_proj_dim is not None:
- processor = SlicedAttnAddedKVProcessor(slice_size)
- elif slice_size is not None:
- processor = SlicedAttnProcessor(slice_size)
- elif self.added_kv_proj_dim is not None:
- processor = AttnAddedKVProcessor()
- else:
- # set attention processor
- # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
- # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
- # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
- processor = (
- AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
- )
-
- self.set_processor(processor)
-
- def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False) -> None:
- r"""
- Set the attention processor to use.
-
- Args:
- processor (`AttnProcessor`):
- The attention processor to use.
- _remove_lora (`bool`, *optional*, defaults to `False`):
- Set to `True` to remove LoRA layers from the model.
- """
- if not USE_PEFT_BACKEND and hasattr(self, "processor") and _remove_lora and self.to_q.lora_layer is not None:
- deprecate(
- "set_processor to offload LoRA",
- "0.26.0",
- "In detail, removing LoRA layers via calling `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.",
- )
- # TODO(Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete
- # We need to remove all LoRA layers
- # Don't forget to remove ALL `_remove_lora` from the codebase
- for module in self.modules():
- if hasattr(module, "set_lora_layer"):
- module.set_lora_layer(None)
-
- # if current processor is in `self._modules` and if passed `processor` is not, we need to
- # pop `processor` from `self._modules`
- if (
- hasattr(self, "processor")
- and isinstance(self.processor, torch.nn.Module)
- and not isinstance(processor, torch.nn.Module)
- ):
- logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
- self._modules.pop("processor")
-
- self.processor = processor
-
- def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
- r"""
- Get the attention processor in use.
-
- Args:
- return_deprecated_lora (`bool`, *optional*, defaults to `False`):
- Set to `True` to return the deprecated LoRA attention processor.
-
- Returns:
- "AttentionProcessor": The attention processor in use.
- """
- if not return_deprecated_lora:
- return self.processor
-
- # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
- # serialization format for LoRA Attention Processors. It should be deleted once the integration
- # with PEFT is completed.
- is_lora_activated = {
- name: module.lora_layer is not None
- for name, module in self.named_modules()
- if hasattr(module, "lora_layer")
- }
-
- # 1. if no layer has a LoRA activated we can return the processor as usual
- if not any(is_lora_activated.values()):
- return self.processor
-
- # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
- is_lora_activated.pop("add_k_proj", None)
- is_lora_activated.pop("add_v_proj", None)
- # 2. else it is not posssible that only some layers have LoRA activated
- if not all(is_lora_activated.values()):
- raise ValueError(
- f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
- )
-
- # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
- non_lora_processor_cls_name = self.processor.__class__.__name__
- lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)
-
- hidden_size = self.inner_dim
-
- # now create a LoRA attention processor from the LoRA layers
- if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor]:
- kwargs = {
- "cross_attention_dim": self.cross_attention_dim,
- "rank": self.to_q.lora_layer.rank,
- "network_alpha": self.to_q.lora_layer.network_alpha,
- "q_rank": self.to_q.lora_layer.rank,
- "q_hidden_size": self.to_q.lora_layer.out_features,
- "k_rank": self.to_k.lora_layer.rank,
- "k_hidden_size": self.to_k.lora_layer.out_features,
- "v_rank": self.to_v.lora_layer.rank,
- "v_hidden_size": self.to_v.lora_layer.out_features,
- "out_rank": self.to_out[0].lora_layer.rank,
- "out_hidden_size": self.to_out[0].lora_layer.out_features,
- }
-
- if hasattr(self.processor, "attention_op"):
- kwargs["attention_op"] = self.processor.attention_op
-
- lora_processor = lora_processor_cls(hidden_size, **kwargs)
- lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
- lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
- lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
- lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
- elif lora_processor_cls == LoRAAttnAddedKVProcessor:
- lora_processor = lora_processor_cls(
- hidden_size,
- cross_attention_dim=self.add_k_proj.weight.shape[0],
- rank=self.to_q.lora_layer.rank,
- network_alpha=self.to_q.lora_layer.network_alpha,
- )
- lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
- lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
- lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
- lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
-
- # only save if used
- if self.add_k_proj.lora_layer is not None:
- lora_processor.add_k_proj_lora.load_state_dict(self.add_k_proj.lora_layer.state_dict())
- lora_processor.add_v_proj_lora.load_state_dict(self.add_v_proj.lora_layer.state_dict())
- else:
- lora_processor.add_k_proj_lora = None
- lora_processor.add_v_proj_lora = None
- else:
- raise ValueError(f"{lora_processor_cls} does not exist.")
-
- return lora_processor
-
- def forward(
- self,
- hidden_states: torch.FloatTensor,
- encoder_hidden_states: Optional[torch.FloatTensor] = None,
- attention_mask: Optional[torch.FloatTensor] = None,
- **cross_attention_kwargs,
- ) -> torch.Tensor:
- r"""
- The forward method of the `Attention` class.
-
- Args:
- hidden_states (`torch.Tensor`):
- The hidden states of the query.
- encoder_hidden_states (`torch.Tensor`, *optional*):
- The hidden states of the encoder.
- attention_mask (`torch.Tensor`, *optional*):
- The attention mask to use. If `None`, no mask is applied.
- **cross_attention_kwargs:
- Additional keyword arguments to pass along to the cross attention.
-
- Returns:
- `torch.Tensor`: The output of the attention layer.
- """
- # The `Attention` class can call different attention processors / attention functions
- # here we simply pass along all tensors to the selected processor class
- # For standard processors that are defined here, `**cross_attention_kwargs` is empty
- return self.processor(
- self,
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- **cross_attention_kwargs,
- )
-
- def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
- r"""
- Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
- is the number of heads initialized while constructing the `Attention` class.
-
- Args:
- tensor (`torch.Tensor`): The tensor to reshape.
-
- Returns:
- `torch.Tensor`: The reshaped tensor.
- """
- head_size = self.heads
- batch_size, seq_len, dim = tensor.shape
- tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
- tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
- return tensor
-
- def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
- r"""
- Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
- the number of heads initialized while constructing the `Attention` class.
-
- Args:
- tensor (`torch.Tensor`): The tensor to reshape.
- out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
- reshaped to `[batch_size * heads, seq_len, dim // heads]`.
-
- Returns:
- `torch.Tensor`: The reshaped tensor.
- """
- head_size = self.heads
- batch_size, seq_len, dim = tensor.shape
- tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
- tensor = tensor.permute(0, 2, 1, 3)
-
- if out_dim == 3:
- tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
-
- return tensor
-
- def get_attention_scores(
- self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
- ) -> torch.Tensor:
- r"""
- Compute the attention scores.
-
- Args:
- query (`torch.Tensor`): The query tensor.
- key (`torch.Tensor`): The key tensor.
- attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
-
- Returns:
- `torch.Tensor`: The attention probabilities/scores.
- """
- dtype = query.dtype
- if self.upcast_attention:
- query = query.float()
- key = key.float()
-
- if attention_mask is None:
- baddbmm_input = torch.empty(
- query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
- )
- beta = 0
- else:
- baddbmm_input = attention_mask
- beta = 1
-
- attention_scores = torch.baddbmm(
- baddbmm_input,
- query,
- key.transpose(-1, -2),
- beta=beta,
- alpha=self.scale,
- )
- del baddbmm_input
-
- if self.upcast_softmax:
- attention_scores = attention_scores.float()
-
- attention_probs = attention_scores.softmax(dim=-1)
- del attention_scores
-
- attention_probs = attention_probs.to(dtype)
-
- return attention_probs
-
- def prepare_attention_mask(
- self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
- ) -> torch.Tensor:
- r"""
- Prepare the attention mask for the attention computation.
-
- Args:
- attention_mask (`torch.Tensor`):
- The attention mask to prepare.
- target_length (`int`):
- The target length of the attention mask. This is the length of the attention mask after padding.
- batch_size (`int`):
- The batch size, which is used to repeat the attention mask.
- out_dim (`int`, *optional*, defaults to `3`):
- The output dimension of the attention mask. Can be either `3` or `4`.
-
- Returns:
- `torch.Tensor`: The prepared attention mask.
- """
- head_size = self.heads
- if attention_mask is None:
- return attention_mask
-
- current_length: int = attention_mask.shape[-1]
- if current_length != target_length:
- if attention_mask.device.type == "mps":
- # HACK: MPS: Does not support padding by greater than dimension of input tensor.
- # Instead, we can manually construct the padding tensor.
- padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
- padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
- attention_mask = torch.cat([attention_mask, padding], dim=2)
- else:
- # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
- # we want to instead pad by (0, remaining_length), where remaining_length is:
- # remaining_length: int = target_length - current_length
- # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
- attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
-
- if out_dim == 3:
- if attention_mask.shape[0] < batch_size * head_size:
- attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
- elif out_dim == 4:
- attention_mask = attention_mask.unsqueeze(1)
- attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
-
- return attention_mask
-
- def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
- r"""
- Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
- `Attention` class.
-
- Args:
- encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
-
- Returns:
- `torch.Tensor`: The normalized encoder hidden states.
- """
- assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
-
- if isinstance(self.norm_cross, nn.LayerNorm):
- encoder_hidden_states = self.norm_cross(encoder_hidden_states)
- elif isinstance(self.norm_cross, nn.GroupNorm):
- # Group norm norms along the channels dimension and expects
- # input to be in the shape of (N, C, *). In this case, we want
- # to norm along the hidden dimension, so we need to move
- # (batch_size, sequence_length, hidden_size) ->
- # (batch_size, hidden_size, sequence_length)
- encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
- encoder_hidden_states = self.norm_cross(encoder_hidden_states)
- encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
- else:
- assert False
-
- return encoder_hidden_states
-
- def _init_compress(self):
- if len(self.compress_kv_factor) == 2:
- self.sr = nn.Conv2d(self.inner_dim, self.inner_dim, groups=self.inner_dim, kernel_size=self.compress_kv_factor, stride=self.compress_kv_factor)
- self.sr.weight.data.fill_(1/self.compress_kv_factor[0]**2)
- elif len(self.compress_kv_factor) == 1:
- self.kernel_size = self.compress_kv_factor[0]
- self.sr = nn.Conv1d(self.inner_dim, self.inner_dim, groups=self.inner_dim, kernel_size=self.compress_kv_factor[0], stride=self.compress_kv_factor[0])
- self.sr.weight.data.fill_(1/self.compress_kv_factor[0])
- self.sr.bias.data.zero_()
- self.norm = nn.LayerNorm(self.inner_dim)
-
-class AttnProcessor2_0:
- r"""
- Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
- """
-
- def __init__(self, dim=1152, attention_mode='xformers', use_rope=False, rope_scaling=None, compress_kv_factor=None):
- self.dim = dim
- self.attention_mode = attention_mode
- self.use_rope = use_rope
- self.rope_scaling = rope_scaling
- self.compress_kv_factor = compress_kv_factor
- if self.use_rope:
- self._init_rope()
-
- if not hasattr(F, "scaled_dot_product_attention"):
- raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-
-
- def _init_rope(self):
- if self.rope_scaling is None:
- self.rope2d = RoPE2D()
- self.rope1d = RoPE1D()
- else:
- scaling_type = self.rope_scaling["type"]
- scaling_factor_2d = self.rope_scaling["factor_2d"]
- scaling_factor_1d = self.rope_scaling["factor_1d"]
- if scaling_type == "linear":
- self.rope2d = LinearScalingRoPE2D(scaling_factor=scaling_factor_2d)
- self.rope1d = LinearScalingRoPE1D(scaling_factor=scaling_factor_1d)
- else:
- raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
- def __call__(
- self,
- attn: Attention,
- hidden_states: torch.FloatTensor,
- encoder_hidden_states: Optional[torch.FloatTensor] = None,
- attention_mask: Optional[torch.FloatTensor] = None,
- temb: Optional[torch.FloatTensor] = None,
- scale: float = 1.0,
- position_q: Optional[torch.LongTensor] = None,
- position_k: Optional[torch.LongTensor] = None,
- last_shape: Tuple[int] = None,
- ) -> torch.FloatTensor:
- residual = hidden_states
-
- args = () if USE_PEFT_BACKEND else (scale,)
-
- if attn.spatial_norm is not None:
- hidden_states = attn.spatial_norm(hidden_states, temb)
-
- input_ndim = hidden_states.ndim
-
- if input_ndim == 4:
- batch_size, channel, height, width = hidden_states.shape
- hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-
-
- if self.compress_kv_factor is not None:
- batch_size = hidden_states.shape[0]
- if len(last_shape) == 2:
- encoder_hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, self.dim, *last_shape)
- encoder_hidden_states = attn.sr(encoder_hidden_states).reshape(batch_size, self.dim, -1).permute(0, 2, 1)
- elif len(last_shape) == 1:
- encoder_hidden_states = hidden_states.permute(0, 2, 1)
- if last_shape[0] % 2 == 1:
- first_frame_pad = encoder_hidden_states[:, :, :1].repeat((1, 1, attn.kernel_size - 1))
- encoder_hidden_states = torch.concatenate((first_frame_pad, encoder_hidden_states), dim=2)
- encoder_hidden_states = attn.sr(encoder_hidden_states).permute(0, 2, 1)
- else:
- raise NotImplementedError(f'NotImplementedError with last_shape {last_shape}')
-
- encoder_hidden_states = attn.norm(encoder_hidden_states)
-
- batch_size, sequence_length, _ = (
- hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
- )
-
- if attention_mask is not None:
- attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
- # scaled_dot_product_attention expects attention_mask shape to be
- # (batch, heads, source_length, target_length)
- attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
- if attn.group_norm is not None:
- hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
- args = () if USE_PEFT_BACKEND else (scale,)
- query = attn.to_q(hidden_states, *args)
-
- if encoder_hidden_states is None:
- encoder_hidden_states = hidden_states
- elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-
-
- key = attn.to_k(encoder_hidden_states, *args)
- value = attn.to_v(encoder_hidden_states, *args)
-
- inner_dim = key.shape[-1]
- head_dim = inner_dim // attn.heads
-
- query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
- key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
- value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
- if self.use_rope:
- # require the shape of (batch_size x nheads x ntokens x dim)
- if position_q.ndim == 3:
- query = self.rope2d(query, position_q)
- elif position_q.ndim == 2:
- query = self.rope1d(query, position_q)
- else:
- raise NotImplementedError
- if position_k.ndim == 3:
- key = self.rope2d(key, position_k)
- elif position_k.ndim == 2:
- key = self.rope1d(key, position_k)
- else:
- raise NotImplementedError
-
- # the output of sdp = (batch, num_heads, seq_len, head_dim)
- # TODO: add support for attn.scale when we move to Torch 2.1
- if self.attention_mode == 'flash':
- assert attention_mask is None or torch.all(attention_mask.bool()), 'flash-attn do not support attention_mask'
- with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
- hidden_states = F.scaled_dot_product_attention(
- query, key, value, dropout_p=0.0, is_causal=False
- )
- elif self.attention_mode == 'xformers':
- with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
- hidden_states = F.scaled_dot_product_attention(
- query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
- )
- elif self.attention_mode == 'math':
- hidden_states = F.scaled_dot_product_attention(
- query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
- )
- else:
- raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
- hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
- hidden_states = hidden_states.to(query.dtype)
-
- # linear proj
- hidden_states = attn.to_out[0](hidden_states, *args)
- # dropout
- hidden_states = attn.to_out[1](hidden_states)
-
- if input_ndim == 4:
- hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
- if attn.residual_connection:
- hidden_states = hidden_states + residual
-
- hidden_states = hidden_states / attn.rescale_output_factor
-
- return hidden_states
-
-@maybe_allow_in_graph
-class GatedSelfAttentionDense(nn.Module):
- r"""
- A gated self-attention dense layer that combines visual features and object features.
-
- Parameters:
- query_dim (`int`): The number of channels in the query.
- context_dim (`int`): The number of channels in the context.
- n_heads (`int`): The number of heads to use for attention.
- d_head (`int`): The number of channels in each head.
- """
-
- def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
- super().__init__()
-
- # we need a linear projection since we need cat visual feature and obj feature
- self.linear = nn.Linear(context_dim, query_dim)
-
- self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
- self.ff = FeedForward(query_dim, activation_fn="geglu")
-
- self.norm1 = nn.LayerNorm(query_dim)
- self.norm2 = nn.LayerNorm(query_dim)
-
- self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
- self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
-
- self.enabled = True
-
- def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
- if not self.enabled:
- return x
-
- n_visual = x.shape[1]
- objs = self.linear(objs)
-
- x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
- x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
-
- return x
-
-
-class FeedForward(nn.Module):
- r"""
- A feed-forward layer.
-
- Parameters:
- dim (`int`): The number of channels in the input.
- dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
- mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
- activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
- final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
- """
-
- def __init__(
- self,
- dim: int,
- dim_out: Optional[int] = None,
- mult: int = 4,
- dropout: float = 0.0,
- activation_fn: str = "geglu",
- final_dropout: bool = False,
- ):
- super().__init__()
- inner_dim = int(dim * mult)
- dim_out = dim_out if dim_out is not None else dim
- linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
-
- if activation_fn == "gelu":
- act_fn = GELU(dim, inner_dim)
- if activation_fn == "gelu-approximate":
- act_fn = GELU(dim, inner_dim, approximate="tanh")
- elif activation_fn == "geglu":
- act_fn = GEGLU(dim, inner_dim)
- elif activation_fn == "geglu-approximate":
- act_fn = ApproximateGELU(dim, inner_dim)
-
- self.net = nn.ModuleList([])
- # project in
- self.net.append(act_fn)
- # project dropout
- self.net.append(nn.Dropout(dropout))
- # project out
- self.net.append(linear_cls(inner_dim, dim_out))
- # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
- if final_dropout:
- self.net.append(nn.Dropout(dropout))
-
- def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
- compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
- for module in self.net:
- if isinstance(module, compatible_cls):
- hidden_states = module(hidden_states, scale)
- else:
- hidden_states = module(hidden_states)
- return hidden_states
-
-
-@maybe_allow_in_graph
-class BasicTransformerBlock_(nn.Module):
- r"""
- A basic Transformer block.
-
- Parameters:
- dim (`int`): The number of channels in the input and output.
- num_attention_heads (`int`): The number of heads to use for multi-head attention.
- attention_head_dim (`int`): The number of channels in each head.
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
- cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
- activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
- num_embeds_ada_norm (:
- obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
- attention_bias (:
- obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
- only_cross_attention (`bool`, *optional*):
- Whether to use only cross-attention layers. In this case two cross attention layers are used.
- double_self_attention (`bool`, *optional*):
- Whether to use two self-attention layers. In this case no cross attention layers are used.
- upcast_attention (`bool`, *optional*):
- Whether to upcast the attention computation to float32. This is useful for mixed precision training.
- norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
- Whether to use learnable elementwise affine parameters for normalization.
- norm_type (`str`, *optional*, defaults to `"layer_norm"`):
- The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
- final_dropout (`bool` *optional*, defaults to False):
- Whether to apply a final dropout after the last feed-forward layer.
- attention_type (`str`, *optional*, defaults to `"default"`):
- The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
- positional_embeddings (`str`, *optional*, defaults to `None`):
- The type of positional embeddings to apply to.
- num_positional_embeddings (`int`, *optional*, defaults to `None`):
- The maximum number of positional embeddings to apply.
- """
-
- def __init__(
- self,
- dim: int,
- num_attention_heads: int,
- attention_head_dim: int,
- dropout=0.0,
- cross_attention_dim: Optional[int] = None,
- activation_fn: str = "geglu",
- num_embeds_ada_norm: Optional[int] = None,
- attention_bias: bool = False,
- only_cross_attention: bool = False,
- double_self_attention: bool = False,
- upcast_attention: bool = False,
- norm_elementwise_affine: bool = True,
- norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
- norm_eps: float = 1e-5,
- final_dropout: bool = False,
- attention_type: str = "default",
- positional_embeddings: Optional[str] = None,
- num_positional_embeddings: Optional[int] = None,
- attention_mode: str = "xformers",
- use_rope: bool = False,
- rope_scaling: Optional[Dict] = None,
- compress_kv_factor: Optional[Tuple] = None,
- ):
- super().__init__()
- self.only_cross_attention = only_cross_attention
-
- self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
- self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
- self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
- self.use_layer_norm = norm_type == "layer_norm"
-
- if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
- raise ValueError(
- f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
- f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
- )
-
- if positional_embeddings and (num_positional_embeddings is None):
- raise ValueError(
- "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
- )
-
- if positional_embeddings == "sinusoidal":
- self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
- else:
- self.pos_embed = None
-
- # Define 3 blocks. Each block has its own normalization layer.
- # 1. Self-Attn
- if self.use_ada_layer_norm:
- self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
- elif self.use_ada_layer_norm_zero:
- self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
- else:
- self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-
- self.attn1 = Attention(
- query_dim=dim,
- heads=num_attention_heads,
- dim_head=attention_head_dim,
- dropout=dropout,
- bias=attention_bias,
- cross_attention_dim=cross_attention_dim if only_cross_attention else None,
- upcast_attention=upcast_attention,
- attention_mode=attention_mode,
- use_rope=use_rope,
- rope_scaling=rope_scaling,
- compress_kv_factor=compress_kv_factor,
- )
-
- # # 2. Cross-Attn
- # if cross_attention_dim is not None or double_self_attention:
- # # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
- # # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
- # # the second cross attention block.
- # self.norm2 = (
- # AdaLayerNorm(dim, num_embeds_ada_norm)
- # if self.use_ada_layer_norm
- # else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
- # )
- # self.attn2 = Attention(
- # query_dim=dim,
- # cross_attention_dim=cross_attention_dim if not double_self_attention else None,
- # heads=num_attention_heads,
- # dim_head=attention_head_dim,
- # dropout=dropout,
- # bias=attention_bias,
- # upcast_attention=upcast_attention,
- # ) # is self-attn if encoder_hidden_states is none
- # else:
- # self.norm2 = None
- # self.attn2 = None
-
- # 3. Feed-forward
- # if not self.use_ada_layer_norm_single:
- # self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
- self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-
- self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
-
- # 4. Fuser
- if attention_type == "gated" or attention_type == "gated-text-image":
- self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
-
- # 5. Scale-shift for PixArt-Alpha.
- if self.use_ada_layer_norm_single:
- self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim ** 0.5)
-
- # let chunk size default to None
- self._chunk_size = None
- self._chunk_dim = 0
-
- def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
- # Sets chunk feed-forward
- self._chunk_size = chunk_size
- self._chunk_dim = dim
-
- def forward(
- self,
- hidden_states: torch.FloatTensor,
- attention_mask: Optional[torch.FloatTensor] = None,
- encoder_hidden_states: Optional[torch.FloatTensor] = None,
- encoder_attention_mask: Optional[torch.FloatTensor] = None,
- timestep: Optional[torch.LongTensor] = None,
- cross_attention_kwargs: Dict[str, Any] = None,
- class_labels: Optional[torch.LongTensor] = None,
- position_q: Optional[torch.LongTensor] = None,
- position_k: Optional[torch.LongTensor] = None,
- frame: int = None,
- ) -> torch.FloatTensor:
- # Notice that normalization is always applied before the real computation in the following blocks.
- # 0. Self-Attention
- batch_size = hidden_states.shape[0]
-
- if self.use_ada_layer_norm:
- norm_hidden_states = self.norm1(hidden_states, timestep)
- elif self.use_ada_layer_norm_zero:
- norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
- hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
- )
- elif self.use_layer_norm:
- norm_hidden_states = self.norm1(hidden_states)
- elif self.use_ada_layer_norm_single:
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
- self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
- ).chunk(6, dim=1)
- norm_hidden_states = self.norm1(hidden_states)
- norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
- norm_hidden_states = norm_hidden_states.squeeze(1)
- else:
- raise ValueError("Incorrect norm used")
-
- if self.pos_embed is not None:
- norm_hidden_states = self.pos_embed(norm_hidden_states)
-
- # 1. Retrieve lora scale.
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
- # 2. Prepare GLIGEN inputs
- cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
- gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
-
- attn_output = self.attn1(
- norm_hidden_states,
- encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
- attention_mask=attention_mask,
- position_q=position_q,
- position_k=position_k,
- last_shape=frame,
- **cross_attention_kwargs,
- )
- if self.use_ada_layer_norm_zero:
- attn_output = gate_msa.unsqueeze(1) * attn_output
- elif self.use_ada_layer_norm_single:
- attn_output = gate_msa * attn_output
-
- hidden_states = attn_output + hidden_states
- if hidden_states.ndim == 4:
- hidden_states = hidden_states.squeeze(1)
-
- # 2.5 GLIGEN Control
- if gligen_kwargs is not None:
- hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
-
- # # 3. Cross-Attention
- # if self.attn2 is not None:
- # if self.use_ada_layer_norm:
- # norm_hidden_states = self.norm2(hidden_states, timestep)
- # elif self.use_ada_layer_norm_zero or self.use_layer_norm:
- # norm_hidden_states = self.norm2(hidden_states)
- # elif self.use_ada_layer_norm_single:
- # # For PixArt norm2 isn't applied here:
- # # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
- # norm_hidden_states = hidden_states
- # else:
- # raise ValueError("Incorrect norm")
-
- # if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
- # norm_hidden_states = self.pos_embed(norm_hidden_states)
-
- # attn_output = self.attn2(
- # norm_hidden_states,
- # encoder_hidden_states=encoder_hidden_states,
- # attention_mask=encoder_attention_mask,
- # **cross_attention_kwargs,
- # )
- # hidden_states = attn_output + hidden_states
-
- # 4. Feed-forward
- # if not self.use_ada_layer_norm_single:
- # norm_hidden_states = self.norm3(hidden_states)
-
- if self.use_ada_layer_norm_zero:
- norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
- if self.use_ada_layer_norm_single:
- # norm_hidden_states = self.norm2(hidden_states)
- norm_hidden_states = self.norm3(hidden_states)
- norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-
- if self._chunk_size is not None:
- # "feed_forward_chunk_size" can be used to save memory
- if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
- raise ValueError(
- f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
- )
-
- num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
- ff_output = torch.cat(
- [
- self.ff(hid_slice, scale=lora_scale)
- for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)
- ],
- dim=self._chunk_dim,
- )
- else:
- ff_output = self.ff(norm_hidden_states, scale=lora_scale)
-
- if self.use_ada_layer_norm_zero:
- ff_output = gate_mlp.unsqueeze(1) * ff_output
- elif self.use_ada_layer_norm_single:
- ff_output = gate_mlp * ff_output
-
- hidden_states = ff_output + hidden_states
- if hidden_states.ndim == 4:
- hidden_states = hidden_states.squeeze(1)
-
- return hidden_states
-
-
-@maybe_allow_in_graph
-class BasicTransformerBlock(nn.Module):
- r"""
- A basic Transformer block.
-
- Parameters:
- dim (`int`): The number of channels in the input and output.
- num_attention_heads (`int`): The number of heads to use for multi-head attention.
- attention_head_dim (`int`): The number of channels in each head.
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
- cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
- activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
- num_embeds_ada_norm (:
- obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
- attention_bias (:
- obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
- only_cross_attention (`bool`, *optional*):
- Whether to use only cross-attention layers. In this case two cross attention layers are used.
- double_self_attention (`bool`, *optional*):
- Whether to use two self-attention layers. In this case no cross attention layers are used.
- upcast_attention (`bool`, *optional*):
- Whether to upcast the attention computation to float32. This is useful for mixed precision training.
- norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
- Whether to use learnable elementwise affine parameters for normalization.
- norm_type (`str`, *optional*, defaults to `"layer_norm"`):
- The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
- final_dropout (`bool` *optional*, defaults to False):
- Whether to apply a final dropout after the last feed-forward layer.
- attention_type (`str`, *optional*, defaults to `"default"`):
- The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
- positional_embeddings (`str`, *optional*, defaults to `None`):
- The type of positional embeddings to apply to.
- num_positional_embeddings (`int`, *optional*, defaults to `None`):
- The maximum number of positional embeddings to apply.
- """
-
- def __init__(
- self,
- dim: int,
- num_attention_heads: int,
- attention_head_dim: int,
- dropout=0.0,
- cross_attention_dim: Optional[int] = None,
- activation_fn: str = "geglu",
- num_embeds_ada_norm: Optional[int] = None,
- attention_bias: bool = False,
- only_cross_attention: bool = False,
- double_self_attention: bool = False,
- upcast_attention: bool = False,
- norm_elementwise_affine: bool = True,
- norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
- norm_eps: float = 1e-5,
- final_dropout: bool = False,
- attention_type: str = "default",
- positional_embeddings: Optional[str] = None,
- num_positional_embeddings: Optional[int] = None,
- attention_mode: str = "xformers",
- use_rope: bool = False,
- rope_scaling: Optional[Dict] = None,
- compress_kv_factor: Optional[Tuple] = None,
- ):
- super().__init__()
- self.only_cross_attention = only_cross_attention
-
- self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
- self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
- self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
- self.use_layer_norm = norm_type == "layer_norm"
-
- if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
- raise ValueError(
- f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
- f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
- )
-
- if positional_embeddings and (num_positional_embeddings is None):
- raise ValueError(
- "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
- )
-
- if positional_embeddings == "sinusoidal":
- self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
- else:
- self.pos_embed = None
-
- # Define 3 blocks. Each block has its own normalization layer.
- # 1. Self-Attn
- if self.use_ada_layer_norm:
- self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
- elif self.use_ada_layer_norm_zero:
- self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
- else:
- self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-
- self.attn1 = Attention(
- query_dim=dim,
- heads=num_attention_heads,
- dim_head=attention_head_dim,
- dropout=dropout,
- bias=attention_bias,
- cross_attention_dim=cross_attention_dim if only_cross_attention else None,
- upcast_attention=upcast_attention,
- attention_mode=attention_mode,
- use_rope=use_rope,
- rope_scaling=rope_scaling,
- compress_kv_factor=compress_kv_factor,
- )
-
- # 2. Cross-Attn
- if cross_attention_dim is not None or double_self_attention:
- # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
- # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
- # the second cross attention block.
- self.norm2 = (
- AdaLayerNorm(dim, num_embeds_ada_norm)
- if self.use_ada_layer_norm
- else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
- )
- self.attn2 = Attention(
- query_dim=dim,
- cross_attention_dim=cross_attention_dim if not double_self_attention else None,
- heads=num_attention_heads,
- dim_head=attention_head_dim,
- dropout=dropout,
- bias=attention_bias,
- upcast_attention=upcast_attention,
- attention_mode=attention_mode, # only xformers support attention_mask
- use_rope=False, # do not position in cross attention
- compress_kv_factor=None,
- ) # is self-attn if encoder_hidden_states is none
- else:
- self.norm2 = None
- self.attn2 = None
-
- # 3. Feed-forward
- if not self.use_ada_layer_norm_single:
- self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-
- self.ff = FeedForward(
- dim,
- dropout=dropout,
- activation_fn=activation_fn,
- final_dropout=final_dropout,
- )
-
- # 4. Fuser
- if attention_type == "gated" or attention_type == "gated-text-image":
- self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
-
- # 5. Scale-shift for PixArt-Alpha.
- if self.use_ada_layer_norm_single:
- self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
-
- # let chunk size default to None
- self._chunk_size = None
- self._chunk_dim = 0
-
- def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
- # Sets chunk feed-forward
- self._chunk_size = chunk_size
- self._chunk_dim = dim
-
- def forward(
- self,
- hidden_states: torch.FloatTensor,
- attention_mask: Optional[torch.FloatTensor] = None,
- encoder_hidden_states: Optional[torch.FloatTensor] = None,
- encoder_attention_mask: Optional[torch.FloatTensor] = None,
- timestep: Optional[torch.LongTensor] = None,
- cross_attention_kwargs: Dict[str, Any] = None,
- class_labels: Optional[torch.LongTensor] = None,
- position_q: Optional[torch.LongTensor] = None,
- position_k: Optional[torch.LongTensor] = None,
- hw: Tuple[int, int] = None,
- ) -> torch.FloatTensor:
- # Notice that normalization is always applied before the real computation in the following blocks.
- # 0. Self-Attention
- batch_size = hidden_states.shape[0]
-
- if self.use_ada_layer_norm:
- norm_hidden_states = self.norm1(hidden_states, timestep)
- elif self.use_ada_layer_norm_zero:
- norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
- hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
- )
- elif self.use_layer_norm:
- norm_hidden_states = self.norm1(hidden_states)
- elif self.use_ada_layer_norm_single:
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
- self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
- ).chunk(6, dim=1)
- norm_hidden_states = self.norm1(hidden_states)
- norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
- norm_hidden_states = norm_hidden_states.squeeze(1)
- else:
- raise ValueError("Incorrect norm used")
-
- if self.pos_embed is not None:
- norm_hidden_states = self.pos_embed(norm_hidden_states)
-
- # 1. Retrieve lora scale.
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
- # 2. Prepare GLIGEN inputs
- cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
- gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
-
- attn_output = self.attn1(
- norm_hidden_states,
- encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
- attention_mask=attention_mask,
- position_q=position_q,
- position_k=position_k,
- last_shape=hw,
- **cross_attention_kwargs,
- )
- if self.use_ada_layer_norm_zero:
- attn_output = gate_msa.unsqueeze(1) * attn_output
- elif self.use_ada_layer_norm_single:
- attn_output = gate_msa * attn_output
-
- hidden_states = attn_output + hidden_states
- if hidden_states.ndim == 4:
- hidden_states = hidden_states.squeeze(1)
-
- # 2.5 GLIGEN Control
- if gligen_kwargs is not None:
- hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
-
- # 3. Cross-Attention
- if self.attn2 is not None:
- if self.use_ada_layer_norm:
- norm_hidden_states = self.norm2(hidden_states, timestep)
- elif self.use_ada_layer_norm_zero or self.use_layer_norm:
- norm_hidden_states = self.norm2(hidden_states)
- elif self.use_ada_layer_norm_single:
- # For PixArt norm2 isn't applied here:
- # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
- norm_hidden_states = hidden_states
- else:
- raise ValueError("Incorrect norm")
-
- if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
- norm_hidden_states = self.pos_embed(norm_hidden_states)
-
- attn_output = self.attn2(
- norm_hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=encoder_attention_mask,
- position_q=None, # cross attn do not need relative position
- position_k=None,
- last_shape=None,
- **cross_attention_kwargs,
- )
- hidden_states = attn_output + hidden_states
-
- # 4. Feed-forward
- if not self.use_ada_layer_norm_single:
- norm_hidden_states = self.norm3(hidden_states)
-
- if self.use_ada_layer_norm_zero:
- norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
- if self.use_ada_layer_norm_single:
- norm_hidden_states = self.norm2(hidden_states)
- norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-
- if self._chunk_size is not None:
- # "feed_forward_chunk_size" can be used to save memory
- ff_output = _chunked_feed_forward(
- self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
- )
- else:
- ff_output = self.ff(norm_hidden_states, scale=lora_scale)
-
- if self.use_ada_layer_norm_zero:
- ff_output = gate_mlp.unsqueeze(1) * ff_output
- elif self.use_ada_layer_norm_single:
- ff_output = gate_mlp * ff_output
-
- hidden_states = ff_output + hidden_states
- if hidden_states.ndim == 4:
- hidden_states = hidden_states.squeeze(1)
-
- return hidden_states
-
-class AdaLayerNormSingle(nn.Module):
- r"""
- Norm layer adaptive layer norm single (adaLN-single).
-
- As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
-
- Parameters:
- embedding_dim (`int`): The size of each embedding vector.
- use_additional_conditions (`bool`): To use additional conditions for normalization or not.
- """
-
- def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
- super().__init__()
-
- self.emb = CombinedTimestepSizeEmbeddings(
- embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
- )
-
- self.silu = nn.SiLU()
- self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-
- def forward(
- self,
- timestep: torch.Tensor,
- added_cond_kwargs: Dict[str, torch.Tensor] = None,
- batch_size: int = None,
- hidden_dtype: Optional[torch.dtype] = None,
- ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
- # No modulation happening here.
- embedded_timestep = self.emb(timestep, batch_size=batch_size, hidden_dtype=hidden_dtype, resolution=None,
- aspect_ratio=None)
- return self.linear(self.silu(embedded_timestep)), embedded_timestep
-
-
-@dataclass
-class Transformer3DModelOutput(BaseOutput):
- """
- The output of [`Transformer2DModel`].
-
- Args:
- sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
- The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
- distributions for the unnoised latent pixels.
- """
-
- sample: torch.FloatTensor
diff --git a/opensora/models/diffusion/opensora/__init__.py b/opensora/models/diffusion/opensora/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/models/diffusion/opensora/modeling_opensora.py b/opensora/models/diffusion/opensora/modeling_opensora.py
new file mode 100644
index 000000000..f5fc0ca2b
--- /dev/null
+++ b/opensora/models/diffusion/opensora/modeling_opensora.py
@@ -0,0 +1,843 @@
+import os
+import numpy as np
+from torch import nn
+import torch
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from torch.nn import functional as F
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.utils import is_torch_version, deprecate
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.models.embeddings import PixArtAlphaTextProjection
+from opensora.models.diffusion.opensora.modules import OverlapPatchEmbed3D, OverlapPatchEmbed2D, PatchEmbed2D, BasicTransformerBlock
+from opensora.utils.utils import to_2tuple
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
+
+class OpenSoraT2V(ModelMixin, ConfigMixin):
+ """
+ A 2D Transformer model for image-like data.
+
+ Parameters:
+ num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+ in_channels (`int`, *optional*):
+ The number of channels in the input and output (specify if the input is **continuous**).
+ num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+ sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+ This is fixed during training since it is used to learn a number of position embeddings.
+ num_vector_embeds (`int`, *optional*):
+ The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+ Includes the class for the masked latent pixel.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+ num_embeds_ada_norm ( `int`, *optional*):
+ The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+ `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+ added to the hidden states.
+
+ During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+ attention_bias (`bool`, *optional*):
+ Configure if the `TransformerBlocks` attention should contain a bias parameter.
+ """
+
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ sample_size_t: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ patch_size: Optional[int] = None,
+ patch_size_t: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_elementwise_affine: bool = True,
+ norm_eps: float = 1e-5,
+ attention_type: str = "default",
+ caption_channels: int = None,
+ interpolation_scale_h: float = None,
+ interpolation_scale_w: float = None,
+ interpolation_scale_t: float = None,
+ use_additional_conditions: Optional[bool] = None,
+ attention_mode: str = 'xformers',
+ downsampler: str = None,
+ use_rope: bool = False,
+ use_stable_fp32: bool = False,
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ use_motion: bool = False,
+ ):
+ super().__init__()
+
+ # Validate inputs.
+ if patch_size is not None:
+ if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single"]:
+ raise NotImplementedError(
+ f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+ )
+ elif norm_type in ["ada_norm", "ada_norm_zero"] and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+ )
+
+ # Set some common variables used across the board.
+ self.use_motion = use_motion
+ self.sparse1d = sparse1d
+ self.sparse2d = sparse2d
+ self.sparse_n = sparse_n
+ self.use_rope = use_rope
+ self.use_linear_projection = use_linear_projection
+ self.interpolation_scale_t = interpolation_scale_t
+ self.interpolation_scale_h = interpolation_scale_h
+ self.interpolation_scale_w = interpolation_scale_w
+ self.downsampler = downsampler
+ self.caption_channels = caption_channels
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.gradient_checkpointing = False
+ self.config.hidden_size = self.inner_dim
+ use_additional_conditions = False
+ # if use_additional_conditions is None:
+ # if norm_type == "ada_norm_single" and sample_size == 128:
+ # use_additional_conditions = True
+ # else:
+ # use_additional_conditions = False
+ self.use_additional_conditions = use_additional_conditions
+
+ # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+ # Define whether input is continuous or discrete depending on configuration
+ assert in_channels is not None and patch_size is not None
+
+ if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+ deprecation_message = (
+ f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+ " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+ " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+ " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+ " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+ )
+ deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+ norm_type = "ada_norm"
+
+ # 2. Initialize the right blocks.
+ # Initialize the output blocks and other projection blocks when necessary.
+ self._init_patched_inputs(norm_type=norm_type)
+
+ def _init_patched_inputs(self, norm_type):
+ assert self.config.sample_size_t is not None, "OpenSoraT2V over patched input must provide sample_size_t"
+ assert self.config.sample_size is not None, "OpenSoraT2V over patched input must provide sample_size"
+ #assert not (self.config.sample_size_t == 1 and self.config.patch_size_t == 2), "Image do not need patchfy in t-dim"
+
+ self.num_frames = self.config.sample_size_t
+ self.config.sample_size = to_2tuple(self.config.sample_size)
+ self.height = self.config.sample_size[0]
+ self.width = self.config.sample_size[1]
+ self.patch_size_t = self.config.patch_size_t
+ self.patch_size = self.config.patch_size
+ interpolation_scale_t = ((self.config.sample_size_t - 1) // 16 + 1) if self.config.sample_size_t % 2 == 1 else self.config.sample_size_t / 16
+ interpolation_scale_t = (
+ self.config.interpolation_scale_t if self.config.interpolation_scale_t is not None else interpolation_scale_t
+ )
+ interpolation_scale = (
+ self.config.interpolation_scale_h if self.config.interpolation_scale_h is not None else self.config.sample_size[0] / 30,
+ self.config.interpolation_scale_w if self.config.interpolation_scale_w is not None else self.config.sample_size[1] / 40,
+ )
+ # if self.config.sample_size_t > 1:
+ # self.pos_embed = PatchEmbed3D(
+ # num_frames=self.config.sample_size_t,
+ # height=self.config.sample_size[0],
+ # width=self.config.sample_size[1],
+ # patch_size_t=self.config.patch_size_t,
+ # patch_size=self.config.patch_size,
+ # in_channels=self.in_channels,
+ # embed_dim=self.inner_dim,
+ # interpolation_scale=interpolation_scale,
+ # interpolation_scale_t=interpolation_scale_t,
+ # )
+ # else:
+ if self.config.downsampler is not None and len(self.config.downsampler) == 9:
+ self.pos_embed = OverlapPatchEmbed3D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+ elif self.config.downsampler is not None and len(self.config.downsampler) == 7:
+ self.pos_embed = OverlapPatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+
+ else:
+ self.pos_embed = PatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+ interpolation_scale_thw = (interpolation_scale_t, *interpolation_scale)
+ self.transformer_blocks = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.config.cross_attention_dim,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ downsampler=self.config.downsampler,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=self.sparse1d,
+ sparse2d=self.sparse2d,
+ sparse_n=self.sparse_n,
+ sparse_group=i % 2 == 1,
+ )
+ for i in range(self.config.num_layers)
+ ]
+ )
+
+ if self.config.norm_type != "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+ self.proj_out_2 = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+ elif self.config.norm_type == "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+ self.proj_out = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+
+ # PixArt-Alpha blocks.
+ self.adaln_single = None
+ if self.config.norm_type == "ada_norm_single":
+ # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+ # additional conditions until we find better name
+ self.adaln_single = AdaLayerNormSingle(
+ self.inner_dim, use_additional_conditions=self.use_additional_conditions
+ )
+
+ self.caption_projection = None
+ if self.caption_channels is not None:
+ self.caption_projection = PixArtAlphaTextProjection(
+ in_features=self.caption_channels, hidden_size=self.inner_dim
+ )
+
+ @property
+ def attn_processors(self):
+ r"""
+ Returns:
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
+ indexed by its weight name.
+ """
+ # set recursively
+ processors = {}
+
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
+ if hasattr(module, "get_processor"):
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+ def set_attn_processor(self, processor):
+ r"""
+ Sets the attention processor to use to compute attention.
+
+ Parameters:
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
+ for **all** `Attention` layers.
+
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+ processor. This is strongly recommended when setting trainable attention processors.
+
+ """
+ count = len(self.attn_processors.keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ timestep: Optional[torch.LongTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ added_cond_kwargs: Dict[str, torch.Tensor] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ use_image_num: Optional[int] = 0,
+ return_dict: bool = True,
+ ):
+ """
+ The [`Transformer2DModel`] forward method.
+
+ Args:
+ hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+ Input `hidden_states`.
+ encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+ self-attention.
+ timestep ( `torch.LongTensor`, *optional*):
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+ class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+ Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+ `AdaLayerZeroNorm`.
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ attention_mask ( `torch.Tensor`, *optional*):
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+ negative values to the attention scores corresponding to "discard" tokens.
+ encoder_attention_mask ( `torch.Tensor`, *optional*):
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+ above. This bias will be added to the cross-attention scores.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+ tuple.
+
+ Returns:
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+ `tuple` where the first element is the sample tensor.
+ """
+ batch_size, c, frame, h, w = hidden_states.shape
+ # print('hidden_states.shape', hidden_states.shape)
+ frame = frame - use_image_num # 21-4=17
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ print.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+ attention_mask_vid, attention_mask_img = None, None
+ if attention_mask is not None and attention_mask.ndim == 4:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
+ # b, frame+use_image_num, h, w -> a video with images
+ # b, 1, h, w -> only images
+ attention_mask = attention_mask.to(self.dtype)
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ attention_mask_vid = attention_mask[:, :frame * hccl_info.world_size] # b, frame, h, w
+ attention_mask_img = attention_mask[:, frame * hccl_info.world_size:] # b, use_image_num, h, w
+ else:
+ # print('before attention_mask.shape', attention_mask.shape)
+ attention_mask_vid = attention_mask[:, :frame * nccl_info.world_size] # b, frame, h, w
+ attention_mask_img = attention_mask[:, frame * nccl_info.world_size:] # b, use_image_num, h, w
+ # print('after attention_mask.shape', attention_mask_vid.shape)
+ else:
+ attention_mask_vid = attention_mask[:, :frame] # b, frame, h, w
+ attention_mask_img = attention_mask[:, frame:] # b, use_image_num, h, w
+
+ if attention_mask_vid.numel() > 0:
+ attention_mask_vid_first_frame = attention_mask_vid[:, :1].repeat(1, self.patch_size_t-1, 1, 1)
+ attention_mask_vid = torch.cat([attention_mask_vid_first_frame, attention_mask_vid], dim=1)
+ attention_mask_vid = attention_mask_vid.unsqueeze(1) # b 1 t h w
+ attention_mask_vid = F.max_pool3d(attention_mask_vid, kernel_size=(self.patch_size_t, self.patch_size, self.patch_size),
+ stride=(self.patch_size_t, self.patch_size, self.patch_size))
+ attention_mask_vid = rearrange(attention_mask_vid, 'b 1 t h w -> (b 1) 1 (t h w)')
+ if attention_mask_img.numel() > 0:
+ attention_mask_img = F.max_pool2d(attention_mask_img, kernel_size=(self.patch_size, self.patch_size), stride=(self.patch_size, self.patch_size))
+ attention_mask_img = rearrange(attention_mask_img, 'b i h w -> (b i) 1 (h w)')
+
+ attention_mask_vid = (1 - attention_mask_vid.bool().to(self.dtype)) * -10000.0 if attention_mask_vid.numel() > 0 else None
+ attention_mask_img = (1 - attention_mask_img.bool().to(self.dtype)) * -10000.0 if attention_mask_img.numel() > 0 else None
+
+ if frame == 1 and use_image_num == 0 and not get_sequence_parallel_state():
+ attention_mask_img = attention_mask_vid
+ attention_mask_vid = None
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ # import ipdb;ipdb.set_trace()
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 3:
+ # b, 1+use_image_num, l -> a video with images
+ # b, 1, l -> only images
+ encoder_attention_mask = (1 - encoder_attention_mask.to(self.dtype)) * -10000.0
+ in_t = encoder_attention_mask.shape[1]
+ encoder_attention_mask_vid = encoder_attention_mask[:, :in_t-use_image_num] # b, 1, l
+ encoder_attention_mask_vid = rearrange(encoder_attention_mask_vid, 'b 1 l -> (b 1) 1 l') if encoder_attention_mask_vid.numel() > 0 else None
+
+ encoder_attention_mask_img = encoder_attention_mask[:, in_t-use_image_num:] # b, use_image_num, l
+ encoder_attention_mask_img = rearrange(encoder_attention_mask_img, 'b i l -> (b i) 1 l') if encoder_attention_mask_img.numel() > 0 else None
+
+ if frame == 1 and use_image_num == 0 and not get_sequence_parallel_state():
+ encoder_attention_mask_img = encoder_attention_mask_vid
+ encoder_attention_mask_vid = None
+
+ if npu_config is not None and attention_mask_vid is not None:
+ attention_mask_vid = npu_config.get_attention_mask(attention_mask_vid, attention_mask_vid.shape[-1])
+ encoder_attention_mask_vid = npu_config.get_attention_mask(encoder_attention_mask_vid,
+ attention_mask_vid.shape[-2])
+ if npu_config is not None and attention_mask_img is not None:
+ attention_mask_img = npu_config.get_attention_mask(attention_mask_img, attention_mask_img.shape[-1])
+ encoder_attention_mask_img = npu_config.get_attention_mask(encoder_attention_mask_img,
+ attention_mask_img.shape[-2])
+
+
+ # 1. Input
+ frame = ((frame - 1) // self.patch_size_t + 1) if frame % 2 == 1 else frame // self.patch_size_t # patchfy
+ # print('frame', frame)
+ height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+ hidden_states_vid, hidden_states_img, encoder_hidden_states_vid, encoder_hidden_states_img, \
+ timestep_vid, timestep_img, embedded_timestep_vid, embedded_timestep_img = self._operate_on_patched_inputs(
+ hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, batch_size, frame, use_image_num
+ )
+ # 2. Blocks
+ # import ipdb;ipdb.set_trace()
+ if get_sequence_parallel_state():
+ if hidden_states_vid is not None:
+ # print(333333333333333)
+ hidden_states_vid = rearrange(hidden_states_vid, 'b s h -> s b h', b=batch_size).contiguous()
+ encoder_hidden_states_vid = rearrange(encoder_hidden_states_vid, 'b s h -> s b h',
+ b=batch_size).contiguous()
+ timestep_vid = timestep_vid.view(batch_size, 6, -1).transpose(0, 1).contiguous()
+ # print('timestep_vid', timestep_vid.shape)
+
+
+ for block in self.transformer_blocks:
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ # import ipdb;ipdb.set_trace()
+ if hidden_states_vid is not None:
+ hidden_states_vid = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states_vid,
+ attention_mask_vid,
+ encoder_hidden_states_vid,
+ encoder_attention_mask_vid,
+ timestep_vid,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ # import ipdb;ipdb.set_trace()
+ if hidden_states_img is not None:
+ hidden_states_img = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states_img,
+ attention_mask_img,
+ encoder_hidden_states_img,
+ encoder_attention_mask_img,
+ timestep_img,
+ cross_attention_kwargs,
+ class_labels,
+ 1,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ if hidden_states_vid is not None:
+ hidden_states_vid = block(
+ hidden_states_vid,
+ attention_mask=attention_mask_vid,
+ encoder_hidden_states=encoder_hidden_states_vid,
+ encoder_attention_mask=encoder_attention_mask_vid,
+ timestep=timestep_vid,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+ if hidden_states_img is not None:
+ hidden_states_img = block(
+ hidden_states_img,
+ attention_mask=attention_mask_img,
+ encoder_hidden_states=encoder_hidden_states_img,
+ encoder_attention_mask=encoder_attention_mask_img,
+ timestep=timestep_img,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=1,
+ height=height,
+ width=width,
+ )
+
+ if get_sequence_parallel_state():
+ if hidden_states_vid is not None:
+ hidden_states_vid = rearrange(hidden_states_vid, 's b h -> b s h', b=batch_size).contiguous()
+
+ # 3. Output
+ output_vid, output_img = None, None
+ if hidden_states_vid is not None:
+ output_vid = self._get_output_for_patched_inputs(
+ hidden_states=hidden_states_vid,
+ timestep=timestep_vid,
+ class_labels=class_labels,
+ embedded_timestep=embedded_timestep_vid,
+ num_frames=frame,
+ height=height,
+ width=width,
+ ) # b c t h w
+ if hidden_states_img is not None:
+ output_img = self._get_output_for_patched_inputs(
+ hidden_states=hidden_states_img,
+ timestep=timestep_img,
+ class_labels=class_labels,
+ embedded_timestep=embedded_timestep_img,
+ num_frames=1,
+ height=height,
+ width=width,
+ ) # b c 1 h w
+ if use_image_num != 0:
+ output_img = rearrange(output_img, '(b i) c 1 h w -> b c i h w', i=use_image_num)
+
+ if output_vid is not None and output_img is not None:
+ output = torch.cat([output_vid, output_img], dim=2)
+ elif output_vid is not None:
+ output = output_vid
+ elif output_img is not None:
+ output = output_img
+
+ if not return_dict:
+ return (output,)
+
+ return Transformer2DModelOutput(sample=output)
+
+
+ def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, batch_size, frame, use_image_num):
+ # batch_size = hidden_states.shape[0]
+ hidden_states_vid, hidden_states_img = self.pos_embed(hidden_states.to(self.dtype), frame)
+ timestep_vid, timestep_img = None, None
+ embedded_timestep_vid, embedded_timestep_img = None, None
+ encoder_hidden_states_vid, encoder_hidden_states_img = None, None
+
+ if self.adaln_single is not None:
+ if self.use_additional_conditions and added_cond_kwargs is None:
+ raise ValueError(
+ "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+ )
+ timestep, embedded_timestep = self.adaln_single(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+ if hidden_states_vid is None:
+ timestep_img = timestep
+ embedded_timestep_img = embedded_timestep
+ else:
+ timestep_vid = timestep
+ embedded_timestep_vid = embedded_timestep
+ if hidden_states_img is not None:
+ timestep_img = repeat(timestep, 'b d -> (b i) d', i=use_image_num).contiguous()
+ embedded_timestep_img = repeat(embedded_timestep, 'b d -> (b i) d', i=use_image_num).contiguous()
+
+ if self.caption_projection is not None:
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ if hidden_states_vid is None:
+ encoder_hidden_states_img = rearrange(encoder_hidden_states, 'b 1 l d -> (b 1) l d')
+ else:
+ encoder_hidden_states_vid = rearrange(encoder_hidden_states[:, :1], 'b 1 l d -> (b 1) l d')
+ if hidden_states_img is not None:
+ encoder_hidden_states_img = rearrange(encoder_hidden_states[:, 1:], 'b i l d -> (b i) l d')
+
+
+ return hidden_states_vid, hidden_states_img, encoder_hidden_states_vid, encoder_hidden_states_img, timestep_vid, timestep_img, embedded_timestep_vid, embedded_timestep_img
+
+
+
+ def _get_output_for_patched_inputs(
+ self, hidden_states, timestep, class_labels, embedded_timestep, num_frames, height=None, width=None
+ ):
+ # import ipdb;ipdb.set_trace()
+ if self.config.norm_type != "ada_norm_single":
+ conditioning = self.transformer_blocks[0].norm1.emb(
+ timestep, class_labels, hidden_dtype=self.dtype
+ )
+ shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+ hidden_states = self.proj_out_2(hidden_states)
+ elif self.config.norm_type == "ada_norm_single":
+ shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states)
+ # Modulation
+ hidden_states = hidden_states * (1 + scale) + shift
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states.squeeze(1)
+
+ # unpatchify
+ if self.adaln_single is None:
+ height = width = int(hidden_states.shape[1] ** 0.5)
+ hidden_states = hidden_states.reshape(
+ shape=(-1, num_frames, height, width, self.patch_size_t, self.patch_size, self.patch_size, self.out_channels)
+ )
+ hidden_states = torch.einsum("nthwopqc->nctohpwq", hidden_states)
+ output = hidden_states.reshape(
+ shape=(-1, self.out_channels, num_frames * self.patch_size_t, height * self.patch_size, width * self.patch_size)
+ )
+ # import ipdb;ipdb.set_trace()
+ # if output.shape[2] % 2 == 0:
+ # output = output[:, :, 1:]
+ return output
+
+def OpenSoraT2V_S_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=8, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=768, **kwargs)
+
+def OpenSoraT2V_B_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=16, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=1536, **kwargs)
+
+def OpenSoraT2V_L_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=24, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=2304, **kwargs)
+
+OpenSora_models = {
+ "OpenSoraT2V-S/122": OpenSoraT2V_S_122, # 0.3B
+ "OpenSoraT2V-B/122": OpenSoraT2V_B_122, # 1.2B
+ "OpenSoraT2V-L/122": OpenSoraT2V_L_122, # 2.7B
+}
+
+OpenSora_models_class = {
+ "OpenSoraT2V-S/122": OpenSoraT2V,
+ "OpenSoraT2V-B/122": OpenSoraT2V,
+ "OpenSoraT2V-L/122": OpenSoraT2V,
+}
+
+if __name__ == '__main__':
+ from opensora.models.causalvideovae import ae_stride_config, ae_channel_config
+ from opensora.models.causalvideovae import ae_norm, ae_denorm
+ from opensora.models import CausalVAEModelWrapper
+
+ args = type('args', (),
+ {
+ 'ae': 'CausalVAEModel_D8_4x8x8',
+ 'attention_mode': 'xformers',
+ 'use_rope': True,
+ 'model_max_length': 300,
+ 'max_height': 480,
+ 'max_width': 640,
+ 'num_frames': 29,
+ 'use_image_num': 0,
+ 'compress_kv_factor': 1,
+ 'interpolation_scale_t': 1,
+ 'interpolation_scale_h': 1,
+ 'interpolation_scale_w': 1,
+ "sparse1d": True,
+ "sparse2d": False,
+ "sparse_n": 4,
+ "rank": 64,
+ }
+ )
+ b = 16
+ c = 4
+ cond_c = 4096
+ num_timesteps = 1000
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ num_frames = (args.num_frames - 1) // ae_stride_t + 1
+
+ # device = torch.device('cuda:0')
+ # model = OpenSoraT2V_L_122(in_channels=c,
+ # out_channels=c,
+ # sample_size=latent_size,
+ # sample_size_t=num_frames,
+ # activation_fn="gelu-approximate",
+ # attention_bias=True,
+ # attention_type="default",
+ # double_self_attention=False,
+ # norm_elementwise_affine=False,
+ # norm_eps=1e-06,
+ # norm_num_groups=32,
+ # num_vector_embeds=None,
+ # only_cross_attention=False,
+ # upcast_attention=False,
+ # use_linear_projection=False,
+ # use_additional_conditions=False,
+ # downsampler=None,
+ # interpolation_scale_t=args.interpolation_scale_t,
+ # interpolation_scale_h=args.interpolation_scale_h,
+ # interpolation_scale_w=args.interpolation_scale_w,
+ # use_rope=args.use_rope,
+ # sparse1d=args.sparse1d,
+ # sparse2d=args.sparse2d,
+ # sparse_n=args.sparse_n
+ # ).to(device)
+
+ # try:
+ # path = "/storage/dataset/Open-Sora-Plan-v1.2.0/29x720p/diffusion_pytorch_model.safetensors"
+ # ckpt = torch.load(path, map_location="cpu")
+ # msg = model.load_state_dict(ckpt, strict=True)
+ # print(msg)
+ # except Exception as e:
+ # print(e)
+ # print(model)
+ # print(f'{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B')
+ # # import sys;sys.exit()
+ # x = torch.randn(b, c, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w).to(device)
+ # cond = torch.randn(b, 1+args.use_image_num, args.model_max_length, cond_c).to(device)
+ # attn_mask = torch.randint(0, 2, (b, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w)).to(device) # B L or B 1+num_images L
+ # cond_mask = torch.randint(0, 2, (b, 1+args.use_image_num, args.model_max_length)).to(device) # B L or B 1+num_images L
+ # timestep = torch.randint(0, 1000, (b,), device=device)
+ # model_kwargs = dict(hidden_states=x, encoder_hidden_states=cond, attention_mask=attn_mask,
+ # encoder_attention_mask=cond_mask, use_image_num=args.use_image_num, timestep=timestep)
+ # with torch.no_grad():
+ # output = model(**model_kwargs)
+ # print(output[0].shape)
+
+
+ from peft import LoraConfig, PeftModel, get_peft_model
+ from opensora.utils.ema_utils import EMAModel
+ lora_config = LoraConfig(
+ r=args.rank,
+ lora_alpha=args.rank,
+ init_lora_weights="gaussian",
+ target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+ )
+ # model_merge = model.merge_and_unload()
+ # model = get_peft_model(model, lora_config)
+
+ # import ipdb;ipdb.set_trace()
+ def check_ema_allclose(model_base, lora_path, ema_path):
+ ema_model = EMAModel.from_pretrained(ema_path, OpenSoraT2V, lora_config, model_base)
+ base_model = OpenSoraT2V.from_pretrained(model_base)
+ model_lora = PeftModel.from_pretrained(base_model, lora_path)
+ for p, p_ in zip(model_lora.parameters(), ema_model.shadow_params):
+ assert torch.allclose(p, p_)
+ merge_lora = model_lora.merge_and_unload()
+ res = 0
+ for p, p_ in zip(merge_lora.parameters(), base_model.parameters()):
+ res += int(torch.allclose(p, p_))
+ print(f'total {len(list(merge_lora.parameters()))}, allclose {res}')
+ def check_allclose(model_base, lora_path, model_path):
+ model = OpenSoraT2V.from_pretrained(model_path) # 合并权重的模型
+ base_model = OpenSoraT2V.from_pretrained(model_base) # 基模型
+ model_lora = PeftModel.from_pretrained(base_model, lora_path) # 基模型+lora
+ merge_lora = model_lora.merge_and_unload()
+ res = 0
+ for p, p_ in zip(model.parameters(), merge_lora.parameters()):
+ res += int(torch.allclose(p, p_))
+ print(f'total {len(list(merge_lora.parameters()))}, allclose {res}')
+ res = 0
+ for p, p_ in zip(model.parameters(), base_model.parameters()):
+ res += int(torch.allclose(p, p_))
+ print(f'total {len(list(merge_lora.parameters()))}, allclose {res}')
+ import ipdb;ipdb.set_trace()
+
+ model_base = "/storage/dataset/Open-Sora-Plan-v1.2.0/29x720p"
+ lora_path = "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/debug_lora/checkpoint-20/model"
+ model_path = "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/debug_lora/checkpoint-20/model"
+ ema_path = "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/debug_lora/checkpoint-20/model_ema"
+ check_allclose(model_base, lora_path, model_path)
+ import ipdb;ipdb.set_trace()
+
+ print()
diff --git a/opensora/models/diffusion/opensora/modules.py b/opensora/models/diffusion/opensora/modules.py
new file mode 100644
index 000000000..9fd6a1f5d
--- /dev/null
+++ b/opensora/models/diffusion/opensora/modules.py
@@ -0,0 +1,1548 @@
+from einops import rearrange
+from torch import nn
+import torch
+import numpy as np
+
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from typing import Any, Dict, Optional
+import re
+import torch
+import torch.nn.functional as F
+from torch import nn
+import diffusers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, GatedSelfAttentionDense
+from diffusers.models.attention_processor import Attention as Attention_
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+from .rope import PositionGetter3D, RoPE3D
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+ from opensora.acceleration.communications import all_to_all_SBH
+except:
+ torch_npu = None
+ npu_config = None
+ set_run_dtype = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
+ from opensora.utils.communications import all_to_all_SBH
+logger = logging.get_logger(__name__)
+
+def get_3d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+ grid_t = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_h = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid_w = np.arange(grid_size[2], dtype=np.float32) / (grid_size[2] / base_size[2]) / interpolation_scale[2]
+ grid = np.meshgrid(grid_w, grid_h, grid_t) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([3, 1, grid_size[2], grid_size[1], grid_size[0]])
+ pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+ # import ipdb;ipdb.set_trace()
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 3 != 0:
+ raise ValueError("embed_dim must be divisible by 3")
+
+ # import ipdb;ipdb.set_trace()
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_t = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0]) # (T*H*W, D/3)
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1]) # (T*H*W, D/3)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2]) # (T*H*W, D/3)
+
+ emb = np.concatenate([emb_t, emb_h, emb_w], axis=1) # (T*H*W, D)
+ return emb
+
+
+def get_2d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+def get_1d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid return: pos_embed: [grid_size, embed_dim] or
+ [1+grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid = np.arange(grid_size, dtype=np.float32) / (grid_size / base_size) / interpolation_scale
+ pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid) # (H*W, D/2)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+ """
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
+class PatchEmbed2D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ # assert num_frames == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv2d(
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sp_size = hccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = hccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+ else:
+ sp_size = nccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = nccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+
+ else:
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent, image_latent = latent[:, :num_frames], latent[:, num_frames:]
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype) if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = (image_latent + temp_pos_embed[:, :1]).to(image_latent.dtype) if image_latent is not None and image_latent.numel() > 0 else None
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c') if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = rearrange(image_latent, 'b t n c -> (b t) n c') if image_latent is not None and image_latent.numel() > 0 else None
+
+ if num_frames == 1 and image_latent is None and not get_sequence_parallel_state():
+ image_latent = video_latent
+ video_latent = None
+ # print('video_latent is None, image_latent is None', video_latent is None, image_latent is None)
+ return video_latent, image_latent
+
+
+
+class OverlapPatchEmbed3D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ # assert patch_size_t == 1 and patch_size == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv3d(
+ in_channels, embed_dim, kernel_size=(patch_size_t, patch_size, patch_size), stride=(patch_size_t, patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ # latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ # latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ latent = rearrange(latent, 'b c t h w -> (b t) (h w) c ')
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent, image_latent = latent[:, :num_frames], latent[:, num_frames:]
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype) if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = (image_latent + temp_pos_embed[:, :1]).to(image_latent.dtype) if image_latent is not None and image_latent.numel() > 0 else None
+
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c') if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = rearrange(image_latent, 'b t n c -> (b t) n c') if image_latent is not None and image_latent.numel() > 0 else None
+
+ if num_frames == 1 and image_latent is None:
+ image_latent = video_latent
+ video_latent = None
+ return video_latent, image_latent
+
+
+
+class OverlapPatchEmbed2D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ assert patch_size_t == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv2d(
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent, image_latent = latent[:, :num_frames], latent[:, num_frames:]
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype) if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = (image_latent + temp_pos_embed[:, :1]).to(image_latent.dtype) if image_latent is not None and image_latent.numel() > 0 else None
+
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c') if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = rearrange(image_latent, 'b t n c -> (b t) n c') if image_latent is not None and image_latent.numel() > 0 else None
+
+ if num_frames == 1 and image_latent is None:
+ image_latent = video_latent
+ video_latent = None
+ return video_latent, image_latent
+
+class Attention(Attention_):
+ def __init__(self, downsampler, attention_mode, use_rope, interpolation_scale_thw,
+ sparse1d, sparse2d, sparse_n, sparse_group, is_cross_attn, **kwags):
+ processor = AttnProcessor2_0(attention_mode=attention_mode, use_rope=use_rope, interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d, sparse2d=sparse2d, sparse_n=sparse_n, sparse_group=sparse_group, is_cross_attn=is_cross_attn)
+ super().__init__(processor=processor, **kwags)
+ self.downsampler = None
+
+ def prepare_attention_mask(
+ self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+ ) -> torch.Tensor:
+ r"""
+ Prepare the attention mask for the attention computation.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ The attention mask to prepare.
+ target_length (`int`):
+ The target length of the attention mask. This is the length of the attention mask after padding.
+ batch_size (`int`):
+ The batch size, which is used to repeat the attention mask.
+ out_dim (`int`, *optional*, defaults to `3`):
+ The output dimension of the attention mask. Can be either `3` or `4`.
+
+ Returns:
+ `torch.Tensor`: The prepared attention mask.
+ """
+ head_size = self.heads
+ if get_sequence_parallel_state():
+ head_size = head_size // nccl_info.world_size
+ if attention_mask is None:
+ return attention_mask
+
+ current_length: int = attention_mask.shape[-1]
+ if current_length != target_length:
+ if attention_mask.device.type == "mps":
+ # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+ # Instead, we can manually construct the padding tensor.
+ padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+ padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+ attention_mask = torch.cat([attention_mask, padding], dim=2)
+ else:
+ # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+ # we want to instead pad by (0, remaining_length), where remaining_length is:
+ # remaining_length: int = target_length - current_length
+ # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+ if out_dim == 3:
+ if attention_mask.shape[0] < batch_size * head_size:
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+ elif out_dim == 4:
+ attention_mask = attention_mask.unsqueeze(1)
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+ return attention_mask
+
+class DownSampler3d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv3d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ if npu_config is None:
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.layer, x, x_dtype) + (x if self.down_shortcut else 0)
+
+ self.t = t//self.down_factor[0]
+ self.h = h//self.down_factor[1]
+ self.w = w//self.down_factor[2]
+ x = rearrange(x, 'b d (t dt) (h dh) (w dw) -> (b dt dh dw) (t h w) d',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> b 1 t h w', t=t, h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (t dt) (h dh) (w dw) -> (b dt dh dw) 1 (t h w)',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b dt dh dw) (t h w) d -> b (t dt h dh w dw) d',
+ t=t, h=h, w=w,
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x
+
+
+class DownSampler2d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv2d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+
+ self.t = 1
+ self.h = h//self.down_factor[0]
+ self.w = w//self.down_factor[1]
+
+ x = rearrange(x, 'b d (h dh) (w dw) -> (b dh dw) (h w) d',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> (b t) 1 h w', h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (h dh) (w dw) -> (b dh dw) 1 (h w)',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b t dh dw) (h w) d -> b (t h dh w dw) d',
+ t=t, h=h, w=w,
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x
+
+class AttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+ """
+
+ def __init__(self, attention_mode='xformers', use_rope=False, interpolation_scale_thw=(1, 1, 1),
+ sparse1d=False, sparse2d=False, sparse_n=2, sparse_group=False, is_cross_attn=True):
+ self.sparse1d = sparse1d
+ self.sparse2d = sparse2d
+ self.sparse_n = sparse_n
+ self.sparse_group = sparse_group
+ self.is_cross_attn = is_cross_attn
+ self.use_rope = use_rope
+ self.interpolation_scale_thw = interpolation_scale_thw
+ if self.use_rope:
+ self._init_rope(interpolation_scale_thw)
+ self.attention_mode = attention_mode
+ if not hasattr(F, "scaled_dot_product_attention"):
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+ assert not (self.sparse1d and self.sparse2d)
+
+ def _init_rope(self, interpolation_scale_thw):
+ self.rope = RoPE3D(interpolation_scale_thw=interpolation_scale_thw)
+ self.position_getter = PositionGetter3D()
+
+ def _sparse_1d(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ attention_mask: b nheads 1 thw
+ """
+ l = x.shape[-2]
+ assert l == frame*height*width
+ assert attention_mask is None or attention_mask.shape[2] == 1
+ pad_len = self.sparse_n * self.sparse_n - l % (self.sparse_n * self.sparse_n)
+ if pad_len != 0:
+ x = F.pad(x, (0, 0, 0, pad_len))
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_len, 0, 0), value=-9980.0)
+ if not self.sparse_group:
+ x = rearrange(x, 'b h (g k) d -> (b k) h g d', k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (g k) -> (b k) h 1 g', k=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b h (n m k) d -> (b m) h (n k) d', m=self.sparse_n, k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (n m k) -> (b m) h 1 (n k)', m=self.sparse_n, k=self.sparse_n)
+ if self.is_cross_attn:
+ # attention_mask = attention_mask.repeat(self.sparse_n, 1, 1, 1)
+ attention_mask = torch.repeat_interleave(attention_mask, self.sparse_n, dim=0)
+ return x, attention_mask, pad_len
+
+ def _reverse_sparse_1d(self, x, frame, height, width, pad_len):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ assert x.shape[2] == (frame*height*width+pad_len) // self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(b k) h g d -> b h (g k) d', k=self.sparse_n)
+ else:
+ x = rearrange(x, '(b m) h (n k) d -> b h (n m k) d', m=self.sparse_n, k=self.sparse_n)
+ x = x[:, :, :frame*height*width, :]
+ # x = x.contiguous()
+ return x
+
+ def _sparse_1d_kv(self, x):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ # x = repeat(x, 'b h s d -> (k b) h s d', k=self.sparse_n)
+ x = torch.repeat_interleave(x, self.sparse_n, dim=0)
+ return x
+
+ def _sparse_2d(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ attention_mask: b nheads 1 thw
+ """
+ d = x.shape[-1]
+ x = rearrange(x, 'b h (T H W) d -> b h T H W d', T=frame, H=height, W=width)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (T H W) -> b h T H W', T=frame, H=height, W=width)
+ pad_height = self.sparse_n*self.sparse_n - height % (self.sparse_n*self.sparse_n)
+ pad_width = self.sparse_n*self.sparse_n - width % (self.sparse_n*self.sparse_n)
+ if pad_height != 0 or pad_width != 0:
+ x = rearrange(x, 'b h T H W d -> b (h d) T H W')
+ x = F.pad(x, (0, pad_width, 0, pad_height, 0, 0))
+ x = rearrange(x, 'b (h d) T H W -> b h T H W d', d=d)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_width, 0, pad_height, 0, 0), value=-9500.0)
+
+ if not self.sparse_group:
+ x = rearrange(x, 'b h t (g1 k1) (g2 k2) d -> (k1 k2 b) h (t g1 g2) d',
+ k1=self.sparse_n, k2=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h t (g1 k1) (g2 k2) -> (k1 k2 b) h 1 (t g1 g2)',
+ k1=self.sparse_n, k2=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b h t (n1 m1 k1) (n2 m2 k2) d -> (m1 m2 b) h (t n1 n2 k1 k2) d',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h t (n1 m1 k1) (n2 m2 k2) -> (m1 m2 b) h 1 (t n1 n2 k1 k2)',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n)
+
+ if self.is_cross_attn:
+ attention_mask = attention_mask.repeat(self.sparse_n*self.sparse_n, 1, 1, 1)
+ return x, attention_mask, pad_height, pad_width
+
+ def _reverse_sparse_2d(self, x, frame, height, width, pad_height, pad_width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ assert x.shape[2] == frame*(height+pad_height)*(width+pad_width)//self.sparse_n//self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(k1 k2 b) h (t g1 g2) d -> b h t (g1 k1) (g2 k2) d',
+ k1=self.sparse_n, k2=self.sparse_n,
+ g1=(height+pad_height)//self.sparse_n, g2=(width+pad_width)//self.sparse_n)
+ else:
+ x = rearrange(x, '(m1 m2 b) h (t n1 n2 k1 k2) d -> b h t (n1 m1 k1) (n2 m2 k2) d',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n,
+ n1=(height+pad_height)//self.sparse_n//self.sparse_n, n2=(width+pad_width)//self.sparse_n//self.sparse_n)
+ x = x[:, :, :, :height, :width, :]
+ x = rearrange(x, 'b h T H W d -> b h (T H W) d')
+ # x = x.contiguous()
+ return x
+
+
+ def _sparse_2d_kv(self, x):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ x = repeat(x, 'b h s d -> (k1 k2 b) h s d', k1=self.sparse_n, k2=self.sparse_n)
+ return x
+
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ frame: int = 8,
+ height: int = 16,
+ width: int = 16,
+ *args,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+
+ if attn.downsampler is not None:
+ hidden_states, attention_mask = attn.downsampler(hidden_states, attention_mask, t=frame, h=height, w=width)
+ frame, height, width = attn.downsampler.t, attn.downsampler.h, attn.downsampler.w
+
+ residual = hidden_states
+
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sequence_length, batch_size, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ else:
+ sequence_length, batch_size, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ else:
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ if attention_mask is not None:
+ if npu_config is None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length * nccl_info.world_size, batch_size)
+ # scaled_dot_product_attention expects attention_mask shape to be
+ # (batch, heads, source_length, target_length)
+ if get_sequence_parallel_state():
+ attention_mask = attention_mask.view(batch_size, attn.heads // nccl_info.world_size, -1, attention_mask.shape[-1])
+ else:
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+ else:
+ attention_mask = attention_mask.view(batch_size, 1, -1, attention_mask.shape[-1])
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ query = attn.to_q(hidden_states)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ if npu_config is not None and npu_config.on_npu:
+ if get_sequence_parallel_state():
+ query = query.view(-1, attn.heads, head_dim) # [s // sp, b, h * d] -> [s // sp * b, h, d]
+ key = key.view(-1, attn.heads, head_dim)
+ value = value.view(-1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ h_size = attn.heads * head_dim
+ sp_size = hccl_info.world_size
+ h_size_sp = h_size // sp_size
+ # apply all_to_all to gather sequence and split attention heads [s // sp * b, h, d] -> [s * b, h // sp, d]
+ query = all_to_all_SBH(query, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ key = all_to_all_SBH(key, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ value = all_to_all_SBH(value, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ if self.use_rope:
+ query = query.view(-1, batch_size, attn.heads // sp_size, head_dim)
+ key = key.view(-1, batch_size, attn.heads // sp_size, head_dim)
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame * sp_size, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ query = query.view(-1, batch_size, h_size_sp)
+ key = key.view(-1, batch_size, h_size_sp)
+ value = value.view(-1, batch_size, h_size_sp)
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "SBH",
+ head_dim, attn.heads // sp_size)
+
+ hidden_states = hidden_states.view(-1, attn.heads // sp_size, head_dim)
+
+ # [s * b, h // sp, d] -> [s // sp * b, h, d] -> [s // sp, b, h * d]
+ hidden_states = all_to_all_SBH(hidden_states, scatter_dim=0, gather_dim=1).view(-1, batch_size, h_size)
+ else:
+ if npu_config.enable_FA and query.dtype == torch.float32:
+ dtype = torch.bfloat16
+ else:
+ dtype = None
+
+ query = query.view(batch_size, -1, attn.heads, head_dim)
+ key = key.view(batch_size, -1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ query = query.view(batch_size, -1, attn.heads * head_dim)
+ key = key.view(batch_size, -1, attn.heads * head_dim)
+
+ with set_run_dtype(query, dtype):
+ query, key, value = npu_config.set_current_run_dtype([query, key, value])
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "BSH",
+ head_dim, attn.heads)
+
+ hidden_states = npu_config.restore_dtype(hidden_states)
+ else:
+ if get_sequence_parallel_state():
+ query = query.reshape(-1, attn.heads, head_dim) # [s // sp, b, h * d] -> [s // sp * b, h, d]
+ key = key.reshape(-1, attn.heads, head_dim)
+ value = value.reshape(-1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ h_size = attn.heads * head_dim
+ sp_size = nccl_info.world_size
+ h_size_sp = h_size // sp_size
+ # print(frame, sp_size, height, width)
+ # apply all_to_all to gather sequence and split attention heads [s // sp * b, h, d] -> [s * b, h // sp, d]
+ query = all_to_all_SBH(query, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ key = all_to_all_SBH(key, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ value = all_to_all_SBH(value, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ query = query.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ key = key.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ value = value.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame * sp_size, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ # print('after rope query', query.shape, 'key', key.shape, 'value', value.shape)
+ query = rearrange(query, 's b h d -> b h s d')
+ key = rearrange(key, 's b h d -> b h s d')
+ value = rearrange(value, 's b h d -> b h s d')
+ # print('rearrange query', query.shape, 'key', key.shape, 'value', value.shape)
+
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d(query, attention_mask, frame * sp_size, height, width)
+
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv(key)
+ value = self._sparse_1d_kv(value)
+ else:
+ key, _, pad_len = self._sparse_1d(key, None, frame * sp_size, height, width)
+ value, _, pad_len = self._sparse_1d(value, None, frame * sp_size, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame * sp_size, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame * sp_size, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame * sp_size, height, width)
+
+
+
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ # import ipdb;ipdb.set_trace()
+ # print(attention_mask)
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d(hidden_states, frame * sp_size, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame * sp_size, height, width, pad_height, pad_width)
+
+ hidden_states = rearrange(hidden_states, 'b h s d -> s b h d')
+
+ hidden_states = hidden_states.reshape(-1, attn.heads // sp_size, head_dim)
+ hidden_states = hidden_states.contiguous()
+ # [s * b, h // sp, d] -> [s // sp * b, h, d] -> [s // sp, b, h * d]
+ hidden_states = all_to_all_SBH(hidden_states, scatter_dim=0, gather_dim=1).reshape(-1, batch_size, h_size)
+ else:
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ # qk norm
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d(query, attention_mask, frame, height, width)
+
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv(key)
+ value = self._sparse_1d_kv(value)
+ else:
+ key, _, pad_len = self._sparse_1d(key, None, frame, height, width)
+ value, _, pad_len = self._sparse_1d(value, None, frame, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame, height, width)
+ # print(frame, height, width, query.shape, key.shape, value.shape)
+ # query, key, value = query.contiguous(), key.contiguous(), value.contiguous()
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d(hidden_states, frame, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame, height, width, pad_height, pad_width)
+
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+ hidden_states = hidden_states.to(query.dtype)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ if attn.downsampler is not None:
+ hidden_states = attn.downsampler.reverse(hidden_states, t=frame, h=height, w=width)
+ return hidden_states
+
+
+
+class FeedForward_Conv3d(nn.Module):
+ def __init__(self, downsampler, dim, hidden_features, bias=True):
+ super(FeedForward_Conv3d, self).__init__()
+
+ self.bias = bias
+
+ self.project_in = nn.Linear(dim, hidden_features, bias=bias)
+
+ self.dwconv = nn.ModuleList([
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(5, 5, 5), stride=1, padding=(2, 2, 2), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(3, 3, 3), stride=1, padding=(1, 1, 1), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(1, 1, 1), stride=1, padding=(0, 0, 0), dilation=1,
+ groups=hidden_features, bias=bias)
+ ])
+
+ self.project_out = nn.Linear(hidden_features, dim, bias=bias)
+
+
+ def forward(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ if npu_config is None:
+ x = self.project_in(x)
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + module(x)
+ out = rearrange(out, 'b d t h w -> b (t h w) d', t=t, h=h, w=w)
+ x = self.project_out(out)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.project_in, x, npu_config.replaced_type)
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + npu_config.run_conv3d(module, x, npu_config.replaced_type)
+ out = rearrange(out, 'b d t h w -> b (t h w) d', t=t, h=h, w=w)
+ x = npu_config.run_conv3d(self.project_out, out, x_dtype)
+ return x
+
+
+class FeedForward_Conv2d(nn.Module):
+ def __init__(self, downsampler, dim, hidden_features, bias=True):
+ super(FeedForward_Conv2d, self).__init__()
+
+ self.bias = bias
+
+ self.project_in = nn.Linear(dim, hidden_features, bias=bias)
+
+ self.dwconv = nn.ModuleList([
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(5, 5), stride=1, padding=(2, 2), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(3, 3), stride=1, padding=(1, 1), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(1, 1), stride=1, padding=(0, 0), dilation=1,
+ groups=hidden_features, bias=bias)
+ ])
+
+ self.project_out = nn.Linear(hidden_features, dim, bias=bias)
+
+
+ def forward(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ x = self.project_in(x)
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + module(x)
+ out = rearrange(out, '(b t) d h w -> b (t h w) d', t=t, h=h, w=w)
+ x = self.project_out(out)
+ return x
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+ r"""
+ A basic Transformer block.
+
+ Parameters:
+ dim (`int`): The number of channels in the input and output.
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+ only_cross_attention (`bool`, *optional*):
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
+ double_self_attention (`bool`, *optional*):
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
+ upcast_attention (`bool`, *optional*):
+ Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+ norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+ Whether to use learnable elementwise affine parameters for normalization.
+ norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+ The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+ final_dropout (`bool` *optional*, defaults to False):
+ Whether to apply a final dropout after the last feed-forward layer.
+ attention_type (`str`, *optional*, defaults to `"default"`):
+ The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+ positional_embeddings (`str`, *optional*, defaults to `None`):
+ The type of positional embeddings to apply to.
+ num_positional_embeddings (`int`, *optional*, defaults to `None`):
+ The maximum number of positional embeddings to apply.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ attention_bias: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_elementwise_affine: bool = True,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_eps: float = 1e-5,
+ final_dropout: bool = False,
+ attention_type: str = "default",
+ positional_embeddings: Optional[str] = None,
+ num_positional_embeddings: Optional[int] = None,
+ ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+ ada_norm_bias: Optional[int] = None,
+ ff_inner_dim: Optional[int] = None,
+ ff_bias: bool = True,
+ attention_out_bias: bool = True,
+ attention_mode: str = "xformers",
+ downsampler: str = None,
+ use_rope: bool = False,
+ interpolation_scale_thw: Tuple[int] = (1, 1, 1),
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ sparse_group: bool = False,
+ ):
+ super().__init__()
+ self.only_cross_attention = only_cross_attention
+ self.downsampler = downsampler
+
+ # We keep these boolean flags for backward-compatibility.
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+ self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+ self.use_layer_norm = norm_type == "layer_norm"
+ self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+ )
+
+ self.norm_type = norm_type
+ self.num_embeds_ada_norm = num_embeds_ada_norm
+
+ if positional_embeddings and (num_positional_embeddings is None):
+ raise ValueError(
+ "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+ )
+
+ if positional_embeddings == "sinusoidal":
+ self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+ else:
+ self.pos_embed = None
+
+ # Define 3 blocks. Each block has its own normalization layer.
+ # 1. Self-Attn
+ if norm_type == "ada_norm":
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_zero":
+ self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm1 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+ self.attn1 = Attention(
+ query_dim=dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ attention_mode=attention_mode,
+ downsampler=downsampler,
+ use_rope=use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ sparse_group=sparse_group,
+ is_cross_attn=False,
+ )
+
+ # 2. Cross-Attn
+ if cross_attention_dim is not None or double_self_attention:
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+ # the second cross attention block.
+ if norm_type == "ada_norm":
+ self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm2 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+ self.attn2 = Attention(
+ query_dim=dim,
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ attention_mode=attention_mode,
+ downsampler=False,
+ use_rope=False,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ sparse_group=sparse_group,
+ is_cross_attn=True,
+ ) # is self-attn if encoder_hidden_states is none
+ else:
+ self.norm2 = None
+ self.attn2 = None
+
+ # 3. Feed-forward
+ if norm_type == "ada_norm_continuous":
+ self.norm3 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "layer_norm",
+ )
+
+ elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]:
+ self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+ elif norm_type == "layer_norm_i2vgen":
+ self.norm3 = None
+
+ if downsampler:
+ downsampler_ker_size = list(re.search(r'k(\d{2,3})', downsampler).group(1)) # 122
+ # if len(downsampler_ker_size) == 3:
+ # self.ff = FeedForward_Conv3d(
+ # downsampler,
+ # dim,
+ # 2 * dim,
+ # bias=ff_bias,
+ # )
+ # elif len(downsampler_ker_size) == 2:
+ self.ff = FeedForward_Conv2d(
+ downsampler,
+ dim,
+ 2 * dim,
+ bias=ff_bias,
+ )
+ else:
+ self.ff = FeedForward(
+ dim,
+ dropout=dropout,
+ activation_fn=activation_fn,
+ final_dropout=final_dropout,
+ inner_dim=ff_inner_dim,
+ bias=ff_bias,
+ )
+
+ # 4. Fuser
+ if attention_type == "gated" or attention_type == "gated-text-image":
+ self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+ # 5. Scale-shift for PixArt-Alpha.
+ if norm_type == "ada_norm_single":
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+ # let chunk size default to None
+ self._chunk_size = None
+ self._chunk_dim = 0
+
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+ # Sets chunk feed-forward
+ self._chunk_size = chunk_size
+ self._chunk_dim = dim
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ frame: int = None,
+ height: int = None,
+ width: int = None,
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+ ) -> torch.FloatTensor:
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+ # Notice that normalization is always applied before the real computation in the following blocks.
+ # 0. Self-Attention
+ batch_size = hidden_states.shape[0]
+
+ # import ipdb;ipdb.set_trace()
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm1(hidden_states, timestep)
+ elif self.norm_type == "ada_norm_zero":
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm1(hidden_states)
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif self.norm_type == "ada_norm_single":
+ # import ipdb;ipdb.set_trace()
+ if get_sequence_parallel_state():
+ batch_size = hidden_states.shape[1]
+ # print('hidden_states', hidden_states.shape)
+ # print('timestep', timestep.shape)
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[:, None] + timestep.reshape(6, batch_size, -1)
+ ).chunk(6, dim=0)
+ else:
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+ ).chunk(6, dim=1)
+ norm_hidden_states = self.norm1(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+ # norm_hidden_states = norm_hidden_states.squeeze(1)
+ else:
+ raise ValueError("Incorrect norm used")
+
+ if self.pos_embed is not None:
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ # 1. Prepare GLIGEN inputs
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ if self.norm_type == "ada_norm_zero":
+ attn_output = gate_msa.unsqueeze(1) * attn_output
+ elif self.norm_type == "ada_norm_single":
+ attn_output = gate_msa * attn_output
+
+ hidden_states = attn_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ # 1.2 GLIGEN Control
+ if gligen_kwargs is not None:
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+ # 3. Cross-Attention
+ if self.attn2 is not None:
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm2(hidden_states, timestep)
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm2(hidden_states)
+ elif self.norm_type == "ada_norm_single":
+ # For PixArt norm2 isn't applied here:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+ norm_hidden_states = hidden_states
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ else:
+ raise ValueError("Incorrect norm")
+
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ attn_output = self.attn2(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ hidden_states = attn_output + hidden_states
+
+ # 4. Feed-forward
+ # i2vgen doesn't have this norm 🤷♂️
+ if self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif not self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm3(hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+ if self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm2(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+ # if self._chunk_size is not None:
+ # # "feed_forward_chunk_size" can be used to save memory
+ # ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+ # else:
+
+ if self.downsampler:
+ ff_output = self.ff(norm_hidden_states, t=frame, h=height, w=width)
+ else:
+ ff_output = self.ff(norm_hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
+ elif self.norm_type == "ada_norm_single":
+ ff_output = gate_mlp * ff_output
+
+ hidden_states = ff_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ return hidden_states
diff --git a/opensora/models/diffusion/opensora/rope.py b/opensora/models/diffusion/opensora/rope.py
new file mode 100644
index 000000000..c127727a0
--- /dev/null
+++ b/opensora/models/diffusion/opensora/rope.py
@@ -0,0 +1,98 @@
+import torch
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state
+
+class PositionGetter3D(object):
+ """ return positions of patches """
+
+ def __init__(self, ):
+ self.cache_positions = {}
+
+ def __call__(self, b, t, h, w, device):
+ if not (b,t,h,w) in self.cache_positions:
+ x = torch.arange(w, device=device)
+ y = torch.arange(h, device=device)
+ z = torch.arange(t, device=device)
+ pos = torch.cartesian_prod(z, y, x)
+ if get_sequence_parallel_state():
+ # print('PositionGetter3D', PositionGetter3D)
+ pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, -1, 1).contiguous().expand(3, -1, b).clone()
+ else:
+ pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, 1, -1).contiguous().expand(3, b, -1).clone()
+ poses = (pos[0].contiguous(), pos[1].contiguous(), pos[2].contiguous())
+ max_poses = (int(poses[0].max()), int(poses[1].max()), int(poses[2].max()))
+
+ self.cache_positions[b, t, h, w] = (poses, max_poses)
+ pos = self.cache_positions[b, t, h, w]
+
+ return pos
+
+
+class RoPE3D(torch.nn.Module):
+
+ def __init__(self, freq=10000.0, F0=1.0, interpolation_scale_thw=(1, 1, 1)):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+ self.interpolation_scale_t = interpolation_scale_thw[0]
+ self.interpolation_scale_h = interpolation_scale_thw[1]
+ self.interpolation_scale_w = interpolation_scale_thw[2]
+ self.cache = {}
+
+ def get_cos_sin(self, D, seq_len, device, dtype, interpolation_scale=1):
+ if (D, seq_len, device, dtype) not in self.cache:
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+ t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale
+ freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ cos = freqs.cos() # (Seq, Dim)
+ sin = freqs.sin()
+ self.cache[D, seq_len, device, dtype] = (cos, sin)
+ return self.cache[D, seq_len, device, dtype]
+
+ @staticmethod
+ def rotate_half(x):
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+ return torch.cat((-x2, x1), dim=-1)
+
+ def apply_rope1d(self, tokens, pos1d, cos, sin):
+ assert pos1d.ndim == 2
+ if not get_sequence_parallel_state():
+ # for (batch_size x nheads x ntokens x dim)
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+ else:
+ # for (batch_size x ntokens x nheads x dim)
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, :, None, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, :, None, :]
+
+ return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+ def forward(self, tokens, positions):
+ """
+ input:
+ * tokens: batch_size x nheads x ntokens x dim
+ * positions: batch_size x ntokens x 3 (t, y and x position of each token)
+ output:
+ * tokens after appplying RoPE3D (batch_size x nheads x ntokens x x dim)
+ """
+ assert tokens.size(3) % 3 == 0, "number of dimensions should be a multiple of three"
+ D = tokens.size(3) // 3
+ poses, max_poses = positions
+ assert len(poses) == 3 and poses[0].ndim == 2# Batch, Seq, 3
+ cos_t, sin_t = self.get_cos_sin(D, max_poses[0] + 1, tokens.device, tokens.dtype, self.interpolation_scale_t)
+ cos_y, sin_y = self.get_cos_sin(D, max_poses[1] + 1, tokens.device, tokens.dtype, self.interpolation_scale_h)
+ cos_x, sin_x = self.get_cos_sin(D, max_poses[2] + 1, tokens.device, tokens.dtype, self.interpolation_scale_w)
+ # split features into three along the feature dimension, and apply rope1d on each half
+ t, y, x = tokens.chunk(3, dim=-1)
+ t = self.apply_rope1d(t, poses[0], cos_t, sin_t)
+ y = self.apply_rope1d(y, poses[1], cos_y, sin_y)
+ x = self.apply_rope1d(x, poses[2], cos_x, sin_x)
+ tokens = torch.cat((t, y, x), dim=-1)
+ return tokens
\ No newline at end of file
diff --git a/opensora/models/diffusion/opensora1/__init__.py b/opensora/models/diffusion/opensora1/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/models/diffusion/opensora1/modeling_opensora.py b/opensora/models/diffusion/opensora1/modeling_opensora.py
new file mode 100644
index 000000000..67eb08f9a
--- /dev/null
+++ b/opensora/models/diffusion/opensora1/modeling_opensora.py
@@ -0,0 +1,742 @@
+import os
+import numpy as np
+from torch import nn
+import torch
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from torch.nn import functional as F
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.utils import is_torch_version, deprecate
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.models.embeddings import PixArtAlphaTextProjection
+from opensora.models.diffusion.opensora1.modules import OverlapPatchEmbed3D, OverlapPatchEmbed2D, PatchEmbed2D, BasicTransformerBlock
+from opensora.utils.utils import to_2tuple
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
+
+class OpenSoraT2V(ModelMixin, ConfigMixin):
+ """
+ A 2D Transformer model for image-like data.
+
+ Parameters:
+ num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+ in_channels (`int`, *optional*):
+ The number of channels in the input and output (specify if the input is **continuous**).
+ num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+ sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+ This is fixed during training since it is used to learn a number of position embeddings.
+ num_vector_embeds (`int`, *optional*):
+ The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+ Includes the class for the masked latent pixel.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+ num_embeds_ada_norm ( `int`, *optional*):
+ The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+ `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+ added to the hidden states.
+
+ During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+ attention_bias (`bool`, *optional*):
+ Configure if the `TransformerBlocks` attention should contain a bias parameter.
+ """
+
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ sample_size_t: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ patch_size: Optional[int] = None,
+ patch_size_t: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_elementwise_affine: bool = True,
+ norm_eps: float = 1e-5,
+ attention_type: str = "default",
+ caption_channels: int = None,
+ interpolation_scale_h: float = None,
+ interpolation_scale_w: float = None,
+ interpolation_scale_t: float = None,
+ use_additional_conditions: Optional[bool] = None,
+ attention_mode: str = 'xformers',
+ downsampler: str = None,
+ use_rope: bool = False,
+ use_stable_fp32: bool = False,
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ use_motion: bool = False,
+ ):
+ super().__init__()
+
+ # Validate inputs.
+ if patch_size is not None:
+ if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single"]:
+ raise NotImplementedError(
+ f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+ )
+ elif norm_type in ["ada_norm", "ada_norm_zero"] and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+ )
+
+ # Set some common variables used across the board.
+ self.use_motion = use_motion
+ self.sparse1d = sparse1d
+ self.sparse2d = sparse2d
+ self.sparse_n = sparse_n
+ self.use_rope = use_rope
+ self.use_linear_projection = use_linear_projection
+ self.interpolation_scale_t = interpolation_scale_t
+ self.interpolation_scale_h = interpolation_scale_h
+ self.interpolation_scale_w = interpolation_scale_w
+ self.downsampler = downsampler
+ self.caption_channels = caption_channels
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.gradient_checkpointing = False
+ self.config.hidden_size = self.inner_dim
+ use_additional_conditions = False
+ # if use_additional_conditions is None:
+ # if norm_type == "ada_norm_single" and sample_size == 128:
+ # use_additional_conditions = True
+ # else:
+ # use_additional_conditions = False
+ self.use_additional_conditions = use_additional_conditions
+
+ # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+ # Define whether input is continuous or discrete depending on configuration
+ assert in_channels is not None and patch_size is not None
+
+ if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+ deprecation_message = (
+ f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+ " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+ " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+ " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+ " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+ )
+ deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+ norm_type = "ada_norm"
+
+ # 2. Initialize the right blocks.
+ # Initialize the output blocks and other projection blocks when necessary.
+ self._init_patched_inputs(norm_type=norm_type)
+
+ def _init_patched_inputs(self, norm_type):
+ assert self.config.sample_size_t is not None, "OpenSoraT2V over patched input must provide sample_size_t"
+ assert self.config.sample_size is not None, "OpenSoraT2V over patched input must provide sample_size"
+ #assert not (self.config.sample_size_t == 1 and self.config.patch_size_t == 2), "Image do not need patchfy in t-dim"
+
+ self.num_frames = self.config.sample_size_t
+ self.config.sample_size = to_2tuple(self.config.sample_size)
+ self.height = self.config.sample_size[0]
+ self.width = self.config.sample_size[1]
+ self.patch_size_t = self.config.patch_size_t
+ self.patch_size = self.config.patch_size
+ interpolation_scale_t = ((self.config.sample_size_t - 1) // 16 + 1) if self.config.sample_size_t % 2 == 1 else self.config.sample_size_t / 16
+ interpolation_scale_t = (
+ self.config.interpolation_scale_t if self.config.interpolation_scale_t is not None else interpolation_scale_t
+ )
+ interpolation_scale = (
+ self.config.interpolation_scale_h if self.config.interpolation_scale_h is not None else self.config.sample_size[0] / 30,
+ self.config.interpolation_scale_w if self.config.interpolation_scale_w is not None else self.config.sample_size[1] / 40,
+ )
+ # if self.config.sample_size_t > 1:
+ # self.pos_embed = PatchEmbed3D(
+ # num_frames=self.config.sample_size_t,
+ # height=self.config.sample_size[0],
+ # width=self.config.sample_size[1],
+ # patch_size_t=self.config.patch_size_t,
+ # patch_size=self.config.patch_size,
+ # in_channels=self.in_channels,
+ # embed_dim=self.inner_dim,
+ # interpolation_scale=interpolation_scale,
+ # interpolation_scale_t=interpolation_scale_t,
+ # )
+ # else:
+ if self.config.downsampler is not None and len(self.config.downsampler) == 9:
+ self.pos_embed = OverlapPatchEmbed3D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+ elif self.config.downsampler is not None and len(self.config.downsampler) == 7:
+ self.pos_embed = OverlapPatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+
+ else:
+ self.pos_embed = PatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+ interpolation_scale_thw = (interpolation_scale_t, *interpolation_scale)
+ self.transformer_blocks = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.config.cross_attention_dim,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ downsampler=self.config.downsampler,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=self.sparse1d if i > 1 and i < 30 else False,
+ sparse2d=self.sparse2d if i > 1 and i < 30 else False,
+ sparse_n=self.sparse_n,
+ sparse_group=i % 2 == 1,
+ )
+ for i in range(self.config.num_layers)
+ ]
+ )
+
+ if self.config.norm_type != "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+ self.proj_out_2 = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+ elif self.config.norm_type == "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+ self.proj_out = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+
+ # PixArt-Alpha blocks.
+ self.adaln_single = None
+ if self.config.norm_type == "ada_norm_single":
+ # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+ # additional conditions until we find better name
+ self.adaln_single = AdaLayerNormSingle(
+ self.inner_dim, use_additional_conditions=self.use_additional_conditions
+ )
+
+ self.caption_projection = None
+ if self.caption_channels is not None:
+ self.caption_projection = PixArtAlphaTextProjection(
+ in_features=self.caption_channels, hidden_size=self.inner_dim
+ )
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ timestep: Optional[torch.LongTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ added_cond_kwargs: Dict[str, torch.Tensor] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ use_image_num: Optional[int] = 0,
+ return_dict: bool = True,
+ ):
+ """
+ The [`Transformer2DModel`] forward method.
+
+ Args:
+ hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+ Input `hidden_states`.
+ encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+ self-attention.
+ timestep ( `torch.LongTensor`, *optional*):
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+ class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+ Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+ `AdaLayerZeroNorm`.
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ attention_mask ( `torch.Tensor`, *optional*):
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+ negative values to the attention scores corresponding to "discard" tokens.
+ encoder_attention_mask ( `torch.Tensor`, *optional*):
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+ above. This bias will be added to the cross-attention scores.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+ tuple.
+
+ Returns:
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+ `tuple` where the first element is the sample tensor.
+ """
+ batch_size, c, frame, h, w = hidden_states.shape
+ # print('hidden_states.shape', hidden_states.shape)
+ frame = frame - use_image_num # 21-4=17
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ print.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+ attention_mask_vid, attention_mask_img = None, None
+ if attention_mask is not None and attention_mask.ndim == 4:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
+ # b, frame+use_image_num, h, w -> a video with images
+ # b, 1, h, w -> only images
+ attention_mask = attention_mask.to(self.dtype)
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ attention_mask_vid = attention_mask[:, :frame * hccl_info.world_size] # b, frame, h, w
+ attention_mask_img = attention_mask[:, frame * hccl_info.world_size:] # b, use_image_num, h, w
+ else:
+ # print('before attention_mask.shape', attention_mask.shape)
+ attention_mask_vid = attention_mask[:, :frame * nccl_info.world_size] # b, frame, h, w
+ attention_mask_img = attention_mask[:, frame * nccl_info.world_size:] # b, use_image_num, h, w
+ # print('after attention_mask.shape', attention_mask_vid.shape)
+ else:
+ attention_mask_vid = attention_mask[:, :frame] # b, frame, h, w
+ attention_mask_img = attention_mask[:, frame:] # b, use_image_num, h, w
+
+ if attention_mask_vid.numel() > 0:
+ attention_mask_vid_first_frame = attention_mask_vid[:, :1].repeat(1, self.patch_size_t-1, 1, 1)
+ attention_mask_vid = torch.cat([attention_mask_vid_first_frame, attention_mask_vid], dim=1)
+ attention_mask_vid = attention_mask_vid.unsqueeze(1) # b 1 t h w
+ attention_mask_vid = F.max_pool3d(attention_mask_vid, kernel_size=(self.patch_size_t, self.patch_size, self.patch_size),
+ stride=(self.patch_size_t, self.patch_size, self.patch_size))
+ attention_mask_vid = rearrange(attention_mask_vid, 'b 1 t h w -> (b 1) 1 (t h w)')
+ if attention_mask_img.numel() > 0:
+ attention_mask_img = F.max_pool2d(attention_mask_img, kernel_size=(self.patch_size, self.patch_size), stride=(self.patch_size, self.patch_size))
+ attention_mask_img = rearrange(attention_mask_img, 'b i h w -> (b i) 1 (h w)')
+
+ attention_mask_vid = (1 - attention_mask_vid.bool().to(self.dtype)) * -10000.0 if attention_mask_vid.numel() > 0 else None
+ attention_mask_img = (1 - attention_mask_img.bool().to(self.dtype)) * -10000.0 if attention_mask_img.numel() > 0 else None
+
+ if frame == 1 and use_image_num == 0 and not get_sequence_parallel_state():
+ attention_mask_img = attention_mask_vid
+ attention_mask_vid = None
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ # import ipdb;ipdb.set_trace()
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 3:
+ # b, 1+use_image_num, l -> a video with images
+ # b, 1, l -> only images
+ encoder_attention_mask = (1 - encoder_attention_mask.to(self.dtype)) * -10000.0
+ in_t = encoder_attention_mask.shape[1]
+ encoder_attention_mask_vid = encoder_attention_mask[:, :in_t-use_image_num] # b, 1, l
+ encoder_attention_mask_vid = rearrange(encoder_attention_mask_vid, 'b 1 l -> (b 1) 1 l') if encoder_attention_mask_vid.numel() > 0 else None
+
+ encoder_attention_mask_img = encoder_attention_mask[:, in_t-use_image_num:] # b, use_image_num, l
+ encoder_attention_mask_img = rearrange(encoder_attention_mask_img, 'b i l -> (b i) 1 l') if encoder_attention_mask_img.numel() > 0 else None
+
+ if frame == 1 and use_image_num == 0 and not get_sequence_parallel_state():
+ encoder_attention_mask_img = encoder_attention_mask_vid
+ encoder_attention_mask_vid = None
+
+ if npu_config is not None and attention_mask_vid is not None:
+ attention_mask_vid = npu_config.get_attention_mask(attention_mask_vid, attention_mask_vid.shape[-1])
+ encoder_attention_mask_vid = npu_config.get_attention_mask(encoder_attention_mask_vid,
+ attention_mask_vid.shape[-2])
+ if npu_config is not None and attention_mask_img is not None:
+ attention_mask_img = npu_config.get_attention_mask(attention_mask_img, attention_mask_img.shape[-1])
+ encoder_attention_mask_img = npu_config.get_attention_mask(encoder_attention_mask_img,
+ attention_mask_img.shape[-2])
+
+
+ # 1. Input
+ frame = ((frame - 1) // self.patch_size_t + 1) if frame % 2 == 1 else frame // self.patch_size_t # patchfy
+ # print('frame', frame)
+ height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+ hidden_states_vid, hidden_states_img, encoder_hidden_states_vid, encoder_hidden_states_img, \
+ timestep_vid, timestep_img, embedded_timestep_vid, embedded_timestep_img = self._operate_on_patched_inputs(
+ hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, batch_size, frame, use_image_num
+ )
+ # 2. Blocks
+ # import ipdb;ipdb.set_trace()
+ if get_sequence_parallel_state():
+ if hidden_states_vid is not None:
+ # print(333333333333333)
+ hidden_states_vid = rearrange(hidden_states_vid, 'b s h -> s b h', b=batch_size).contiguous()
+ encoder_hidden_states_vid = rearrange(encoder_hidden_states_vid, 'b s h -> s b h',
+ b=batch_size).contiguous()
+ timestep_vid = timestep_vid.view(batch_size, 6, -1).transpose(0, 1).contiguous()
+ # print('timestep_vid', timestep_vid.shape)
+
+ cnt = 0
+ for block in self.transformer_blocks:
+ # print('cnt', cnt)
+ # cnt += 1
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ # import ipdb;ipdb.set_trace()
+ if hidden_states_vid is not None:
+ hidden_states_vid = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states_vid,
+ attention_mask_vid,
+ encoder_hidden_states_vid,
+ encoder_attention_mask_vid,
+ timestep_vid,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ # import ipdb;ipdb.set_trace()
+ if hidden_states_img is not None:
+ hidden_states_img = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states_img,
+ attention_mask_img,
+ encoder_hidden_states_img,
+ encoder_attention_mask_img,
+ timestep_img,
+ cross_attention_kwargs,
+ class_labels,
+ 1,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ if hidden_states_vid is not None:
+ hidden_states_vid = block(
+ hidden_states_vid,
+ attention_mask=attention_mask_vid,
+ encoder_hidden_states=encoder_hidden_states_vid,
+ encoder_attention_mask=encoder_attention_mask_vid,
+ timestep=timestep_vid,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+ if hidden_states_img is not None:
+ hidden_states_img = block(
+ hidden_states_img,
+ attention_mask=attention_mask_img,
+ encoder_hidden_states=encoder_hidden_states_img,
+ encoder_attention_mask=encoder_attention_mask_img,
+ timestep=timestep_img,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=1,
+ height=height,
+ width=width,
+ )
+
+ if get_sequence_parallel_state():
+ if hidden_states_vid is not None:
+ hidden_states_vid = rearrange(hidden_states_vid, 's b h -> b s h', b=batch_size).contiguous()
+
+ # 3. Output
+ output_vid, output_img = None, None
+ if hidden_states_vid is not None:
+ output_vid = self._get_output_for_patched_inputs(
+ hidden_states=hidden_states_vid,
+ timestep=timestep_vid,
+ class_labels=class_labels,
+ embedded_timestep=embedded_timestep_vid,
+ num_frames=frame,
+ height=height,
+ width=width,
+ ) # b c t h w
+ if hidden_states_img is not None:
+ output_img = self._get_output_for_patched_inputs(
+ hidden_states=hidden_states_img,
+ timestep=timestep_img,
+ class_labels=class_labels,
+ embedded_timestep=embedded_timestep_img,
+ num_frames=1,
+ height=height,
+ width=width,
+ ) # b c 1 h w
+ if use_image_num != 0:
+ output_img = rearrange(output_img, '(b i) c 1 h w -> b c i h w', i=use_image_num)
+
+ if output_vid is not None and output_img is not None:
+ output = torch.cat([output_vid, output_img], dim=2)
+ elif output_vid is not None:
+ output = output_vid
+ elif output_img is not None:
+ output = output_img
+
+ if not return_dict:
+ return (output,)
+
+ return Transformer2DModelOutput(sample=output)
+
+
+ def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, batch_size, frame, use_image_num):
+ # batch_size = hidden_states.shape[0]
+ hidden_states_vid, hidden_states_img = self.pos_embed(hidden_states.to(self.dtype), frame)
+ timestep_vid, timestep_img = None, None
+ embedded_timestep_vid, embedded_timestep_img = None, None
+ encoder_hidden_states_vid, encoder_hidden_states_img = None, None
+
+ if self.adaln_single is not None:
+ if self.use_additional_conditions and added_cond_kwargs is None:
+ raise ValueError(
+ "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+ )
+ timestep, embedded_timestep = self.adaln_single(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+ if hidden_states_vid is None:
+ timestep_img = timestep
+ embedded_timestep_img = embedded_timestep
+ else:
+ timestep_vid = timestep
+ embedded_timestep_vid = embedded_timestep
+ if hidden_states_img is not None:
+ timestep_img = repeat(timestep, 'b d -> (b i) d', i=use_image_num).contiguous()
+ embedded_timestep_img = repeat(embedded_timestep, 'b d -> (b i) d', i=use_image_num).contiguous()
+
+ if self.caption_projection is not None:
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ if hidden_states_vid is None:
+ encoder_hidden_states_img = rearrange(encoder_hidden_states, 'b 1 l d -> (b 1) l d')
+ else:
+ encoder_hidden_states_vid = rearrange(encoder_hidden_states[:, :1], 'b 1 l d -> (b 1) l d')
+ if hidden_states_img is not None:
+ encoder_hidden_states_img = rearrange(encoder_hidden_states[:, 1:], 'b i l d -> (b i) l d')
+
+
+ return hidden_states_vid, hidden_states_img, encoder_hidden_states_vid, encoder_hidden_states_img, timestep_vid, timestep_img, embedded_timestep_vid, embedded_timestep_img
+
+
+
+ def _get_output_for_patched_inputs(
+ self, hidden_states, timestep, class_labels, embedded_timestep, num_frames, height=None, width=None
+ ):
+ # import ipdb;ipdb.set_trace()
+ if self.config.norm_type != "ada_norm_single":
+ conditioning = self.transformer_blocks[0].norm1.emb(
+ timestep, class_labels, hidden_dtype=self.dtype
+ )
+ shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+ hidden_states = self.proj_out_2(hidden_states)
+ elif self.config.norm_type == "ada_norm_single":
+ shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states)
+ # Modulation
+ hidden_states = hidden_states * (1 + scale) + shift
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states.squeeze(1)
+
+ # unpatchify
+ if self.adaln_single is None:
+ height = width = int(hidden_states.shape[1] ** 0.5)
+ hidden_states = hidden_states.reshape(
+ shape=(-1, num_frames, height, width, self.patch_size_t, self.patch_size, self.patch_size, self.out_channels)
+ )
+ hidden_states = torch.einsum("nthwopqc->nctohpwq", hidden_states)
+ output = hidden_states.reshape(
+ shape=(-1, self.out_channels, num_frames * self.patch_size_t, height * self.patch_size, width * self.patch_size)
+ )
+ # import ipdb;ipdb.set_trace()
+ # if output.shape[2] % 2 == 0:
+ # output = output[:, :, 1:]
+ return output
+
+def OpenSoraT2V_S_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=8, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=768, **kwargs)
+
+def OpenSoraT2V_B_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=16, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=1536, **kwargs)
+
+def OpenSoraT2V_L_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=24, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=2304, **kwargs)
+
+OpenSora1_models = {
+ "OpenSoraT2V1-S/122": OpenSoraT2V_S_122, # 0.3B
+ "OpenSoraT2V1-B/122": OpenSoraT2V_B_122, # 1.2B
+ "OpenSoraT2V1-L/122": OpenSoraT2V_L_122, # 2.7B
+}
+
+OpenSora1_models_class = {
+ "OpenSoraT2V1-S/122": OpenSoraT2V,
+ "OpenSoraT2V1-B/122": OpenSoraT2V,
+ "OpenSoraT2V1-L/122": OpenSoraT2V,
+}
+
+if __name__ == '__main__':
+ from opensora.models.causalvideovae import ae_stride_config, ae_channel_config
+ from opensora.models.causalvideovae import ae_norm, ae_denorm
+ from opensora.models import CausalVAEModelWrapper
+
+ args = type('args', (),
+ {
+ 'ae': 'CausalVAEModel_D8_4x8x8',
+ 'attention_mode': 'xformers',
+ 'use_rope': True,
+ 'model_max_length': 300,
+ 'max_height': 480,
+ 'max_width': 640,
+ 'num_frames': 29,
+ 'use_image_num': 0,
+ 'compress_kv_factor': 1,
+ 'interpolation_scale_t': 1,
+ 'interpolation_scale_h': 1,
+ 'interpolation_scale_w': 1,
+ "sparse1d": True,
+ "sparse2d": False,
+ "sparse_n": 4,
+ "rank": 64,
+ }
+ )
+ b = 16
+ c = 4
+ cond_c = 4096
+ num_timesteps = 1000
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ num_frames = (args.num_frames - 1) // ae_stride_t + 1
+
+ # device = torch.device('cuda:0')
+ # model = OpenSoraT2V_L_122(in_channels=c,
+ # out_channels=c,
+ # sample_size=latent_size,
+ # sample_size_t=num_frames,
+ # activation_fn="gelu-approximate",
+ # attention_bias=True,
+ # attention_type="default",
+ # double_self_attention=False,
+ # norm_elementwise_affine=False,
+ # norm_eps=1e-06,
+ # norm_num_groups=32,
+ # num_vector_embeds=None,
+ # only_cross_attention=False,
+ # upcast_attention=False,
+ # use_linear_projection=False,
+ # use_additional_conditions=False,
+ # downsampler=None,
+ # interpolation_scale_t=args.interpolation_scale_t,
+ # interpolation_scale_h=args.interpolation_scale_h,
+ # interpolation_scale_w=args.interpolation_scale_w,
+ # use_rope=args.use_rope,
+ # sparse1d=args.sparse1d,
+ # sparse2d=args.sparse2d,
+ # sparse_n=args.sparse_n
+ # ).to(device)
+
+ # try:
+ # path = "/storage/dataset/Open-Sora-Plan-v1.2.0/29x720p/diffusion_pytorch_model.safetensors"
+ # ckpt = torch.load(path, map_location="cpu")
+ # msg = model.load_state_dict(ckpt, strict=True)
+ # print(msg)
+ # except Exception as e:
+ # print(e)
+ # print(model)
+ # print(f'{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B')
+ # # import sys;sys.exit()
+ # x = torch.randn(b, c, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w).to(device)
+ # cond = torch.randn(b, 1+args.use_image_num, args.model_max_length, cond_c).to(device)
+ # attn_mask = torch.randint(0, 2, (b, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w)).to(device) # B L or B 1+num_images L
+ # cond_mask = torch.randint(0, 2, (b, 1+args.use_image_num, args.model_max_length)).to(device) # B L or B 1+num_images L
+ # timestep = torch.randint(0, 1000, (b,), device=device)
+ # model_kwargs = dict(hidden_states=x, encoder_hidden_states=cond, attention_mask=attn_mask,
+ # encoder_attention_mask=cond_mask, use_image_num=args.use_image_num, timestep=timestep)
+ # with torch.no_grad():
+ # output = model(**model_kwargs)
+ # print(output[0].shape)
+
+ from diffusers.training_utils import EMAModel
+ load_model = EMAModel.from_pretrained("/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x8_vae8_anyx320x320_lr5e-5_snr5_noioff0.02_ema9999_sparse1d4_newdit_l_122_rope_mt5xxl_mj/checkpoint-219000/model_ema", OpenSoraT2V)
diff --git a/opensora/models/diffusion/opensora1/modules.py b/opensora/models/diffusion/opensora1/modules.py
new file mode 100644
index 000000000..83a319eff
--- /dev/null
+++ b/opensora/models/diffusion/opensora1/modules.py
@@ -0,0 +1,1551 @@
+from einops import rearrange
+from torch import nn
+import torch
+import numpy as np
+
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from typing import Any, Dict, Optional
+import re
+import torch
+import torch.nn.functional as F
+from torch import nn
+import diffusers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, GatedSelfAttentionDense
+from diffusers.models.attention_processor import Attention as Attention_
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+from .rope import PositionGetter3D, RoPE3D
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+ from opensora.acceleration.communications import all_to_all_SBH
+except:
+ torch_npu = None
+ npu_config = None
+ set_run_dtype = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
+ from opensora.utils.communications import all_to_all_SBH
+logger = logging.get_logger(__name__)
+
+def get_3d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+ grid_t = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_h = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid_w = np.arange(grid_size[2], dtype=np.float32) / (grid_size[2] / base_size[2]) / interpolation_scale[2]
+ grid = np.meshgrid(grid_w, grid_h, grid_t) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([3, 1, grid_size[2], grid_size[1], grid_size[0]])
+ pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+ # import ipdb;ipdb.set_trace()
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 3 != 0:
+ raise ValueError("embed_dim must be divisible by 3")
+
+ # import ipdb;ipdb.set_trace()
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_t = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0]) # (T*H*W, D/3)
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1]) # (T*H*W, D/3)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2]) # (T*H*W, D/3)
+
+ emb = np.concatenate([emb_t, emb_h, emb_w], axis=1) # (T*H*W, D)
+ return emb
+
+
+def get_2d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+def get_1d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid return: pos_embed: [grid_size, embed_dim] or
+ [1+grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid = np.arange(grid_size, dtype=np.float32) / (grid_size / base_size) / interpolation_scale
+ pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid) # (H*W, D/2)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+ """
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
+class PatchEmbed2D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ # assert num_frames == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv2d(
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sp_size = hccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = hccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+ else:
+ sp_size = nccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = nccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+
+ else:
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent, image_latent = latent[:, :num_frames], latent[:, num_frames:]
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype) if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = (image_latent + temp_pos_embed[:, :1]).to(image_latent.dtype) if image_latent is not None and image_latent.numel() > 0 else None
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c') if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = rearrange(image_latent, 'b t n c -> (b t) n c') if image_latent is not None and image_latent.numel() > 0 else None
+
+ if num_frames == 1 and image_latent is None and not get_sequence_parallel_state():
+ image_latent = video_latent
+ video_latent = None
+ # print('video_latent is None, image_latent is None', video_latent is None, image_latent is None)
+ return video_latent, image_latent
+
+
+
+class OverlapPatchEmbed3D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ # assert patch_size_t == 1 and patch_size == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv3d(
+ in_channels, embed_dim, kernel_size=(patch_size_t, patch_size, patch_size), stride=(patch_size_t, patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ # latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ # latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ latent = rearrange(latent, 'b c t h w -> (b t) (h w) c ')
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent, image_latent = latent[:, :num_frames], latent[:, num_frames:]
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype) if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = (image_latent + temp_pos_embed[:, :1]).to(image_latent.dtype) if image_latent is not None and image_latent.numel() > 0 else None
+
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c') if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = rearrange(image_latent, 'b t n c -> (b t) n c') if image_latent is not None and image_latent.numel() > 0 else None
+
+ if num_frames == 1 and image_latent is None:
+ image_latent = video_latent
+ video_latent = None
+ return video_latent, image_latent
+
+
+
+class OverlapPatchEmbed2D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ assert patch_size_t == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv2d(
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent, image_latent = latent[:, :num_frames], latent[:, num_frames:]
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype) if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = (image_latent + temp_pos_embed[:, :1]).to(image_latent.dtype) if image_latent is not None and image_latent.numel() > 0 else None
+
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c') if video_latent is not None and video_latent.numel() > 0 else None
+ image_latent = rearrange(image_latent, 'b t n c -> (b t) n c') if image_latent is not None and image_latent.numel() > 0 else None
+
+ if num_frames == 1 and image_latent is None:
+ image_latent = video_latent
+ video_latent = None
+ return video_latent, image_latent
+
+class Attention(Attention_):
+ def __init__(self, downsampler, attention_mode, use_rope, interpolation_scale_thw,
+ sparse1d, sparse2d, sparse_n, sparse_group, is_cross_attn, **kwags):
+ processor = AttnProcessor2_0(attention_mode=attention_mode, use_rope=use_rope, interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d, sparse2d=sparse2d, sparse_n=sparse_n, sparse_group=sparse_group, is_cross_attn=is_cross_attn)
+ super().__init__(processor=processor, **kwags)
+ self.downsampler = None
+
+ def prepare_attention_mask(
+ self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+ ) -> torch.Tensor:
+ r"""
+ Prepare the attention mask for the attention computation.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ The attention mask to prepare.
+ target_length (`int`):
+ The target length of the attention mask. This is the length of the attention mask after padding.
+ batch_size (`int`):
+ The batch size, which is used to repeat the attention mask.
+ out_dim (`int`, *optional*, defaults to `3`):
+ The output dimension of the attention mask. Can be either `3` or `4`.
+
+ Returns:
+ `torch.Tensor`: The prepared attention mask.
+ """
+ head_size = self.heads
+ if get_sequence_parallel_state():
+ head_size = head_size // nccl_info.world_size
+ if attention_mask is None:
+ return attention_mask
+
+ current_length: int = attention_mask.shape[-1]
+ if current_length != target_length:
+ if attention_mask.device.type == "mps":
+ # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+ # Instead, we can manually construct the padding tensor.
+ padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+ padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+ attention_mask = torch.cat([attention_mask, padding], dim=2)
+ else:
+ # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+ # we want to instead pad by (0, remaining_length), where remaining_length is:
+ # remaining_length: int = target_length - current_length
+ # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+ if out_dim == 3:
+ if attention_mask.shape[0] < batch_size * head_size:
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+ elif out_dim == 4:
+ attention_mask = attention_mask.unsqueeze(1)
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+ return attention_mask
+
+class DownSampler3d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv3d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ if npu_config is None:
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.layer, x, x_dtype) + (x if self.down_shortcut else 0)
+
+ self.t = t//self.down_factor[0]
+ self.h = h//self.down_factor[1]
+ self.w = w//self.down_factor[2]
+ x = rearrange(x, 'b d (t dt) (h dh) (w dw) -> (b dt dh dw) (t h w) d',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> b 1 t h w', t=t, h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (t dt) (h dh) (w dw) -> (b dt dh dw) 1 (t h w)',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b dt dh dw) (t h w) d -> b (t dt h dh w dw) d',
+ t=t, h=h, w=w,
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x
+
+
+class DownSampler2d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv2d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+
+ self.t = 1
+ self.h = h//self.down_factor[0]
+ self.w = w//self.down_factor[1]
+
+ x = rearrange(x, 'b d (h dh) (w dw) -> (b dh dw) (h w) d',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> (b t) 1 h w', h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (h dh) (w dw) -> (b dh dw) 1 (h w)',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b t dh dw) (h w) d -> b (t h dh w dw) d',
+ t=t, h=h, w=w,
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x
+
+class AttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+ """
+
+ def __init__(self, attention_mode='xformers', use_rope=False, interpolation_scale_thw=(1, 1, 1),
+ sparse1d=False, sparse2d=False, sparse_n=2, sparse_group=False, is_cross_attn=True):
+ self.sparse1d = sparse1d
+ self.sparse2d = sparse2d
+ self.sparse_n = sparse_n
+ self.sparse_group = sparse_group
+ self.is_cross_attn = is_cross_attn
+ self.use_rope = use_rope
+ self.interpolation_scale_thw = interpolation_scale_thw
+ if self.use_rope:
+ self._init_rope(interpolation_scale_thw)
+ self.attention_mode = attention_mode
+ if not hasattr(F, "scaled_dot_product_attention"):
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+ assert not (self.sparse1d and self.sparse2d)
+
+ def _init_rope(self, interpolation_scale_thw):
+ self.rope = RoPE3D(interpolation_scale_thw=interpolation_scale_thw)
+ self.position_getter = PositionGetter3D()
+
+ def _sparse_1d(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ attention_mask: b nheads 1 thw
+ """
+ l = x.shape[-2]
+ assert l == frame*height*width
+ assert attention_mask is None or attention_mask.shape[2] == 1
+ pad_len = 0
+ if l % (self.sparse_n * self.sparse_n) != 0:
+ pad_len = self.sparse_n * self.sparse_n - l % (self.sparse_n * self.sparse_n)
+ if pad_len != 0:
+ x = F.pad(x, (0, 0, 0, pad_len))
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_len, 0, 0), value=-9980.0)
+ if not self.sparse_group:
+ x = rearrange(x, 'b h (g k) d -> (k b) h g d', k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (g k) -> (k b) h 1 g', k=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b h (n m k) d -> (m b) h (n k) d', m=self.sparse_n, k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (n m k) -> (m b) h 1 (n k)', m=self.sparse_n, k=self.sparse_n)
+ if self.is_cross_attn:
+ attention_mask = attention_mask.repeat(self.sparse_n, 1, 1, 1)
+ return x, attention_mask, pad_len
+
+ def _reverse_sparse_1d(self, x, frame, height, width, pad_len):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ assert x.shape[2] == (frame*height*width+pad_len) // self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(k b) h g d -> b h (g k) d', k=self.sparse_n)
+ else:
+ x = rearrange(x, '(m b) h (n k) d -> b h (n m k) d', m=self.sparse_n, k=self.sparse_n)
+ x = x[:, :, :frame*height*width, :]
+ # x = x.contiguous()
+ return x
+
+ def _sparse_1d_kv(self, x):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ x = repeat(x, 'b h s d -> (k b) h s d', k=self.sparse_n)
+ return x
+
+ def _sparse_2d(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ attention_mask: b nheads 1 thw
+ """
+ d = x.shape[-1]
+ x = rearrange(x, 'b h (T H W) d -> b h T H W d', T=frame, H=height, W=width)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (T H W) -> b h T H W', T=frame, H=height, W=width)
+ pad_height = self.sparse_n*self.sparse_n - height % (self.sparse_n*self.sparse_n)
+ pad_width = self.sparse_n*self.sparse_n - width % (self.sparse_n*self.sparse_n)
+ if pad_height != 0 or pad_width != 0:
+ x = rearrange(x, 'b h T H W d -> b (h d) T H W')
+ x = F.pad(x, (0, pad_width, 0, pad_height, 0, 0))
+ x = rearrange(x, 'b (h d) T H W -> b h T H W d', d=d)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_width, 0, pad_height, 0, 0), value=-9500.0)
+
+ if not self.sparse_group:
+ x = rearrange(x, 'b h t (g1 k1) (g2 k2) d -> (k1 k2 b) h (t g1 g2) d',
+ k1=self.sparse_n, k2=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h t (g1 k1) (g2 k2) -> (k1 k2 b) h 1 (t g1 g2)',
+ k1=self.sparse_n, k2=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b h t (n1 m1 k1) (n2 m2 k2) d -> (m1 m2 b) h (t n1 n2 k1 k2) d',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h t (n1 m1 k1) (n2 m2 k2) -> (m1 m2 b) h 1 (t n1 n2 k1 k2)',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n)
+
+ if self.is_cross_attn:
+ attention_mask = attention_mask.repeat(self.sparse_n*self.sparse_n, 1, 1, 1)
+ return x, attention_mask, pad_height, pad_width
+
+ def _reverse_sparse_2d(self, x, frame, height, width, pad_height, pad_width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ assert x.shape[2] == frame*(height+pad_height)*(width+pad_width)//self.sparse_n//self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(k1 k2 b) h (t g1 g2) d -> b h t (g1 k1) (g2 k2) d',
+ k1=self.sparse_n, k2=self.sparse_n,
+ g1=(height+pad_height)//self.sparse_n, g2=(width+pad_width)//self.sparse_n)
+ else:
+ x = rearrange(x, '(m1 m2 b) h (t n1 n2 k1 k2) d -> b h t (n1 m1 k1) (n2 m2 k2) d',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n,
+ n1=(height+pad_height)//self.sparse_n//self.sparse_n, n2=(width+pad_width)//self.sparse_n//self.sparse_n)
+ x = x[:, :, :, :height, :width, :]
+ x = rearrange(x, 'b h T H W d -> b h (T H W) d')
+ # x = x.contiguous()
+ return x
+
+
+ def _sparse_2d_kv(self, x):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ x = repeat(x, 'b h s d -> (k1 k2 b) h s d', k1=self.sparse_n, k2=self.sparse_n)
+ return x
+
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ frame: int = 8,
+ height: int = 16,
+ width: int = 16,
+ *args,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+
+ if attn.downsampler is not None:
+ hidden_states, attention_mask = attn.downsampler(hidden_states, attention_mask, t=frame, h=height, w=width)
+ frame, height, width = attn.downsampler.t, attn.downsampler.h, attn.downsampler.w
+
+ residual = hidden_states
+
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sequence_length, batch_size, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ else:
+ sequence_length, batch_size, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ else:
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ if attention_mask is not None:
+ if npu_config is None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length * nccl_info.world_size, batch_size)
+ # scaled_dot_product_attention expects attention_mask shape to be
+ # (batch, heads, source_length, target_length)
+ if get_sequence_parallel_state():
+ attention_mask = attention_mask.view(batch_size, attn.heads // nccl_info.world_size, -1, attention_mask.shape[-1])
+ else:
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+ else:
+ attention_mask = attention_mask.view(batch_size, 1, -1, attention_mask.shape[-1])
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ query = attn.to_q(hidden_states)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ if npu_config is not None and npu_config.on_npu:
+ if get_sequence_parallel_state():
+ query = query.view(-1, attn.heads, head_dim) # [s // sp, b, h * d] -> [s // sp * b, h, d]
+ key = key.view(-1, attn.heads, head_dim)
+ value = value.view(-1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ h_size = attn.heads * head_dim
+ sp_size = hccl_info.world_size
+ h_size_sp = h_size // sp_size
+ # apply all_to_all to gather sequence and split attention heads [s // sp * b, h, d] -> [s * b, h // sp, d]
+ query = all_to_all_SBH(query, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ key = all_to_all_SBH(key, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ value = all_to_all_SBH(value, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ if self.use_rope:
+ query = query.view(-1, batch_size, attn.heads // sp_size, head_dim)
+ key = key.view(-1, batch_size, attn.heads // sp_size, head_dim)
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame * sp_size, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ query = query.view(-1, batch_size, h_size_sp)
+ key = key.view(-1, batch_size, h_size_sp)
+ value = value.view(-1, batch_size, h_size_sp)
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "SBH",
+ head_dim, attn.heads // sp_size)
+
+ hidden_states = hidden_states.view(-1, attn.heads // sp_size, head_dim)
+
+ # [s * b, h // sp, d] -> [s // sp * b, h, d] -> [s // sp, b, h * d]
+ hidden_states = all_to_all_SBH(hidden_states, scatter_dim=0, gather_dim=1).view(-1, batch_size, h_size)
+ else:
+ if npu_config.enable_FA and query.dtype == torch.float32:
+ dtype = torch.bfloat16
+ else:
+ dtype = None
+
+ query = query.view(batch_size, -1, attn.heads, head_dim)
+ key = key.view(batch_size, -1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ query = query.view(batch_size, -1, attn.heads * head_dim)
+ key = key.view(batch_size, -1, attn.heads * head_dim)
+
+ with set_run_dtype(query, dtype):
+ query, key, value = npu_config.set_current_run_dtype([query, key, value])
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "BSH",
+ head_dim, attn.heads)
+
+ hidden_states = npu_config.restore_dtype(hidden_states)
+ else:
+ if get_sequence_parallel_state():
+ query = query.reshape(-1, attn.heads, head_dim) # [s // sp, b, h * d] -> [s // sp * b, h, d]
+ key = key.reshape(-1, attn.heads, head_dim)
+ value = value.reshape(-1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ h_size = attn.heads * head_dim
+ sp_size = nccl_info.world_size
+ h_size_sp = h_size // sp_size
+ # print(frame, sp_size, height, width)
+ # apply all_to_all to gather sequence and split attention heads [s // sp * b, h, d] -> [s * b, h // sp, d]
+ query = all_to_all_SBH(query, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ key = all_to_all_SBH(key, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ value = all_to_all_SBH(value, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ query = query.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ key = key.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ value = value.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame * sp_size, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ # print('after rope query', query.shape, 'key', key.shape, 'value', value.shape)
+ query = rearrange(query, 's b h d -> b h s d')
+ key = rearrange(key, 's b h d -> b h s d')
+ value = rearrange(value, 's b h d -> b h s d')
+ # print('rearrange query', query.shape, 'key', key.shape, 'value', value.shape)
+
+
+
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d(query, attention_mask, frame * sp_size, height, width)
+
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv(key)
+ value = self._sparse_1d_kv(value)
+ else:
+ key, _, pad_len = self._sparse_1d(key, None, frame * sp_size, height, width)
+ value, _, pad_len = self._sparse_1d(value, None, frame * sp_size, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame * sp_size, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame * sp_size, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame * sp_size, height, width)
+
+
+
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ # import ipdb;ipdb.set_trace()
+ # print(attention_mask)
+ # print('query', query.shape, 'key', key.shape, 'value', value.shape, 'attention_mask', attention_mask.shape if attention_mask is not None else None)
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d(hidden_states, frame * sp_size, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame * sp_size, height, width, pad_height, pad_width)
+
+ hidden_states = rearrange(hidden_states, 'b h s d -> s b h d')
+
+ hidden_states = hidden_states.reshape(-1, attn.heads // sp_size, head_dim)
+ hidden_states = hidden_states.contiguous()
+ # [s * b, h // sp, d] -> [s // sp * b, h, d] -> [s // sp, b, h * d]
+ hidden_states = all_to_all_SBH(hidden_states, scatter_dim=0, gather_dim=1).reshape(-1, batch_size, h_size)
+ else:
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ # qk norm
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d(query, attention_mask, frame, height, width)
+
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv(key)
+ value = self._sparse_1d_kv(value)
+ else:
+ key, _, pad_len = self._sparse_1d(key, None, frame, height, width)
+ value, _, pad_len = self._sparse_1d(value, None, frame, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame, height, width)
+ # print(frame, height, width, query.shape, key.shape, value.shape)
+ # query, key, value = query.contiguous(), key.contiguous(), value.contiguous()
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d(hidden_states, frame, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame, height, width, pad_height, pad_width)
+
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+ hidden_states = hidden_states.to(query.dtype)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ if attn.downsampler is not None:
+ hidden_states = attn.downsampler.reverse(hidden_states, t=frame, h=height, w=width)
+ return hidden_states
+
+
+
+class FeedForward_Conv3d(nn.Module):
+ def __init__(self, downsampler, dim, hidden_features, bias=True):
+ super(FeedForward_Conv3d, self).__init__()
+
+ self.bias = bias
+
+ self.project_in = nn.Linear(dim, hidden_features, bias=bias)
+
+ self.dwconv = nn.ModuleList([
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(5, 5, 5), stride=1, padding=(2, 2, 2), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(3, 3, 3), stride=1, padding=(1, 1, 1), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(1, 1, 1), stride=1, padding=(0, 0, 0), dilation=1,
+ groups=hidden_features, bias=bias)
+ ])
+
+ self.project_out = nn.Linear(hidden_features, dim, bias=bias)
+
+
+ def forward(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ if npu_config is None:
+ x = self.project_in(x)
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + module(x)
+ out = rearrange(out, 'b d t h w -> b (t h w) d', t=t, h=h, w=w)
+ x = self.project_out(out)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.project_in, x, npu_config.replaced_type)
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + npu_config.run_conv3d(module, x, npu_config.replaced_type)
+ out = rearrange(out, 'b d t h w -> b (t h w) d', t=t, h=h, w=w)
+ x = npu_config.run_conv3d(self.project_out, out, x_dtype)
+ return x
+
+
+class FeedForward_Conv2d(nn.Module):
+ def __init__(self, downsampler, dim, hidden_features, bias=True):
+ super(FeedForward_Conv2d, self).__init__()
+
+ self.bias = bias
+
+ self.project_in = nn.Linear(dim, hidden_features, bias=bias)
+
+ self.dwconv = nn.ModuleList([
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(5, 5), stride=1, padding=(2, 2), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(3, 3), stride=1, padding=(1, 1), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(1, 1), stride=1, padding=(0, 0), dilation=1,
+ groups=hidden_features, bias=bias)
+ ])
+
+ self.project_out = nn.Linear(hidden_features, dim, bias=bias)
+
+
+ def forward(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ x = self.project_in(x)
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + module(x)
+ out = rearrange(out, '(b t) d h w -> b (t h w) d', t=t, h=h, w=w)
+ x = self.project_out(out)
+ return x
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+ r"""
+ A basic Transformer block.
+
+ Parameters:
+ dim (`int`): The number of channels in the input and output.
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+ only_cross_attention (`bool`, *optional*):
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
+ double_self_attention (`bool`, *optional*):
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
+ upcast_attention (`bool`, *optional*):
+ Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+ norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+ Whether to use learnable elementwise affine parameters for normalization.
+ norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+ The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+ final_dropout (`bool` *optional*, defaults to False):
+ Whether to apply a final dropout after the last feed-forward layer.
+ attention_type (`str`, *optional*, defaults to `"default"`):
+ The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+ positional_embeddings (`str`, *optional*, defaults to `None`):
+ The type of positional embeddings to apply to.
+ num_positional_embeddings (`int`, *optional*, defaults to `None`):
+ The maximum number of positional embeddings to apply.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ attention_bias: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_elementwise_affine: bool = True,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_eps: float = 1e-5,
+ final_dropout: bool = False,
+ attention_type: str = "default",
+ positional_embeddings: Optional[str] = None,
+ num_positional_embeddings: Optional[int] = None,
+ ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+ ada_norm_bias: Optional[int] = None,
+ ff_inner_dim: Optional[int] = None,
+ ff_bias: bool = True,
+ attention_out_bias: bool = True,
+ attention_mode: str = "xformers",
+ downsampler: str = None,
+ use_rope: bool = False,
+ interpolation_scale_thw: Tuple[int] = (1, 1, 1),
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ sparse_group: bool = False,
+ ):
+ super().__init__()
+ self.only_cross_attention = only_cross_attention
+ self.downsampler = downsampler
+
+ # We keep these boolean flags for backward-compatibility.
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+ self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+ self.use_layer_norm = norm_type == "layer_norm"
+ self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+ )
+
+ self.norm_type = norm_type
+ self.num_embeds_ada_norm = num_embeds_ada_norm
+
+ if positional_embeddings and (num_positional_embeddings is None):
+ raise ValueError(
+ "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+ )
+
+ if positional_embeddings == "sinusoidal":
+ self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+ else:
+ self.pos_embed = None
+
+ # Define 3 blocks. Each block has its own normalization layer.
+ # 1. Self-Attn
+ if norm_type == "ada_norm":
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_zero":
+ self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm1 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+ self.attn1 = Attention(
+ query_dim=dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ attention_mode=attention_mode,
+ downsampler=downsampler,
+ use_rope=use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ sparse_group=sparse_group,
+ is_cross_attn=False,
+ )
+
+ # 2. Cross-Attn
+ if cross_attention_dim is not None or double_self_attention:
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+ # the second cross attention block.
+ if norm_type == "ada_norm":
+ self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm2 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+ self.attn2 = Attention(
+ query_dim=dim,
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ attention_mode=attention_mode,
+ downsampler=False,
+ use_rope=False,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ sparse_group=sparse_group,
+ is_cross_attn=True,
+ ) # is self-attn if encoder_hidden_states is none
+ else:
+ self.norm2 = None
+ self.attn2 = None
+
+ # 3. Feed-forward
+ if norm_type == "ada_norm_continuous":
+ self.norm3 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "layer_norm",
+ )
+
+ elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]:
+ self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+ elif norm_type == "layer_norm_i2vgen":
+ self.norm3 = None
+
+ if downsampler:
+ downsampler_ker_size = list(re.search(r'k(\d{2,3})', downsampler).group(1)) # 122
+ # if len(downsampler_ker_size) == 3:
+ # self.ff = FeedForward_Conv3d(
+ # downsampler,
+ # dim,
+ # 2 * dim,
+ # bias=ff_bias,
+ # )
+ # elif len(downsampler_ker_size) == 2:
+ self.ff = FeedForward_Conv2d(
+ downsampler,
+ dim,
+ 2 * dim,
+ bias=ff_bias,
+ )
+ else:
+ self.ff = FeedForward(
+ dim,
+ dropout=dropout,
+ activation_fn=activation_fn,
+ final_dropout=final_dropout,
+ inner_dim=ff_inner_dim,
+ bias=ff_bias,
+ )
+
+ # 4. Fuser
+ if attention_type == "gated" or attention_type == "gated-text-image":
+ self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+ # 5. Scale-shift for PixArt-Alpha.
+ if norm_type == "ada_norm_single":
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+ # let chunk size default to None
+ self._chunk_size = None
+ self._chunk_dim = 0
+
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+ # Sets chunk feed-forward
+ self._chunk_size = chunk_size
+ self._chunk_dim = dim
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ frame: int = None,
+ height: int = None,
+ width: int = None,
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+ ) -> torch.FloatTensor:
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+ # Notice that normalization is always applied before the real computation in the following blocks.
+ # 0. Self-Attention
+ batch_size = hidden_states.shape[0]
+
+ # import ipdb;ipdb.set_trace()
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm1(hidden_states, timestep)
+ elif self.norm_type == "ada_norm_zero":
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm1(hidden_states)
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif self.norm_type == "ada_norm_single":
+ # import ipdb;ipdb.set_trace()
+ if get_sequence_parallel_state():
+ batch_size = hidden_states.shape[1]
+ # print('hidden_states', hidden_states.shape)
+ # print('timestep', timestep.shape)
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[:, None] + timestep.reshape(6, batch_size, -1)
+ ).chunk(6, dim=0)
+ else:
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+ ).chunk(6, dim=1)
+ norm_hidden_states = self.norm1(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+ # norm_hidden_states = norm_hidden_states.squeeze(1)
+ else:
+ raise ValueError("Incorrect norm used")
+
+ if self.pos_embed is not None:
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ # 1. Prepare GLIGEN inputs
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ if self.norm_type == "ada_norm_zero":
+ attn_output = gate_msa.unsqueeze(1) * attn_output
+ elif self.norm_type == "ada_norm_single":
+ attn_output = gate_msa * attn_output
+
+ hidden_states = attn_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ # 1.2 GLIGEN Control
+ if gligen_kwargs is not None:
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+ # 3. Cross-Attention
+ if self.attn2 is not None:
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm2(hidden_states, timestep)
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm2(hidden_states)
+ elif self.norm_type == "ada_norm_single":
+ # For PixArt norm2 isn't applied here:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+ norm_hidden_states = hidden_states
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ else:
+ raise ValueError("Incorrect norm")
+
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ attn_output = self.attn2(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ hidden_states = attn_output + hidden_states
+
+ # 4. Feed-forward
+ # i2vgen doesn't have this norm 🤷♂️
+ if self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif not self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm3(hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+ if self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm2(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+ # if self._chunk_size is not None:
+ # # "feed_forward_chunk_size" can be used to save memory
+ # ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+ # else:
+
+ if self.downsampler:
+ ff_output = self.ff(norm_hidden_states, t=frame, h=height, w=width)
+ else:
+ ff_output = self.ff(norm_hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
+ elif self.norm_type == "ada_norm_single":
+ ff_output = gate_mlp * ff_output
+
+ hidden_states = ff_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ return hidden_states
diff --git a/opensora/models/diffusion/opensora1/rope.py b/opensora/models/diffusion/opensora1/rope.py
new file mode 100644
index 000000000..c127727a0
--- /dev/null
+++ b/opensora/models/diffusion/opensora1/rope.py
@@ -0,0 +1,98 @@
+import torch
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state
+
+class PositionGetter3D(object):
+ """ return positions of patches """
+
+ def __init__(self, ):
+ self.cache_positions = {}
+
+ def __call__(self, b, t, h, w, device):
+ if not (b,t,h,w) in self.cache_positions:
+ x = torch.arange(w, device=device)
+ y = torch.arange(h, device=device)
+ z = torch.arange(t, device=device)
+ pos = torch.cartesian_prod(z, y, x)
+ if get_sequence_parallel_state():
+ # print('PositionGetter3D', PositionGetter3D)
+ pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, -1, 1).contiguous().expand(3, -1, b).clone()
+ else:
+ pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, 1, -1).contiguous().expand(3, b, -1).clone()
+ poses = (pos[0].contiguous(), pos[1].contiguous(), pos[2].contiguous())
+ max_poses = (int(poses[0].max()), int(poses[1].max()), int(poses[2].max()))
+
+ self.cache_positions[b, t, h, w] = (poses, max_poses)
+ pos = self.cache_positions[b, t, h, w]
+
+ return pos
+
+
+class RoPE3D(torch.nn.Module):
+
+ def __init__(self, freq=10000.0, F0=1.0, interpolation_scale_thw=(1, 1, 1)):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+ self.interpolation_scale_t = interpolation_scale_thw[0]
+ self.interpolation_scale_h = interpolation_scale_thw[1]
+ self.interpolation_scale_w = interpolation_scale_thw[2]
+ self.cache = {}
+
+ def get_cos_sin(self, D, seq_len, device, dtype, interpolation_scale=1):
+ if (D, seq_len, device, dtype) not in self.cache:
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+ t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale
+ freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ cos = freqs.cos() # (Seq, Dim)
+ sin = freqs.sin()
+ self.cache[D, seq_len, device, dtype] = (cos, sin)
+ return self.cache[D, seq_len, device, dtype]
+
+ @staticmethod
+ def rotate_half(x):
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+ return torch.cat((-x2, x1), dim=-1)
+
+ def apply_rope1d(self, tokens, pos1d, cos, sin):
+ assert pos1d.ndim == 2
+ if not get_sequence_parallel_state():
+ # for (batch_size x nheads x ntokens x dim)
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+ else:
+ # for (batch_size x ntokens x nheads x dim)
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, :, None, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, :, None, :]
+
+ return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+ def forward(self, tokens, positions):
+ """
+ input:
+ * tokens: batch_size x nheads x ntokens x dim
+ * positions: batch_size x ntokens x 3 (t, y and x position of each token)
+ output:
+ * tokens after appplying RoPE3D (batch_size x nheads x ntokens x x dim)
+ """
+ assert tokens.size(3) % 3 == 0, "number of dimensions should be a multiple of three"
+ D = tokens.size(3) // 3
+ poses, max_poses = positions
+ assert len(poses) == 3 and poses[0].ndim == 2# Batch, Seq, 3
+ cos_t, sin_t = self.get_cos_sin(D, max_poses[0] + 1, tokens.device, tokens.dtype, self.interpolation_scale_t)
+ cos_y, sin_y = self.get_cos_sin(D, max_poses[1] + 1, tokens.device, tokens.dtype, self.interpolation_scale_h)
+ cos_x, sin_x = self.get_cos_sin(D, max_poses[2] + 1, tokens.device, tokens.dtype, self.interpolation_scale_w)
+ # split features into three along the feature dimension, and apply rope1d on each half
+ t, y, x = tokens.chunk(3, dim=-1)
+ t = self.apply_rope1d(t, poses[0], cos_t, sin_t)
+ y = self.apply_rope1d(y, poses[1], cos_y, sin_y)
+ x = self.apply_rope1d(x, poses[2], cos_x, sin_x)
+ tokens = torch.cat((t, y, x), dim=-1)
+ return tokens
\ No newline at end of file
diff --git a/opensora/models/diffusion/opensora2/__init__.py b/opensora/models/diffusion/opensora2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/opensora/models/diffusion/opensora2/modeling_inpaint.py b/opensora/models/diffusion/opensora2/modeling_inpaint.py
new file mode 100644
index 000000000..06ce4e4da
--- /dev/null
+++ b/opensora/models/diffusion/opensora2/modeling_inpaint.py
@@ -0,0 +1,269 @@
+import os
+import numpy as np
+from torch import nn
+import torch
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from diffusers.configuration_utils import register_to_config
+from opensora.models.diffusion.opensora2.modules import PatchEmbed2D
+from opensora.utils.utils import to_2tuple
+
+
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V
+
+import glob
+
+def zero_module(module):
+ for p in module.parameters():
+ nn.init.zeros_(p)
+ return module
+
+def reconstitute_checkpoint(pretrained_checkpoint, model_state_dict):
+ pretrained_keys = set(list(pretrained_checkpoint.keys()))
+ model_keys = set(list(model_state_dict.keys()))
+ common_keys = list(pretrained_keys & model_keys)
+ checkpoint = {k: pretrained_checkpoint[k] for k in common_keys if model_state_dict[k].numel() == pretrained_checkpoint[k].numel()}
+ return checkpoint
+
+
+class OpenSoraInpaint(OpenSoraT2V):
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ sample_size_t: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ patch_size: Optional[int] = None,
+ patch_size_t: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_elementwise_affine: bool = True,
+ norm_eps: float = 1e-5,
+ attention_type: str = "default",
+ caption_channels: int = None,
+ interpolation_scale_h: float = None,
+ interpolation_scale_w: float = None,
+ interpolation_scale_t: float = None,
+ use_additional_conditions: Optional[bool] = None,
+ attention_mode: str = 'xformers',
+ downsampler: str = None,
+ use_rope: bool = False,
+ use_stable_fp32: bool = False,
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ use_motion: bool = False,
+ # inpaint
+ vae_scale_factor_t: int = 4,
+ ):
+ super().__init__(
+ num_attention_heads=num_attention_heads,
+ attention_head_dim=attention_head_dim,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ num_layers=num_layers,
+ dropout=dropout,
+ norm_num_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim,
+ attention_bias=attention_bias,
+ sample_size=sample_size,
+ sample_size_t=sample_size_t,
+ num_vector_embeds=num_vector_embeds,
+ patch_size=patch_size,
+ patch_size_t=patch_size_t,
+ activation_fn=activation_fn,
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention,
+ double_self_attention=double_self_attention,
+ upcast_attention=upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=norm_elementwise_affine,
+ norm_eps=norm_eps,
+ attention_type=attention_type,
+ caption_channels=caption_channels,
+ interpolation_scale_h=interpolation_scale_h,
+ interpolation_scale_w=interpolation_scale_w,
+ interpolation_scale_t=interpolation_scale_t,
+ use_additional_conditions=use_additional_conditions,
+ attention_mode=attention_mode,
+ downsampler=downsampler,
+ use_rope=use_rope,
+ use_stable_fp32=use_stable_fp32,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ use_motion=use_motion,
+ )
+
+ self.vae_scale_factor_t = vae_scale_factor_t
+ # init masked_pixel_values and mask conv_in
+ self._init_patched_inputs_for_inpainting()
+
+ def _init_patched_inputs_for_inpainting(self):
+
+ assert self.config.sample_size_t is not None, "OpenSoraInpaint over patched input must provide sample_size_t"
+ assert self.config.sample_size is not None, "OpenSoraInpaint over patched input must provide sample_size"
+ #assert not (self.config.sample_size_t == 1 and self.config.patch_size_t == 2), "Image do not need patchfy in t-dim"
+
+ self.num_frames = self.config.sample_size_t
+ self.config.sample_size = to_2tuple(self.config.sample_size)
+ self.height = self.config.sample_size[0]
+ self.width = self.config.sample_size[1]
+ self.patch_size_t = self.config.patch_size_t
+ self.patch_size = self.config.patch_size
+ interpolation_scale_t = ((self.config.sample_size_t - 1) // 16 + 1) if self.config.sample_size_t % 2 == 1 else self.config.sample_size_t / 16
+ interpolation_scale_t = (
+ self.config.interpolation_scale_t if self.config.interpolation_scale_t is not None else interpolation_scale_t
+ )
+ interpolation_scale = (
+ self.config.interpolation_scale_h if self.config.interpolation_scale_h is not None else self.config.sample_size[0] / 30,
+ self.config.interpolation_scale_w if self.config.interpolation_scale_w is not None else self.config.sample_size[1] / 40,
+ )
+
+ self.pos_embed_mask = nn.ModuleList(
+ [
+ PatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.vae_scale_factor_t, # adapt for mask
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ ),
+ zero_module(nn.Linear(self.inner_dim, self.inner_dim, bias=False)),
+ ]
+ )
+ self.pos_embed_masked_hidden_states = nn.ModuleList(
+ [
+ PatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ ),
+ zero_module(nn.Linear(self.inner_dim, self.inner_dim, bias=False)),
+ ]
+ )
+
+ def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, motion_score, batch_size, frame, use_image_num):
+ # inpaint
+ assert hidden_states.shape[1] == 2 * self.config.in_channels + self.vae_scale_factor_t
+ in_channels = self.config.in_channels
+
+ input_hidden_states, input_masked_hidden_states, input_mask = hidden_states[:, :in_channels], hidden_states[:, in_channels: 2 * in_channels], hidden_states[:, 2 * in_channels:]
+
+ input_hidden_states = self.pos_embed(input_hidden_states.to(self.dtype), frame)
+
+ input_masked_hidden_states = self.pos_embed_masked_hidden_states[0](input_masked_hidden_states.to(self.dtype), frame)
+ input_masked_hidden_states = self.pos_embed_masked_hidden_states[1](input_masked_hidden_states)
+
+ input_mask = self.pos_embed_mask[0](input_mask.to(self.dtype), frame)
+ input_mask = self.pos_embed_mask[1](input_mask)
+
+ hidden_states = input_hidden_states + input_masked_hidden_states + input_mask
+
+ if self.adaln_single is not None:
+ if self.use_additional_conditions and added_cond_kwargs is None:
+ raise ValueError(
+ "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+ )
+ timestep, embedded_timestep = self.adaln_single(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+ if self.motion_projection is not None:
+ assert motion_score is not None
+ motion_embed = self.motion_projection(motion_score, batch_size=batch_size, hidden_dtype=self.dtype) # b 6d
+ # print('use self.motion_projection, motion_embed:', torch.sum(motion_embed))
+ timestep = timestep + motion_embed
+
+ if self.caption_projection is not None:
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ assert encoder_hidden_states.shape[1] == 1
+ encoder_hidden_states = rearrange(encoder_hidden_states, 'b 1 l d -> (b 1) l d')
+
+ return hidden_states, encoder_hidden_states, timestep, embedded_timestep
+
+ def transformer_model_custom_load_state_dict(self, pretrained_model_path):
+ pretrained_model_path = os.path.join(pretrained_model_path, 'diffusion_pytorch_model.*')
+ pretrained_model_path = glob.glob(pretrained_model_path)
+ assert len(pretrained_model_path) > 0, f"Cannot find pretrained model in {pretrained_model_path}"
+ pretrained_model_path = pretrained_model_path[0]
+
+ print(f'Loading {self.__class__.__name__} pretrained weights...')
+ print(f'Loading pretrained model from {pretrained_model_path}...')
+ model_state_dict = self.state_dict()
+ if 'safetensors' in pretrained_model_path: # pixart series
+ from safetensors.torch import load_file as safe_load
+ # import ipdb;ipdb.set_trace()
+ pretrained_checkpoint = safe_load(pretrained_model_path, device="cpu")
+ else: # latest stage training weight
+ pretrained_checkpoint = torch.load(pretrained_model_path, map_location='cpu')
+ if 'model' in pretrained_checkpoint:
+ pretrained_checkpoint = pretrained_checkpoint['model']
+ checkpoint = reconstitute_checkpoint(pretrained_checkpoint, model_state_dict)
+
+ if not 'pos_embed_masked_hidden_states.0.weight' in checkpoint:
+ checkpoint['pos_embed_masked_hidden_states.0.proj.weight'] = checkpoint['pos_embed.proj.weight']
+ checkpoint['pos_embed_masked_hidden_states.0.proj.bias'] = checkpoint['pos_embed.proj.bias']
+
+ missing_keys, unexpected_keys = self.load_state_dict(checkpoint, strict=False)
+ print(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
+ print(f'Successfully load {len(self.state_dict()) - len(missing_keys)}/{len(model_state_dict)} keys from {pretrained_model_path}!')
+
+ def custom_load_state_dict(self, pretrained_model_path):
+ assert isinstance(pretrained_model_path, dict), "pretrained_model_path must be a dict"
+
+ pretrained_transformer_model_path = pretrained_model_path.get('transformer_model', None)
+
+ self.transformer_model_custom_load_state_dict(pretrained_transformer_model_path)
+
+def OpenSoraInpaint_S_122(**kwargs):
+ return OpenSoraInpaint(num_layers=32, attention_head_dim=96, num_attention_heads=8, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=768, **kwargs)
+
+def OpenSoraInpaint_B_122(**kwargs):
+ return OpenSoraInpaint(num_layers=32, attention_head_dim=96, num_attention_heads=16, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=1536, **kwargs)
+
+def OpenSoraInpaint_L_122(**kwargs):
+ return OpenSoraInpaint(num_layers=32, attention_head_dim=96, num_attention_heads=24, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=2304, **kwargs)
+
+OpenSoraInpaint_models = {
+ "OpenSoraInpaint-S/122": OpenSoraInpaint_S_122, # 0.3B
+ "OpenSoraInpaint-B/122": OpenSoraInpaint_B_122, # 1.2B
+ "OpenSoraInpaint-L/122": OpenSoraInpaint_L_122, # 2.7B
+}
+
+OpenSoraInpaint_models_class = {
+ "OpenSoraInpaint-S/122": OpenSoraInpaint,
+ "OpenSoraInpaint-B/122": OpenSoraInpaint,
+ "OpenSoraInpaint-L/122": OpenSoraInpaint,
+}
diff --git a/opensora/models/diffusion/opensora2/modeling_opensora.py b/opensora/models/diffusion/opensora2/modeling_opensora.py
new file mode 100644
index 000000000..a0326bfee
--- /dev/null
+++ b/opensora/models/diffusion/opensora2/modeling_opensora.py
@@ -0,0 +1,595 @@
+import os
+import numpy as np
+from torch import nn
+import torch
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from torch.nn import functional as F
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.utils import is_torch_version, deprecate
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.models.embeddings import PixArtAlphaTextProjection
+from opensora.models.diffusion.opensora2.modules import MotionAdaLayerNormSingle, PatchEmbed2D, BasicTransformerBlock
+from opensora.utils.utils import to_2tuple
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
+
+class OpenSoraT2V(ModelMixin, ConfigMixin):
+ """
+ A 2D Transformer model for image-like data.
+
+ Parameters:
+ num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+ in_channels (`int`, *optional*):
+ The number of channels in the input and output (specify if the input is **continuous**).
+ num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+ sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+ This is fixed during training since it is used to learn a number of position embeddings.
+ num_vector_embeds (`int`, *optional*):
+ The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+ Includes the class for the masked latent pixel.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+ num_embeds_ada_norm ( `int`, *optional*):
+ The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+ `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+ added to the hidden states.
+
+ During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+ attention_bias (`bool`, *optional*):
+ Configure if the `TransformerBlocks` attention should contain a bias parameter.
+ """
+
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ sample_size_t: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ patch_size: Optional[int] = None,
+ patch_size_t: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_elementwise_affine: bool = True,
+ norm_eps: float = 1e-5,
+ attention_type: str = "default",
+ caption_channels: int = None,
+ interpolation_scale_h: float = None,
+ interpolation_scale_w: float = None,
+ interpolation_scale_t: float = None,
+ use_additional_conditions: Optional[bool] = None,
+ attention_mode: str = 'xformers',
+ downsampler: str = None,
+ use_rope: bool = False,
+ use_stable_fp32: bool = False,
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ use_motion: bool = False,
+ ):
+ super().__init__()
+
+ # Validate inputs.
+ if patch_size is not None:
+ if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single"]:
+ raise NotImplementedError(
+ f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+ )
+ elif norm_type in ["ada_norm", "ada_norm_zero"] and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+ )
+
+ # Set some common variables used across the board.
+ self.use_motion = use_motion
+ self.sparse1d = sparse1d
+ self.sparse2d = sparse2d
+ self.sparse_n = sparse_n
+ self.use_rope = use_rope
+ self.use_linear_projection = use_linear_projection
+ self.interpolation_scale_t = interpolation_scale_t
+ self.interpolation_scale_h = interpolation_scale_h
+ self.interpolation_scale_w = interpolation_scale_w
+ self.downsampler = downsampler
+ self.caption_channels = caption_channels
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.gradient_checkpointing = False
+ self.config.hidden_size = self.inner_dim
+ use_additional_conditions = False
+ # if use_additional_conditions is None:
+ # if norm_type == "ada_norm_single" and sample_size == 128:
+ # use_additional_conditions = True
+ # else:
+ # use_additional_conditions = False
+ self.use_additional_conditions = use_additional_conditions
+
+ # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+ # Define whether input is continuous or discrete depending on configuration
+ assert in_channels is not None and patch_size is not None
+
+ if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+ deprecation_message = (
+ f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+ " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+ " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+ " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+ " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+ )
+ deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+ norm_type = "ada_norm"
+
+ # 2. Initialize the right blocks.
+ # Initialize the output blocks and other projection blocks when necessary.
+ self._init_patched_inputs(norm_type=norm_type)
+
+ def _init_patched_inputs(self, norm_type):
+ assert self.config.sample_size_t is not None, "OpenSoraT2V over patched input must provide sample_size_t"
+ assert self.config.sample_size is not None, "OpenSoraT2V over patched input must provide sample_size"
+ #assert not (self.config.sample_size_t == 1 and self.config.patch_size_t == 2), "Image do not need patchfy in t-dim"
+
+ self.num_frames = self.config.sample_size_t
+ self.config.sample_size = to_2tuple(self.config.sample_size)
+ self.height = self.config.sample_size[0]
+ self.width = self.config.sample_size[1]
+ self.patch_size_t = self.config.patch_size_t
+ self.patch_size = self.config.patch_size
+ interpolation_scale_t = ((self.config.sample_size_t - 1) // 16 + 1) if self.config.sample_size_t % 2 == 1 else self.config.sample_size_t / 16
+ interpolation_scale_t = (
+ self.config.interpolation_scale_t if self.config.interpolation_scale_t is not None else interpolation_scale_t
+ )
+ interpolation_scale = (
+ self.config.interpolation_scale_h if self.config.interpolation_scale_h is not None else self.config.sample_size[0] / 30,
+ self.config.interpolation_scale_w if self.config.interpolation_scale_w is not None else self.config.sample_size[1] / 40,
+ )
+ self.pos_embed = PatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+ interpolation_scale_thw = (interpolation_scale_t, *interpolation_scale)
+ self.transformer_blocks = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.config.cross_attention_dim,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ downsampler=self.config.downsampler,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=self.sparse1d if i > 1 and i < 30 else False,
+ sparse2d=self.sparse2d if i > 1 and i < 30 else False,
+ sparse_n=self.sparse_n,
+ sparse_group=i % 2 == 1,
+ )
+ for i in range(self.config.num_layers)
+ ]
+ )
+
+ if self.config.norm_type != "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+ self.proj_out_2 = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+ elif self.config.norm_type == "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
+ self.proj_out = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+
+ # PixArt-Alpha blocks.
+ self.adaln_single = None
+ if self.config.norm_type == "ada_norm_single":
+ # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+ # additional conditions until we find better name
+ self.adaln_single = AdaLayerNormSingle(
+ self.inner_dim, use_additional_conditions=self.use_additional_conditions
+ )
+
+ self.caption_projection = None
+ if self.caption_channels is not None:
+ self.caption_projection = PixArtAlphaTextProjection(
+ in_features=self.caption_channels, hidden_size=self.inner_dim
+ )
+ self.motion_projection = None
+ if self.use_motion:
+ self.motion_projection = MotionAdaLayerNormSingle(self.inner_dim)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ timestep: Optional[torch.LongTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ added_cond_kwargs: Dict[str, torch.Tensor] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ use_image_num: Optional[int] = 0,
+ motion_score: Optional[torch.FloatTensor] = None,
+ return_dict: bool = True,
+ ):
+ """
+ The [`Transformer2DModel`] forward method.
+
+ Args:
+ hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+ Input `hidden_states`.
+ encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+ self-attention.
+ timestep ( `torch.LongTensor`, *optional*):
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+ class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+ Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+ `AdaLayerZeroNorm`.
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ attention_mask ( `torch.Tensor`, *optional*):
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+ negative values to the attention scores corresponding to "discard" tokens.
+ encoder_attention_mask ( `torch.Tensor`, *optional*):
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+ above. This bias will be added to the cross-attention scores.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+ tuple.
+
+ Returns:
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+ `tuple` where the first element is the sample tensor.
+ """
+ batch_size, c, frame, h, w = hidden_states.shape
+ assert use_image_num == 0
+ frame = frame - use_image_num # 21-4=17
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ print.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+ if attention_mask is not None and attention_mask.ndim == 4:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
+ # b, frame+use_image_num, h, w -> a video with images
+ # b, 1, h, w -> only images
+ attention_mask = attention_mask.to(self.dtype)
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ attention_mask = attention_mask[:, :frame * hccl_info.world_size] # b, frame, h, w
+ else:
+ attention_mask = attention_mask[:, :frame * nccl_info.world_size] # b, frame, h, w
+ else:
+ attention_mask = attention_mask[:, :frame] # b, frame, h, w
+
+ attention_mask = attention_mask.unsqueeze(1) # b 1 t h w
+ attention_mask = F.max_pool3d(attention_mask, kernel_size=(self.patch_size_t, self.patch_size, self.patch_size),
+ stride=(self.patch_size_t, self.patch_size, self.patch_size))
+ attention_mask = rearrange(attention_mask, 'b 1 t h w -> (b 1) 1 (t h w)')
+ attention_mask = (1 - attention_mask.bool().to(self.dtype)) * -10000.0
+
+
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ # import ipdb;ipdb.set_trace()
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 3:
+ # b, 1, l -> only images
+ encoder_attention_mask = (1 - encoder_attention_mask.to(self.dtype)) * -10000.0
+
+
+ # if npu_config is not None and attention_mask is not None:
+ # attention_mask = npu_config.get_attention_mask(attention_mask, attention_mask.shape[-1])
+ # encoder_attention_mask = npu_config.get_attention_mask(encoder_attention_mask, attention_mask.shape[-2])
+
+
+ # 1. Input
+ frame = ((frame - 1) // self.patch_size_t + 1) if frame % 2 == 1 else frame // self.patch_size_t # patchfy
+ # print('frame', frame)
+ height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+ hidden_states, encoder_hidden_states, timestep, embedded_timestep = self._operate_on_patched_inputs(
+ hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, motion_score, batch_size, frame, use_image_num
+ )
+ # 2. Blocks
+ # import ipdb;ipdb.set_trace()
+ if get_sequence_parallel_state():
+ hidden_states = rearrange(hidden_states, 'b s h -> s b h', b=batch_size).contiguous()
+ encoder_hidden_states = rearrange(encoder_hidden_states, 'b s h -> s b h', b=batch_size).contiguous()
+ timestep = timestep.view(batch_size, 6, -1).transpose(0, 1).contiguous()
+
+ for block in self.transformer_blocks:
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states,
+ attention_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ timestep,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ hidden_states = block(
+ hidden_states,
+ attention_mask=attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+
+ if get_sequence_parallel_state():
+ hidden_states = rearrange(hidden_states, 's b h -> b s h', b=batch_size).contiguous()
+
+ # 3. Output
+ output = self._get_output_for_patched_inputs(
+ hidden_states=hidden_states,
+ timestep=timestep,
+ class_labels=class_labels,
+ embedded_timestep=embedded_timestep,
+ num_frames=frame,
+ height=height,
+ width=width,
+ ) # b c t h w
+
+ if not return_dict:
+ return (output,)
+
+ return Transformer2DModelOutput(sample=output)
+
+
+ def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, motion_score, batch_size, frame, use_image_num):
+
+ hidden_states = self.pos_embed(hidden_states.to(self.dtype), frame)
+ if self.adaln_single is not None:
+ if self.use_additional_conditions and added_cond_kwargs is None:
+ raise ValueError(
+ "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+ )
+ timestep, embedded_timestep = self.adaln_single(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+ if self.motion_projection is not None:
+ assert motion_score is not None
+ motion_embed = self.motion_projection(motion_score, batch_size=batch_size, hidden_dtype=self.dtype) # b 6d
+ # print('use self.motion_projection, motion_embed:', torch.sum(motion_embed))
+ timestep = timestep + motion_embed
+
+ if self.caption_projection is not None:
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ assert encoder_hidden_states.shape[1] == 1
+ encoder_hidden_states = rearrange(encoder_hidden_states, 'b 1 l d -> (b 1) l d')
+
+ return hidden_states, encoder_hidden_states, timestep, embedded_timestep
+
+
+
+ def _get_output_for_patched_inputs(
+ self, hidden_states, timestep, class_labels, embedded_timestep, num_frames, height=None, width=None
+ ):
+ # import ipdb;ipdb.set_trace()
+ if self.config.norm_type != "ada_norm_single":
+ conditioning = self.transformer_blocks[0].norm1.emb(
+ timestep, class_labels, hidden_dtype=self.dtype
+ )
+ shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+ hidden_states = self.proj_out_2(hidden_states)
+ elif self.config.norm_type == "ada_norm_single":
+ shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states)
+ # Modulation
+ hidden_states = hidden_states * (1 + scale) + shift
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states.squeeze(1)
+
+ # unpatchify
+ if self.adaln_single is None:
+ height = width = int(hidden_states.shape[1] ** 0.5)
+ hidden_states = hidden_states.reshape(
+ shape=(-1, num_frames, height, width, self.patch_size_t, self.patch_size, self.patch_size, self.out_channels)
+ )
+ hidden_states = torch.einsum("nthwopqc->nctohpwq", hidden_states)
+ output = hidden_states.reshape(
+ shape=(-1, self.out_channels, num_frames * self.patch_size_t, height * self.patch_size, width * self.patch_size)
+ )
+ return output
+
+def OpenSoraT2V_S_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=8, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=768, **kwargs)
+
+def OpenSoraT2V_B_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=16, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=1536, **kwargs)
+
+def OpenSoraT2V_L_122(**kwargs):
+ return OpenSoraT2V(num_layers=32, attention_head_dim=96, num_attention_heads=24, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, cross_attention_dim=2304, **kwargs)
+
+OpenSora2_models = {
+ "OpenSoraT2V2-S/122": OpenSoraT2V_S_122, # 0.3B
+ "OpenSoraT2V2-B/122": OpenSoraT2V_B_122, # 1.2B
+ "OpenSoraT2V2-L/122": OpenSoraT2V_L_122, # 2.7B
+}
+
+OpenSora2_models_class = {
+ "OpenSoraT2V2-S/122": OpenSoraT2V,
+ "OpenSoraT2V2-B/122": OpenSoraT2V,
+ "OpenSoraT2V2-L/122": OpenSoraT2V,
+}
+
+if __name__ == '__main__':
+ from opensora.models.causalvideovae import ae_stride_config, ae_channel_config
+ from opensora.models.causalvideovae import ae_norm, ae_denorm
+ from opensora.models import CausalVAEModelWrapper
+
+ args = type('args', (),
+ {
+ 'ae': 'CausalVAEModel_D8_4x8x8',
+ 'attention_mode': 'xformers',
+ 'use_rope': True,
+ 'model_max_length': 300,
+ 'max_height': 480,
+ 'max_width': 640,
+ 'num_frames': 29,
+ 'use_image_num': 0,
+ 'compress_kv_factor': 1,
+ 'interpolation_scale_t': 1,
+ 'interpolation_scale_h': 1,
+ 'interpolation_scale_w': 1,
+ "sparse1d": True,
+ "sparse2d": False,
+ "sparse_n": 4,
+ "rank": 64,
+ }
+ )
+ b = 16
+ c = 4
+ cond_c = 4096
+ num_timesteps = 1000
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ num_frames = (args.num_frames - 1) // ae_stride_t + 1
+
+ device = torch.device('cuda:0')
+ model = OpenSoraT2V_L_122(in_channels=c,
+ out_channels=c,
+ sample_size=latent_size,
+ sample_size_t=num_frames,
+ activation_fn="gelu-approximate",
+ attention_bias=True,
+ attention_type="default",
+ double_self_attention=False,
+ norm_elementwise_affine=False,
+ norm_eps=1e-06,
+ norm_num_groups=32,
+ num_vector_embeds=None,
+ only_cross_attention=False,
+ upcast_attention=False,
+ use_linear_projection=False,
+ use_additional_conditions=False,
+ downsampler=None,
+ interpolation_scale_t=args.interpolation_scale_t,
+ interpolation_scale_h=args.interpolation_scale_h,
+ interpolation_scale_w=args.interpolation_scale_w,
+ use_rope=args.use_rope,
+ sparse1d=args.sparse1d,
+ sparse2d=args.sparse2d,
+ sparse_n=args.sparse_n
+ ).to(device)
+
+ try:
+ path = "/storage/dataset/Open-Sora-Plan-v1.2.0/29x720p/diffusion_pytorch_model.safetensors"
+ ckpt = torch.load(path, map_location="cpu")
+ msg = model.load_state_dict(ckpt, strict=True)
+ print(msg)
+ except Exception as e:
+ print(e)
+ print(model)
+ print(f'{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B')
+ # import sys;sys.exit()
+ x = torch.randn(b, c, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w).to(device)
+ cond = torch.randn(b, 1+args.use_image_num, args.model_max_length, cond_c).to(device)
+ attn_mask = torch.randint(0, 2, (b, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w)).to(device) # B L or B 1+num_images L
+ cond_mask = torch.randint(0, 2, (b, 1+args.use_image_num, args.model_max_length)).to(device) # B L or B 1+num_images L
+ timestep = torch.randint(0, 1000, (b,), device=device)
+ model_kwargs = dict(hidden_states=x, encoder_hidden_states=cond, attention_mask=attn_mask,
+ encoder_attention_mask=cond_mask, use_image_num=args.use_image_num, timestep=timestep)
+ with torch.no_grad():
+ output = model(**model_kwargs)
+ print(output[0].shape)
+
diff --git a/opensora/models/diffusion/opensora2/modules.py b/opensora/models/diffusion/opensora2/modules.py
new file mode 100644
index 000000000..1462fb4d4
--- /dev/null
+++ b/opensora/models/diffusion/opensora2/modules.py
@@ -0,0 +1,1472 @@
+from einops import rearrange
+from torch import nn
+import torch
+import numpy as np
+
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from typing import Any, Dict, Optional
+import re
+import torch
+import torch.nn.functional as F
+from torch import nn
+import diffusers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, GatedSelfAttentionDense
+from diffusers.models.attention_processor import Attention as Attention_
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding, Timesteps, TimestepEmbedding
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+from .rope import PositionGetter3D, RoPE3D
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+ from opensora.acceleration.communications import all_to_all_SBH
+except:
+ torch_npu = None
+ npu_config = None
+ set_run_dtype = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
+ from opensora.utils.communications import all_to_all_SBH
+logger = logging.get_logger(__name__)
+
+def get_3d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+ grid_t = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_h = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid_w = np.arange(grid_size[2], dtype=np.float32) / (grid_size[2] / base_size[2]) / interpolation_scale[2]
+ grid = np.meshgrid(grid_w, grid_h, grid_t) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([3, 1, grid_size[2], grid_size[1], grid_size[0]])
+ pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+ # import ipdb;ipdb.set_trace()
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 3 != 0:
+ raise ValueError("embed_dim must be divisible by 3")
+
+ # import ipdb;ipdb.set_trace()
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_t = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0]) # (T*H*W, D/3)
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1]) # (T*H*W, D/3)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2]) # (T*H*W, D/3)
+
+ emb = np.concatenate([emb_t, emb_h, emb_w], axis=1) # (T*H*W, D)
+ return emb
+
+
+def get_2d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+def get_1d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid return: pos_embed: [grid_size, embed_dim] or
+ [1+grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid = np.arange(grid_size, dtype=np.float32) / (grid_size / base_size) / interpolation_scale
+ pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid) # (H*W, D/2)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+ """
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
+
+class MotionEmbeddings(nn.Module):
+ """
+ From PixArt-Alpha.
+
+ Reference:
+ https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+ """
+
+ def __init__(self, embedding_dim):
+ super().__init__()
+
+ self.motion_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+ self.motion_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+
+ def forward(self, motion_score, hidden_dtype):
+ motions_proj = self.motion_proj(motion_score)
+ motions_emb = self.motion_embedder(motions_proj.to(dtype=hidden_dtype)) # (N, D)
+ return motions_emb
+
+class MotionAdaLayerNormSingle(nn.Module):
+ r"""
+ Norm layer adaptive layer norm single (adaLN-single).
+
+ As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+
+ Parameters:
+ embedding_dim (`int`): The size of each embedding vector.
+ use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+ """
+
+ def __init__(self, embedding_dim: int):
+ super().__init__()
+
+ self.emb = MotionEmbeddings(embedding_dim)
+
+ self.silu = nn.SiLU()
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+
+ self.linear.weight.data.zero_()
+ if self.linear.bias is not None:
+ self.linear.bias.data.zero_()
+
+ def forward(
+ self,
+ motion_score: torch.Tensor,
+ batch_size: int,
+ hidden_dtype: Optional[torch.dtype] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+ if isinstance(motion_score, float) or isinstance(motion_score, int):
+ motion_score = torch.tensor([motion_score], device=self.linear.weight.device)[: ]
+ assert motion_score.ndim == 1
+ if motion_score.shape[0] != batch_size:
+ motion_score = motion_score.repeat(batch_size//motion_score.shape[0])
+ assert motion_score.shape[0] == batch_size
+ # No modulation happening here.
+ embedded_motion = self.emb(motion_score, hidden_dtype=hidden_dtype)
+ return self.linear(self.silu(embedded_motion))
+
+class PatchEmbed2D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ # assert num_frames == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv2d(
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sp_size = hccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = hccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+ else:
+ sp_size = nccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = nccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+
+ else:
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ assert latent.shape[1] == num_frames
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ latent = (latent + temp_pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, 'b t n c -> b (t n) c')
+ return latent
+
+
+class Attention(Attention_):
+ def __init__(self, downsampler, attention_mode, use_rope, interpolation_scale_thw,
+ sparse1d, sparse2d, sparse_n, sparse_group, is_cross_attn, **kwags):
+ processor = AttnProcessor2_0(attention_mode=attention_mode, use_rope=use_rope, interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d, sparse2d=sparse2d, sparse_n=sparse_n, sparse_group=sparse_group, is_cross_attn=is_cross_attn)
+ super().__init__(processor=processor, **kwags)
+ self.downsampler = None
+
+ def prepare_attention_mask(
+ self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+ ) -> torch.Tensor:
+ r"""
+ Prepare the attention mask for the attention computation.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ The attention mask to prepare.
+ target_length (`int`):
+ The target length of the attention mask. This is the length of the attention mask after padding.
+ batch_size (`int`):
+ The batch size, which is used to repeat the attention mask.
+ out_dim (`int`, *optional*, defaults to `3`):
+ The output dimension of the attention mask. Can be either `3` or `4`.
+
+ Returns:
+ `torch.Tensor`: The prepared attention mask.
+ """
+ head_size = self.heads
+ if get_sequence_parallel_state():
+ head_size = head_size // nccl_info.world_size
+ if attention_mask is None:
+ return attention_mask
+
+ current_length: int = attention_mask.shape[-1]
+ if current_length != target_length:
+ if attention_mask.device.type == "mps":
+ # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+ # Instead, we can manually construct the padding tensor.
+ padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+ padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+ attention_mask = torch.cat([attention_mask, padding], dim=2)
+ else:
+ # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+ # we want to instead pad by (0, remaining_length), where remaining_length is:
+ # remaining_length: int = target_length - current_length
+ # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+ if out_dim == 3:
+ if attention_mask.shape[0] < batch_size * head_size:
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+ elif out_dim == 4:
+ attention_mask = attention_mask.unsqueeze(1)
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+ return attention_mask
+
+class DownSampler3d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv3d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ if npu_config is None:
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.layer, x, x_dtype) + (x if self.down_shortcut else 0)
+
+ self.t = t//self.down_factor[0]
+ self.h = h//self.down_factor[1]
+ self.w = w//self.down_factor[2]
+ x = rearrange(x, 'b d (t dt) (h dh) (w dw) -> (b dt dh dw) (t h w) d',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> b 1 t h w', t=t, h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (t dt) (h dh) (w dw) -> (b dt dh dw) 1 (t h w)',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b dt dh dw) (t h w) d -> b (t dt h dh w dw) d',
+ t=t, h=h, w=w,
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x
+
+
+class DownSampler2d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv2d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+
+ self.t = 1
+ self.h = h//self.down_factor[0]
+ self.w = w//self.down_factor[1]
+
+ x = rearrange(x, 'b d (h dh) (w dw) -> (b dh dw) (h w) d',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> (b t) 1 h w', h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (h dh) (w dw) -> (b dh dw) 1 (h w)',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b t dh dw) (h w) d -> b (t h dh w dw) d',
+ t=t, h=h, w=w,
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x
+
+class AttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+ """
+
+ def __init__(self, attention_mode='xformers', use_rope=False, interpolation_scale_thw=(1, 1, 1),
+ sparse1d=False, sparse2d=False, sparse_n=2, sparse_group=False, is_cross_attn=True):
+ self.sparse1d = sparse1d
+ self.sparse2d = sparse2d
+ self.sparse_n = sparse_n
+ self.sparse_group = sparse_group
+ self.is_cross_attn = is_cross_attn
+ self.use_rope = use_rope
+ self.interpolation_scale_thw = interpolation_scale_thw
+ if self.use_rope:
+ self._init_rope(interpolation_scale_thw)
+ self.attention_mode = attention_mode
+ if not hasattr(F, "scaled_dot_product_attention"):
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+ assert not (self.sparse1d and self.sparse2d)
+
+ def _init_rope(self, interpolation_scale_thw):
+ self.rope = RoPE3D(interpolation_scale_thw=interpolation_scale_thw)
+ self.position_getter = PositionGetter3D()
+
+ def _sparse_1d(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ attention_mask: b nheads 1 thw
+ """
+ l = x.shape[-2]
+ assert l == frame*height*width
+ # import ipdb;ipdb.set_trace()
+ if torch_npu is not None and attention_mask is not None:
+ assert attention_mask.ndim == 3 and attention_mask.shape[1] == 1
+ attention_mask = attention_mask.unsqueeze(1)
+ assert attention_mask is None or attention_mask.shape[2] == 1
+ pad_len = 0
+ if l % (self.sparse_n * self.sparse_n) != 0:
+ pad_len = self.sparse_n * self.sparse_n - l % (self.sparse_n * self.sparse_n)
+ if pad_len != 0:
+ x = F.pad(x, (0, 0, 0, pad_len))
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_len, 0, 0), value=-9980.0)
+ if not self.sparse_group:
+ x = rearrange(x, 'b h (g k) d -> (k b) h g d', k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (g k) -> (k b) h 1 g', k=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b h (n m k) d -> (m b) h (n k) d', m=self.sparse_n, k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (n m k) -> (m b) h 1 (n k)', m=self.sparse_n, k=self.sparse_n)
+ if self.is_cross_attn:
+ attention_mask = attention_mask.repeat(self.sparse_n, 1, 1, 1)
+ return x, attention_mask, pad_len
+
+ def _sparse_1d_on_npu(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x ntokens x nheads x dim)
+ attention_mask: b nheads 1 thw
+ """
+ l = x.shape[1]
+ assert l == frame*height*width
+ # import ipdb;ipdb.set_trace()
+ if torch_npu is not None and attention_mask is not None:
+ assert attention_mask.ndim == 3 and attention_mask.shape[1] == 1
+ attention_mask = attention_mask.unsqueeze(1)
+ assert attention_mask is None or attention_mask.shape[2] == 1
+ pad_len = 0
+ if l % (self.sparse_n * self.sparse_n) != 0:
+ pad_len = self.sparse_n * self.sparse_n - l % (self.sparse_n * self.sparse_n)
+ if pad_len != 0:
+ x = F.pad(x, (0, 0, 0, 0, 0, pad_len))
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_len, 0, 0), value=-9980.0)
+ if not self.sparse_group:
+ x = rearrange(x, 'b (g k) h d -> (b k) g h d', k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (g k) -> (b k) h 1 g', k=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b (n m k) h d -> (b m) (n k) h d', m=self.sparse_n, k=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (n m k) -> (b m) h 1 (n k)', m=self.sparse_n, k=self.sparse_n)
+ if self.is_cross_attn:
+ attention_mask = repeat(attention_mask, 'b h 1 s -> (b k) h 1 s', k=self.sparse_n)
+ return x, attention_mask, pad_len
+
+ def _reverse_sparse_1d(self, x, frame, height, width, pad_len):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ # import ipdb;ipdb.set_trace()
+ assert x.shape[2] == (frame*height*width+pad_len) // self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(k b) h g d -> b h (g k) d', k=self.sparse_n)
+ else:
+ x = rearrange(x, '(m b) h (n k) d -> b h (n m k) d', m=self.sparse_n, k=self.sparse_n)
+ x = x[:, :, :frame*height*width, :]
+ # x = x.contiguous()
+ return x
+
+ def _reverse_sparse_1d_on_npu(self, x, frame, height, width, pad_len):
+ """
+ require the shape of (batch_size x ntokens x nheads x dim)
+ """
+ assert x.shape[1] == (frame * height * width + pad_len) // self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(b k) g h d -> b (g k) h d', k=self.sparse_n)
+ else:
+ x = rearrange(x, '(b m) (n k) h d -> b (n m k) h d', m=self.sparse_n, k=self.sparse_n)
+ x = x[:, :frame*height*width, :, :]
+ # x = x.contiguous()
+ return x
+
+ def _sparse_1d_kv(self, x):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ x = repeat(x, 'b h s d -> (k b) h s d', k=self.sparse_n)
+ return x
+
+ def _sparse_1d_kv_on_npu(self, x):
+ """
+ require the shape of (batch_size x ntokens x nheads x dim)
+ """
+ x = repeat(x, 'b h s d -> (b k) h s d', k=self.sparse_n)
+ return x
+
+ def _sparse_2d(self, x, attention_mask, frame, height, width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ attention_mask: b nheads 1 thw
+ """
+ d = x.shape[-1]
+ x = rearrange(x, 'b h (T H W) d -> b h T H W d', T=frame, H=height, W=width)
+ if torch_npu is not None and attention_mask is not None:
+ assert attention_mask.ndim == 3 and attention_mask.shape[1] == 1
+ attention_mask = attention_mask.unsqueeze(1)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h 1 (T H W) -> b h T H W', T=frame, H=height, W=width)
+ pad_height = self.sparse_n*self.sparse_n - height % (self.sparse_n*self.sparse_n)
+ pad_width = self.sparse_n*self.sparse_n - width % (self.sparse_n*self.sparse_n)
+ if pad_height != 0 or pad_width != 0:
+ x = rearrange(x, 'b h T H W d -> b (h d) T H W')
+ x = F.pad(x, (0, pad_width, 0, pad_height, 0, 0))
+ x = rearrange(x, 'b (h d) T H W -> b h T H W d', d=d)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = F.pad(attention_mask, (0, pad_width, 0, pad_height, 0, 0), value=-9500.0)
+
+ if not self.sparse_group:
+ x = rearrange(x, 'b h t (g1 k1) (g2 k2) d -> (k1 k2 b) h (t g1 g2) d',
+ k1=self.sparse_n, k2=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h t (g1 k1) (g2 k2) -> (k1 k2 b) h 1 (t g1 g2)',
+ k1=self.sparse_n, k2=self.sparse_n).contiguous()
+ else:
+ x = rearrange(x, 'b h t (n1 m1 k1) (n2 m2 k2) d -> (m1 m2 b) h (t n1 n2 k1 k2) d',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n)
+ if attention_mask is not None and not self.is_cross_attn:
+ attention_mask = rearrange(attention_mask, 'b h t (n1 m1 k1) (n2 m2 k2) -> (m1 m2 b) h 1 (t n1 n2 k1 k2)',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n)
+
+ if self.is_cross_attn:
+ attention_mask = attention_mask.repeat(self.sparse_n*self.sparse_n, 1, 1, 1)
+ return x, attention_mask, pad_height, pad_width
+
+ def _reverse_sparse_2d(self, x, frame, height, width, pad_height, pad_width):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ assert x.shape[2] == frame*(height+pad_height)*(width+pad_width)//self.sparse_n//self.sparse_n
+ if not self.sparse_group:
+ x = rearrange(x, '(k1 k2 b) h (t g1 g2) d -> b h t (g1 k1) (g2 k2) d',
+ k1=self.sparse_n, k2=self.sparse_n,
+ g1=(height+pad_height)//self.sparse_n, g2=(width+pad_width)//self.sparse_n)
+ else:
+ x = rearrange(x, '(m1 m2 b) h (t n1 n2 k1 k2) d -> b h t (n1 m1 k1) (n2 m2 k2) d',
+ m1=self.sparse_n, k1=self.sparse_n, m2=self.sparse_n, k2=self.sparse_n,
+ n1=(height+pad_height)//self.sparse_n//self.sparse_n, n2=(width+pad_width)//self.sparse_n//self.sparse_n)
+ x = x[:, :, :, :height, :width, :]
+ x = rearrange(x, 'b h T H W d -> b h (T H W) d')
+ # x = x.contiguous()
+ return x
+
+
+ def _sparse_2d_kv(self, x):
+ """
+ require the shape of (batch_size x nheads x ntokens x dim)
+ """
+ x = repeat(x, 'b h s d -> (k1 k2 b) h s d', k1=self.sparse_n, k2=self.sparse_n)
+ return x
+
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ frame: int = 8,
+ height: int = 16,
+ width: int = 16,
+ *args,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+
+ if attn.downsampler is not None:
+ hidden_states, attention_mask = attn.downsampler(hidden_states, attention_mask, t=frame, h=height, w=width)
+ frame, height, width = attn.downsampler.t, attn.downsampler.h, attn.downsampler.w
+
+ residual = hidden_states
+
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sequence_length, batch_size, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ else:
+ sequence_length, batch_size, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ else:
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ if attention_mask is not None:
+ if npu_config is None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length * nccl_info.world_size, batch_size)
+ # scaled_dot_product_attention expects attention_mask shape to be
+ # (batch, heads, source_length, target_length)
+ if get_sequence_parallel_state():
+ attention_mask = attention_mask.view(batch_size, attn.heads // nccl_info.world_size, -1, attention_mask.shape[-1])
+ else:
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ query = attn.to_q(hidden_states)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ if npu_config is not None and npu_config.on_npu:
+ if get_sequence_parallel_state():
+ query = query.view(-1, attn.heads, head_dim) # [s // sp, b, h * d] -> [s // sp * b, h, d]
+ key = key.view(-1, attn.heads, head_dim)
+ value = value.view(-1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ h_size = attn.heads * head_dim
+ sp_size = hccl_info.world_size
+ h_size_sp = h_size // sp_size
+ # apply all_to_all to gather sequence and split attention heads [s // sp * b, h, d] -> [s * b, h // sp, d]
+ query = all_to_all_SBH(query, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ key = all_to_all_SBH(key, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ value = all_to_all_SBH(value, scatter_dim=1, gather_dim=0).view(-1, batch_size, h_size_sp)
+ if self.use_rope:
+ query = query.view(-1, batch_size, attn.heads // sp_size, head_dim)
+ key = key.view(-1, batch_size, attn.heads // sp_size, head_dim)
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame * sp_size, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ query = query.view(-1, batch_size, h_size_sp)
+ key = key.view(-1, batch_size, h_size_sp)
+ value = value.view(-1, batch_size, h_size_sp)
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "SBH",
+ head_dim, attn.heads // sp_size)
+
+ hidden_states = hidden_states.view(-1, attn.heads // sp_size, head_dim)
+
+ # [s * b, h // sp, d] -> [s // sp * b, h, d] -> [s // sp, b, h * d]
+ hidden_states = all_to_all_SBH(hidden_states, scatter_dim=0, gather_dim=1).view(-1, batch_size, h_size)
+ else:
+ if npu_config.enable_FA and query.dtype == torch.float32:
+ dtype = torch.bfloat16
+ else:
+ dtype = None
+
+ query = query.reshape(batch_size, -1, attn.heads, head_dim)
+ key = key.reshape(batch_size, -1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ if self.use_rope:
+ # require the shape of (batch_size x ntokens x nheads x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ value = value.reshape(batch_size, -1, attn.heads, head_dim)
+ # print('l727', frame, height, width, query.shape, attention_mask.shape)
+ # import ipdb;ipdb.set_trace()
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d_on_npu(query, attention_mask, frame, height, width)
+
+ # import ipdb;ipdb.set_trace()
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv_on_npu(key)
+ value = self._sparse_1d_kv_on_npu(value)
+ else:
+ key, _, pad_len = self._sparse_1d_on_npu(key, None, frame, height, width)
+ value, _, pad_len = self._sparse_1d_on_npu(value, None, frame, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame, height, width)
+
+ query = query.reshape(query.shape[0], query.shape[1], -1)
+ key = key.reshape(key.shape[0], key.shape[1], -1)
+ value = value.reshape(value.shape[0], value.shape[1], -1)
+
+ if npu_config is not None and attention_mask is not None:
+ # b h(1) 1 l
+ # print('l756', attention_mask.shape, 'query, key, value', query.shape, key.shape, value.shape)
+ if self.sparse1d or self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ assert attention_mask.shape[1] == 1 and attention_mask.shape[2] == 1 and attention_mask.ndim == 4
+ attention_mask = attention_mask.squeeze(1) # b 1 l
+ else:
+ assert attention_mask.shape[1] == 1 and attention_mask.ndim == 3
+ # print('l760', attention_mask.shape)
+ if self.is_cross_attn:
+ attention_mask = npu_config.get_attention_mask(attention_mask, query.shape[1])
+ attention_mask = attention_mask.reshape(attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
+ else:
+ attention_mask = npu_config.get_attention_mask(attention_mask, attention_mask.shape[-1])
+ attention_mask = attention_mask.reshape(attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
+ # print('l767', attention_mask.shape)
+ with set_run_dtype(query, dtype):
+ query, key, value = npu_config.set_current_run_dtype([query, key, value])
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "BSH",
+ head_dim, attn.heads)
+
+ hidden_states = npu_config.restore_dtype(hidden_states)
+
+ hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, attn.heads, head_dim)
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d_on_npu(hidden_states, frame, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame, height, width, pad_height, pad_width)
+
+ hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
+
+ else:
+ if get_sequence_parallel_state():
+ query = query.reshape(-1, attn.heads, head_dim) # [s // sp, b, h * d] -> [s // sp * b, h, d]
+ key = key.reshape(-1, attn.heads, head_dim)
+ value = value.reshape(-1, attn.heads, head_dim)
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+ h_size = attn.heads * head_dim
+ sp_size = nccl_info.world_size
+ h_size_sp = h_size // sp_size
+ # print(frame, sp_size, height, width)
+ # apply all_to_all to gather sequence and split attention heads [s // sp * b, h, d] -> [s * b, h // sp, d]
+ query = all_to_all_SBH(query, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ key = all_to_all_SBH(key, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ value = all_to_all_SBH(value, scatter_dim=1, gather_dim=0).reshape(-1, batch_size, h_size_sp)
+ query = query.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ key = key.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ value = value.reshape(-1, batch_size, attn.heads // sp_size, head_dim)
+ # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame * sp_size, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ # print('after rope query', query.shape, 'key', key.shape, 'value', value.shape)
+ query = rearrange(query, 's b h d -> b h s d')
+ key = rearrange(key, 's b h d -> b h s d')
+ value = rearrange(value, 's b h d -> b h s d')
+ # print('rearrange query', query.shape, 'key', key.shape, 'value', value.shape)
+
+
+
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d(query, attention_mask, frame * sp_size, height, width)
+
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv(key)
+ value = self._sparse_1d_kv(value)
+ else:
+ key, _, pad_len = self._sparse_1d(key, None, frame * sp_size, height, width)
+ value, _, pad_len = self._sparse_1d(value, None, frame * sp_size, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame * sp_size, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame * sp_size, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame * sp_size, height, width)
+
+
+
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ # import ipdb;ipdb.set_trace()
+ # print(attention_mask)
+ # print('query', query.shape, 'key', key.shape, 'value', value.shape, 'attention_mask', attention_mask.shape if attention_mask is not None else None)
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d(hidden_states, frame * sp_size, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame * sp_size, height, width, pad_height, pad_width)
+
+ hidden_states = rearrange(hidden_states, 'b h s d -> s b h d')
+
+ hidden_states = hidden_states.reshape(-1, attn.heads // sp_size, head_dim)
+ hidden_states = hidden_states.contiguous()
+ # [s * b, h // sp, d] -> [s // sp * b, h, d] -> [s // sp, b, h * d]
+ hidden_states = all_to_all_SBH(hidden_states, scatter_dim=0, gather_dim=1).reshape(-1, batch_size, h_size)
+ else:
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ # qk norm
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ if self.sparse1d:
+ query, attention_mask, pad_len = self._sparse_1d(query, attention_mask, frame, height, width)
+
+ if self.is_cross_attn:
+ key = self._sparse_1d_kv(key)
+ value = self._sparse_1d_kv(value)
+ else:
+ key, _, pad_len = self._sparse_1d(key, None, frame, height, width)
+ value, _, pad_len = self._sparse_1d(value, None, frame, height, width)
+
+ elif self.sparse2d:
+ # import ipdb;ipdb.set_trace()
+ query, attention_mask, pad_height, pad_width = self._sparse_2d(query, attention_mask, frame, height, width)
+ if self.is_cross_attn:
+ key = self._sparse_2d_kv(key)
+ value = self._sparse_2d_kv(value)
+ else:
+ key, _, pad_height, pad_width = self._sparse_2d(key, None, frame, height, width)
+ value, _, pad_height, pad_width = self._sparse_2d(value, None, frame, height, width)
+ # print(frame, height, width, query.shape, key.shape, value.shape)
+ # query, key, value = query.contiguous(), key.contiguous(), value.contiguous()
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+
+ if self.sparse1d:
+ hidden_states = self._reverse_sparse_1d(hidden_states, frame, height, width, pad_len)
+ elif self.sparse2d:
+ hidden_states = self._reverse_sparse_2d(hidden_states, frame, height, width, pad_height, pad_width)
+
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+ hidden_states = hidden_states.to(query.dtype)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ if attn.downsampler is not None:
+ hidden_states = attn.downsampler.reverse(hidden_states, t=frame, h=height, w=width)
+ return hidden_states
+
+
+
+class FeedForward_Conv3d(nn.Module):
+ def __init__(self, downsampler, dim, hidden_features, bias=True):
+ super(FeedForward_Conv3d, self).__init__()
+
+ self.bias = bias
+
+ self.project_in = nn.Linear(dim, hidden_features, bias=bias)
+
+ self.dwconv = nn.ModuleList([
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(5, 5, 5), stride=1, padding=(2, 2, 2), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(3, 3, 3), stride=1, padding=(1, 1, 1), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv3d(hidden_features, hidden_features, kernel_size=(1, 1, 1), stride=1, padding=(0, 0, 0), dilation=1,
+ groups=hidden_features, bias=bias)
+ ])
+
+ self.project_out = nn.Linear(hidden_features, dim, bias=bias)
+
+
+ def forward(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ if npu_config is None:
+ x = self.project_in(x)
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + module(x)
+ out = rearrange(out, 'b d t h w -> b (t h w) d', t=t, h=h, w=w)
+ x = self.project_out(out)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.project_in, x, npu_config.replaced_type)
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + npu_config.run_conv3d(module, x, npu_config.replaced_type)
+ out = rearrange(out, 'b d t h w -> b (t h w) d', t=t, h=h, w=w)
+ x = npu_config.run_conv3d(self.project_out, out, x_dtype)
+ return x
+
+
+class FeedForward_Conv2d(nn.Module):
+ def __init__(self, downsampler, dim, hidden_features, bias=True):
+ super(FeedForward_Conv2d, self).__init__()
+
+ self.bias = bias
+
+ self.project_in = nn.Linear(dim, hidden_features, bias=bias)
+
+ self.dwconv = nn.ModuleList([
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(5, 5), stride=1, padding=(2, 2), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(3, 3), stride=1, padding=(1, 1), dilation=1,
+ groups=hidden_features, bias=bias),
+ nn.Conv2d(hidden_features, hidden_features, kernel_size=(1, 1), stride=1, padding=(0, 0), dilation=1,
+ groups=hidden_features, bias=bias)
+ ])
+
+ self.project_out = nn.Linear(hidden_features, dim, bias=bias)
+
+
+ def forward(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ x = self.project_in(x)
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = F.gelu(x)
+ out = x
+ for module in self.dwconv:
+ out = out + module(x)
+ out = rearrange(out, '(b t) d h w -> b (t h w) d', t=t, h=h, w=w)
+ x = self.project_out(out)
+ return x
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+ r"""
+ A basic Transformer block.
+
+ Parameters:
+ dim (`int`): The number of channels in the input and output.
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+ only_cross_attention (`bool`, *optional*):
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
+ double_self_attention (`bool`, *optional*):
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
+ upcast_attention (`bool`, *optional*):
+ Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+ norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+ Whether to use learnable elementwise affine parameters for normalization.
+ norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+ The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+ final_dropout (`bool` *optional*, defaults to False):
+ Whether to apply a final dropout after the last feed-forward layer.
+ attention_type (`str`, *optional*, defaults to `"default"`):
+ The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+ positional_embeddings (`str`, *optional*, defaults to `None`):
+ The type of positional embeddings to apply to.
+ num_positional_embeddings (`int`, *optional*, defaults to `None`):
+ The maximum number of positional embeddings to apply.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ attention_bias: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_elementwise_affine: bool = True,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_eps: float = 1e-5,
+ final_dropout: bool = False,
+ attention_type: str = "default",
+ positional_embeddings: Optional[str] = None,
+ num_positional_embeddings: Optional[int] = None,
+ ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+ ada_norm_bias: Optional[int] = None,
+ ff_inner_dim: Optional[int] = None,
+ ff_bias: bool = True,
+ attention_out_bias: bool = True,
+ attention_mode: str = "xformers",
+ downsampler: str = None,
+ use_rope: bool = False,
+ interpolation_scale_thw: Tuple[int] = (1, 1, 1),
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ sparse_group: bool = False,
+ ):
+ super().__init__()
+ self.only_cross_attention = only_cross_attention
+ self.downsampler = downsampler
+
+ # We keep these boolean flags for backward-compatibility.
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+ self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+ self.use_layer_norm = norm_type == "layer_norm"
+ self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+ )
+
+ self.norm_type = norm_type
+ self.num_embeds_ada_norm = num_embeds_ada_norm
+
+ if positional_embeddings and (num_positional_embeddings is None):
+ raise ValueError(
+ "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+ )
+
+ if positional_embeddings == "sinusoidal":
+ self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+ else:
+ self.pos_embed = None
+
+ # Define 3 blocks. Each block has its own normalization layer.
+ # 1. Self-Attn
+ if norm_type == "ada_norm":
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_zero":
+ self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm1 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+ self.attn1 = Attention(
+ query_dim=dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ attention_mode=attention_mode,
+ downsampler=downsampler,
+ use_rope=use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ sparse_group=sparse_group,
+ is_cross_attn=False,
+ )
+
+ # 2. Cross-Attn
+ if cross_attention_dim is not None or double_self_attention:
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+ # the second cross attention block.
+ if norm_type == "ada_norm":
+ self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm2 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+ self.attn2 = Attention(
+ query_dim=dim,
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ attention_mode=attention_mode,
+ downsampler=False,
+ use_rope=False,
+ interpolation_scale_thw=interpolation_scale_thw,
+ sparse1d=sparse1d,
+ sparse2d=sparse2d,
+ sparse_n=sparse_n,
+ sparse_group=sparse_group,
+ is_cross_attn=True,
+ ) # is self-attn if encoder_hidden_states is none
+ else:
+ self.norm2 = None
+ self.attn2 = None
+
+ # 3. Feed-forward
+ if norm_type == "ada_norm_continuous":
+ self.norm3 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "layer_norm",
+ )
+
+ elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]:
+ self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+ elif norm_type == "layer_norm_i2vgen":
+ self.norm3 = None
+
+ if downsampler:
+ downsampler_ker_size = list(re.search(r'k(\d{2,3})', downsampler).group(1)) # 122
+ # if len(downsampler_ker_size) == 3:
+ # self.ff = FeedForward_Conv3d(
+ # downsampler,
+ # dim,
+ # 2 * dim,
+ # bias=ff_bias,
+ # )
+ # elif len(downsampler_ker_size) == 2:
+ self.ff = FeedForward_Conv2d(
+ downsampler,
+ dim,
+ 2 * dim,
+ bias=ff_bias,
+ )
+ else:
+ self.ff = FeedForward(
+ dim,
+ dropout=dropout,
+ activation_fn=activation_fn,
+ final_dropout=final_dropout,
+ inner_dim=ff_inner_dim,
+ bias=ff_bias,
+ )
+
+ # 4. Fuser
+ if attention_type == "gated" or attention_type == "gated-text-image":
+ self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+ # 5. Scale-shift for PixArt-Alpha.
+ if norm_type == "ada_norm_single":
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+ # let chunk size default to None
+ self._chunk_size = None
+ self._chunk_dim = 0
+
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+ # Sets chunk feed-forward
+ self._chunk_size = chunk_size
+ self._chunk_dim = dim
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ frame: int = None,
+ height: int = None,
+ width: int = None,
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+ ) -> torch.FloatTensor:
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+ # Notice that normalization is always applied before the real computation in the following blocks.
+ # 0. Self-Attention
+ batch_size = hidden_states.shape[0]
+
+ # import ipdb;ipdb.set_trace()
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm1(hidden_states, timestep)
+ elif self.norm_type == "ada_norm_zero":
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm1(hidden_states)
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif self.norm_type == "ada_norm_single":
+ # import ipdb;ipdb.set_trace()
+ if get_sequence_parallel_state():
+ batch_size = hidden_states.shape[1]
+ # print('hidden_states', hidden_states.shape)
+ # print('timestep', timestep.shape)
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[:, None] + timestep.reshape(6, batch_size, -1)
+ ).chunk(6, dim=0)
+ else:
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+ ).chunk(6, dim=1)
+ norm_hidden_states = self.norm1(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+ # norm_hidden_states = norm_hidden_states.squeeze(1)
+ else:
+ raise ValueError("Incorrect norm used")
+
+ if self.pos_embed is not None:
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ # 1. Prepare GLIGEN inputs
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ if self.norm_type == "ada_norm_zero":
+ attn_output = gate_msa.unsqueeze(1) * attn_output
+ elif self.norm_type == "ada_norm_single":
+ attn_output = gate_msa * attn_output
+
+ hidden_states = attn_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ # 1.2 GLIGEN Control
+ if gligen_kwargs is not None:
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+ # 3. Cross-Attention
+ if self.attn2 is not None:
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm2(hidden_states, timestep)
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm2(hidden_states)
+ elif self.norm_type == "ada_norm_single":
+ # For PixArt norm2 isn't applied here:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+ norm_hidden_states = hidden_states
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ else:
+ raise ValueError("Incorrect norm")
+
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ attn_output = self.attn2(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ hidden_states = attn_output + hidden_states
+
+ # 4. Feed-forward
+ # i2vgen doesn't have this norm 🤷♂️
+ if self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif not self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm3(hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+ if self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm2(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+ # if self._chunk_size is not None:
+ # # "feed_forward_chunk_size" can be used to save memory
+ # ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+ # else:
+
+ if self.downsampler:
+ ff_output = self.ff(norm_hidden_states, t=frame, h=height, w=width)
+ else:
+ ff_output = self.ff(norm_hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
+ elif self.norm_type == "ada_norm_single":
+ ff_output = gate_mlp * ff_output
+
+ hidden_states = ff_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ return hidden_states
diff --git a/opensora/models/diffusion/opensora2/rope.py b/opensora/models/diffusion/opensora2/rope.py
new file mode 100644
index 000000000..1d907f733
--- /dev/null
+++ b/opensora/models/diffusion/opensora2/rope.py
@@ -0,0 +1,98 @@
+import torch
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state
+
+class PositionGetter3D(object):
+ """ return positions of patches """
+
+ def __init__(self, ):
+ self.cache_positions = {}
+
+ def __call__(self, b, t, h, w, device):
+ if not (b,t,h,w) in self.cache_positions:
+ x = torch.arange(w, device=device)
+ y = torch.arange(h, device=device)
+ z = torch.arange(t, device=device)
+ pos = torch.cartesian_prod(z, y, x)
+ if get_sequence_parallel_state():
+ # print('PositionGetter3D', PositionGetter3D)
+ pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, -1, 1).contiguous().expand(3, -1, b).clone()
+ else:
+ pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, 1, -1).contiguous().expand(3, b, -1).clone()
+ poses = (pos[0].contiguous(), pos[1].contiguous(), pos[2].contiguous())
+ max_poses = (int(poses[0].max()), int(poses[1].max()), int(poses[2].max()))
+
+ self.cache_positions[b, t, h, w] = (poses, max_poses)
+ pos = self.cache_positions[b, t, h, w]
+
+ return pos
+
+
+class RoPE3D(torch.nn.Module):
+
+ def __init__(self, freq=10000.0, F0=1.0, interpolation_scale_thw=(1, 1, 1)):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+ self.interpolation_scale_t = interpolation_scale_thw[0]
+ self.interpolation_scale_h = interpolation_scale_thw[1]
+ self.interpolation_scale_w = interpolation_scale_thw[2]
+ self.cache = {}
+
+ def get_cos_sin(self, D, seq_len, device, dtype, interpolation_scale=1):
+ if (D, seq_len, device, dtype) not in self.cache:
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+ t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale
+ freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ cos = freqs.cos() # (Seq, Dim)
+ sin = freqs.sin()
+ self.cache[D, seq_len, device, dtype] = (cos, sin)
+ return self.cache[D, seq_len, device, dtype]
+
+ @staticmethod
+ def rotate_half(x):
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+ return torch.cat((-x2, x1), dim=-1)
+
+ def apply_rope1d(self, tokens, pos1d, cos, sin):
+ assert pos1d.ndim == 2
+ if torch_npu is None and not get_sequence_parallel_state():
+ # for (batch_size x nheads x ntokens x dim)
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+ else:
+ # for (batch_size x ntokens x nheads x dim) or (ntokens x batch_size x nheads x dim)
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, :, None, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, :, None, :]
+
+ return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+ def forward(self, tokens, positions):
+ """
+ input:
+ * tokens: batch_size x nheads x ntokens x dim
+ * positions: batch_size x ntokens x 3 (t, y and x position of each token)
+ output:
+ * tokens after appplying RoPE3D (batch_size x nheads x ntokens x x dim)
+ """
+ assert tokens.size(3) % 3 == 0, "number of dimensions should be a multiple of three"
+ D = tokens.size(3) // 3
+ poses, max_poses = positions
+ assert len(poses) == 3 and poses[0].ndim == 2# Batch, Seq, 3
+ cos_t, sin_t = self.get_cos_sin(D, max_poses[0] + 1, tokens.device, tokens.dtype, self.interpolation_scale_t)
+ cos_y, sin_y = self.get_cos_sin(D, max_poses[1] + 1, tokens.device, tokens.dtype, self.interpolation_scale_h)
+ cos_x, sin_x = self.get_cos_sin(D, max_poses[2] + 1, tokens.device, tokens.dtype, self.interpolation_scale_w)
+ # split features into three along the feature dimension, and apply rope1d on each half
+ t, y, x = tokens.chunk(3, dim=-1)
+ t = self.apply_rope1d(t, poses[0], cos_t, sin_t)
+ y = self.apply_rope1d(y, poses[1], cos_y, sin_y)
+ x = self.apply_rope1d(x, poses[2], cos_x, sin_x)
+ tokens = torch.cat((t, y, x), dim=-1)
+ return tokens
\ No newline at end of file
diff --git a/opensora/models/diffusion/udit/modeling_udit.py b/opensora/models/diffusion/udit/modeling_udit.py
new file mode 100644
index 000000000..cb80078d0
--- /dev/null
+++ b/opensora/models/diffusion/udit/modeling_udit.py
@@ -0,0 +1,1041 @@
+import os
+import numpy as np
+from torch import nn
+import torch
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from torch.nn import functional as F
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.utils import is_torch_version, deprecate
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.models.embeddings import PixArtAlphaTextProjection
+from opensora.models.diffusion.udit.modules import Upsample2d, Downsample2d, PatchEmbed2D, BasicTransformerBlock, \
+ FP32_GELU, FP32_SiLU, FP32_Layernorm
+from opensora.utils.utils import to_2tuple
+import math
+import re
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+
+class UDiTT2V(ModelMixin, ConfigMixin, PeftAdapterMixin):
+ """
+ A 2D Transformer model for image-like data.
+
+ Parameters:
+ num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+ in_channels (`int`, *optional*):
+ The number of channels in the input and output (specify if the input is **continuous**).
+ num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+ sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+ This is fixed during training since it is used to learn a number of position embeddings.
+ num_vector_embeds (`int`, *optional*):
+ The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+ Includes the class for the masked latent pixel.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+ num_embeds_ada_norm ( `int`, *optional*):
+ The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+ `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+ added to the hidden states.
+
+ During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+ attention_bias (`bool`, *optional*):
+ Configure if the `TransformerBlocks` attention should contain a bias parameter.
+ """
+
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ sample_size_t: Optional[int] = None,
+ patch_size: Optional[int] = None,
+ patch_size_t: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ mlp_ratio: int = 4,
+ depth: Optional[list] = [2, 5, 8, 5, 2],
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_elementwise_affine: bool = True,
+ norm_eps: float = 1e-5,
+ attention_type: str = "default",
+ caption_channels: int = None,
+ interpolation_scale_h: float = None,
+ interpolation_scale_w: float = None,
+ interpolation_scale_t: float = None,
+ use_additional_conditions: Optional[bool] = None,
+ attention_mode: str = 'xformers',
+ downsampler: str = 'k333_s222',
+ use_rope: bool = False,
+ use_stable_fp32: bool = False,
+ sparse1d: bool = False,
+ sparse2d: bool = False,
+ sparse_n: int = 2,
+ ):
+ super().__init__()
+
+ # Set some common variables used across the board.
+ self.use_stable_fp32 = use_stable_fp32
+ self.use_rope = use_rope
+ self.downsampler = downsampler
+ self.use_linear_projection = use_linear_projection
+ self.interpolation_scale_t = interpolation_scale_t
+ self.interpolation_scale_h = interpolation_scale_h
+ self.interpolation_scale_w = interpolation_scale_w
+ self.caption_channels = caption_channels
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.gradient_checkpointing = False
+ use_additional_conditions = False
+ # if use_additional_conditions is None:
+ # if norm_type == "ada_norm_single" and sample_size == 128:
+ # use_additional_conditions = True
+ # else:
+ # use_additional_conditions = False
+ self.use_additional_conditions = use_additional_conditions
+
+ # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+ # Define whether input is continuous or discrete depending on configuration
+ assert in_channels is not None
+
+ if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+ deprecation_message = (
+ f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+ " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+ " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+ " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+ " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+ )
+ deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+ norm_type = "ada_norm"
+
+ # 2. Initialize the right blocks.
+ # Initialize the output blocks and other projection blocks when necessary.
+ self._init_patched_inputs(norm_type=norm_type)
+ if self.use_stable_fp32:
+ self._replace_fp32_layers()
+
+ def _init_patched_inputs(self, norm_type):
+ assert self.config.sample_size_t is not None, "OpenSoraT2V over patched input must provide sample_size_t"
+ assert self.config.sample_size is not None, "OpenSoraT2V over patched input must provide sample_size"
+
+ self.config.sample_size = to_2tuple(self.config.sample_size)
+ # self.num_frames = self.config.sample_size_t
+ # self.height = self.config.sample_size[0]
+ # self.width = self.config.sample_size[1]
+ interpolation_scale_t = ((self.config.sample_size_t - 1) // 16 + 1) if self.config.sample_size_t % 2 == 1 else self.config.sample_size_t / 16
+ interpolation_scale_t = (
+ self.config.interpolation_scale_t if self.config.interpolation_scale_t is not None else interpolation_scale_t
+ )
+ interpolation_scale = (
+ self.config.interpolation_scale_h if self.config.interpolation_scale_h is not None else self.config.sample_size[0] / 30,
+ self.config.interpolation_scale_w if self.config.interpolation_scale_w is not None else self.config.sample_size[1] / 40,
+ )
+
+ # down_factor = list(re.search(r's(\d{2,3})', self.downsampler).group(1))
+ # down_factor = [int(i) for i in down_factor]
+ # down_factor = down_factor if isinstance(self.config.down_factor, list) else [self.config.down_factor] * 5
+ # down_factor = [2] * len(self.config.depth)
+ is_video_model = False
+ # if self.config.downsampler is not None and len(self.config.downsampler) == 9:
+ # is_video_model = True # to init weight from image
+ # self.pos_embed = OverlapPatchEmbed3D(
+ # num_frames=self.config.sample_size_t,
+ # height=self.config.sample_size[0],
+ # width=self.config.sample_size[1],
+ # patch_size_t=self.config.patch_size_t,
+ # patch_size=self.config.patch_size,
+ # in_channels=self.in_channels,
+ # embed_dim=self.inner_dim,
+ # interpolation_scale=interpolation_scale,
+ # interpolation_scale_t=interpolation_scale_t,
+ # use_abs_pos=not self.config.use_rope,
+ # )
+ # elif self.config.downsampler is not None and len(self.config.downsampler) == 7:
+ # is_video_model = False
+ self.pos_embed = PatchEmbed2D(
+ num_frames=self.config.sample_size_t,
+ height=self.config.sample_size[0],
+ width=self.config.sample_size[1],
+ patch_size_t=self.config.patch_size_t,
+ patch_size=self.config.patch_size,
+ in_channels=self.in_channels,
+ embed_dim=self.inner_dim,
+ interpolation_scale=interpolation_scale,
+ interpolation_scale_t=interpolation_scale_t,
+ use_abs_pos=not self.config.use_rope,
+ )
+ # layer_thw = [[self.config.sample_size_t//self.config.patch_size_t,
+ # (self.config.sample_size[0] + self.config.sample_size[0] % (self.config.patch_size*2))//self.config.patch_size,
+ # (self.config.sample_size[1] + self.config.sample_size[1] % (self.config.patch_size*2))//self.config.patch_size]]
+ interpolation_scale_thw = (interpolation_scale_t, *interpolation_scale)
+ # for i in range((len(self.config.depth)-1)//2):
+ # t = layer_thw[i][0] // 2 if layer_thw[i][0] != 1 else 1
+ # h = (layer_thw[i][1] + layer_thw[i][1] % 4) // 2 # why mod 4, because downsample and downsampler in attention
+ # w = (layer_thw[i][2] + layer_thw[i][2] % 4) // 2
+ # layer_thw.append([t, h, w])
+ # self.layer_thw = layer_thw
+ self.encoder_level_1 = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim,
+ downsampler=self.config.downsampler,
+ mlp_ratio=self.config.mlp_ratio,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.inner_dim,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ )
+ for _ in range(self.config.depth[0])
+ ]
+ )
+ # self.down1_2 = Downsample3d(self.inner_dim) if is_video_model else Downsample2d(self.inner_dim)
+ self.down1_2 = Downsample2d(self.inner_dim, self.inner_dim * 2)
+
+ self.encoder_level_2 = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim * 2,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim * 2,
+ downsampler=self.config.downsampler,
+ mlp_ratio=self.config.mlp_ratio,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.inner_dim * 2,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ )
+ for _ in range(self.config.depth[1])
+ ]
+ )
+ # self.down2_3 = Downsample3d(self.inner_dim * 2) if is_video_model else Downsample2d(self.inner_dim * 2)
+ self.down2_3 = Downsample2d(self.inner_dim * 2, self.inner_dim * 4)
+
+ self.latent = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim * 4,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim * 4,
+ downsampler=self.config.downsampler,
+ mlp_ratio=self.config.mlp_ratio,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.inner_dim * 4,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ )
+ for _ in range(self.config.depth[2])
+ ]
+ )
+
+ # self.up3_2 = Upsample3d(int(self.inner_dim * 4)) if is_video_model else Upsample2d(self.inner_dim * 4) ## From Level 4 to Level 3
+ self.up3_2 = Upsample2d(self.inner_dim * 4, self.inner_dim * 2) ## From Level 4 to Level 3
+
+ # self.reduce_chan_level2_norm = nn.LayerNorm(int(self.inner_dim * 2), elementwise_affine=True, eps=1e-6)
+ self.reduce_chan_level2 = nn.Linear(int(self.inner_dim * 4), int(self.inner_dim * 2), bias=True)
+ self.decoder_level_2 = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim * 2,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim * 2,
+ downsampler=self.config.downsampler,
+ mlp_ratio=self.config.mlp_ratio,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.inner_dim * 2,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ )
+ for _ in range(self.config.depth[3])
+ ]
+ )
+
+ # self.up2_1 = Upsample3d(int(self.inner_dim * 2)) if is_video_model else Upsample2d(self.inner_dim * 2) ## From Level 4 to Level 3
+ self.up2_1 = Upsample2d(self.inner_dim * 2, self.inner_dim) ## From Level 4 to Level 3
+
+ # self.reduce_chan_level1_norm = nn.LayerNorm(int(self.inner_dim * 2), elementwise_affine=True, eps=1e-6)
+ self.reduce_chan_level1 = nn.Linear(int(self.inner_dim * 2), int(self.inner_dim * 1), bias=True)
+ self.decoder_level_1 = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ self.inner_dim,
+ self.config.num_attention_heads,
+ self.config.attention_head_dim,
+ downsampler=self.config.downsampler,
+ mlp_ratio=self.config.mlp_ratio,
+ dropout=self.config.dropout,
+ cross_attention_dim=self.inner_dim,
+ activation_fn=self.config.activation_fn,
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+ attention_bias=self.config.attention_bias,
+ only_cross_attention=self.config.only_cross_attention,
+ double_self_attention=self.config.double_self_attention,
+ upcast_attention=self.config.upcast_attention,
+ norm_type=norm_type,
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
+ norm_eps=self.config.norm_eps,
+ attention_type=self.config.attention_type,
+ attention_mode=self.config.attention_mode,
+ use_rope=self.config.use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ )
+ for _ in range(self.config.depth[4])
+ ]
+ )
+
+ if self.config.norm_type != "ada_norm_single":
+ self.norm_out = nn.LayerNorm(2 * self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.proj_out_1 = nn.Linear(2 * self.inner_dim, 2 * self.inner_dim)
+ self.proj_out_2 = nn.Linear(
+ 2 * self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+ elif self.config.norm_type == "ada_norm_single":
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+ self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / (self.inner_dim)**0.5)
+ self.proj_out = nn.Linear(
+ self.inner_dim, self.config.patch_size_t * self.config.patch_size * self.config.patch_size * self.out_channels
+ )
+
+ # PixArt-Alpha blocks.
+ # self.adaln_single = None
+ # if self.config.norm_type == "ada_norm_single":
+ # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+ # additional conditions until we find better name
+ self.adaln_single_1 = AdaLayerNormSingle(
+ self.inner_dim, use_additional_conditions=self.use_additional_conditions
+ )
+ self.adaln_single_2 = AdaLayerNormSingle(
+ self.inner_dim * 2, use_additional_conditions=self.use_additional_conditions
+ )
+ self.adaln_single_3 = AdaLayerNormSingle(
+ self.inner_dim * 4, use_additional_conditions=self.use_additional_conditions
+ )
+
+ # self.caption_projection = None
+ # if self.caption_channels is not None:
+ self.caption_projection_1 = PixArtAlphaTextProjection(
+ in_features=self.caption_channels, hidden_size=self.inner_dim
+ )
+ self.caption_projection_2 = PixArtAlphaTextProjection(
+ in_features=self.caption_channels, hidden_size=self.inner_dim * 2
+ )
+ self.caption_projection_3 = PixArtAlphaTextProjection(
+ in_features=self.caption_channels, hidden_size=self.inner_dim * 4
+ )
+
+ def _replace_fp32_layers(self, module=None):
+ if module is None:
+ module = self
+ for name, submodule in module.named_children():
+ if isinstance(submodule, nn.LayerNorm):
+ # print(f"Replacing LayerNorm in {name}")
+ new_layer = FP32_Layernorm(submodule.normalized_shape, submodule.eps, submodule.elementwise_affine)
+ if submodule.elementwise_affine:
+ new_layer.weight.data.copy_(submodule.weight.data.float())
+ if submodule.bias is not None:
+ new_layer.bias.data.copy_(submodule.bias.data.float())
+ setattr(module, name, new_layer)
+ elif isinstance(submodule, nn.SiLU):
+ # print(f"Replacing SiLU in {name}")
+ setattr(module, name, FP32_SiLU(submodule.inplace))
+ elif isinstance(submodule, nn.GELU):
+ # print(f"Replacing GELU in {name}")
+ setattr(module, name, FP32_GELU(submodule.approximate))
+ else:
+ self._replace_fp32_layers(submodule)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ timestep: Optional[torch.LongTensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ added_cond_kwargs: Dict[str, torch.Tensor] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ use_image_num: Optional[int] = 0,
+ return_dict: bool = True,
+ ):
+ """
+ The [`Transformer2DModel`] forward method.
+
+ Args:
+ hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+ Input `hidden_states`.
+ encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+ self-attention.
+ timestep ( `torch.LongTensor`, *optional*):
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+ class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+ Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+ `AdaLayerZeroNorm`.
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ attention_mask ( `torch.Tensor`, *optional*):
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+ negative values to the attention scores corresponding to "discard" tokens.
+ encoder_attention_mask ( `torch.Tensor`, *optional*):
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+ above. This bias will be added to the cross-attention scores.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+ tuple.
+
+ Returns:
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+ `tuple` where the first element is the sample tensor.
+ """
+ batch_size, c, frame, height, width = hidden_states.shape
+ frame = frame - use_image_num # 21-4=17
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ print.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+
+ if attention_mask is not None and attention_mask.ndim == 4:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
+ # b, frame+use_image_num, h, w -> a video with images
+ # b, 1, h, w -> only images
+ pad_h_0, pad_w_0 = height % (self.config.patch_size * 2), width % (self.config.patch_size * 2)
+
+ hidden_states = F.pad(hidden_states, (0, pad_w_0, 0, pad_h_0, 0, 0), mode='reflect')
+ attention_mask = attention_mask.to(self.dtype)
+ attention_mask = attention_mask.unsqueeze(1) # b 1 t h w
+ attention_mask = F.pad(attention_mask, (0, pad_w_0, 0, pad_h_0, 0, 0))
+ attention_mask = F.max_pool3d(attention_mask, kernel_size=(self.config.patch_size_t, self.config.patch_size, self.config.patch_size),
+ stride=(self.config.patch_size_t, self.config.patch_size, self.config.patch_size))
+ attention_mask = rearrange(attention_mask, 'b 1 t h w -> b 1 (t h w)')
+
+ attention_bias = (1 - attention_mask.bool().to(self.dtype)) * -10000.0
+
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 3:
+ # b, 1, l -> only video
+ encoder_attention_mask = (1 - encoder_attention_mask.to(self.dtype)) * -10000.0
+
+
+ # 1. Input
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+ hidden_states, encoder_hidden_states_1, encoder_hidden_states_2, encoder_hidden_states_3, \
+ timestep_1, timestep_2, timestep_3, \
+ embedded_timestep_1, embedded_timestep_2, embedded_timestep_3 = self._operate_on_patched_inputs(
+ hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, batch_size, frame, use_image_num
+ )
+ frame, height, width = frame // self.config.patch_size_t, \
+ (height + pad_h_0) // (self.config.patch_size), (width + pad_w_0) // (self.config.patch_size)
+
+
+ assert not torch.any(torch.isnan(hidden_states)), 'after _operate_on_patched_inputs'
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ # encoder_1
+ out_enc_level1 = hidden_states
+ # import ipdb;ipdb.set_trace()
+ if self.training and self.gradient_checkpointing:
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+ for block in self.encoder_level_1:
+ out_enc_level1 = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ out_enc_level1,
+ attention_bias,
+ encoder_hidden_states_1,
+ encoder_attention_mask,
+ timestep_1,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ for block in self.encoder_level_1:
+ out_enc_level1 = block(
+ out_enc_level1,
+ attention_mask=attention_bias,
+ encoder_hidden_states=encoder_hidden_states_1,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep_1,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+ pad_h_1, pad_w_1 = height % 4, width % 4
+
+ inp_enc_level2, attention_bias, attention_mask = self.down1_2(out_enc_level1, attention_mask, frame, height, width, pad_h=pad_h_1, pad_w=pad_w_1)
+ # frame, height, width = frame // 2 if frame != 1 else frame, (height + pad_h_1) // 2, (width + pad_w_1) // 2
+ height, width = (height + pad_h_1) // 2, (width + pad_w_1) // 2
+
+ # encoder_2
+ out_enc_level2 = inp_enc_level2
+
+ if self.training and self.gradient_checkpointing:
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+ for block in self.encoder_level_2:
+ out_enc_level2 = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ out_enc_level2,
+ attention_bias,
+ encoder_hidden_states_2,
+ encoder_attention_mask,
+ timestep_2,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ for block in self.encoder_level_2:
+ out_enc_level2 = block(
+ out_enc_level2,
+ attention_mask=attention_bias,
+ encoder_hidden_states=encoder_hidden_states_2,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep_2,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+ pad_h_2, pad_w_2 = height % 4, width % 4
+
+ # import ipdb;ipdb.set_trace()
+ inp_enc_level3, attention_bias, attention_mask = self.down2_3(out_enc_level2, attention_mask, frame, height, width, pad_h=pad_h_2, pad_w=pad_w_2)
+ # frame, height, width = frame // 2 if frame != 1 else frame, (height + pad_h_2) // 2, (width + pad_w_2) // 2
+ height, width = (height + pad_h_2) // 2, (width + pad_w_2) // 2
+
+ # latent
+ latent = inp_enc_level3
+ if self.training and self.gradient_checkpointing:
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+ for block in self.latent:
+ latent = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ latent,
+ attention_bias,
+ encoder_hidden_states_3,
+ encoder_attention_mask,
+ timestep_3,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ for block in self.latent:
+ latent = block(
+ latent,
+ attention_mask=attention_bias,
+ encoder_hidden_states=encoder_hidden_states_3,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep_3,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+
+ # decoder_2
+
+ # import ipdb;ipdb.set_trace()
+ inp_dec_level2, attention_bias, attention_mask = self.up3_2(latent, attention_mask, frame, height, width, pad_h=pad_h_2, pad_w=pad_w_2)
+ # frame, height, width = frame * 2 if frame != 1 else frame, height * 2 - pad_h_2, width * 2 - pad_w_2
+ height, width = height * 2 - pad_h_2, width * 2 - pad_w_2
+ inp_dec_level2 = torch.cat([inp_dec_level2, out_enc_level2], 2)
+ # inp_dec_level2 = self.reduce_chan_level2_norm(inp_dec_level2)
+ inp_dec_level2 = self.reduce_chan_level2(inp_dec_level2)
+ out_dec_level2 = inp_dec_level2
+
+ if self.training and self.gradient_checkpointing:
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+ for block in self.decoder_level_2:
+ out_dec_level2 = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ out_dec_level2,
+ attention_bias,
+ encoder_hidden_states_2,
+ encoder_attention_mask,
+ timestep_2,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ for block in self.decoder_level_2:
+ out_dec_level2 = block(
+ out_dec_level2,
+ attention_mask=attention_bias,
+ encoder_hidden_states=encoder_hidden_states_2,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep_2,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+
+ # decoder_1
+
+ # import ipdb;ipdb.set_trace()
+ inp_dec_level1, attention_bias, attention_mask = self.up2_1(out_dec_level2, attention_mask, frame, height, width, pad_h=pad_h_1, pad_w=pad_w_1)
+ # frame, height, width = frame * 2 if frame != 1 else frame, height * 2 - pad_h_1, width * 2 - pad_w_1
+ height, width = height * 2 - pad_h_1, width * 2 - pad_w_1
+ inp_dec_level1 = torch.cat([inp_dec_level1, out_enc_level1], 2)
+ # inp_dec_level1 = self.reduce_chan_level1_norm(inp_dec_level1)
+ inp_dec_level1 = self.reduce_chan_level1(inp_dec_level1)
+ out_dec_level1 = inp_dec_level1
+
+ if self.training and self.gradient_checkpointing:
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+ for block in self.decoder_level_1:
+ out_dec_level1 = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ out_dec_level1,
+ attention_bias,
+ encoder_hidden_states_1,
+ encoder_attention_mask,
+ timestep_1,
+ cross_attention_kwargs,
+ class_labels,
+ frame,
+ height,
+ width,
+ **ckpt_kwargs,
+ )
+ else:
+ for block in self.decoder_level_1:
+ out_dec_level1 = block(
+ out_dec_level1,
+ attention_mask=attention_bias,
+ encoder_hidden_states=encoder_hidden_states_1,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep_1,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ frame=frame,
+ height=height,
+ width=width,
+ )
+
+ assert not torch.any(torch.isnan(out_dec_level1)), 'after out_dec_level1'
+ # 3. Output
+ output = self._get_output_for_patched_inputs(
+ hidden_states=out_dec_level1,
+ timestep=timestep_1,
+ class_labels=class_labels,
+ embedded_timestep=embedded_timestep_1,
+ num_frames=frame,
+ height=height,
+ width=width,
+ ) # b c t h w
+
+ assert not torch.any(torch.isnan(output)), 'after output'
+ frame, height, width = frame * self.config.patch_size_t, height * self.config.patch_size - pad_h_0, width * self.config.patch_size - pad_w_0
+ output = output[:, :, :frame, :height, :width]
+ if not return_dict:
+ return (output,)
+
+ return Transformer2DModelOutput(sample=output)
+
+
+ def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, batch_size, frame, use_image_num):
+ # batch_size = hidden_states.shape[0]
+ hidden_states = self.pos_embed(hidden_states.to(self.dtype), frame)
+
+ if self.use_additional_conditions and added_cond_kwargs is None:
+ raise ValueError(
+ "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+ )
+ timestep_1, embedded_timestep_1 = self.adaln_single_1(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+ timestep_2, embedded_timestep_2 = self.adaln_single_2(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+ timestep_3, embedded_timestep_3 = self.adaln_single_3(
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=self.dtype
+ ) # b 6d, b d
+
+ encoder_hidden_states_1 = self.caption_projection_1(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ encoder_hidden_states_1 = rearrange(encoder_hidden_states_1[:, :1], 'b 1 l d -> (b 1) l d')
+ encoder_hidden_states_2 = self.caption_projection_2(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ encoder_hidden_states_2 = rearrange(encoder_hidden_states_2[:, :1], 'b 1 l d -> (b 1) l d')
+ encoder_hidden_states_3 = self.caption_projection_3(encoder_hidden_states) # b, 1+use_image_num, l, d or b, 1, l, d
+ encoder_hidden_states_3 = rearrange(encoder_hidden_states_3[:, :1], 'b 1 l d -> (b 1) l d')
+
+
+ return hidden_states, encoder_hidden_states_1, encoder_hidden_states_2, encoder_hidden_states_3, \
+ timestep_1, timestep_2, timestep_3, embedded_timestep_1, embedded_timestep_2, embedded_timestep_3
+
+
+
+ def _get_output_for_patched_inputs(
+ self, hidden_states, timestep, class_labels, embedded_timestep, num_frames, height=None, width=None
+ ):
+ if self.config.norm_type != "ada_norm_single":
+ conditioning = self.transformer_blocks[0].norm1.emb(
+ timestep, class_labels, hidden_dtype=self.dtype
+ )
+ shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+ hidden_states = self.proj_out_2(hidden_states)
+ elif self.config.norm_type == "ada_norm_single":
+ shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+ hidden_states = self.norm_out(hidden_states)
+ # Modulation
+ hidden_states = hidden_states * (1 + scale) + shift
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states.squeeze(1)
+
+ # # unpatchify
+ # hidden_states = hidden_states.reshape(
+ # shape=(-1, num_frames, height, width, self.out_channels)
+ # )
+ # output = torch.einsum("nthwc->ncthw", hidden_states)
+ # return output
+ # unpatchify
+ hidden_states = hidden_states.reshape(
+ shape=(-1, num_frames, height, width, self.config.patch_size_t, self.config.patch_size, self.config.patch_size, self.config.out_channels)
+ )
+ hidden_states = torch.einsum("nthwopqc->nctohpwq", hidden_states)
+ output = hidden_states.reshape(
+ shape=(-1, self.config.out_channels, num_frames * self.config.patch_size_t, height * self.config.patch_size, width * self.config.patch_size)
+ )
+ # import ipdb;ipdb.set_trace()
+ # if output.shape[2] % 2 == 0:
+ # output = output[:, :, 1:]
+ return output
+
+def UDiTT2V_S_122(**kwargs):
+ return UDiTT2V(depth=[6, 6, 6, 6, 6], attention_head_dim=48, num_attention_heads=8, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, **kwargs)
+
+def UDiTT2V_B_122(**kwargs):
+ return UDiTT2V(depth=[6, 6, 6, 6, 6], attention_head_dim=48, num_attention_heads=16, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, **kwargs)
+
+def UDiTT2V_L_122(**kwargs):
+ return UDiTT2V(depth=[6, 6, 6, 6, 6], attention_head_dim=48, num_attention_heads=24, patch_size_t=1, patch_size=2,
+ norm_type="ada_norm_single", caption_channels=4096, **kwargs)
+
+UDiT_models = {
+ "UDiTT2V-S/122": UDiTT2V_S_122, # 0.4B
+ "UDiTT2V-B/122": UDiTT2V_B_122, # 1.7B
+ "UDiTT2V-L/122": UDiTT2V_L_122, # 3.7B
+}
+
+UDiT_models_class = {
+ "UDiTT2V-S/122": UDiTT2V,
+ "UDiTT2V-B/122": UDiTT2V,
+ "UDiTT2V-L/122": UDiTT2V,
+}
+
+
+if __name__ == '__main__':
+ import sys
+ from copy import deepcopy
+ from opensora.models.ae import ae_channel_config, ae_stride_config
+ from opensora.models.ae import getae, getae_wrapper
+ from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
+
+ args = type('args', (),
+ {
+ 'ae': 'CausalVAEModel_4x8x8',
+ 'attention_mode': 'xformers',
+ 'use_rope': True,
+ 'model_max_length': 300,
+ 'max_height': 240,
+ 'max_width': 320,
+ 'num_frames': 1,
+ 'use_image_num': 0,
+ 'compress_kv_factor': 1,
+ 'interpolation_scale_t': 1,
+ 'interpolation_scale_h': 1,
+ 'interpolation_scale_w': 1,
+ }
+ )
+ b = 16
+ c = 4
+ cond_c = 4096
+ num_timesteps = 1000
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper:
+ num_frames = (args.num_frames - 1) // ae_stride_t + 1
+ else:
+ num_frames = args.num_frames // ae_stride_t
+
+ device = torch.device('cuda:0')
+
+
+
+ model = UDiTT2V_L_122(in_channels=c,
+ out_channels=c,
+ sample_size=latent_size,
+ sample_size_t=num_frames,
+ activation_fn="gelu-approximate",
+ attention_bias=True,
+ attention_type="default",
+ double_self_attention=False,
+ norm_elementwise_affine=False,
+ norm_eps=1e-06,
+ norm_num_groups=32,
+ num_vector_embeds=None,
+ only_cross_attention=False,
+ upcast_attention=False,
+ use_linear_projection=False,
+ use_additional_conditions=False,
+ downsampler=None,
+ interpolation_scale_t=args.interpolation_scale_t,
+ interpolation_scale_h=args.interpolation_scale_h,
+ interpolation_scale_w=args.interpolation_scale_w,
+ use_rope=args.use_rope).to(device)
+
+ print(model)
+ print(f'{sum(p.numel() for p in model.parameters() if p.requires_grad)/1e9} B')
+
+
+ model_state_dict = model.state_dict()
+ pretrained = "/storage/ongoing/new/Open-Sora-Plan/bs2_20node_73000k_480p_61x480p_lr5e-5_snr5_noioff0.02_ema_rope_uditultra122_qknorm_ds222_mt5xxl_sucai288w/checkpoint-11500/model_ema/diffusion_pytorch_model.safetensors"
+ try:
+ if 'safetensors' in pretrained: # pixart series
+ from safetensors.torch import load_file as safe_load
+ # import ipdb;ipdb.set_trace()
+ pretrained_checkpoint = safe_load(pretrained, device="cpu")
+ pretrained_keys = set(list(pretrained_checkpoint.keys()))
+ model_keys = set(list(model_state_dict.keys()))
+ common_keys = list(pretrained_keys & model_keys)
+ checkpoint = {k: pretrained_checkpoint[k] for k in common_keys if model_state_dict[k].numel() == pretrained_checkpoint[k].numel()}
+ # if checkpoint['pos_embed.proj.weight'].shape != model.pos_embed.proj.weight.shape and checkpoint['pos_embed.proj.weight'].ndim == 4:
+ # logger.info(f"Resize pos_embed, {checkpoint['pos_embed.proj.weight'].shape} -> {model.pos_embed.proj.weight.shape}")
+ # repeat = model.pos_embed.proj.weight.shape[2]
+ # checkpoint['pos_embed.proj.weight'] = checkpoint['pos_embed.proj.weight'].unsqueeze(2).repeat(1, 1, repeat, 1, 1) / float(repeat)
+ # del checkpoint['proj_out.weight'], checkpoint['proj_out.bias']
+ else: # latest stage training weight
+ checkpoint = torch.load(pretrained, map_location='cpu')
+ if 'model' in checkpoint:
+ checkpoint = checkpoint['model']
+ # import ipdb;ipdb.set_trace()
+ missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
+ print(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
+ print(f'Successfully load {len(model_state_dict) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
+ except Exception as e:
+ print(e)
+
+ # import sys;sys.exit()
+ # try:
+ # path = "bs32_1node_480p_lr1e-4_snr5_noioff0.02_ema_uditultra22_ds22_mt5xxl/checkpoint-500/model/diffusion_pytorch_model.safetensors"
+ # from safetensors.torch import load_file as safe_load
+ # ckpt = safe_load(path, device="cpu")
+ # new_ckpt = {}
+ # k_size = 3
+ # t_stride = 1
+ # for k, v in ckpt.items():
+ # if 'pos_embed.proj.weight' in k:
+ # new_v = v.unsqueeze(-3).repeat(1, 1, k_size, 1, 1) # 768, 4, 3, 3 -> 768, 4, 3, 3, 3
+ # elif 'attn1.downsampler.layer.weight' in k:
+ # new_v = v.unsqueeze(-3).repeat(1, 1, k_size, 1, 1) # 768, 4, 3, 3 -> 768, 4, 3, 3, 3
+ # elif 'body.0.weight' in k and 'down' in k:
+ # in_c = v.shape[0]
+ # new_v = v[:in_c//2].unsqueeze(-3).repeat(1, 1, k_size, 1, 1) # 384, 768, 3, 3 -> 192, 768, 3, 3, 3
+ # elif 'body.0.weight' in k and 'up' in k:
+ # new_v = v.unsqueeze(-3).repeat(2, 1, k_size, 1, 1) # 6144, 3072, 3, 3 -> 12288, 3072, 3, 3, 3
+ # elif 'proj_out' in k:
+ # if 'weight' in k:
+ # new_v = v.repeat(t_stride, 1) # 16, 768 -> 32, 768
+ # elif 'bias' in k:
+ # new_v = v.repeat(t_stride) # 16 -> 32
+ # else:
+ # new_v = v
+ # new_ckpt[k] = new_v
+ # msg = model.load_state_dict(new_ckpt, strict=False)
+ # # print(msg)
+ # except Exception as e:
+ # print(e)
+ x = torch.randn(b, c, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w).to(device)
+ cond = torch.randn(b, 1+args.use_image_num, args.model_max_length, cond_c).to(device)
+ attn_mask = torch.randint(0, 2, (b, 1+(args.num_frames-1)//ae_stride_t+args.use_image_num, args.max_height//ae_stride_h, args.max_width//ae_stride_w)).to(device) # B L or B 1+num_images L
+ cond_mask = torch.randint(0, 2, (b, 1+args.use_image_num, args.model_max_length)).to(device) # B L or B 1+num_images L
+ timestep = torch.randint(0, 1000, (b,), device=device)
+ model_kwargs = dict(hidden_states=x, encoder_hidden_states=cond, attention_mask=attn_mask,
+ encoder_attention_mask=cond_mask, use_image_num=args.use_image_num, timestep=timestep)
+ with torch.no_grad():
+ output = model(**model_kwargs)[0]
+ # import ipdb;ipdb.set_trace()
+ print(output.shape)
+
+
+
+
+ # from peft import LoraConfig, PeftModel, get_peft_model
+ # from opensora.utils.lora_utils import EMAModel_LoRA, maybe_zero_3, get_peft_state_maybe_zero_3
+ # lora_save_path = '/storage/ongoing/new/Open-Sora-Plan/debug_lora/model_lora'
+ # ema_lora_save_path = '/storage/ongoing/new/Open-Sora-Plan/debug_lora/ema_model_lora'
+ # origin_model_path = '/storage/ongoing/new/Open-Sora-Plan/bs16_4node_240p_lr1e-4_snr5_noioff0.02_ema_rope_uditultra22_ds22_mt5xxl/checkpoint-500/model_ema'
+ # model = UDiTUltraT2V.from_pretrained(origin_model_path)
+ # lora_config = LoraConfig(
+ # r=64,
+ # lora_alpha=64,
+ # init_lora_weights="gaussian",
+ # target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+ # )
+ # model_lora = get_peft_model(model, lora_config)
+ # # --------------------ema lora_model----------------------------------
+ # # create ema lora_model
+ # ema_model = deepcopy(model_lora)
+ # ema_model_lora = EMAModel_LoRA(lora_config, parameters=ema_model.parameters(), update_after_step=0,
+ # model_cls=UDiTUltraT2V, model_config=ema_model.config)
+ # ema_model_lora.save_pretrained(ema_lora_save_path)
+ # ema_model_load_lora = EMAModel_LoRA.from_pretrained(ema_lora_save_path, UDiTUltraT2V, lora_config, origin_model_path)
+ # ema_model_lora.load_state_dict(ema_model_load_lora.state_dict())
+ # ema_model_lora.to(device)
+
+ # # -----------------lora model---------------------------------
+ # # get lora weight
+ # model_lora.save_pretrained(lora_save_path)
+ # # ----------------load lora model------------------------------
+ # # load lora weight
+ # model = UDiTUltraT2V.from_pretrained(origin_model_path)
+ # import ipdb;ipdb.set_trace()
+ # model_load_lora = PeftModel.from_pretrained(model, lora_save_path)
+ # for k, v in model_load_lora.state_dict().items():
+ # assert torch.allclose(v, model_lora.state_dict()[k])
+ # # for k, v in zip(ema_model_lora.shadow_params, model_lora.parameters()):
+ # # assert torch.allclose(v, k)
+ # print('Merging LoRA weights...')
+ # import ipdb;ipdb.set_trace()
+ # model_load_lora_merge = model_load_lora.merge_and_unload()
+ # with torch.no_grad():
+ # output = model_load_lora_merge(**model_kwargs)
+ # print(output[0].shape)
+
+
+
+
diff --git a/opensora/models/diffusion/udit/modules.py b/opensora/models/diffusion/udit/modules.py
new file mode 100644
index 000000000..875c10bfb
--- /dev/null
+++ b/opensora/models/diffusion/udit/modules.py
@@ -0,0 +1,1161 @@
+from einops import rearrange
+from torch import nn
+import torch
+import numpy as np
+
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+import diffusers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, GatedSelfAttentionDense
+from diffusers.models.attention_processor import Attention as Attention_
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+from opensora.models.diffusion.udit.rope import PositionGetter3D, RoPE3D
+import re
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config, set_run_dtype
+except:
+ torch_npu = None
+ npu_config = None
+ set_run_dtype = None
+logger = logging.get_logger(__name__)
+
+def get_3d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+ grid_t = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_h = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid_w = np.arange(grid_size[2], dtype=np.float32) / (grid_size[2] / base_size[2]) / interpolation_scale[2]
+ grid = np.meshgrid(grid_w, grid_h, grid_t) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([3, 1, grid_size[2], grid_size[1], grid_size[0]])
+ pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 3 != 0:
+ raise ValueError("embed_dim must be divisible by 3")
+
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_t = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0]) # (T*H*W, D/3)
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1]) # (T*H*W, D/3)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2]) # (T*H*W, D/3)
+
+ emb = np.concatenate([emb_t, emb_h, emb_w], axis=1) # (T*H*W, D)
+ return emb
+
+
+def get_2d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size[0]) / interpolation_scale[0]
+ grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size[1]) / interpolation_scale[1]
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ # use 1/3 of dimensions to encode grid_t/h/w
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+def get_1d_sincos_pos_embed(
+ embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16,
+):
+ """
+ grid_size: int of the grid return: pos_embed: [grid_size, embed_dim] or
+ [1+grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ # if isinstance(grid_size, int):
+ # grid_size = (grid_size, grid_size)
+
+ grid = np.arange(grid_size, dtype=np.float32) / (grid_size / base_size) / interpolation_scale
+ pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid) # (H*W, D/2)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+ """
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+class FP32_Layernorm(nn.LayerNorm):
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ origin_dtype = inputs.dtype
+ return F.layer_norm(inputs.float(), self.normalized_shape, self.weight.float() if self.weight is not None else None,
+ self.bias.float() if self.bias is not None else None, self.eps).to(origin_dtype)
+
+
+class FP32_SiLU(nn.SiLU):
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ return torch.nn.functional.silu(inputs.float(), inplace=self.inplace).to(inputs.dtype)
+
+
+class FP32_GELU(nn.GELU):
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ return torch.nn.functional.gelu(inputs.float(), approximate=self.approximate).to(inputs.dtype)
+
+
+
+class OverlapPatchEmbed3D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=False,
+ ):
+ super().__init__()
+ # assert patch_size_t == 1 and patch_size == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+ self.proj = nn.Conv3d(
+ in_channels, embed_dim, kernel_size=3, padding=1, stride=(patch_size_t, patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+
+ num_frames = latent.shape[-3] // self.patch_size_t
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ # latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ # latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ latent = rearrange(latent, 'b c t h w -> (b t) (h w) c ')
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent = latent
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype)
+
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c')
+ return video_latent
+
+
+
+class PatchEmbed2D(nn.Module):
+ """2D Image to Patch Embedding but with 3D position embedding"""
+
+ def __init__(
+ self,
+ num_frames=1,
+ height=224,
+ width=224,
+ patch_size_t=1,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ layer_norm=False,
+ flatten=True,
+ bias=True,
+ interpolation_scale=(1, 1),
+ interpolation_scale_t=1,
+ use_abs_pos=True,
+ ):
+ super().__init__()
+ # assert num_frames == 1
+ self.use_abs_pos = use_abs_pos
+ self.flatten = flatten
+ self.layer_norm = layer_norm
+
+ self.proj = nn.Conv2d(
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
+ )
+ if layer_norm:
+ self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+ else:
+ self.norm = None
+
+ self.patch_size_t = patch_size_t
+ self.patch_size = patch_size
+ # See:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+
+ self.height, self.width = height // patch_size, width // patch_size
+ self.base_size = (height // patch_size, width // patch_size)
+ self.interpolation_scale = (interpolation_scale[0], interpolation_scale[1])
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim, (self.height, self.width), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+ )
+ self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+ self.num_frames = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.base_size_t = (num_frames - 1) // patch_size_t + 1 if num_frames % 2 == 1 else num_frames // patch_size_t
+ self.interpolation_scale_t = interpolation_scale_t
+ temp_pos_embed = get_1d_sincos_pos_embed(embed_dim, self.num_frames, base_size=self.base_size_t, interpolation_scale=self.interpolation_scale_t)
+ self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+ # self.temp_embed_gate = nn.Parameter(torch.tensor([0.0]))
+
+ def forward(self, latent, num_frames):
+ b, _, _, _, _ = latent.shape
+ video_latent, image_latent = None, None
+ # b c 1 h w
+ # assert latent.shape[-3] == 1 and num_frames == 1
+ height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+ latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+ latent = self.proj(latent)
+
+ if self.flatten:
+ latent = latent.flatten(2).transpose(1, 2) # BT C H W -> BT N C
+ if self.layer_norm:
+ latent = self.norm(latent)
+
+ if self.use_abs_pos:
+ # Interpolate positional embeddings if needed.
+ # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+ if self.height != height or self.width != width:
+ # raise NotImplementedError
+ pos_embed = get_2d_sincos_pos_embed(
+ embed_dim=self.pos_embed.shape[-1],
+ grid_size=(height, width),
+ base_size=self.base_size,
+ interpolation_scale=self.interpolation_scale,
+ )
+ pos_embed = torch.from_numpy(pos_embed)
+ pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ pos_embed = self.pos_embed
+
+
+ if self.num_frames != num_frames:
+ # import ipdb;ipdb.set_trace()
+ # raise NotImplementedError
+ if get_sequence_parallel_state():
+ if npu_config is not None:
+ sp_size = hccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = hccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+ else:
+ sp_size = nccl_info.world_size
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames * sp_size,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ rank = nccl_info.rank % sp_size
+ st_frame = rank * num_frames
+ ed_frame = st_frame + num_frames
+ temp_pos_embed = temp_pos_embed[st_frame: ed_frame]
+
+ else:
+ temp_pos_embed = get_1d_sincos_pos_embed(
+ embed_dim=self.temp_pos_embed.shape[-1],
+ grid_size=num_frames,
+ base_size=self.base_size_t,
+ interpolation_scale=self.interpolation_scale_t,
+ )
+ temp_pos_embed = torch.from_numpy(temp_pos_embed)
+ temp_pos_embed = temp_pos_embed.float().unsqueeze(0).to(latent.device)
+ else:
+ temp_pos_embed = self.temp_pos_embed
+
+ latent = (latent + pos_embed).to(latent.dtype)
+
+ latent = rearrange(latent, '(b t) n c -> b t n c', b=b)
+ video_latent = latent
+
+ if self.use_abs_pos:
+ # temp_pos_embed = temp_pos_embed.unsqueeze(2) * self.temp_embed_gate.tanh()
+ temp_pos_embed = temp_pos_embed.unsqueeze(2)
+ video_latent = (video_latent + temp_pos_embed).to(video_latent.dtype)
+ video_latent = rearrange(video_latent, 'b t n c -> b (t n) c')
+
+ return video_latent
+
+
+
+class Attention(Attention_):
+ def __init__(self, downsampler, attention_mode, use_rope, interpolation_scale_thw, **kwags):
+ processor = AttnProcessor2_0(attention_mode=attention_mode, use_rope=use_rope, interpolation_scale_thw=interpolation_scale_thw)
+ super().__init__(processor=processor, **kwags)
+ self.downsampler = None
+ # if downsampler: # downsampler k155_s122
+ # downsampler_ker_size = list(re.search(r'k(\d{2,3})', downsampler).group(1)) # 122
+ # down_factor = list(re.search(r's(\d{2,3})', downsampler).group(1))
+ # downsampler_ker_size = [int(i) for i in downsampler_ker_size]
+ # downsampler_padding = [(i - 1) // 2 for i in downsampler_ker_size]
+ # down_factor = [int(i) for i in down_factor]
+
+ # if len(downsampler_ker_size) == 2:
+ # self.downsampler = DownSampler2d(kwags['query_dim'], kwags['query_dim'], kernel_size=downsampler_ker_size, stride=1,
+ # padding=downsampler_padding, groups=kwags['query_dim'], down_factor=down_factor,
+ # down_shortcut=True)
+ # elif len(downsampler_ker_size) == 3:
+ # self.downsampler = DownSampler3d(kwags['query_dim'], kwags['query_dim'], kernel_size=downsampler_ker_size, stride=1,
+ # padding=downsampler_padding, groups=kwags['query_dim'], down_factor=down_factor,
+ # down_shortcut=True)
+
+ # self.q_norm = nn.LayerNorm(kwags['dim_head'], elementwise_affine=True, eps=1e-6)
+ # self.k_norm = nn.LayerNorm(kwags['dim_head'], elementwise_affine=True, eps=1e-6)
+
+
+class DownSampler3d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv3d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=t, h=h, w=w)
+ if npu_config is None:
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.layer, x, x_dtype) + (x if self.down_shortcut else 0)
+
+ self.t = t//self.down_factor[0]
+ self.h = h//self.down_factor[1]
+ self.w = w//self.down_factor[2]
+ # import ipdb;ipdb.set_trace()
+ x = rearrange(x, 'b d (t dt) (h dh) (w dw) -> (b dt dh dw) (t h w) d',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> b 1 t h w', t=t, h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (t dt) (h dh) (w dw) -> (b dt dh dw) 1 (t h w)',
+ t=t//self.down_factor[0], h=h//self.down_factor[1], w=w//self.down_factor[2],
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ x = rearrange(x, '(b dt dh dw) (t h w) d -> b (t dt h dh w dw) d',
+ t=t, h=h, w=w,
+ dt=self.down_factor[0], dh=self.down_factor[1], dw=self.down_factor[2])
+ return x
+
+
+
+class DownSampler2d(nn.Module):
+ def __init__(self, *args, **kwargs):
+ ''' Required kwargs: down_factor, downsampler'''
+ super().__init__()
+ self.down_factor = kwargs.pop('down_factor')
+ self.down_shortcut = kwargs.pop('down_shortcut')
+ self.layer = nn.Conv2d(*args, **kwargs)
+
+ def forward(self, x, attention_mask, t, h, w):
+ # import ipdb;ipdb.set_trace()
+ b = x.shape[0]
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=t, h=h, w=w)
+ x = self.layer(x) + (x if self.down_shortcut else 0)
+
+ self.t = 1
+ self.h = h//self.down_factor[0]
+ self.w = w//self.down_factor[1]
+
+ x = rearrange(x, 'b d (h dh) (w dw) -> (b dh dw) (h w) d',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> (b t) 1 h w', h=h, w=w)
+ attention_mask = rearrange(attention_mask, 'b 1 (h dh) (w dw) -> (b dh dw) 1 (h w)',
+ h=h//self.down_factor[0], w=w//self.down_factor[1],
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x, attention_mask
+
+ def reverse(self, x, t, h, w):
+ x = rearrange(x, '(b t dh dw) (h w) d -> b (t h dh w dw) d',
+ t=t, h=h, w=w,
+ dh=self.down_factor[0], dw=self.down_factor[1])
+ return x
+
+
+class AttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+ """
+
+ def __init__(self, attention_mode='xformers', use_rope=False, interpolation_scale_thw=(1, 1, 1)):
+ self.use_rope = use_rope
+ self.interpolation_scale_thw = interpolation_scale_thw
+ if self.use_rope:
+ self._init_rope(interpolation_scale_thw)
+ self.attention_mode = attention_mode
+ if not hasattr(F, "scaled_dot_product_attention"):
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+ def _init_rope(self, interpolation_scale_thw):
+ self.rope = RoPE3D(interpolation_scale_thw=interpolation_scale_thw)
+ self.position_getter = PositionGetter3D()
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ frame: int = 8,
+ height: int = 16,
+ width: int = 16,
+ *args,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+
+ if attn.downsampler is not None:
+ hidden_states, attention_mask = attn.downsampler(hidden_states, attention_mask, t=frame, h=height, w=width)
+ frame, height, width = attn.downsampler.t, attn.downsampler.h, attn.downsampler.w
+
+ residual = hidden_states
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ # if input_ndim == 4:
+ # batch_size, channel, height, width = hidden_states.shape
+ # hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ if attention_mask is not None:
+ if npu_config is None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ # scaled_dot_product_attention expects attention_mask shape to be
+ # (batch, heads, source_length, target_length)
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+ else:
+ if npu_config.enable_FA:
+ attention_mask = attention_mask.to(torch.bool)
+ attention_mask = attention_mask.view(batch_size, 1, -1, attention_mask.shape[-1])
+ attention_mask = attention_mask.repeat(1, 1, hidden_states.shape[1], 1)
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ query = attn.to_q(hidden_states)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ if npu_config is not None and npu_config.on_npu:
+ if npu_config.enable_FA and query.dtype == torch.float32:
+ dtype = torch.bfloat16
+ else:
+ dtype = None
+
+ if self.use_rope:
+ query = query.view(batch_size, -1, attn.heads, head_dim)
+ key = key.view(batch_size, -1, attn.heads, head_dim)
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+ query = query.view(batch_size, -1, attn.heads * head_dim)
+ key = key.view(batch_size, -1, attn.heads * head_dim)
+
+ with set_run_dtype(query, dtype):
+ query, key, value = npu_config.set_current_run_dtype([query, key, value])
+ hidden_states = npu_config.run_attention(query, key, value, attention_mask, "BSH",
+ head_dim, attn.heads)
+
+ hidden_states = npu_config.restore_dtype(hidden_states)
+ else:
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ # qk norm
+ # query = attn.q_norm(query)
+ # key = attn.k_norm(key)
+
+ if self.use_rope:
+ # require the shape of (batch_size x nheads x ntokens x dim)
+ pos_thw = self.position_getter(batch_size, t=frame, h=height, w=width, device=query.device)
+ query = self.rope(query, pos_thw)
+ key = self.rope(key, pos_thw)
+
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ if attention_mask is None or not torch.any(attention_mask.bool()): # 0 mean visible
+ attention_mask = None
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ # TODO: add support for attn.scale when we move to Torch 2.1
+
+ if self.attention_mode == 'flash':
+ assert attention_mask is None, 'flash-attn do not support attention_mask'
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'xformers':
+ with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ elif self.attention_mode == 'math':
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+ else:
+ raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+ hidden_states = hidden_states.to(query.dtype)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ if attn.downsampler is not None:
+ hidden_states = attn.downsampler.reverse(hidden_states, t=frame, h=height, w=width)
+
+ return hidden_states
+
+
+class PixelUnshuffle(nn.Module):
+ def __init__(self, ratio, ratio_t=None):
+ super().__init__()
+ self.r = ratio
+ self.r_t = ratio_t if ratio_t else 1
+
+ def forward(self, x):
+ # if self.r_t is not None and self.r_t != 1:
+ if x.ndim == 5:
+ b, c, t, h, w = x.shape
+ # import ipdb;ipdb.set_trace()
+ assert t % self.r_t == 0 and h % self.r == 0 and w % self.r == 0
+ x = rearrange(x, 'b c (t r1) (h r2) (w r3) -> b (c r1 r2 r3) t h w', r1=self.r_t, r2=self.r, r3=self.r)
+ else:
+ b, c, h, w = x.shape
+ assert h % self.r == 0 and w % self.r == 0
+ x = rearrange(x, 'b c (h r2) (w r3) -> b (c r2 r3) h w', r2=self.r, r3=self.r)
+ return x
+
+class PixelShuffle(nn.Module):
+ def __init__(self, ratio, ratio_t=None):
+ super().__init__()
+ self.r = ratio
+ self.r_t = ratio_t if ratio_t else 1
+
+ def forward(self, x):
+ if x.ndim == 5:
+ b, c, t, h, w = x.shape
+ # import ipdb;ipdb.set_trace()
+ assert c % (self.r_t*self.r*self.r) == 0
+ x = rearrange(x, 'b (c r1 r2 r3) t h w -> b c (t r1) (h r2) (w r3)', r1=self.r_t, r2=self.r, r3=self.r)
+ else:
+ b, c, h, w = x.shape
+ assert c % (self.r*self.r) == 0
+ x = rearrange(x, 'b (c r2 r3) h w -> b c (h r2) (w r3)', r2=self.r, r3=self.r)
+ return x
+
+class Downsample3d(nn.Module):
+ def __init__(self, n_feat):
+ super(Downsample3d, self).__init__()
+
+ self.body = nn.Sequential(nn.Conv3d(n_feat, n_feat // 4, kernel_size=3, stride=1, padding=1, bias=False),
+ PixelUnshuffle(2, 2))
+
+ def forward(self, x, attention_mask, frames, height, width, pad_h=0, pad_w=0):
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=frames, h=height, w=width)
+ # x = F.pad(x, (0, pad_w, 0, pad_h, 0, 0), mode='reflect')
+ x = F.pad(x, (0, pad_w, 0, pad_h, 0, 0))
+ if npu_config is None:
+ x = self.body(x)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.body, x, x_dtype)
+ x = rearrange(x, 'b d t h w -> b (t h w) d')
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> b 1 t h w', t=frames, h=height, w=width)
+ attention_mask = F.pad(attention_mask, (0, pad_w, 0, pad_h, 0, 0))
+ attention_mask = F.max_pool3d(attention_mask, kernel_size=2, stride=2)
+ attention_mask = rearrange(attention_mask, 'b 1 t h w -> b 1 (t h w)')
+ attention_bias = (1 - attention_mask.bool().to(x.dtype)) * -10000.0
+
+ return x, attention_bias, attention_mask
+
+class Upsample3d(nn.Module):
+ def __init__(self, n_feat):
+ super(Upsample3d, self).__init__()
+
+ self.body = nn.Sequential(nn.Conv3d(n_feat, n_feat * 4, kernel_size=3, stride=1, padding=1, bias=False),
+ PixelShuffle(2, 2))
+
+ def forward(self, x, attention_mask, frames, height, width, pad_h=0, pad_w=0):
+ x = rearrange(x, 'b (t h w) d -> b d t h w', t=frames, h=height, w=width)
+ if npu_config is None:
+ x = self.body(x)
+ else:
+ x_dtype = x.dtype
+ x = npu_config.run_conv3d(self.body, x, x_dtype)
+ x = x[:, :, :, :height*2-pad_h, :width*2-pad_w]
+ x = rearrange(x, 'b d t h w -> b (t h w) d')
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> b 1 t h w', t=frames, h=height, w=width)
+ attention_mask = attention_mask.repeat_interleave(2, -1).repeat_interleave(2, -2).repeat_interleave(2, -3)
+ attention_mask = attention_mask[:, :, :, :height*2-pad_h, :width*2-pad_w]
+ attention_mask = rearrange(attention_mask, 'b 1 t h w -> b 1 (t h w)')
+ attention_bias = (1 - attention_mask.bool().to(x.dtype)) * -10000.0
+
+ return x, attention_bias, attention_mask
+
+
+class Downsample2d(nn.Module):
+ def __init__(self, n_feat, n_feat_out, is_video_model=False):
+ super(Downsample2d, self).__init__()
+ self.body = nn.Conv2d(n_feat, n_feat_out, kernel_size=3, stride=2, padding=1, bias=False)
+
+ def forward(self, x, attention_mask, frames, height, width, pad_h=0, pad_w=0):
+ # import ipdb;ipdb.set_trace()
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=frames, h=height, w=width)
+ if npu_config is None:
+ x = F.pad(x, (0, pad_w, 0, pad_h))
+ else:
+ x = npu_config.run_pad_2d(F.pad, x, pad=(0, pad_w, 0, pad_h))
+ x = self.body(x)
+ x = rearrange(x, '(b t) d h w -> b (t h w) d', t=frames)
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> (b t) 1 h w', t=frames, h=height, w=width)
+ attention_mask = F.pad(attention_mask, (0, pad_w, 0, pad_h))
+ attention_mask = F.max_pool2d(attention_mask.float(), kernel_size=2, stride=2)
+ attention_mask = rearrange(attention_mask, '(b t) 1 h w -> b 1 (t h w)', t=frames)
+ attention_bias = (1 - attention_mask.bool().to(x.dtype)) * -10000.0
+
+ return x, attention_bias, attention_mask
+
+class Upsample2d(nn.Module):
+ def __init__(self, n_feat, n_feat_out, is_video_model=False):
+ super(Upsample2d, self).__init__()
+ self.body = nn.Conv2d(n_feat, n_feat_out, kernel_size=3, stride=1, padding=1, bias=False)
+
+ def forward(self, x, attention_mask, frames, height, width, pad_h=0, pad_w=0):
+ x = rearrange(x, 'b (t h w) d -> (b t) d h w', t=frames, h=height, w=width)
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+ x = self.body(x)
+ x = x[:, :, :height*2-pad_h, :width*2-pad_w]
+ x = rearrange(x, '(b t) d h w -> b (t h w) d', t=frames)
+
+ attention_mask = rearrange(attention_mask, 'b 1 (t h w) -> (b t) 1 h w', t=frames, h=height, w=width)
+ attention_mask = attention_mask.repeat_interleave(2, -1).repeat_interleave(2, -2)
+ attention_mask = attention_mask[:, :, :height*2-pad_h, :width*2-pad_w]
+ attention_mask = rearrange(attention_mask, '(b t) 1 h w -> b 1 (t h w)', t=frames)
+ attention_bias = (1 - attention_mask.bool().to(x.dtype)) * -10000.0
+
+ return x, attention_bias, attention_mask
+
+class LlamaMLP(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ dim_out: Optional[int] = None,
+ mult: int = 2.5,
+ dropout: float = 0.0,
+ activation_fn: str = "geglu",
+ final_dropout: bool = False,
+ inner_dim=None,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.hidden_size = dim
+ self.intermediate_size = int(dim * mult)
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+ self.act_fn = ACT2FN["silu"]
+
+ def forward(self, x):
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+ r"""
+ A basic Transformer block.
+
+ Parameters:
+ dim (`int`): The number of channels in the input and output.
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+ only_cross_attention (`bool`, *optional*):
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
+ double_self_attention (`bool`, *optional*):
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
+ upcast_attention (`bool`, *optional*):
+ Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+ norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+ Whether to use learnable elementwise affine parameters for normalization.
+ norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+ The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+ final_dropout (`bool` *optional*, defaults to False):
+ Whether to apply a final dropout after the last feed-forward layer.
+ attention_type (`str`, *optional*, defaults to `"default"`):
+ The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+ positional_embeddings (`str`, *optional*, defaults to `None`):
+ The type of positional embeddings to apply to.
+ num_positional_embeddings (`int`, *optional*, defaults to `None`):
+ The maximum number of positional embeddings to apply.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ attention_bias: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_elementwise_affine: bool = True,
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+ norm_eps: float = 1e-5,
+ final_dropout: bool = False,
+ attention_type: str = "default",
+ positional_embeddings: Optional[str] = None,
+ num_positional_embeddings: Optional[int] = None,
+ ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+ ada_norm_bias: Optional[int] = None,
+ ff_inner_dim: Optional[int] = None,
+ ff_bias: bool = True,
+ attention_out_bias: bool = True,
+ attention_mode: str = "xformers",
+ downsampler: str = None,
+ mlp_ratio: int = 4,
+ use_rope: bool = False,
+ interpolation_scale_thw: Tuple[int] = (1, 1, 1),
+ ):
+ super().__init__()
+ self.only_cross_attention = only_cross_attention
+
+ # We keep these boolean flags for backward-compatibility.
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+ self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+ self.use_layer_norm = norm_type == "layer_norm"
+ self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+ raise ValueError(
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+ )
+
+ self.norm_type = norm_type
+ self.num_embeds_ada_norm = num_embeds_ada_norm
+
+ if positional_embeddings and (num_positional_embeddings is None):
+ raise ValueError(
+ "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+ )
+
+ if positional_embeddings == "sinusoidal":
+ self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+ else:
+ self.pos_embed = None
+
+ # Define 3 blocks. Each block has its own normalization layer.
+ # 1. Self-Attn
+ if norm_type == "ada_norm":
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_zero":
+ self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm1 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+ self.attn1 = Attention(
+ query_dim=dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ downsampler=downsampler,
+ attention_mode=attention_mode,
+ use_rope=use_rope,
+ interpolation_scale_thw=interpolation_scale_thw,
+ )
+
+ # 2. Cross-Attn
+ if cross_attention_dim is not None or double_self_attention:
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+ # the second cross attention block.
+ if norm_type == "ada_norm":
+ self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ elif norm_type == "ada_norm_continuous":
+ self.norm2 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "rms_norm",
+ )
+ else:
+ self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+ self.attn2 = Attention(
+ query_dim=dim,
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ downsampler=None,
+ attention_mode=attention_mode,
+ use_rope=False,
+ interpolation_scale_thw=interpolation_scale_thw,
+ ) # is self-attn if encoder_hidden_states is none
+ else:
+ self.norm2 = None
+ self.attn2 = None
+
+ # 3. Feed-forward
+ if norm_type == "ada_norm_continuous":
+ self.norm3 = AdaLayerNormContinuous(
+ dim,
+ ada_norm_continous_conditioning_embedding_dim,
+ norm_elementwise_affine,
+ norm_eps,
+ ada_norm_bias,
+ "layer_norm",
+ )
+
+ elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]:
+ self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+ elif norm_type == "layer_norm_i2vgen":
+ self.norm3 = None
+
+ self.ff = LlamaMLP(
+ dim,
+ bias=ff_bias,
+ )
+
+ # 4. Fuser
+ if attention_type == "gated" or attention_type == "gated-text-image":
+ self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+ # 5. Scale-shift for PixArt-Alpha.
+ if norm_type == "ada_norm_single":
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+ # let chunk size default to None
+ self._chunk_size = None
+ self._chunk_dim = 0
+
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+ # Sets chunk feed-forward
+ self._chunk_size = chunk_size
+ self._chunk_dim = dim
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ frame: int = None,
+ height: int = None,
+ width: int = None,
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+ ) -> torch.FloatTensor:
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+ # Notice that normalization is always applied before the real computation in the following blocks.
+ # 0. Self-Attention
+ batch_size = hidden_states.shape[0]
+
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm1(hidden_states, timestep)
+ elif self.norm_type == "ada_norm_zero":
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm1(hidden_states)
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif self.norm_type == "ada_norm_single":
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+ ).chunk(6, dim=1)
+ norm_hidden_states = self.norm1(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+ norm_hidden_states = norm_hidden_states.squeeze(1)
+ else:
+ raise ValueError("Incorrect norm used")
+
+ if self.pos_embed is not None:
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ # 1. Prepare GLIGEN inputs
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+ # 0, -10000 ->(bool) False, True ->(any) True ->(not) False
+ # 0, 0 ->(bool) False, False ->(any) False ->(not) True
+ # assert attention_mask.bool().float().sum() / attention_mask.bool().float().numel() <= 1/16, 'must ~all visible'
+ attn_output = self.attn1(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+ attention_mask=attention_mask, frame=frame, height=height, width=width,
+ **cross_attention_kwargs,
+ )
+ if self.norm_type == "ada_norm_zero":
+ attn_output = gate_msa.unsqueeze(1) * attn_output
+ elif self.norm_type == "ada_norm_single":
+ attn_output = gate_msa * attn_output
+
+ hidden_states = attn_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ # 1.2 GLIGEN Control
+ if gligen_kwargs is not None:
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+ # 3. Cross-Attention
+ if self.attn2 is not None:
+ if self.norm_type == "ada_norm":
+ norm_hidden_states = self.norm2(hidden_states, timestep)
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+ norm_hidden_states = self.norm2(hidden_states)
+ elif self.norm_type == "ada_norm_single":
+ # For PixArt norm2 isn't applied here:
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+ norm_hidden_states = hidden_states
+ elif self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ else:
+ raise ValueError("Incorrect norm")
+
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+ attn_output = self.attn2(
+ norm_hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask,
+ **cross_attention_kwargs,
+ )
+ hidden_states = attn_output + hidden_states
+ # 4. Feed-forward
+ # i2vgen doesn't have this norm 🤷♂️
+ if self.norm_type == "ada_norm_continuous":
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+ elif not self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm3(hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+ if self.norm_type == "ada_norm_single":
+ norm_hidden_states = self.norm2(hidden_states)
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+ # if self.downsampler:
+ # ff_output = self.ff(norm_hidden_states, t=frame, h=height, w=width)
+ # else:
+ ff_output = self.ff(norm_hidden_states)
+
+ if self.norm_type == "ada_norm_zero":
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
+ elif self.norm_type == "ada_norm_single":
+ ff_output = gate_mlp * ff_output
+
+ hidden_states = ff_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ return hidden_states
diff --git a/opensora/models/diffusion/udit/rope.py b/opensora/models/diffusion/udit/rope.py
new file mode 100644
index 000000000..b42096b9d
--- /dev/null
+++ b/opensora/models/diffusion/udit/rope.py
@@ -0,0 +1,154 @@
+import torch
+
+class PositionGetter3D(object):
+ """ return positions of patches """
+
+ def __init__(self, ):
+ self.cache_positions = {}
+
+ def __call__(self, b, t, h, w, device):
+ if not (t,h,w) in self.cache_positions:
+ x = torch.arange(w, device=device)
+ y = torch.arange(h, device=device)
+ z = torch.arange(t, device=device)
+ self.cache_positions[t,h,w] = torch.cartesian_prod(z, y, x) # (t, h, w, 3)
+ pos = self.cache_positions[t,h,w].view(1, t*h*w, 3).expand(b, -1, 3).clone()
+ return pos
+
+
+class RoPE3D(torch.nn.Module):
+
+ def __init__(self, freq=10000.0, F0=1.0, interpolation_scale_thw=(1, 1, 1)):
+ super().__init__()
+ self.base = freq
+ self.F0 = F0
+ self.interpolation_scale_t = interpolation_scale_thw[0]
+ self.interpolation_scale_h = interpolation_scale_thw[1]
+ self.interpolation_scale_w = interpolation_scale_thw[2]
+ self.cache = {}
+
+ def get_cos_sin(self, D, seq_len, device, dtype, interpolation_scale=1):
+ if (D, seq_len, device, dtype) not in self.cache:
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+ t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale
+ freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ cos = freqs.cos() # (Seq, Dim)
+ sin = freqs.sin()
+ self.cache[D, seq_len, device, dtype] = (cos, sin)
+ return self.cache[D, seq_len, device, dtype]
+
+ @staticmethod
+ def rotate_half(x):
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+ return torch.cat((-x2, x1), dim=-1)
+
+ def apply_rope1d(self, tokens, pos1d, cos, sin):
+ assert pos1d.ndim == 2
+ cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+ sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+ return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+ def forward(self, tokens, positions):
+ """
+ input:
+ * tokens: batch_size x nheads x ntokens x dim
+ * positions: batch_size x ntokens x 3 (t, y and x position of each token)
+ output:
+ * tokens after appplying RoPE3D (batch_size x nheads x ntokens x dim)
+ """
+ assert tokens.size(3) % 3 == 0, "number of dimensions should be a multiple of three"
+ D = tokens.size(3) // 3
+ assert positions.ndim == 3 and positions.shape[-1] == 3 # Batch, Seq, 3
+ cos_t, sin_t = self.get_cos_sin(D, int(positions[:, :, 0].max()) + 1, tokens.device, tokens.dtype, self.interpolation_scale_t)
+ cos_y, sin_y = self.get_cos_sin(D, int(positions[:, :, 1].max()) + 1, tokens.device, tokens.dtype, self.interpolation_scale_h)
+ cos_x, sin_x = self.get_cos_sin(D, int(positions[:, :, 2].max()) + 1, tokens.device, tokens.dtype, self.interpolation_scale_w)
+ # split features into three along the feature dimension, and apply rope1d on each half
+ t, y, x = tokens.chunk(3, dim=-1)
+ t = self.apply_rope1d(t, positions[:, :, 0], cos_t, sin_t)
+ y = self.apply_rope1d(y, positions[:, :, 1], cos_y, sin_y)
+ x = self.apply_rope1d(x, positions[:, :, 2], cos_x, sin_x)
+ tokens = torch.cat((t, y, x), dim=-1)
+ return tokens
+
+
+
+# import torch
+# from einops import rearrange, repeat
+
+# class PositionGetter3D(object):
+# """ return positions of patches """
+
+# def __init__(self, ):
+# self.cache_positions = {}
+
+# def __call__(self, b, t, h, w, device):
+# if not (t,h,w) in self.cache_positions:
+# x = torch.arange(w, device=device)
+# y = torch.arange(h, device=device)
+# z = torch.arange(t, device=device)
+# positions = torch.cartesian_prod(z, y, x) # (t, h, w, 3)
+# positions = rearrange(positions, 'n d -> d 1 n')
+# positions = repeat(positions, 'd 1 n -> d b n', b=b).clone()
+# poses = (positions[0], positions[1], positions[2])
+# max_pos = (int(poses[0].max()), int(poses[1].max()), int(poses[2].max()))
+# self.cache_positions[t,h,w] = (poses, max_pos)
+# pos = self.cache_positions[t,h,w]
+# return pos
+
+
+# class RoPE3D(torch.nn.Module):
+
+# def __init__(self, freq=10000.0, F0=1.0, interpolation_scale_thw=(1, 1, 1)):
+# super().__init__()
+# self.base = freq
+# self.F0 = F0
+# self.interpolation_scale_t = interpolation_scale_thw[0]
+# self.interpolation_scale_h = interpolation_scale_thw[1]
+# self.interpolation_scale_w = interpolation_scale_thw[2]
+# self.cache = {}
+
+# def get_cos_sin(self, D, seq_len, device, dtype, interpolation_scale=1):
+# if (D, seq_len, device, dtype) not in self.cache:
+# inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+# t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale
+# freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+# freqs = torch.cat((freqs, freqs), dim=-1)
+# cos = freqs.cos() # (Seq, Dim)
+# sin = freqs.sin()
+# self.cache[D, seq_len, device, dtype] = (cos, sin)
+# return self.cache[D, seq_len, device, dtype]
+
+# @staticmethod
+# def rotate_half(x):
+# x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+# return torch.cat((-x2, x1), dim=-1)
+
+# def apply_rope1d(self, tokens, pos1d, cos, sin):
+# assert pos1d.ndim == 2
+# cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+# sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+# return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+# def forward(self, tokens, positions):
+# """
+# input:
+# * tokens: batch_size x nheads x ntokens x dim
+# * positions: batch_size x ntokens x 3 (t, y and x position of each token)
+# output:
+# * tokens after appplying RoPE3D (batch_size x nheads x ntokens x dim)
+# """
+# assert tokens.size(3) % 3 == 0, "number of dimensions should be a multiple of three"
+# D = tokens.size(3) // 3
+# poses, max_pos = positions
+# assert len(poses) == 3 and poses[0].ndim == 2 # Batch, Seq, 3
+# cos_t, sin_t = self.get_cos_sin(D, max_pos[0] + 1, tokens.device, tokens.dtype, self.interpolation_scale_t)
+# cos_y, sin_y = self.get_cos_sin(D, max_pos[1] + 1, tokens.device, tokens.dtype, self.interpolation_scale_h)
+# cos_x, sin_x = self.get_cos_sin(D, max_pos[2] + 1, tokens.device, tokens.dtype, self.interpolation_scale_w)
+# # split features into three along the feature dimension, and apply rope1d on each half
+# t, y, x = tokens.chunk(3, dim=-1)
+# t = self.apply_rope1d(t, poses[0], cos_t, sin_t)
+# y = self.apply_rope1d(y, poses[1], cos_y, sin_y)
+# x = self.apply_rope1d(x, poses[2], cos_x, sin_x)
+# tokens = torch.cat((t, y, x), dim=-1)
+# return tokens
diff --git a/opensora/models/diffusion/transport/__init__.py b/opensora/models/diffusion/utils/transport/__init__.py
similarity index 100%
rename from opensora/models/diffusion/transport/__init__.py
rename to opensora/models/diffusion/utils/transport/__init__.py
diff --git a/opensora/models/diffusion/transport/integrators.py b/opensora/models/diffusion/utils/transport/integrators.py
similarity index 100%
rename from opensora/models/diffusion/transport/integrators.py
rename to opensora/models/diffusion/utils/transport/integrators.py
diff --git a/opensora/models/diffusion/transport/path.py b/opensora/models/diffusion/utils/transport/path.py
similarity index 100%
rename from opensora/models/diffusion/transport/path.py
rename to opensora/models/diffusion/utils/transport/path.py
diff --git a/opensora/models/diffusion/transport/transport.py b/opensora/models/diffusion/utils/transport/transport.py
similarity index 100%
rename from opensora/models/diffusion/transport/transport.py
rename to opensora/models/diffusion/utils/transport/transport.py
diff --git a/opensora/models/diffusion/transport/utils.py b/opensora/models/diffusion/utils/transport/utils.py
similarity index 100%
rename from opensora/models/diffusion/transport/utils.py
rename to opensora/models/diffusion/utils/transport/utils.py
diff --git a/opensora/models/text_encoder/__init__.py b/opensora/models/text_encoder/__init__.py
index 28cf771bb..2079ce004 100644
--- a/opensora/models/text_encoder/__init__.py
+++ b/opensora/models/text_encoder/__init__.py
@@ -1,6 +1,6 @@
import torch
from torch import nn
-from transformers import T5EncoderModel, CLIPModel, CLIPProcessor
+from transformers import CLIPModel, CLIPProcessor
from opensora.utils.utils import get_precision
@@ -9,7 +9,15 @@ class T5Wrapper(nn.Module):
def __init__(self, args, **kwargs):
super(T5Wrapper, self).__init__()
self.model_name = args.text_encoder_name
- self.text_enc = T5EncoderModel.from_pretrained(self.model_name, cache_dir=args.cache_dir, **kwargs).eval()
+ # if 'mt5' in self.model_name:
+ from transformers import MT5EncoderModel
+ self.text_enc = MT5EncoderModel.from_pretrained("/home/image_data/mt5-xxl", cache_dir=args.cache_dir, **kwargs).eval()
+ # self.text_enc = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir, **kwargs).eval()
+ # self.text_enc = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--google--mt5-xl/snapshots/63fc6450d80515b48e026b69ef2fbbd426433e84", cache_dir=args.cache_dir, **kwargs).eval()
+ # elif 't5' in self.model_name:
+ # from transformers import T5EncoderModel
+ # # self.text_enc = T5EncoderModel.from_pretrained(self.model_name, cache_dir=args.cache_dir, **kwargs).eval()
+ # self.text_enc = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, **kwargs).eval()
def forward(self, input_ids, attention_mask):
text_encoder_embs = self.text_enc(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
@@ -30,6 +38,10 @@ def forward(self, input_ids, attention_mask):
text_encoder = {
+ 'google/mt5-xl': T5Wrapper,
+ 'google/mt5-xxl': T5Wrapper,
+ 'google/umt5-xl': T5Wrapper,
+ 'google/umt5-xxl': T5Wrapper,
'DeepFloyd/t5-v1_1-xxl': T5Wrapper,
'openai/clip-vit-large-patch14': CLIPWrapper
}
@@ -37,12 +49,22 @@ def forward(self, input_ids, attention_mask):
def get_text_enc(args):
"""deprecation"""
- text_enc = text_encoder.get(args.text_encoder_name, None)
+ encoder_key = None
+ for key in text_encoder.keys():
+ if key in args.text_encoder_name:
+ encoder_key = key
+ break
+ text_enc = text_encoder.get(encoder_key, None)
assert text_enc is not None
return text_enc(args)
def get_text_warpper(text_encoder_name):
"""deprecation"""
- text_enc = text_encoder.get(text_encoder_name, None)
+ encoder_key = None
+ for key in text_encoder.keys():
+ if key in text_encoder_name:
+ encoder_key = key
+ break
+ text_enc = text_encoder.get(encoder_key, None)
assert text_enc is not None
return text_enc
diff --git a/opensora/npu_config.py b/opensora/npu_config.py
new file mode 100644
index 000000000..7e39ef034
--- /dev/null
+++ b/opensora/npu_config.py
@@ -0,0 +1,410 @@
+import math
+import mmap
+import os
+import pickle
+import random
+import numpy as np
+import torch
+import subprocess
+import sys
+import threading
+import gc
+import torch.distributed as dist
+
+from opensora.adaptor.zp_manager import zp_manager
+
+try:
+ import torch_npu
+
+ npu_is_available = True
+ from torch_npu.contrib import transfer_to_npu
+except:
+ npu_is_available = False
+
+from contextlib import contextmanager
+import types
+
+
+def compress_video(input_file, output_file, out_size):
+ """使用 ffmpeg 压缩视频文件。"""
+ command = [
+ 'ffmpeg',
+ '-i', input_file,
+ '-vf', f"scale='min({out_size},iw)':'min({out_size},ih)':force_original_aspect_ratio=decrease",
+ '-c:v', 'libx264',
+ '-crf', '18',
+ '-preset', 'slow',
+ '-c:a', 'copy',
+ output_file
+ ]
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+
+@contextmanager
+def set_run_dtype(x, dtype=None):
+ # 保存原始环境变量的值(如果存在)
+ npu_config.original_run_dtype = x.dtype
+ # 设置环境变量为指定的值
+ npu_config.current_run_dtype = dtype
+ try:
+ # Yield control back to the body of the `with` statement
+ yield
+ finally:
+ # 恢复原始的环境变量值
+ npu_config.current_run_dtype = None
+ npu_config.original_run_dtype = None
+
+
+class NPUConfig:
+ N_NPU_PER_NODE = 8
+
+ def __init__(self):
+ self.on_npu = npu_is_available
+ self.node_world_size = self.N_NPU_PER_NODE
+ self.profiling = False
+ self.profiling_step = 5
+ self.enable_FA = True
+ self.enable_FP32 = False
+ self.load_pickle = True
+ self.use_small_dataset = False
+ self.current_run_dtype = None
+ self.original_run_dtype = None
+ self.zp_manager = zp_manager
+ self.replaced_type = torch.float32
+ self.conv_dtype = torch.float16
+ if self.enable_FA and self.enable_FP32:
+ self.inf_float = -10000.0
+ else:
+ self.inf_float = -10000.0
+
+ if self.use_small_dataset:
+ self.load_pickle = False
+
+ self._loss = []
+ self.work_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ self.pickle_save_path = f"{self.work_path}/pickles"
+ self.mm = dict()
+
+ if self.on_npu:
+ import deepspeed
+ import sys
+ torch_npu.npu.set_compile_mode(jit_compile=False)
+
+ import deepspeed.runtime.utils as utils
+ from opensora.adaptor.utils import all_gather_dp_groups, all_gather_into_tensor_dp_groups
+ utils.all_gather_dp_groups = all_gather_dp_groups
+
+ import deepspeed.runtime.bf16_optimizer as bf16_optimizer
+ from opensora.adaptor.bf16_optimizer import BF16_Optimizer
+ self.replace_methods(bf16_optimizer.BF16_Optimizer, BF16_Optimizer)
+
+ from opensora.adaptor.stage_1_and_2 import DeepSpeedZeroOptimizer
+ import deepspeed.runtime.zero.stage_1_and_2 as stage_1_and_2
+ self.replace_methods(stage_1_and_2.DeepSpeedZeroOptimizer, DeepSpeedZeroOptimizer, ['_has_inf_or_nan'])
+
+ import deepspeed.runtime.engine as engine
+ from opensora.adaptor.engine import DeepSpeedEngine
+ self.replace_methods(engine.DeepSpeedEngine, DeepSpeedEngine, skip_fcns=['__init__', '_copy_recovery_script', '_change_recovery_script_permissions'])
+
+ if "RANK" in os.environ:
+ self.rank = int(os.environ["RANK"])
+ self.world_size = int(os.environ["WORLD_SIZE"])
+ torch_npu.npu.set_device(self.get_local_rank())
+ else:
+ self.rank = torch.cuda.current_device()
+ self.world_size = self.N_NPU_PER_NODE
+ self.print_with_rank(f"The npu_config.on_npu is {self.on_npu}")
+ self.bind_thread_to_cpu()
+ gc.set_threshold(700, 10, 10000)
+
+ def get_total_cores(self):
+ try:
+ total_cores = os.sysconf('SC_NPROCESSORS_ONLN')
+ except (AttributeError, ValueError):
+ total_cores = os.cpu_count()
+ return total_cores
+
+
+ def bind_thread_to_cpu(self):
+ total_cores = self.get_total_cores()
+ # 每个卡的核心数量
+ cores_per_rank = total_cores // 8
+ # 计算本地rank
+ local_rank = self.rank % 8
+ # 计算当前 rank 的 CPU 核范围
+ start_core = local_rank * cores_per_rank
+ end_core = start_core + cores_per_rank - 1
+ # 构建 CPU 核范围字符串
+ cpu_cores_range = f"{start_core}-{end_core}"
+ pid = os.getpid()
+ command = f"taskset -cp {cpu_cores_range} {pid}"
+
+ subprocess.run(command, shell=True, check=True)
+ return f"Binding Cores:{self.rank}:{pid}:{cpu_cores_range}"
+
+ def replace_methods(self, target_class, source_class, skip_fcns=[], only_include_fcns=None):
+ for attr_name in dir(source_class):
+ attr_value = getattr(source_class, attr_name)
+ if attr_name in source_class.__dict__:
+ attr_class_value = source_class.__dict__[attr_name]
+ else:
+ attr_class_value = attr_value
+ if (isinstance(attr_class_value, staticmethod) or isinstance(attr_class_value, classmethod)
+ or attr_name in skip_fcns):
+ print(f"skip replace {attr_name}")
+ continue
+
+ if only_include_fcns is not None and attr_name not in only_include_fcns:
+ continue
+
+ elif isinstance(attr_value, types.FunctionType):
+ setattr(target_class, attr_name, attr_value)
+
+ def get_attention_mask(self, attention_mask, repeat_num):
+ if self.on_npu and attention_mask is not None:
+ if npu_config.enable_FA:
+ attention_mask = attention_mask.to(torch.bool)
+ attention_mask = attention_mask.repeat(1, repeat_num, 1)
+ return attention_mask
+ def set_current_run_dtype(self, variables):
+ if variables[0].dtype != self.current_run_dtype and self.current_run_dtype is not None:
+ for index, var in enumerate(variables):
+ variables[index] = var.to(self.current_run_dtype)
+ return tuple(variables)
+
+ def restore_dtype(self, x):
+ if x.dtype != self.original_run_dtype and self.original_run_dtype is not None:
+ x = x.to(self.original_run_dtype)
+ return x
+
+ def get_output_video_path(self, name):
+ os.makedirs(f"{self.work_path}/output_videos", exist_ok=True)
+ return f"{self.work_path}/output_videos/{name}"
+
+ def get_node_id(self):
+ return self.rank // self.node_world_size
+
+ def get_node_size(self):
+ return self.world_size // self.node_world_size
+
+ def get_local_rank(self):
+ return self.rank % self.N_NPU_PER_NODE
+
+ def get_pickle_path(self, file_name):
+ return f"{self.pickle_save_path}/{file_name}_local_n63"
+
+ def free_mm(self):
+ for key, value in self.mm.items():
+ value.close()
+ self.mm.clear()
+
+ def __del__(self):
+ self.free_mm()
+
+ def try_load_pickle(self, file_name, function):
+ file_name = self.get_pickle_path(file_name)
+ if os.path.exists(file_name) and self.load_pickle:
+ with open(file_name, 'rb') as file:
+ # self.mm[file_name] = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
+ # # 使用 mmap 进行数据读取
+ # loaded_data = pickle.loads(self.mm[file_name][:])
+ loaded_data = pickle.load(file)
+ return loaded_data
+ else:
+ data = function()
+ if not self.use_small_dataset:
+ if self.rank % self.N_NPU_PER_NODE == 0:
+ # 只需要rank0保存文件
+ os.makedirs(self.pickle_save_path, exist_ok=True)
+ with open(file_name, 'wb') as file:
+ pickle.dump(data, file, pickle.HIGHEST_PROTOCOL)
+ return data
+
+ def try_get_vid_path(self, file, out_size=1024):
+ output_file = file.rsplit(".", 1)[0] + f"_resize{out_size}.mp4"
+ if not os.path.exists(output_file):
+ return file
+ # compress_video(file, output_file, out_size)
+ return output_file
+
+ def npu_format_cast(self, x):
+ return torch_npu.npu_format_cast(x, 2)
+
+ def calc_grad_norm(self, model):
+ # 计算并打印梯度范数
+ # model_engine = accelerator.deepspeed_engine_wrapped.engine
+ # gradients = model_engine.get_gradients()
+ # grad_norm = get_grad_norm(gradients)
+ # 计算并打印梯度范数
+ grad_norm = 0
+ n_grad = 0
+ # for name, param in model.named_parameters():
+ # grad_data = deepspeed.utils.safe_get_full_grad(param)
+ # # self.print_tensor_stats(grad_data, name=name)
+ #
+ # if grad_data is not None:
+ # param_norm = grad_data.norm(2)
+ # grad_norm += param_norm.item() ** 2
+ # n_grad += 1
+ # grad_norm = (grad_norm / n_grad) ** (1. / 2)
+
+ return grad_norm
+
+ def _run(self, operator, x, tmp_dtype, out_dtype=None, out_nd_format=False):
+ if self.on_npu:
+ if out_dtype is None:
+ out_dtype = x.dtype
+
+ with torch.cuda.amp.autocast(enabled=False):
+ x = operator.to(device=x.device, dtype=tmp_dtype)(x.to(tmp_dtype))
+ x = x.to(out_dtype)
+ if out_nd_format:
+ return self.npu_format_cast(x)
+ else:
+ return x
+ else:
+ return operator(x)
+
+ def run_group_norm(self, operator, x):
+ return self._run(operator, x, torch.float32)
+
+ def run_layer_norm(self, operator, x):
+ return self._run(operator, x, torch.float32)
+
+ def print_tensor_stats(self, tensor, name="Tensor", rank=None):
+ if rank and rank != self.rank:
+ return
+
+ if tensor is None:
+ self.print_msg(f"Tensor {name} is None.")
+ return
+
+ x_dtype = tensor.dtype
+ tensor = tensor.to(torch.bfloat16)
+ max_val = tensor.max().item()
+ min_val = tensor.min().item()
+ abs_max_val = min(abs(max_val), abs(min_val))
+ mean_val = tensor.mean().item()
+ median_val = tensor.median().item()
+ std_val = tensor.std().item()
+ shape = tensor.shape
+ self.print_msg(
+ f"{name} - Max: {max_val}, Min: {min_val}, Mean: {mean_val}, AbsMax: {abs_max_val},"
+ f"Median: {median_val}, Std: {std_val}, Shape: {shape}, Type: {x_dtype}")
+
+ def run_conv3d(self, operator, x, out_dtype):
+ return self._run(operator, x, self.conv_dtype, out_dtype, out_nd_format=True)
+
+ def run_pool_2d(self, operator, x):
+ return self._run(operator, x, self.replaced_type)
+
+ def run_pad_2d(self, operator, x, pad, mode="constant"):
+ if self.on_npu:
+ x_dtype = x.dtype
+ x = x.to(self.replaced_type)
+ x = operator(x, pad, mode)
+ x = x.to(x_dtype)
+ else:
+ x = operator(x, pad, mode)
+ return x
+
+ def seed_everything(self, seed=100):
+ seed += self.rank
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
+
+ def print_with_rank(self, msg, rank=0, save=False):
+ if self.rank == rank:
+ print(f"{msg}", flush=True)
+ if save:
+ self._loss.append(msg)
+
+ def print_msg(self, msg, on=True, rank=None):
+ if on:
+ if self.rank == rank or rank is None:
+ print(f"[RANK-{self.rank}]: {msg}", flush=True)
+
+ def save_loss(self, filename, rank=0):
+ if self.rank == rank:
+ import json
+ with open(filename, 'w') as file:
+ json.dump(self._loss, file, indent=4)
+
+ def run_attention(self, query, key, value, atten_mask, input_layout, head_dim, head_num):
+ if self.enable_FA:
+ hidden_states = torch_npu.npu_fusion_attention(query, key, value,
+ atten_mask=atten_mask,
+ input_layout=input_layout,
+ scale=1 / math.sqrt(head_dim),
+ head_num=head_num)[0]
+ else:
+ hidden_states = self.scaled_dot_product_attention(query, key, value,
+ atten_mask=atten_mask,
+ input_layout=input_layout,
+ scale=1 / math.sqrt(head_dim),
+ head_num=head_num)
+ return hidden_states
+
+ def scaled_dot_product_attention(self, query, key, value, input_layout, head_num=None,
+ atten_mask=None, scale=None, dropout_p=0.0, is_causal=False) -> torch.Tensor:
+ # L, S = query.size(-2), key.size(-2)
+ def trans_tensor_shape(x, layout, head_num):
+ if layout == "BSH":
+ batch = x.shape[0]
+ x = x.view(batch, -1, head_num, x.shape[-1] // head_num).transpose(1, 2).contiguous()
+ elif layout == "SBH":
+ batch = x.shape[1]
+ x = x.view(-1, batch * head_num, x.shape[-1] // head_num).transpose(0, 1).contiguous()
+ x = x.view(batch, head_num, -1, x.shape[-1])
+ return x
+
+ query = trans_tensor_shape(query, input_layout, head_num)
+ key = trans_tensor_shape(key, input_layout, head_num)
+ value = trans_tensor_shape(value, input_layout, head_num)
+
+ attn_weight = query @ key.transpose(-2, -1) * scale
+ attn_bias = torch.zeros_like(attn_weight, dtype=query.dtype, device=query.device)
+ if is_causal:
+ assert atten_mask is None
+ temp_mask = torch.zeros_like(attn_weight, dtype=torch.bool, device=query.device).tril(diagonal=0)
+ attn_bias.masked_fill_(temp_mask.logical_not(), npu_config.inf_float)
+ attn_bias.to(query.dtype)
+
+ if atten_mask is not None:
+ assert (not self.enable_FA) and atten_mask.dtype != torch.bool, \
+ "attention_mask must not be bool type when use this function"
+
+ attn_weight += attn_bias
+ attn_weight = torch.softmax(attn_weight, dim=-1)
+ attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+ output = attn_weight @ value
+ if input_layout == "BSH":
+ output = output.transpose(1, 2).contiguous().view(output.shape[0], -1, head_num * output.shape[-1])
+ else:
+ output = output.view(output.shape[0] * head_num, -1, output.shape[-1]).transpose(0, 1).contiguous()
+ output = output.view(output.shape[0], -1, head_num * output.shape[-1])
+ return output
+
+ def print_tensor_with_rank(self, name, tensor, rank=[0], dim_print_cnt=[]):
+ if type(rank) is not list:
+ rank = [rank]
+ if self.rank in rank:
+ def print_dim(tensor_, indices):
+ if tensor_.dim() == len(indices):
+ return '{0:10.5f} '.format(tensor[tuple(indices)].detach().item())
+ else:
+ cur_dim = len(indices)
+ ret = ''
+ for x in range(0, tensor_.size(cur_dim), tensor_.size(cur_dim) // dim_print_cnt[cur_dim]):
+ ret += print_dim(tensor_, indices + [x])
+ return ret + '\n'
+
+ print(name, tensor.size(), self.rank, '\n', print_dim(tensor, []))
+
+
+npu_config = NPUConfig()
diff --git a/opensora/sample/pipeline_inpaint.py b/opensora/sample/pipeline_inpaint.py
new file mode 100644
index 000000000..d4df874be
--- /dev/null
+++ b/opensora/sample/pipeline_inpaint.py
@@ -0,0 +1,365 @@
+
+from typing import Callable, List, Optional, Tuple, Union
+import math
+from matplotlib import widgets
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+import numpy as np
+
+
+from diffusers.utils import (
+ BACKENDS_MAPPING,
+ deprecate,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+try:
+ from opensora.npu_config import npu_config
+except:
+ npu_config = None
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+from .pipeline_opensora import EXAMPLE_DOC_STRING, retrieve_timesteps, OpenSoraPipeline
+
+class OpenSoraInpaintPipeline(OpenSoraPipeline):
+
+ def register_image_transforms(self, transforms):
+ self.image_transforms = transforms
+ print('image transforms register')
+
+
+ def get_masked_video(self,video,masks):
+
+ masks = masks[:,0:1,:,:]
+
+ images = [video[i] for i in range(video.shape[0])]
+ masks_list = [masks[i] for i in range(masks.shape[0])]
+
+ masked_images = []
+
+ for index,(image,mask) in enumerate(zip(images,masks_list)):
+ mask = mask.astype(bool)
+ if len(mask.shape) == 2:
+ mask = np.expand_dims(mask, axis=2)
+ masked_img = image * (1-mask)
+ masked_images.append(masked_img)
+
+ masked_video = np.stack(masked_images) #[(H,W,C)] -> (T,H,W,C)
+
+ masked_video_tensor = torch.from_numpy(masked_video) #numpy->tensor
+
+ if len(masks.shape) == 3:
+ masks = np.expand_dims(masks,axis=3)
+
+ masks = torch.from_numpy(masks)
+
+ return masked_video_tensor,masks
+
+
+
+
+ def get_masked_video_mask(
+ self,
+ video,
+ mask,
+ batch_size,
+ num_images_per_prompt,
+ num_frames,
+ height,
+ width,
+ do_classifier_free_guidance,
+ weight_dtype,
+ device
+ ):
+ # NOTE inpaint
+ # if isinstance(conditional_images, list) and isinstance(conditional_images[0], torch.Tensor):
+ # if len(conditional_images[0].shape) == 3:
+ # conditional_images = [condition_image.unsqueeze(0) for condition_image in conditional_images] # C H W -> 1 C H W
+ # elif len(conditional_images[0].shape) == 4:
+ # pass
+ # conditional_images = torch.cat(conditional_images, dim=0).to(device=device) # F C H W
+ # elif isinstance(conditional_images, torch.Tensor):
+ # assert len(conditional_images.shape) == 4, "The shape of conditional_images should be a tensor with 4 dim"
+ # conditional_images = conditional_images.to(device=device) # F C H W
+ # else:
+ # raise NotImplementedError
+
+ # input_video = torch.zeros([num_frames, 3, height, width], device=device)
+ # input_video[conditional_images_indices] = conditional_images.to(input_video.dtype)
+
+ # print(conditional_images_indices)
+
+ # T, C, H, W = input_video.shape
+ # mask = torch.ones([T, 1, H, W], device=device)
+ # mask[conditional_images_indices] = 0
+ # masked_video = input_video * (mask < 0.5)
+
+
+ # import ipdb; ipdb.set_trace()
+ masked_video,mask = self.get_masked_video(video,mask) #video [T,H,W,C];mask [T,1,H,W];masked_video [T,C,H,W]
+
+ T = masked_video.shape[0]
+
+ masked_video = masked_video.to(device)
+ mask = mask.to(device,dtype=torch.float16)
+
+ import ipdb; ipdb.set_trace()
+ mask = mask/255.0
+
+ try:
+ masked_video = self.image_transforms(masked_video)
+
+ except:
+ raise ValueError("The image_transforms is not defined, please define it first")
+
+ masked_video = masked_video.unsqueeze(0).repeat(batch_size * num_images_per_prompt, 1, 1, 1, 1).transpose(1, 2).contiguous() # b c t h w
+ mask = mask.unsqueeze(0).repeat(batch_size * num_images_per_prompt, 1, 1, 1, 1).transpose(1, 2).contiguous() # b c t h w
+ masked_video = masked_video.to(self.vae.vae.dtype)
+ # masked_video = masked_video.to(torch.float64)
+
+ # import ipdb; ipdb.set_trace()
+
+ masked_video = self.vae.encode(masked_video)
+
+ # not vae style
+ mask = rearrange(mask, 'b c t h w -> (b c t) 1 h w')
+ latent_size = (height // self.vae.vae_scale_factor[1], width // self.vae.vae_scale_factor[2])
+
+ # import ipdb; ipdb.set_trace()
+ if num_frames % 2 == 1:
+ latent_size_t = (num_frames - 1) // self.vae.vae_scale_factor[0] + 1
+ else:
+ latent_size_t = num_frames // self.vae.vae_scale_factor[0]
+ mask = F.interpolate(mask, size=latent_size, mode='bilinear')
+ mask = rearrange(mask, '(b c t) 1 h w -> b c t h w', t=T, c=1)
+ mask_first_frame = mask[:, :, 0:1].repeat(1, 1, self.vae.vae_scale_factor[0], 1, 1)
+ mask = torch.cat([mask_first_frame, mask[:, :, 1:]], dim=2).contiguous()
+ # import ipdb; ipdb.set_trace()
+ mask = mask.view(batch_size, latent_size_t, self.vae.vae_scale_factor[0], *latent_size)
+ mask = mask.transpose(1, 2).contiguous()
+
+ masked_video = torch.cat([masked_video] * 2) if do_classifier_free_guidance else masked_video
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+ masked_video = masked_video.to(weight_dtype)
+ mask = mask.to(weight_dtype)
+
+ return masked_video, mask
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ # NOTE inpaint
+ video,
+ masks,
+ prompt: Union[str, List[str]] = None,
+ negative_prompt: str = "",
+ num_inference_steps: int = 20,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.5,
+ motion_score: float = None,
+ num_images_per_prompt: Optional[int] = 1,
+ num_frames: Optional[int] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ use_resolution_binning: bool = True,
+ max_sequence_length: int = 300,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+
+ # 1. Check inputs. Raise error if not correct
+ num_frames = num_frames or self.transformer.config.sample_size_t * self.vae.vae_scale_factor[0]
+ height = height or self.transformer.config.sample_size[0] * self.vae.vae_scale_factor[1]
+ width = width or self.transformer.config.sample_size[1] * self.vae.vae_scale_factor[2]
+ self.check_inputs(
+ prompt,
+ num_frames,
+ height,
+ width,
+ negative_prompt,
+ callback_steps,
+ prompt_embeds,
+ negative_prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_attention_mask,
+ )
+
+ # 2. Default height and width to transformer
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ # import ipdb;ipdb.set_trace()
+ device = getattr(self, '_execution_device', None) or getattr(self, 'device', None) or torch.device('cuda')
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ (
+ prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_embeds,
+ negative_prompt_attention_mask,
+ ) = self.encode_prompt(
+ prompt,
+ do_classifier_free_guidance,
+ negative_prompt=negative_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ prompt_attention_mask=prompt_attention_mask,
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
+ clean_caption=clean_caption,
+ max_sequence_length=max_sequence_length,
+ )
+ if do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+ # 4. Prepare timesteps
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+ # 5. Prepare latents.
+ latent_channels = self.transformer.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ latent_channels,
+ num_frames,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 6.1 Prepare micro-conditions.
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+
+ # 7. Denoising loop
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+ masked_video, mask = self.get_masked_video_mask(
+ video,
+ masks,
+ batch_size,
+ num_images_per_prompt,
+ num_frames,
+ height,
+ width,
+ do_classifier_free_guidance,
+ latents.dtype,
+ device
+ )
+
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ # inpaint
+ # import ipdb; ipdb.set_trace()
+ latent_model_input = torch.cat([latent_model_input, masked_video, mask], dim=1)
+
+ current_timestep = t
+ if not torch.is_tensor(current_timestep):
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+ # This would be a good case for the `match` statement (Python 3.10+)
+ is_mps = latent_model_input.device.type == "mps"
+ if isinstance(current_timestep, float):
+ dtype = torch.float32 if is_mps else torch.float64
+ else:
+ dtype = torch.int32 if is_mps else torch.int64
+ current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+ elif len(current_timestep.shape) == 0:
+ current_timestep = current_timestep[None].to(latent_model_input.device)
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+ # import ipdb;ipdb.set_trace()
+ if prompt_embeds.ndim == 3:
+ prompt_embeds = prompt_embeds.unsqueeze(1) # b l d -> b 1 l d
+ if prompt_attention_mask.ndim == 2:
+ prompt_attention_mask = prompt_attention_mask.unsqueeze(1) # b l -> b 1 l
+ # prepare attention_mask.
+ # b c t h w -> b t h w
+ attention_mask = torch.ones_like(latent_model_input)[:, 0]
+ # predict noise model_output
+ # import ipdb; ipdb.set_trace()
+ noise_pred = self.transformer(
+ latent_model_input,
+ attention_mask=attention_mask,
+ encoder_hidden_states=prompt_embeds,
+ encoder_attention_mask=prompt_attention_mask,
+ timestep=current_timestep,
+ added_cond_kwargs=added_cond_kwargs,
+ motion_score=motion_score,
+ return_dict=False,
+ )[0]
+ # import ipdb; ipdb.set_trace()
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # learned sigma
+ if self.transformer.config.out_channels // 2 == latent_channels:
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
+ else:
+ noise_pred = noise_pred
+
+ # compute previous image: x_t -> x_t-1
+ # import ipdb; ipdb.set_trace()
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+ # print(f'latents_{i}_{t}', torch.max(latents), torch.min(latents), torch.mean(latents), torch.std(latents))
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ step_idx = i // getattr(self.scheduler, "order", 1)
+ callback(step_idx, t, latents)
+ # import ipdb;ipdb.set_trace()
+ # latents = latents.squeeze(2)
+ # import ipdb; ipdb.set_trace()
+ if not output_type == "latent":
+ # b t h w c
+ image = self.decode_latents(latents)
+ image = image[:, :num_frames, :height, :width]
+ else:
+ image = latents
+
+ # Offload all models
+ self.maybe_free_model_hooks()
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
+
diff --git a/opensora/sample/pipeline_opensora.py b/opensora/sample/pipeline_opensora.py
new file mode 100644
index 000000000..c650f5ab0
--- /dev/null
+++ b/opensora/sample/pipeline_opensora.py
@@ -0,0 +1,1196 @@
+# Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+import math
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers.models import AutoencoderKL, Transformer2DModel
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.utils import (
+ BACKENDS_MAPPING,
+ deprecate,
+ is_bs4_available,
+ is_ftfy_available,
+ logging,
+ replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+try:
+ from opensora.npu_config import npu_config
+except:
+ npu_config = None
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+if is_bs4_available():
+ from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+ import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> import torch
+ >>> from diffusers import PixArtSigmaPipeline
+
+ >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-Sigma-XL-2-512-MS" too.
+ >>> pipe = PixArtSigmaPipeline.from_pretrained(
+ ... "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", torch_dtype=torch.float16
+ ... )
+ >>> # Enable memory optimizations.
+ >>> # pipe.enable_model_cpu_offload()
+
+ >>> prompt = "A small cactus with a happy face in the Sahara desert."
+ >>> image = pipe(prompt).images[0]
+ ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+ scheduler,
+ num_inference_steps: Optional[int] = None,
+ device: Optional[Union[str, torch.device]] = None,
+ timesteps: Optional[List[int]] = None,
+ **kwargs,
+):
+ """
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+ Args:
+ scheduler (`SchedulerMixin`):
+ The scheduler to get timesteps from.
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+ must be `None`.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+ must be `None`.
+
+ Returns:
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+ second element is the number of inference steps.
+ """
+ if timesteps is not None:
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+ if not accepts_timesteps:
+ raise ValueError(
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+ f" timestep schedules. Please check whether you are using the correct scheduler."
+ )
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ num_inference_steps = len(timesteps)
+ else:
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ return timesteps, num_inference_steps
+
+
+class OpenSoraPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using PixArt-Sigma.
+ """
+
+ bad_punct_regex = re.compile(
+ r"["
+ + "#®•©™&@·º½¾¿¡§~"
+ + r"\)"
+ + r"\("
+ + r"\]"
+ + r"\["
+ + r"\}"
+ + r"\{"
+ + r"\|"
+ + "\\"
+ + r"\/"
+ + r"\*"
+ + r"]{1,}"
+ ) # noqa
+
+ _optional_components = ["tokenizer", "text_encoder"]
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+ def __init__(
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ vae: AutoencoderKL,
+ transformer: Transformer2DModel,
+ scheduler: DPMSolverMultistepScheduler,
+ ):
+ super().__init__()
+
+ self.register_modules(
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+ )
+
+ # self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt
+ def encode_prompt(
+ self,
+ prompt: Union[str, List[str]],
+ do_classifier_free_guidance: bool = True,
+ negative_prompt: str = "",
+ num_samples_per_prompt: int = 1,
+ device: Optional[torch.device] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ clean_caption: bool = False,
+ max_sequence_length: int = 120,
+ **kwargs,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+ instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+ PixArt-Alpha, this should be "".
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+ whether to use classifier free guidance or not
+ num_samples_per_prompt (`int`, *optional*, defaults to 1):
+ number of images that should be generated per prompt
+ device: (`torch.device`, *optional*):
+ torch device to place the resulting embeddings on
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
+ string.
+ clean_caption (`bool`, defaults to `False`):
+ If `True`, the function will preprocess and clean the provided caption before encoding.
+ max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
+ """
+
+ if "mask_feature" in kwargs:
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+ if device is None:
+ device = getattr(self, '_execution_device', None) or getattr(self, 'device', None) or torch.device('cuda')
+
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ # See Section 3.1. of the paper.
+ max_length = max_sequence_length
+
+ if prompt_embeds is None:
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ add_special_tokens=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {max_length} tokens: {removed_text}"
+ )
+
+ prompt_attention_mask = text_inputs.attention_mask
+ prompt_attention_mask = prompt_attention_mask.to(device)
+
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+ prompt_embeds = prompt_embeds[0]
+
+ if self.text_encoder is not None:
+ dtype = self.text_encoder.dtype
+ elif self.transformer is not None:
+ dtype = self.transformer.dtype
+ else:
+ dtype = None
+
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_samples_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_samples_per_prompt, seq_len, -1)
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+ prompt_attention_mask = prompt_attention_mask.repeat(num_samples_per_prompt, 1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens = [negative_prompt] * batch_size
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_attention_mask=True,
+ add_special_tokens=True,
+ return_tensors="pt",
+ )
+ negative_prompt_attention_mask = uncond_input.attention_mask
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_samples_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_samples_per_prompt, seq_len, -1)
+
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_samples_per_prompt, 1)
+ else:
+ negative_prompt_embeds = None
+ negative_prompt_attention_mask = None
+
+ return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
+ def check_inputs(
+ self,
+ prompt,
+ num_frames,
+ height,
+ width,
+ negative_prompt,
+ callback_steps,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ prompt_attention_mask=None,
+ negative_prompt_attention_mask=None,
+ ):
+ if num_frames <= 0:
+ raise ValueError(f"`num_frames` have to be positive but is {num_frames}.")
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and prompt_attention_mask is None:
+ raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+ if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+ raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+ raise ValueError(
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+ f" {negative_prompt_attention_mask.shape}."
+ )
+
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+ def _text_preprocessing(self, text, clean_caption=False):
+ if clean_caption and not is_bs4_available():
+ logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+ logger.warning("Setting `clean_caption` to False...")
+ clean_caption = False
+
+ if clean_caption and not is_ftfy_available():
+ logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+ logger.warning("Setting `clean_caption` to False...")
+ clean_caption = False
+
+ if not isinstance(text, (tuple, list)):
+ text = [text]
+
+ def process(text: str):
+ if clean_caption:
+ text = self._clean_caption(text)
+ text = self._clean_caption(text)
+ else:
+ text = text.lower().strip()
+ return text
+
+ return [process(t) for t in text]
+
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+ def _clean_caption(self, caption):
+ caption = str(caption)
+ caption = ul.unquote_plus(caption)
+ caption = caption.strip().lower()
+ caption = re.sub("", "person", caption)
+ # urls:
+ caption = re.sub(
+ r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
+ "",
+ caption,
+ ) # regex for urls
+ caption = re.sub(
+ r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
+ "",
+ caption,
+ ) # regex for urls
+ # html:
+ caption = BeautifulSoup(caption, features="html.parser").text
+
+ # @
+ caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+ # 31C0—31EF CJK Strokes
+ # 31F0—31FF Katakana Phonetic Extensions
+ # 3200—32FF Enclosed CJK Letters and Months
+ # 3300—33FF CJK Compatibility
+ # 3400—4DBF CJK Unified Ideographs Extension A
+ # 4DC0—4DFF Yijing Hexagram Symbols
+ # 4E00—9FFF CJK Unified Ideographs
+ caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+ caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+ caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+ caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+ caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+ caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+ # caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+ #######################################################
+
+ # все виды тире / all types of dash --> "-"
+ caption = re.sub(
+ r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
+ "-",
+ caption,
+ )
+
+ # кавычки к одному стандарту
+ caption = re.sub(r"[`´«»“”¨]", '"', caption)
+ caption = re.sub(r"[‘’]", "'", caption)
+
+ # "
+ caption = re.sub(r""?", "", caption)
+ # &
+ caption = re.sub(r"&", "", caption)
+
+ # ip adresses:
+ caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+ # article ids:
+ caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+ # \n
+ caption = re.sub(r"\\n", " ", caption)
+
+ # "#123"
+ caption = re.sub(r"#\d{1,3}\b", "", caption)
+ # "#12345.."
+ caption = re.sub(r"#\d{5,}\b", "", caption)
+ # "123456.."
+ caption = re.sub(r"\b\d{6,}\b", "", caption)
+ # filenames:
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+ #
+ caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
+ caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
+
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
+ caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
+
+ # this-is-my-cute-cat / this_is_my_cute_cat
+ regex2 = re.compile(r"(?:\-|\_)")
+ if len(re.findall(regex2, caption)) > 3:
+ caption = re.sub(regex2, " ", caption)
+
+ caption = ftfy.fix_text(caption)
+ caption = html.unescape(html.unescape(caption))
+
+ caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
+ caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
+ caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
+
+ caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+ caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+ caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+ caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
+
+ caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+ caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+ caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+ caption = re.sub(r"\s+", " ", caption)
+
+ caption.strip()
+
+ caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+ caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+ caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+ caption = re.sub(r"^\.\S+$", "", caption)
+ return caption.strip()
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+ def prepare_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None):
+ shape = (
+ batch_size,
+ num_channels_latents,
+ (math.ceil((int(num_frames) - 1) // self.vae.vae_scale_factor[0]) + 1) if int(num_frames) % 2 == 1 else math.ceil(int(num_frames) // self.vae.vae_scale_factor[0]),
+ math.ceil(int(height) // self.vae.vae_scale_factor[1]),
+ math.ceil(int(width) // self.vae.vae_scale_factor[2]),
+ )
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+
+
+ return latents
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ negative_prompt: str = "",
+ num_inference_steps: int = 20,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.5,
+ motion_score: float = None,
+ num_samples_per_prompt: Optional[int] = 1,
+ num_frames: Optional[int] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ use_resolution_binning: bool = True,
+ max_sequence_length: int = 300,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ """
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_inference_steps (`int`, *optional*, defaults to 100):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+ timesteps are used. Must be in descending order.
+ guidance_scale (`float`, *optional*, defaults to 4.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ num_samples_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The width in pixels of the generated image.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+ negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+ Pre-generated attention mask for negative text embeddings.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ clean_caption (`bool`, *optional*, defaults to `True`):
+ Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+ be installed. If the dependencies are not installed, the embeddings will be created from the raw
+ prompt.
+ use_resolution_binning (`bool` defaults to `True`):
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+ the requested resolution. Useful for generating non-square images.
+ max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+ returned where the first element is a list with the generated images
+ """
+ # 1. Check inputs. Raise error if not correct
+ num_frames = num_frames or self.transformer.config.sample_size_t * self.vae.vae_scale_factor[0]
+ height = height or self.transformer.config.sample_size[0] * self.vae.vae_scale_factor[1]
+ width = width or self.transformer.config.sample_size[1] * self.vae.vae_scale_factor[2]
+ self.check_inputs(
+ prompt,
+ num_frames,
+ height,
+ width,
+ negative_prompt,
+ callback_steps,
+ prompt_embeds,
+ negative_prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_attention_mask,
+ )
+
+ # 2. Default height and width to transformer
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+ # import ipdb;ipdb.set_trace()
+ device = getattr(self, '_execution_device', None) or getattr(self, 'device', None) or torch.device('cuda')
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ (
+ prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_embeds,
+ negative_prompt_attention_mask,
+ ) = self.encode_prompt(
+ prompt,
+ do_classifier_free_guidance,
+ negative_prompt=negative_prompt,
+ num_samples_per_prompt=num_samples_per_prompt,
+ device=device,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ prompt_attention_mask=prompt_attention_mask,
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
+ clean_caption=clean_caption,
+ max_sequence_length=max_sequence_length,
+ )
+ if do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+ # 4. Prepare timesteps
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+ # 5. Prepare latents.
+ latent_channels = self.transformer.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_samples_per_prompt,
+ latent_channels,
+ num_frames,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+ # print('latents', latents.shape)
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 6.1 Prepare micro-conditions.
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+
+ # 7. Denoising loop
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ current_timestep = t
+ if not torch.is_tensor(current_timestep):
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+ # This would be a good case for the `match` statement (Python 3.10+)
+ is_mps = latent_model_input.device.type == "mps"
+ if isinstance(current_timestep, float):
+ dtype = torch.float32 if is_mps else torch.float64
+ else:
+ dtype = torch.int32 if is_mps else torch.int64
+ current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+ elif len(current_timestep.shape) == 0:
+ current_timestep = current_timestep[None].to(latent_model_input.device)
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+ # import ipdb;ipdb.set_trace()
+ if prompt_embeds.ndim == 3:
+ prompt_embeds = prompt_embeds.unsqueeze(1) # b l d -> b 1 l d
+ if prompt_attention_mask.ndim == 2:
+ prompt_attention_mask = prompt_attention_mask.unsqueeze(1) # b l -> b 1 l
+ # prepare attention_mask.
+ # b c t h w -> b t h w
+ attention_mask = torch.ones_like(latent_model_input)[:, 0]
+ # predict noise model_output
+ noise_pred = self.transformer(
+ latent_model_input,
+ attention_mask=attention_mask,
+ encoder_hidden_states=prompt_embeds,
+ encoder_attention_mask=prompt_attention_mask,
+ timestep=current_timestep,
+ added_cond_kwargs=added_cond_kwargs,
+ motion_score=motion_score,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # learned sigma
+ if self.transformer.config.out_channels // 2 == latent_channels:
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
+ else:
+ noise_pred = noise_pred
+
+ # compute previous image: x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+ # print(f'latents_{i}_{t}', torch.max(latents), torch.min(latents), torch.mean(latents), torch.std(latents))
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ step_idx = i // getattr(self.scheduler, "order", 1)
+ callback(step_idx, t, latents)
+ # import ipdb;ipdb.set_trace()
+ # latents = latents.squeeze(2)
+ if not output_type == "latent":
+ # b t h w c
+ image = self.decode_latents(latents)
+ image = image[:, :num_frames, :height, :width]
+ else:
+ image = latents
+
+ # Offload all models
+ self.maybe_free_model_hooks()
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
+
+
+ def decode_latents(self, latents):
+ # print(f'before vae decode {latents.shape}', torch.max(latents).item(), torch.min(latents).item(), torch.mean(latents).item(), torch.std(latents).item())
+ video = self.vae.decode(latents.to(self.vae.vae.dtype))
+ # video = self.vae.decode(latents.to(torch.float16))
+ # print(f'after vae decode {latents.shape}', torch.max(video).item(), torch.min(video).item(), torch.mean(video).item(), torch.std(video).item())
+ video = ((video / 2.0 + 0.5).clamp(0, 1) * 255).to(dtype=torch.uint8).cpu().permute(0, 1, 3, 4, 2).contiguous() # b t h w c
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+ return video
+
+
+
+from opensora.utils.freeinit_utils import get_freq_filter, freq_mix_3d
+class OpenSoraFreeInitPipeline(OpenSoraPipeline):
+ def __init__(
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ vae: AutoencoderKL,
+ transformer: Transformer2DModel,
+ scheduler: DPMSolverMultistepScheduler,
+ ):
+ super().__init__(tokenizer, text_encoder, vae, transformer, scheduler)
+ self.freq_filter = None
+
+
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ negative_prompt: str = "",
+ num_inference_steps: int = 20,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.5,
+ motion_score: float = None,
+ num_samples_per_prompt: Optional[int] = 1,
+ num_frames: Optional[int] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ use_resolution_binning: bool = True,
+ max_sequence_length: int = 300,
+ filter_params=None,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ """
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_inference_steps (`int`, *optional*, defaults to 100):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+ timesteps are used. Must be in descending order.
+ guidance_scale (`float`, *optional*, defaults to 4.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ num_samples_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The width in pixels of the generated image.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+ negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+ Pre-generated attention mask for negative text embeddings.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+ clean_caption (`bool`, *optional*, defaults to `True`):
+ Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+ be installed. If the dependencies are not installed, the embeddings will be created from the raw
+ prompt.
+ use_resolution_binning (`bool` defaults to `True`):
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+ the requested resolution. Useful for generating non-square images.
+ max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+ returned where the first element is a list with the generated images
+ """
+ # 1. Check inputs. Raise error if not correct
+ num_frames = num_frames or self.transformer.config.sample_size_t * self.vae.vae_scale_factor[0]
+ height = height or self.transformer.config.sample_size[0] * self.vae.vae_scale_factor[1]
+ width = width or self.transformer.config.sample_size[1] * self.vae.vae_scale_factor[2]
+ self.check_inputs(
+ prompt,
+ num_frames,
+ height,
+ width,
+ negative_prompt,
+ callback_steps,
+ prompt_embeds,
+ negative_prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_attention_mask,
+ )
+
+ # 2. Default height and width to transformer
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+ # import ipdb;ipdb.set_trace()
+ device = getattr(self, '_execution_device', None) or getattr(self, 'device', None) or torch.device('cuda')
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ # 3. Encode input prompt
+ (
+ prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_embeds,
+ negative_prompt_attention_mask,
+ ) = self.encode_prompt(
+ prompt,
+ do_classifier_free_guidance,
+ negative_prompt=negative_prompt,
+ num_samples_per_prompt=num_samples_per_prompt,
+ device=device,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ prompt_attention_mask=prompt_attention_mask,
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
+ clean_caption=clean_caption,
+ max_sequence_length=max_sequence_length,
+ )
+ if do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+ # 4. Prepare timesteps
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+ # 5. Prepare latents.
+ latent_channels = self.transformer.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_samples_per_prompt,
+ latent_channels,
+ num_frames,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ batch_size = 1
+ num_channels_latents = latent_channels
+ filter_shape = [
+ batch_size,
+ num_channels_latents,
+ (math.ceil((int(num_frames) - 1) / self.vae.vae_scale_factor[0]) + 1) if int(num_frames) % 2 == 1 else math.ceil(int(num_frames) / self.vae.vae_scale_factor[0]),
+ math.ceil(int(height) / self.vae.vae_scale_factor[1]),
+ math.ceil(int(width) / self.vae.vae_scale_factor[2]),
+ ]
+ # self.freq_filter = get_freq_filter(filter_shape, device=self._execution_device, params=filter_params)
+ filter_params = dict(method='butterworth', n=4, d_s=0.25, d_t=0.25)
+ self.freq_filter = get_freq_filter(
+ filter_shape,
+ device=device,
+ filter_type=filter_params['method'],
+ n=filter_params['n'] if filter_params['method']=="butterworth" else None,
+ d_s=filter_params['d_s'],
+ d_t=filter_params['d_t']
+ )
+ # print('latents', latents.shape)
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 6.1 Prepare micro-conditions.
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+
+ # 7. Denoising loop
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+
+ # FreeInit ------------------------------------------------------------------
+ num_iters = 2
+ num_videos_per_prompt = 1
+ save_intermediate = True
+ return_orig = True
+ save_dir = 'freeinit'
+ save_name = 'test'
+ latents_dtype = latents.dtype
+ for iter in range(num_iters):
+ if iter == 0:
+ initial_noise = latents.detach().clone()
+ else:
+ # 1. DDPM Forward with initial noise, get noisy latents z_T
+ # if use_fast_sampling:
+ # current_diffuse_timestep = self.scheduler.config.num_train_timesteps / num_iters * (iter + 1) - 1
+ # else:
+ # current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
+ current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1 # diffuse to t=999 noise level
+ diffuse_timesteps = torch.full((batch_size,),int(current_diffuse_timestep))
+ diffuse_timesteps = diffuse_timesteps.long()
+ z_T = self.scheduler.add_noise(
+ original_samples=latents.to(device),
+ noise=initial_noise.to(device),
+ timesteps=diffuse_timesteps.to(device)
+ )
+ # 2. create random noise z_rand for high-frequency
+ z_rand = torch.randn((batch_size * num_videos_per_prompt, num_channels_latents,
+ (math.ceil((int(num_frames) - 1) / self.vae.vae_scale_factor[0]) + 1) if int(num_frames) % 2 == 1 else math.ceil(int(num_frames) / self.vae.vae_scale_factor[0]),
+ math.ceil(int(height) / self.vae.vae_scale_factor[1]),
+ math.ceil(int(width) / self.vae.vae_scale_factor[2])
+ ), device=device)
+ # 3. Roise Reinitialization
+ latents = freq_mix_3d(z_T.to(dtype=torch.float32), z_rand, LPF=self.freq_filter)
+ latents = latents.to(latents_dtype)
+
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+
+ for i, t in enumerate(timesteps):
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ current_timestep = t
+ if not torch.is_tensor(current_timestep):
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+ # This would be a good case for the `match` statement (Python 3.10+)
+ is_mps = latent_model_input.device.type == "mps"
+ if isinstance(current_timestep, float):
+ dtype = torch.float32 if is_mps else torch.float64
+ else:
+ dtype = torch.int32 if is_mps else torch.int64
+ current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+ elif len(current_timestep.shape) == 0:
+ current_timestep = current_timestep[None].to(latent_model_input.device)
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+ # import ipdb;ipdb.set_trace()
+ if prompt_embeds.ndim == 3:
+ prompt_embeds = prompt_embeds.unsqueeze(1) # b l d -> b 1 l d
+ if prompt_attention_mask.ndim == 2:
+ prompt_attention_mask = prompt_attention_mask.unsqueeze(1) # b l -> b 1 l
+ # prepare attention_mask.
+ # b c t h w -> b t h w
+ attention_mask = torch.ones_like(latent_model_input)[:, 0]
+ # predict noise model_output
+ noise_pred = self.transformer(
+ latent_model_input,
+ attention_mask=attention_mask,
+ encoder_hidden_states=prompt_embeds,
+ encoder_attention_mask=prompt_attention_mask,
+ timestep=current_timestep,
+ added_cond_kwargs=added_cond_kwargs,
+ motion_score=motion_score,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # learned sigma
+ if self.transformer.config.out_channels // 2 == latent_channels:
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
+ else:
+ noise_pred = noise_pred
+
+ # compute previous image: x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+ # print(f'latents_{i}_{t}', torch.max(latents), torch.min(latents), torch.mean(latents), torch.std(latents))
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ step_idx = i // getattr(self.scheduler, "order", 1)
+ callback(step_idx, t, latents)
+ if save_intermediate:
+ # Post-processing
+ import os
+ video = self.decode_latents(latents)
+ # video = torch.from_numpy(video)
+ os.makedirs(save_dir, exist_ok=True)
+ self.save_videos_grid(video, f"{save_dir}/{save_name}_iter{iter}.gif")
+
+ if return_orig and iter==0:
+ orig_video = self.decode_latents(latents)
+ # orig_video = torch.from_numpy(orig_video)
+ # import ipdb;ipdb.set_trace()
+ # latents = latents.squeeze(2)
+ if not output_type == "latent":
+ # b t h w c
+ image = self.decode_latents(latents)
+ image = image[:, :num_frames, :height, :width]
+ else:
+ image = latents
+
+ # Offload all models
+ self.maybe_free_model_hooks()
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
+
+
+
+ def save_videos_grid(self, videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+ from einops import rearrange
+ import torchvision
+ import os
+ import numpy as np
+ import imageio
+ videos = rearrange(videos, "b t h w c -> t b c h w")
+ outputs = []
+ for x in videos:
+ x = torchvision.utils.make_grid(x, nrow=n_rows)
+ x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+ if rescale:
+ x = (x + 1.0) / 2.0 # -1,1 -> 0,1
+ x = x.numpy()
+ if not x.max() > 1:
+ x = x * 255
+ x = x.astype(np.uint8)
+ outputs.append(x)
+
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ imageio.mimsave(path, outputs, fps=fps)
diff --git a/opensora/sample/pipeline_videogen.py b/opensora/sample/pipeline_opensora_sp.py
similarity index 65%
rename from opensora/sample/pipeline_videogen.py
rename to opensora/sample/pipeline_opensora_sp.py
index 303414b8c..03a84599e 100644
--- a/opensora/sample/pipeline_videogen.py
+++ b/opensora/sample/pipeline_opensora_sp.py
@@ -1,4 +1,4 @@
-# All rights reserved.
+# Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -17,26 +17,29 @@
import re
import urllib.parse as ul
from typing import Callable, List, Optional, Tuple, Union
-
+import math
import torch
-import einops
-from einops import rearrange
from transformers import T5EncoderModel, T5Tokenizer
-
-from diffusers.image_processor import VaeImageProcessor
+from einops import rearrange
from diffusers.models import AutoencoderKL, Transformer2DModel
from diffusers.schedulers import DPMSolverMultistepScheduler
from diffusers.utils import (
BACKENDS_MAPPING,
+ deprecate,
is_bs4_available,
is_ftfy_available,
logging,
replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import BaseOutput
-from dataclasses import dataclass
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+try:
+ import torch_npu
+ from opensora.acceleration.parallel_states import get_sequence_parallel_state, hccl_info
+except:
+ torch_npu = None
+ from opensora.utils.parallel_states import get_sequence_parallel_state, nccl_info
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -46,16 +49,19 @@
if is_ftfy_available():
import ftfy
+
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import torch
- >>> from diffusers import PixArtAlphaPipeline
+ >>> from diffusers import PixArtSigmaPipeline
- >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
- >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+ >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-Sigma-XL-2-512-MS" too.
+ >>> pipe = PixArtSigmaPipeline.from_pretrained(
+ ... "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", torch_dtype=torch.float16
+ ... )
>>> # Enable memory optimizations.
- >>> pipe.enable_model_cpu_offload()
+ >>> # pipe.enable_model_cpu_offload()
>>> prompt = "A small cactus with a happy face in the Sahara desert."
>>> image = pipe(prompt).images[0]
@@ -63,47 +69,82 @@
"""
-@dataclass
-class VideoPipelineOutput(BaseOutput):
- video: torch.Tensor
-
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+ scheduler,
+ num_inference_steps: Optional[int] = None,
+ device: Optional[Union[str, torch.device]] = None,
+ timesteps: Optional[List[int]] = None,
+ **kwargs,
+):
+ """
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-class VideoGenPipeline(DiffusionPipeline):
- r"""
- Pipeline for text-to-image generation using PixArt-Alpha.
+ Args:
+ scheduler (`SchedulerMixin`):
+ The scheduler to get timesteps from.
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+ must be `None`.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+ must be `None`.
+
+ Returns:
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+ second element is the number of inference steps.
+ """
+ if timesteps is not None:
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+ if not accepts_timesteps:
+ raise ValueError(
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+ f" timestep schedules. Please check whether you are using the correct scheduler."
+ )
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ num_inference_steps = len(timesteps)
+ else:
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ return timesteps, num_inference_steps
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
- Args:
- vae ([`AutoencoderKL`]):
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
- text_encoder ([`T5EncoderModel`]):
- Frozen text-encoder. PixArt-Alpha uses
- [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
- [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
- tokenizer (`T5Tokenizer`):
- Tokenizer of class
- [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
- transformer ([`Transformer2DModel`]):
- A text conditioned `Transformer2DModel` to denoise the encoded image latents.
- scheduler ([`SchedulerMixin`]):
- A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+class OpenSoraPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using PixArt-Sigma.
"""
+
bad_punct_regex = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+ r"["
+ + "#®•©™&@·º½¾¿¡§~"
+ + r"\)"
+ + r"\("
+ + r"\]"
+ + r"\["
+ + r"\}"
+ + r"\{"
+ + r"\|"
+ + "\\"
+ + r"\/"
+ + r"\*"
+ + r"]{1,}"
) # noqa
_optional_components = ["tokenizer", "text_encoder"]
model_cpu_offload_seq = "text_encoder->transformer->vae"
def __init__(
- self,
- tokenizer: T5Tokenizer,
- text_encoder: T5EncoderModel,
- vae: AutoencoderKL,
- transformer: Transformer2DModel,
- scheduler: DPMSolverMultistepScheduler,
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ vae: AutoencoderKL,
+ transformer: Transformer2DModel,
+ scheduler: DPMSolverMultistepScheduler,
):
super().__init__()
@@ -113,27 +154,21 @@ def __init__(
# self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
- # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
- def mask_text_embeddings(self, emb, mask):
- if emb.shape[0] == 1:
- keep_index = mask.sum().item()
- return emb[:, :, :keep_index, :], keep_index # 1, 120, 4096 -> 1 7 4096
- else:
- masked_feature = emb * mask[:, None, :, None] # 1 120 4096
- return masked_feature, emb.shape[2]
-
- # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt
def encode_prompt(
- self,
- prompt: Union[str, List[str]],
- do_classifier_free_guidance: bool = True,
- negative_prompt: str = "",
- num_images_per_prompt: int = 1,
- device: Optional[torch.device] = None,
- prompt_embeds: Optional[torch.FloatTensor] = None,
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
- clean_caption: bool = False,
- mask_feature: bool = True,
+ self,
+ prompt: Union[str, List[str]],
+ do_classifier_free_guidance: bool = True,
+ negative_prompt: str = "",
+ num_images_per_prompt: int = 1,
+ device: Optional[torch.device] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ clean_caption: bool = False,
+ max_sequence_length: int = 120,
+ **kwargs,
):
r"""
Encodes the prompt into text encoder hidden states.
@@ -157,15 +192,17 @@ def encode_prompt(
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
string.
- clean_caption (bool, defaults to `False`):
+ clean_caption (`bool`, defaults to `False`):
If `True`, the function will preprocess and clean the provided caption before encoding.
- mask_feature: (bool, defaults to `True`):
- If `True`, the function will mask the text embeddings.
+ max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
"""
- embeds_initially_provided = prompt_embeds is not None and negative_prompt_embeds is not None
+
+ if "mask_feature" in kwargs:
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
if device is None:
- device = self.text_encoder.device or self._execution_device
+ device = getattr(self, '_execution_device', None) or getattr(self, 'device', None) or torch.device('cuda')
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -175,7 +212,7 @@ def encode_prompt(
batch_size = prompt_embeds.shape[0]
# See Section 3.1. of the paper.
- max_length = 300
+ max_length = max_sequence_length
if prompt_embeds is None:
prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
@@ -184,7 +221,6 @@ def encode_prompt(
padding="max_length",
max_length=max_length,
truncation=True,
- return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt",
)
@@ -192,21 +228,19 @@ def encode_prompt(
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
- text_input_ids, untruncated_ids
+ text_input_ids, untruncated_ids
):
- removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1: -1])
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
logger.warning(
- "The following part of your input was truncated because the model can only handle sequences up to"
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
f" {max_length} tokens: {removed_text}"
)
- attention_mask = text_inputs.attention_mask.to(device)
- prompt_embeds_attention_mask = attention_mask
+ prompt_attention_mask = text_inputs.attention_mask
+ prompt_attention_mask = prompt_attention_mask.to(device)
- prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
prompt_embeds = prompt_embeds[0]
- else:
- prompt_embeds_attention_mask = torch.ones_like(prompt_embeds)
if self.text_encoder is not None:
dtype = self.text_encoder.dtype
@@ -221,8 +255,8 @@ def encode_prompt(
# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
- prompt_embeds_attention_mask = prompt_embeds_attention_mask.view(bs_embed, -1)
- prompt_embeds_attention_mask = prompt_embeds_attention_mask.repeat(num_images_per_prompt, 1)
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+ prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -238,11 +272,11 @@ def encode_prompt(
add_special_tokens=True,
return_tensors="pt",
)
- attention_mask = uncond_input.attention_mask.to(device)
+ negative_prompt_attention_mask = uncond_input.attention_mask
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
negative_prompt_embeds = self.text_encoder(
- uncond_input.input_ids.to(device),
- attention_mask=attention_mask,
+ uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
)
negative_prompt_embeds = negative_prompt_embeds[0]
@@ -255,36 +289,13 @@ def encode_prompt(
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
- # For classifier free guidance, we need to do two forward passes.
- # Here we concatenate the unconditional and text embeddings into a single batch
- # to avoid doing two forward passes
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
else:
negative_prompt_embeds = None
+ negative_prompt_attention_mask = None
- # print(prompt_embeds.shape) # 1 120 4096
- # print(negative_prompt_embeds.shape) # 1 120 4096
-
- # Perform additional masking.
- if mask_feature and not embeds_initially_provided:
- prompt_embeds = prompt_embeds.unsqueeze(1)
- masked_prompt_embeds, keep_indices = self.mask_text_embeddings(prompt_embeds, prompt_embeds_attention_mask)
- masked_prompt_embeds = masked_prompt_embeds.squeeze(1)
- masked_negative_prompt_embeds = (
- negative_prompt_embeds[:, :keep_indices, :] if negative_prompt_embeds is not None else None
- )
-
- # import torch.nn.functional as F
-
- # padding = (0, 0, 0, 113) # (左, 右, 下, 上)
- # masked_prompt_embeds_ = F.pad(masked_prompt_embeds, padding, "constant", 0)
- # masked_negative_prompt_embeds_ = F.pad(masked_negative_prompt_embeds, padding, "constant", 0)
-
- # print(masked_prompt_embeds == masked_prompt_embeds_[:, :masked_negative_prompt_embeds.shape[1], ...])
-
- return masked_prompt_embeds, masked_negative_prompt_embeds
- # return masked_prompt_embeds_, masked_negative_prompt_embeds_
-
- return prompt_embeds, negative_prompt_embeds
+ return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
@@ -304,21 +315,27 @@ def prepare_extra_step_kwargs(self, generator, eta):
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
def check_inputs(
- self,
- prompt,
- height,
- width,
- negative_prompt,
- callback_steps,
- prompt_embeds=None,
- negative_prompt_embeds=None,
+ self,
+ prompt,
+ num_frames,
+ height,
+ width,
+ negative_prompt,
+ callback_steps,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ prompt_attention_mask=None,
+ negative_prompt_attention_mask=None,
):
+ if num_frames <= 0:
+ raise ValueError(f"`num_frames` have to be positive but is {num_frames}.")
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if (callback_steps is None) or (
- callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
@@ -349,6 +366,12 @@ def check_inputs(
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
+ if prompt_embeds is not None and prompt_attention_mask is None:
+ raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+ if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+ raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
@@ -356,17 +379,23 @@ def check_inputs(
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
f" {negative_prompt_embeds.shape}."
)
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+ raise ValueError(
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+ f" {negative_prompt_attention_mask.shape}."
+ )
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
def _text_preprocessing(self, text, clean_caption=False):
if clean_caption and not is_bs4_available():
- logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
- logger.warn("Setting `clean_caption` to False...")
+ logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+ logger.warning("Setting `clean_caption` to False...")
clean_caption = False
if clean_caption and not is_ftfy_available():
- logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
- logger.warn("Setting `clean_caption` to False...")
+ logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+ logger.warning("Setting `clean_caption` to False...")
clean_caption = False
if not isinstance(text, (tuple, list)):
@@ -390,14 +419,12 @@ def _clean_caption(self, caption):
caption = re.sub("", "person", caption)
# urls:
caption = re.sub(
- r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
- # noqa
+ r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
caption,
) # regex for urls
caption = re.sub(
- r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
- # noqa
+ r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
"",
caption,
) # regex for urls
@@ -420,13 +447,12 @@ def _clean_caption(self, caption):
caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
- caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+ # caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
#######################################################
# все виды тире / all types of dash --> "-"
caption = re.sub(
- r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
- # noqa
+ r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
"-",
caption,
)
@@ -497,14 +523,17 @@ def _clean_caption(self, caption):
caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
caption = re.sub(r"^\.\S+$", "", caption)
-
return caption.strip()
-
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
- def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator,
- latents=None):
+ def prepare_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None):
shape = (
- batch_size, num_channels_latents, video_length, self.vae.latent_size[0], self.vae.latent_size[1])
+ batch_size,
+ num_channels_latents,
+ (math.ceil((int(num_frames) - 1) / self.vae.vae_scale_factor[0]) + 1) if int(num_frames) % 2 == 1 else math.ceil(int(num_frames) / self.vae.vae_scale_factor[0]),
+ math.ceil(int(height) / self.vae.vae_scale_factor[1]),
+ math.ceil(int(width) / self.vae.vae_scale_factor[2]),
+ )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -518,34 +547,39 @@ def prepare_latents(self, batch_size, num_channels_latents, video_length, height
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
+
+
return latents
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
- self,
- prompt: Union[str, List[str]] = None,
- negative_prompt: str = "",
- num_inference_steps: int = 20,
- timesteps: List[int] = None,
- guidance_scale: float = 4.5,
- num_images_per_prompt: Optional[int] = 1,
- video_length: Optional[int] = None,
- height: Optional[int] = None,
- width: Optional[int] = None,
- eta: float = 0.0,
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
- latents: Optional[torch.FloatTensor] = None,
- prompt_embeds: Optional[torch.FloatTensor] = None,
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
- output_type: Optional[str] = "pil",
- return_dict: bool = True,
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
- callback_steps: int = 1,
- clean_caption: bool = True,
- mask_feature: bool = True,
- enable_temporal_attentions: bool = True,
- ) -> Union[VideoPipelineOutput, Tuple]:
+ self,
+ prompt: Union[str, List[str]] = None,
+ negative_prompt: str = "",
+ num_inference_steps: int = 20,
+ timesteps: List[int] = None,
+ guidance_scale: float = 4.5,
+ num_images_per_prompt: Optional[int] = 1,
+ num_frames: Optional[int] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: int = 1,
+ clean_caption: bool = True,
+ use_resolution_binning: bool = True,
+ max_sequence_length: int = 300,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
"""
Function invoked when calling the pipeline for generation.
@@ -563,7 +597,7 @@ def __call__(
timesteps (`List[int]`, *optional*):
Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
timesteps are used. Must be in descending order.
- guidance_scale (`float`, *optional*, defaults to 7.0):
+ guidance_scale (`float`, *optional*, defaults to 4.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -588,9 +622,12 @@ def __call__(
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
+ prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
- Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
+ Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+ negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+ Pre-generated attention mask for negative text embeddings.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -606,7 +643,11 @@ def __call__(
Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
be installed. If the dependencies are not installed, the embeddings will be created from the raw
prompt.
- mask_feature (`bool` defaults to `True`): If set to `True`, the text embeddings will be masked.
+ use_resolution_binning (`bool` defaults to `True`):
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+ the requested resolution. Useful for generating non-square images.
+ max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
Examples:
@@ -616,10 +657,20 @@ def __call__(
returned where the first element is a list with the generated images
"""
# 1. Check inputs. Raise error if not correct
- # height = height or self.transformer.config.sample_size * self.vae_scale_factor
- # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+ num_frames = num_frames or self.transformer.config.sample_size_t * self.vae.vae_scale_factor[0]
+ height = height or self.transformer.config.sample_size[0] * self.vae.vae_scale_factor[1]
+ width = width or self.transformer.config.sample_size[1] * self.vae.vae_scale_factor[2]
self.check_inputs(
- prompt, height, width, negative_prompt, callback_steps, prompt_embeds, negative_prompt_embeds
+ prompt,
+ num_frames,
+ height,
+ width,
+ negative_prompt,
+ callback_steps,
+ prompt_embeds,
+ negative_prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_attention_mask,
)
# 2. Default height and width to transformer
@@ -629,8 +680,8 @@ def __call__(
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
-
- device = self.text_encoder.device or self._execution_device
+ # import ipdb;ipdb.set_trace()
+ device = getattr(self, '_execution_device', None) or getattr(self, 'device', None) or torch.device('cuda')
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -638,7 +689,12 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
- prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+ (
+ prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_embeds,
+ negative_prompt_attention_mask,
+ ) = self.encode_prompt(
prompt,
do_classifier_free_guidance,
negative_prompt=negative_prompt,
@@ -646,22 +702,25 @@ def __call__(
device=device,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
+ prompt_attention_mask=prompt_attention_mask,
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
clean_caption=clean_caption,
- mask_feature=mask_feature,
+ max_sequence_length=max_sequence_length,
)
if do_classifier_free_guidance:
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
# 4. Prepare timesteps
- self.scheduler.set_timesteps(num_inference_steps, device=device)
- timesteps = self.scheduler.timesteps
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
# 5. Prepare latents.
latent_channels = self.transformer.config.in_channels
+ world_size = hccl_info.world_size if torch_npu is not None else nccl_info.world_size
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
latent_channels,
- video_length,
+ (num_frames + world_size - 1) // world_size if get_sequence_parallel_state() else num_frames,
height,
width,
prompt_embeds.dtype,
@@ -669,18 +728,17 @@ def __call__(
generator,
latents,
)
+ if get_sequence_parallel_state():
+ prompt_embeds = rearrange(prompt_embeds, 'b (n x) h -> b n x h', n=world_size,
+ x=prompt_embeds.shape[1] // world_size).contiguous()
+ rank = hccl_info.rank if torch_npu is not None else nccl_info.rank
+ prompt_embeds = prompt_embeds[:, rank, :, :]
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 6.1 Prepare micro-conditions.
added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
- # if self.transformer.config.sample_size == 128:
- # resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
- # aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
- # resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
- # aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
- # added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
# 7. Denoising loop
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
@@ -705,13 +763,24 @@ def __call__(
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
current_timestep = current_timestep.expand(latent_model_input.shape[0])
+ # import ipdb;ipdb.set_trace()
+ if prompt_embeds.ndim == 3:
+ prompt_embeds = prompt_embeds.unsqueeze(1) # b l d -> b 1 l d
+ if prompt_attention_mask.ndim == 2:
+ prompt_attention_mask = prompt_attention_mask.unsqueeze(1) # b l -> b 1 l
+ # prepare attention_mask.
+ # b c t h w -> b t h w
+ attention_mask = torch.ones_like(latent_model_input)[:, 0]
+ if get_sequence_parallel_state():
+ attention_mask = attention_mask.repeat(1, world_size, 1, 1)
# predict noise model_output
noise_pred = self.transformer(
latent_model_input,
+ attention_mask=attention_mask,
encoder_hidden_states=prompt_embeds,
+ encoder_attention_mask=prompt_attention_mask,
timestep=current_timestep,
added_cond_kwargs=added_cond_kwargs,
- enable_temporal_attentions=enable_temporal_attentions,
return_dict=False,
)[0]
@@ -728,32 +797,45 @@ def __call__(
# compute previous image: x_t -> x_t-1
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
+ # print(f'latents_{i}_{t}', torch.max(latents), torch.min(latents), torch.mean(latents), torch.std(latents))
+
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if callback is not None and i % callback_steps == 0:
step_idx = i // getattr(self.scheduler, "order", 1)
callback(step_idx, t, latents)
-
- if not output_type == 'latents':
- video = self.decode_latents(latents)
+ # import ipdb;ipdb.set_trace()
+ # latents = latents.squeeze(2)
+ world_size = hccl_info.world_size if torch_npu is not None else nccl_info.world_size
+ if get_sequence_parallel_state():
+ latents_shape = list(latents.shape)
+ full_shape = [latents_shape[0] * world_size] + latents_shape[1:]
+ all_latents = torch.zeros(full_shape, dtype=latents.dtype, device=latents.device)
+ torch.distributed.all_gather_into_tensor(all_latents, latents)
+ latents_list = list(all_latents.chunk(world_size, dim=0))
+ latents = torch.cat(latents_list, dim=2)
+
+ if not output_type == "latent":
+ # b t h w c
+ image = self.decode_latents(latents)
+ image = image[:, :num_frames, :height, :width]
else:
- video = latents
- return VideoPipelineOutput(video=video)
+ image = latents
# Offload all models
self.maybe_free_model_hooks()
if not return_dict:
- return (video,)
-
- return VideoPipelineOutput(video=video)
+ return (image,)
+ return ImagePipelineOutput(images=image)
+
+
def decode_latents(self, latents):
- video = self.vae.decode(latents)
- # video = self.vae.decode(latents / 0.18215)
- # video = rearrange(video, 'b c t h w -> b t c h w').contiguous()
- video = ((video / 2.0 + 0.5).clamp(0, 1) * 255).to(dtype=torch.uint8).cpu().permute(0, 1, 3, 4, 2).contiguous()
+ # print(f'before vae decode', torch.max(latents).item(), torch.min(latents).item(), torch.mean(latents).item(), torch.std(latents).item())
+ video = self.vae.decode(latents.to(self.vae.vae.dtype))
+ # print(f'after vae decode', torch.max(video).item(), torch.min(video).item(), torch.mean(video).item(), torch.std(video).item())
+ video = ((video / 2.0 + 0.5).clamp(0, 1) * 255).to(dtype=torch.uint8).cpu().permute(0, 1, 3, 4, 2).contiguous() # b t h w c
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
return video
diff --git a/opensora/sample/sample_inpaint_ddp.py b/opensora/sample/sample_inpaint_ddp.py
new file mode 100644
index 000000000..19c28d8c2
--- /dev/null
+++ b/opensora/sample/sample_inpaint_ddp.py
@@ -0,0 +1,457 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, MT5EncoderModel
+
+import os, sys
+import cv2
+
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V as SparseOpenSoraT2V
+# from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+# from opensora.models.captioner.refiner import model_gen
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.models.diffusion.opensora2.modeling_inpaint import OpenSoraInpaint
+from opensora.sample.pipeline_inpaint import OpenSoraInpaintPipeline
+from opensora.dataset.transform import ToTensorVideo, TemporalRandomCrop, RandomHorizontalFlipVideo, CenterCropResizeVideo, LongSideResizeVideo, SpatialStrideCropVideo, NormalizeVideo, ToTensorAfterResize
+from opensora.utils.dataset_utils import DecordInit
+
+from PIL import Image
+import numpy as np
+from torchvision import transforms
+from torchvision.transforms import Lambda
+import imageio
+import re
+from einops import rearrange
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+ pass
+import time
+
+
+
+def load_t2v_checkpoint(model_path):
+ transformer_model = OpenSoraInpaint.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+
+ # set eval mode
+ transformer_model.eval()
+ pipeline = OpenSoraInpaintPipeline(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model
+ ).to(device)
+
+ return pipeline
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+def get_video(video_path):
+ cap = cv2.VideoCapture(video_path)
+ frames = []
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break # 视频读取结束
+
+ # frame 是一个 (H, W, C) 的 numpy 数组,代表一帧
+ frames.append(frame)
+
+ # 释放视频文件
+ cap.release()
+
+ # 将帧列表转换为 numpy 数组,形状为 (T, H, W, C)
+ video_numpy = np.array(frames)
+
+ return video_numpy
+
+def is_image_file(filepath):
+ print(filepath)
+ return re.search(r'\.(jpg|jpeg|png|gif|bmp|tiff|webp|svg)$', filepath, re.IGNORECASE) is not None
+
+def is_video_file(filepath):
+ print(filepath)
+ return re.search(r'\.(mp4|avi|mkv|mov|wmv|flv|webm|mpeg|mpg|3gp)$', filepath, re.IGNORECASE) is not None
+
+def run_model_and_save_images(pipeline, model_path):
+ v_decoder = DecordInit()
+
+ norm_fun = Lambda(lambda x: 2. * x - 1.)
+ resize = [CenterCropResizeVideo((args.height, args.width)), ]
+
+ transform = transforms.Compose([
+ ToTensorAfterResize(),
+ norm_fun
+ ])
+
+ resize_transform = transforms.Compose([*resize])
+
+ pipeline.register_image_transforms(transform)
+
+ def preprocess_pixel_values(pixel_values_path, frame_interval=1, min_clear_ratio=0.6):
+ if isinstance(pixel_values_path, list) and is_image_file(pixel_values_path[0]):
+ if len(pixel_values_path) == 1:
+ conditional_images_indices = [0]
+ elif len(pixel_values_path) == 2:
+ conditional_images_indices = [0, 1]
+ else:
+ raise ValueError("The number of images should be 1 or 2.")
+ conditional_images = [Image.open(image).convert("RGB") for image in pixel_values_path]
+ conditional_images = [torch.from_numpy(np.copy(np.array(image))) for image in conditional_images]
+ conditional_images = [rearrange(image, 'h w c -> c h w').unsqueeze(0) for image in conditional_images]
+ conditional_images = [resize_transform(image) for image in conditional_images]
+ pixel_values = conditional_images
+ elif is_video_file(pixel_values_path[0]):
+ decord_vr = v_decoder(pixel_values_path[0])
+ end_idx = min(len(decord_vr), args.num_frames)
+ frame_indices = np.arange(0, end_idx, frame_interval).astype(int)
+ pixel_values = decord_vr.get_batch(frame_indices).asnumpy()
+ pixel_values = torch.from_numpy(pixel_values)
+ pixel_values = pixel_values.permute(0, 3, 1, 2) # (T, H, W, C) -> (T C H W)
+ pixel_values = resize_transform(pixel_values)
+ conditional_images_indices = list(range(int(min_clear_ratio * pixel_values.shape[0]))) # v2v
+ pixel_values = pixel_values[conditional_images_indices]
+ else:
+ raise ValueError("The input file should be an image or a video.")
+
+ return dict(conditional_images=pixel_values, conditional_images_indices=conditional_images_indices)
+
+
+ if not isinstance(args.text_prompt, list):
+ args.text_prompt = [args.text_prompt]
+ if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ text_prompt = open(args.text_prompt[0], 'r').readlines()
+ args.text_prompt = [i.strip() for i in text_prompt]
+
+ if not isinstance(args.conditional_images_path, list):
+ args.conditional_images_path = [args.conditional_images_path]
+ if len(args.conditional_images_path) == 1 and args.conditional_images_path[0].endswith('txt'):
+ temp = open(args.conditional_images_path[0], 'r').readlines()
+ conditional_images = [i.strip().split(',') for i in temp]
+
+ assert len(text_prompt) % world_size == 0, "The sample num must be a multiple of the world size; otherwise, it may cause an all_gather error."
+
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = """
+ masterpiece, high quality, ultra-detailed,
+ {}.
+ emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
+ sharp focus, high budget, cinemascope, moody, epic, gorgeous
+ """
+
+ negative_prompt = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
+
+ text_prompt = ["A man is riding horse"]
+ video_paths = ["/home/image_data/hxy/data/video/000184_cut.mp4"]
+ mask_paths = ["/home/image_data/hxy/data/video/000001_bbox_cut.mp4"]
+
+ video_grids = []
+ for index, (prompt, video_path, mask_path) in enumerate(zip(text_prompt, video_paths, mask_paths)):
+ if index % world_size != local_rank:
+ continue
+
+ video = get_video(video_path)
+ mask = get_video(mask_path)
+
+ video_tensor = resize_transform(torch.from_numpy(video.transpose(0,3,1,2)))
+ mask_tensor = resize_transform(torch.from_numpy(mask.transpose(0,3,1,2)))
+
+ video_resize = video_tensor.numpy()
+ mask_resize = mask_tensor.numpy()
+
+ # output_path = "/home/image_data/hxy/data/video/output_mask.mp4"
+ # fps = 30
+ # writer = imageio.get_writer(output_path, fps=fps, format='mp4')
+ # for i in range(mask_resize.shape[0]):
+ # writer.append_data(mask_resize[i])
+ # writer.close()
+
+
+ if args.refine_caption:
+ q = f'Translate this brief generation prompt into a detailed caption: {prompt}'
+ query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+ # print(query)
+ with torch.cuda.amp.autocast():
+ refine_prompt = model_gen(refiner, query, None)
+ refine_prompt = refine_prompt.replace('<|im_end|>', '').replace('', '')
+ input_prompt = positive_prompt.format(refine_prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'refine_prompt ({refine_prompt})\n input_prompt ({input_prompt})\n device ({device})')
+ else:
+ input_prompt = positive_prompt.format(prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'input_prompt ({input_prompt})\n device ({device})')
+ videos = pipeline(
+ video = video_resize,
+ masks = mask_resize,
+ prompt=input_prompt,
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ motion_score=args.motion_score,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=args.max_sequence_length,
+ ).images
+ print('videos.shape', videos.shape)
+ try:
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f'{model_path}', f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}.{ext}'),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+ print('save done...')
+
+ else:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{model_path}', f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}.{ext}'
+ ), videos[0],
+ fps=args.fps, quality=6) # highest quality is 10, lowest is 0
+ print('save done...')
+ except:
+ print('Error when saving {}'.format(prompt))
+ video_grids.append(videos)
+ dist.barrier()
+ video_grids = torch.cat(video_grids, dim=0).cuda()
+ shape = list(video_grids.shape)
+ shape[0] *= world_size
+ gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ video_grids = gathered_tensor.cpu()
+
+ # video_grids = video_grids.repeat(world_size, 1, 1, 1)
+ # output = torch.zeros(video_grids.shape, dtype=video_grids.dtype, device=device)
+ # dist.all_to_all_single(output, video_grids)
+ # video_grids = output.cpu()
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}_{checkpoint_name}.{ext}')
+
+ if local_rank == 0:
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=6)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--max_sequence_length", type=int, default=512)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--refine_caption', action='store_true')
+ parser.add_argument('--compile', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--save_memory', action='store_true')
+ parser.add_argument('--motion_score', type=float, default=None)
+ parser.add_argument("--prediction_type", type=str, default='epsilon', help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.")
+ parser.add_argument('--rescale_betas_zero_snr', action='store_true')
+
+ parser.add_argument('--conditional_images_path', nargs='+')
+ parser.add_argument('--force_resolution', action='store_true')
+ parser.add_argument('--video_path', type=str)
+ parser.add_argument('--mask_path', type=str)
+
+
+ args = parser.parse_args()
+
+ if torch_npu is not None:
+ npu_config.print_msg(args)
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ if torch_npu is not None and npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ else:
+ torch.cuda.set_device(local_rank)
+ dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=local_rank)
+
+ torch.manual_seed(args.seed)
+ weight_dtype = torch.float16
+ device = torch.cuda.current_device()
+ vae = ae_wrapper[args.ae](args.ae_path)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained("/home/image_data/mt5-xxl",
+ cache_dir=args.cache_dir, low_cpu_mem_usage=True,
+ torch_dtype=weight_dtype).to(device)
+ tokenizer = AutoTokenizer.from_pretrained("/home/image_data/mt5-xxl",
+ cache_dir=args.cache_dir)
+ # text_encoder = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir)
+
+ # text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ # low_cpu_mem_usage=True, torch_dtype=weight_dtype).to(device)
+ # tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+ if args.refine_caption:
+ from transformers import AutoModel, AutoTokenizer
+ new_path = '/storage/zhubin/ShareGPT4Video/sharegpt4video/sharecaptioner_v1'
+ refiner_tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
+ refiner = AutoModel.from_pretrained(new_path, torch_dtype=weight_dtype, trust_remote_code=True).eval()
+ refiner.to(device)
+ refiner.tokenizer = refiner_tokenizer
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler(clip_sample=False)
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler(clip_sample=False)
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler(prediction_type=args.prediction_type, rescale_betas_zero_snr=args.rescale_betas_zero_snr, timestep_spacing="trailing")
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+ # while True:
+ # cur_path = get_latest_path()
+ # # print(cur_path, latest_path)
+ # if cur_path == latest_path:
+ # time.sleep(5)
+ # continue
+
+ # time.sleep(1)
+ # latest_path = cur_path
+ # os.makedirs(os.path.join(args.save_img_path, latest_path), exist_ok=True)
+ # if npu_config is not None:
+ # npu_config.print_msg(f"The latest_path is {latest_path}")
+ # else:
+ # print(f"The latest_path is {latest_path}")
+ if latest_path is None:
+ latest_path = ''
+
+ full_path = f"{args.model_path}"
+ # full_path = f"{args.model_path}/{latest_path}/model_ema"
+ # full_path = f"{args.model_path}/{latest_path}/model"
+ try:
+ pipeline = load_t2v_checkpoint(full_path)
+ except:
+ time.sleep(100)
+ pipeline = load_t2v_checkpoint(full_path)
+ # print('load model')
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = "/home/image_data/npu_profiling_t2v"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=10000, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ run_model_and_save_images(pipeline, latest_path)
+ prof.step()
+ else:
+ # print('gpu')
+ run_model_and_save_images(pipeline, latest_path)
diff --git a/opensora/sample/sample_inpaint_ddp_on_npu.py b/opensora/sample/sample_inpaint_ddp_on_npu.py
new file mode 100644
index 000000000..c3bee9e9d
--- /dev/null
+++ b/opensora/sample/sample_inpaint_ddp_on_npu.py
@@ -0,0 +1,367 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, MT5EncoderModel, T5Tokenizer, AutoTokenizer
+
+import os, sys
+
+from opensora.adaptor.modules import replace_with_fp32_forwards
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+# from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+# from opensora.models.diffusion.udit_ultra.modeling_udit_ultra import UDiTUltraT2V
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.models.diffusion.opensora2.modeling_inpaint import OpenSoraInpaint
+from opensora.sample.pipeline_inpaint import OpenSoraInpaintPipeline
+from opensora.dataset.transform import ToTensorVideo, TemporalRandomCrop, RandomHorizontalFlipVideo, CenterCropResizeVideo, LongSideResizeVideo, SpatialStrideCropVideo, NormalizeVideo, ToTensorAfterResize
+from opensora.utils.dataset_utils import DecordInit
+
+import imageio
+
+try:
+ import torch_npu
+except:
+ pass
+import time
+from opensora.npu_config import npu_config
+
+from PIL import Image
+import numpy as np
+from torchvision import transforms
+from torchvision.transforms import Lambda
+import imageio
+import re
+from einops import rearrange
+
+import time
+
+
+def load_t2v_checkpoint(model_path):
+ transformer_model = OpenSoraInpaint.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+
+ # set eval mode
+ transformer_model.eval()
+ pipeline = OpenSoraInpaintPipeline(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model
+ ).to(device)
+
+ return pipeline
+
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+def is_image_file(filepath):
+ print(filepath)
+ return re.search(r'\.(jpg|jpeg|png|gif|bmp|tiff|webp|svg)$', filepath, re.IGNORECASE) is not None
+
+def is_video_file(filepath):
+ print(filepath)
+ return re.search(r'\.(mp4|avi|mkv|mov|wmv|flv|webm|mpeg|mpg|3gp)$', filepath, re.IGNORECASE) is not None
+
+
+def run_model_and_save_images(pipeline, model_path):
+ v_decoder = DecordInit()
+
+ norm_fun = Lambda(lambda x: 2. * x - 1.)
+ resize = [CenterCropResizeVideo((args.height, args.width)), ]
+
+ transform = transforms.Compose([
+ ToTensorAfterResize(),
+ norm_fun
+ ])
+
+ resize_transform = transforms.Compose([*resize])
+
+ pipeline.register_image_transforms(transform)
+
+ def preprocess_pixel_values(pixel_values_path, frame_interval=1, min_clear_ratio=0.8):
+ if isinstance(pixel_values_path, list) and is_image_file(pixel_values_path[0]):
+ if len(pixel_values_path) == 1:
+ conditional_images_indices = [0]
+ elif len(pixel_values_path) == 2:
+ conditional_images_indices = [0, 1]
+ else:
+ raise ValueError("The number of images should be 1 or 2.")
+ conditional_images = [Image.open(image).convert("RGB") for image in pixel_values_path]
+ conditional_images = [torch.from_numpy(np.copy(np.array(image))) for image in conditional_images]
+ conditional_images = [rearrange(image, 'h w c -> c h w').unsqueeze(0) for image in conditional_images]
+ conditional_images = [resize_transform(image) for image in conditional_images]
+ pixel_values = conditional_images
+ elif is_video_file(pixel_values_path[0]):
+ decord_vr = v_decoder(pixel_values_path[0])
+ end_idx = min(len(decord_vr), args.num_frames)
+ frame_indices = np.arange(0, end_idx, frame_interval).astype(int)
+ pixel_values = decord_vr.get_batch(frame_indices).asnumpy()
+ pixel_values = torch.from_numpy(pixel_values)
+ pixel_values = pixel_values.permute(0, 3, 1, 2) # (T, H, W, C) -> (T C H W)
+ pixel_values = resize_transform(pixel_values)
+ conditional_images_indices = list(range(int(min_clear_ratio * pixel_values.shape[0]))) # v2v
+ pixel_values = pixel_values[conditional_images_indices]
+ else:
+ raise ValueError("The input file should be an image or a video.")
+
+ return dict(conditional_images=pixel_values, conditional_images_indices=conditional_images_indices)
+
+ if not isinstance(args.text_prompt, list):
+ args.text_prompt = [args.text_prompt]
+ if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ text_prompt = open(args.text_prompt[0], 'r').readlines()
+ args.text_prompt = [i.strip() for i in text_prompt]
+
+ if not isinstance(args.conditional_images_path, list):
+ args.conditional_images_path = [args.conditional_images_path]
+ if len(args.conditional_images_path) == 1 and args.conditional_images_path[0].endswith('txt'):
+ temp = open(args.conditional_images_path[0], 'r').readlines()
+ conditional_images = [i.strip().split(',') for i in temp]
+
+ assert len(text_prompt) % world_size == 0, "The sample num must be a multiple of the world size; otherwise, it may cause an all_gather error."
+
+
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = "(masterpiece), (best quality), (ultra-detailed), {}. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous"
+ negative_prompt = "nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+ video_grids = []
+ for index, (prompt, images) in enumerate(zip(text_prompt, conditional_images)):
+ if index % npu_config.N_NPU_PER_NODE != local_rank:
+ continue
+ print('Processing the ({}) prompt'.format(prompt))
+
+ pre_results = preprocess_pixel_values(images)
+ cond_imgs = pre_results['conditional_images']
+ cond_imgs_indices = pre_results['conditional_images_indices']
+
+ videos = pipeline(
+ conditional_images=cond_imgs,
+ conditional_images_indices=cond_imgs_indices,
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ motion_score=args.motion_score,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=512,
+ ).images
+ print(videos.shape)
+ try:
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+
+ else:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{args.sample_method}_{index}_{checkpoint_name}__gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'
+ ), videos[0],
+ fps=args.fps, quality=9, codec='libx264',
+ output_params=['-threads', '20']) # highest quality is 10, lowest is 0
+ except:
+ print('Error when saving {}'.format(prompt))
+ video_grids.append(videos)
+
+ video_grids = torch.cat(video_grids, dim=0).cuda()
+ shape = list(video_grids.shape)
+ shape[0] *= world_size
+ gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ video_grids = gathered_tensor.cpu()
+
+ # video_grids = video_grids.repeat(world_size, 1, 1, 1)
+ # output = torch.zeros(video_grids.shape, dtype=video_grids.dtype, device=device)
+ # dist.all_to_all_single(output, video_grids)
+ # video_grids = output.cpu()
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_{checkpoint_name}.{ext}')
+
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=9)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--run_time", type=int, default=0)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--motion_score', type=float, default=None)
+
+ parser.add_argument('--conditional_images_path', nargs='+')
+ parser.add_argument('--force_resolution', action='store_true')
+ args = parser.parse_args()
+
+ npu_config.print_msg(args)
+ npu_config.conv_dtype = torch.bfloat16
+ replace_with_fp32_forwards()
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ if npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ dist.init_process_group(backend='hccl', init_method='env://', world_size=world_size, rank=local_rank)
+
+ # torch.manual_seed(args.seed)
+ weight_dtype = torch.float16
+ device = torch.cuda.current_device()
+
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir=args.cache_dir)
+ vae = ae_wrapper[args.ae](args.ae_path)
+ print(args.ae)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=True, torch_dtype=torch.float16).to(device)
+ tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler()
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler()
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler()
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+ first_in = False
+ # while True:
+ # cur_path = get_latest_path()
+ # if cur_path == latest_path:
+ # time.sleep(60)
+ # continue
+
+ # if not first_in:
+ # first_in = True
+ # else:
+ # time.sleep(1200)
+
+ # latest_path = cur_path
+ if latest_path is None:
+ latest_path = ''
+
+ npu_config.print_msg(f"The latest_path is {latest_path}")
+ full_path = f"{args.model_path}"
+ # full_path = f"{args.model_path}/{latest_path}/model_ema"
+ # full_path = "/home/opensora/captions/240p_model_ema"
+ pipeline = load_t2v_checkpoint(full_path)
+
+ if npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = "/home/image_data/npu_profiling_t2v"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=10000, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ run_model_and_save_images(pipeline, latest_path)
+ prof.step()
+ else:
+ run_model_and_save_images(pipeline, latest_path)
diff --git a/opensora/sample/sample_t2v.py b/opensora/sample/sample_t2v.py
index bfb743b65..8da3d4894 100644
--- a/opensora/sample/sample_t2v.py
+++ b/opensora/sample/sample_t2v.py
@@ -9,65 +9,102 @@
HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
-from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
from omegaconf import OmegaConf
from torchvision.utils import save_image
-from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer
+from transformers import T5EncoderModel, MT5EncoderModel, UMT5EncoderModel, AutoTokenizer
import os, sys
-from opensora.models.ae import ae_stride_config, getae, getae_wrapper
-from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
-from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+from opensora.adaptor.modules import replace_with_fp32_forwards
+from opensora.models.causalvideovae import ae_stride_config, ae_channel_config, ae_norm, ae_denorm, CausalVAEModelWrapper
+
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V as SparseOpenSoraT2V
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+
from opensora.models.text_encoder import get_text_enc
from opensora.utils.utils import save_video_grid
-sys.path.append(os.path.split(sys.path[0])[0])
-from pipeline_videogen import VideoGenPipeline
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
import imageio
def main(args):
- # torch.manual_seed(args.seed)
- torch.set_grad_enabled(False)
- device = "cuda" if torch.cuda.is_available() else "cpu"
-
- # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir='cache_dir').to(device, dtype=torch.float16)
- vae = getae_wrapper(args.ae)(args.ae_path).to(device, dtype=torch.float16)
+ torch.manual_seed(args.seed)
+ # torch.backends.cuda.matmul.allow_tf32 = False
+ weight_dtype = torch.bfloat16
+ device = torch.device(args.device)
+
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir=args.cache_dir)
+ vae = CausalVAEModelWrapper(args.ae_path)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
if args.enable_tiling:
vae.vae.enable_tiling()
vae.vae.tile_overlap_factor = args.tile_overlap_factor
-
- # Load model:
- # transformer_model = LatteT2V.from_pretrained(args.model_path, subfolder=args.version, cache_dir=args.cache_dir, torch_dtype=torch.float16).to(device)
- transformer_model = LatteT2V.from_pretrained(args.model_path, low_cpu_mem_usage=False, device_map=None, torch_dtype=torch.float16).to(device)
- print(transformer_model.config)
- transformer_model.force_images = args.force_images
- tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
- text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir, torch_dtype=torch.float16).to(device)
-
- video_length, image_size = transformer_model.config.video_length, args.image_size
- latent_size = (image_size // ae_stride_config[args.ae][1], image_size // ae_stride_config[args.ae][2])
- vae.latent_size = latent_size
- if args.force_images:
- video_length = 1
- ext = 'jpg'
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ # if args.model_3d:
+ # transformer_model = OpenSoraT2V.from_pretrained(args.model_path, subfolder=args.version, cache_dir=args.cache_dir, low_cpu_mem_usage=False, device_map=None, torch_dtype=weight_dtype)
+ # else:
+ # transformer_model = LatteT2V.from_pretrained(args.model_path, subfolder=args.version, cache_dir=args.cache_dir, low_cpu_mem_usage=False, device_map=None, torch_dtype=weight_dtype)
+
+ if args.model_type == 'dit':
+ transformer_model = OpenSoraT2V.from_pretrained(args.model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None, torch_dtype=weight_dtype)
+ elif args.model_type == 'udit':
+ transformer_model = UDiTT2V.from_pretrained(args.model_path, cache_dir=args.cache_dir, ignore_mismatched_sizes=True,
+ low_cpu_mem_usage=False, device_map=None, torch_dtype=weight_dtype)
+ elif args.model_type == 'sparsedit':
+ transformer_model = SparseOpenSoraT2V.from_pretrained(args.model_path, cache_dir=args.cache_dir, ignore_mismatched_sizes=True,
+ low_cpu_mem_usage=False, device_map=None, torch_dtype=weight_dtype)
else:
- ext = 'mp4'
-
+ transformer_model = LatteT2V.from_pretrained(args.model_path, cache_dir=args.cache_dir, low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
+ # ckpt = torch.load('/storage/ongoing/new/image2video_weight/480p_73000_ema_ds_k3_p1_repeat_lowsize2.pt')
+ # transformer_model.load_state_dict(ckpt)
+ # text_encoder = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir)
+ text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir)
+
+
# set eval mode
transformer_model.eval()
vae.eval()
text_encoder.eval()
if args.sample_method == 'DDIM': #########
- scheduler = DDIMScheduler()
+ scheduler = DDIMScheduler(clip_sample=False)
elif args.sample_method == 'EulerDiscrete':
scheduler = EulerDiscreteScheduler()
elif args.sample_method == 'DDPM': #############
- scheduler = DDPMScheduler()
+ scheduler = DDPMScheduler(clip_sample=False)
elif args.sample_method == 'DPMSolverMultistep':
+ '''
+ DPM++ 2M DPMSolverMultistepScheduler
+ DPM++ 2M Karras DPMSolverMultistepScheduler init with use_karras_sigmas=True
+ DPM++ 2M SDE DPMSolverMultistepScheduler init with algorithm_type="sde-dpmsolver++"
+ DPM++ 2M SDE Karras DPMSolverMultistepScheduler init with use_karras_sigmas=True and algorithm_type="sde-dpmsolver++"
+
+ DPM++ SDE DPMSolverSinglestepScheduler
+ DPM++ SDE Karras DPMSolverSinglestepScheduler init with use_karras_sigmas=True
+ DPM2 KDPM2DiscreteScheduler
+ DPM2 Karras KDPM2DiscreteScheduler init with use_karras_sigmas=True
+ DPM2 a KDPM2AncestralDiscreteScheduler
+ DPM2 a Karras KDPM2AncestralDiscreteScheduler init with use_karras_sigmas=True
+ '''
+ # scheduler = DPMSolverMultistepScheduler(use_karras_sigmas=True)
scheduler = DPMSolverMultistepScheduler()
elif args.sample_method == 'DPMSolverSinglestep':
scheduler = DPMSolverSinglestepScheduler()
@@ -81,49 +118,88 @@ def main(args):
scheduler = DEISMultistepScheduler()
elif args.sample_method == 'KDPM2AncestralDiscrete': #########
scheduler = KDPM2AncestralDiscreteScheduler()
- print('videogen_pipeline', device)
- videogen_pipeline = VideoGenPipeline(vae=vae,
- text_encoder=text_encoder,
- tokenizer=tokenizer,
- scheduler=scheduler,
- transformer=transformer_model).to(device=device)
- # videogen_pipeline.enable_xformers_memory_efficient_attention()
-
+ elif args.sample_method == 'EulerDiscreteSVD':
+ scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-video-diffusion-img2vid",
+ subfolder="scheduler", cache_dir=args.cache_dir)
+ pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model)
+ pipeline.to(device)
+ if args.compile:
+ # 5% https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+ options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", \
+ "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+ "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # options = '{"mode": "max-autotune", "memory_format": "channels_last", \
+ # "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
+ from onediffx import compile_pipe
+ pipeline = compile_pipe(
+ pipeline, backend="nexfort", options=options, fuse_qkv_projections=True
+ )
+
+ # 4%
+ # pipeline.transformer = torch.compile(pipeline.transformer)
+
if not os.path.exists(args.save_img_path):
os.makedirs(args.save_img_path)
-
- video_grids = []
+
if not isinstance(args.text_prompt, list):
args.text_prompt = [args.text_prompt]
if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
text_prompt = open(args.text_prompt[0], 'r').readlines()
- args.text_prompt = [i.strip() for i in text_prompt]
- for prompt in args.text_prompt:
- print('Processing the ({}) prompt'.format(prompt))
- videos = videogen_pipeline(prompt,
- video_length=video_length,
- height=image_size,
- width=image_size,
- num_inference_steps=args.num_sampling_steps,
- guidance_scale=args.guidance_scale,
- enable_temporal_attentions=not args.force_images,
- num_images_per_prompt=1,
- mask_feature=True,
- ).video
+ text_prompt = [i.strip() for i in text_prompt]
+
+ positive_prompt = """
+ (masterpiece), (best quality), (ultra-detailed),
+ {}.
+ """
+
+ negative_prompt = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
+
+ # positive_prompt = "{}"
+
+ # negative_prompt = None
+ # positive_prompt = """
+ # (masterpiece), (best quality), (ultra-detailed),
+ # {}.
+ # emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
+ # sharp focus, high budget, cinemascope, moody, epic, gorgeous
+ # """
+
+ # negative_prompt = """
+ # disfigured, poorly drawn face, longbody, lowres, bad anatomy, bad hands, missing fingers, cropped, worst quality, low quality
+ # """
+
+ video_grids = []
+ for idx, prompt in enumerate(text_prompt):
+ videos = pipeline(positive_prompt.format(prompt),
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ motion_score=args.motion_score,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=args.max_sequence_length,
+ ).images
try:
- if args.force_images:
+ if args.num_frames == 1:
+ ext = 'jpg'
videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
- save_image(videos / 255.0, os.path.join(args.save_img_path,
- prompt.replace(' ', '_')[:100] + f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'),
- nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path, f'{idx}.{ext}'), nrow=1, normalize=True, value_range=(0, 1)) # t c h w
else:
+ ext = 'mp4'
imageio.mimwrite(
- os.path.join(
- args.save_img_path,
- prompt.replace(' ', '_')[:100] + f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'
- ), videos[0],
- fps=args.fps, quality=9) # highest quality is 10, lowest is 0
+ os.path.join(args.save_img_path, f'{idx}.{ext}'), videos[0], fps=args.fps, quality=6) # highest quality is 10, lowest is 0
except:
print('Error when saving {}'.format(prompt))
video_grids.append(videos)
@@ -131,37 +207,43 @@ def main(args):
# torchvision.io.write_video(args.save_img_path + '_%04d' % args.run_time + '-.mp4', video_grids, fps=6)
- if args.force_images:
- save_image(video_grids / 255.0, os.path.join(args.save_img_path, f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'),
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, os.path.join(args.save_img_path, f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}{f"_{args.motion_score}" if args.motion_score is not None else ""}.{ext}'),
nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
else:
video_grids = save_video_grid(video_grids)
- imageio.mimwrite(os.path.join(args.save_img_path, f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'), video_grids, fps=args.fps, quality=9)
+ imageio.mimwrite(os.path.join(args.save_img_path, f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}{f"_{args.motion_score}" if args.motion_score is not None else ""}.{ext}'), video_grids, fps=args.fps, quality=6)
print('save path {}'.format(args.save_img_path))
- # save_videos_grid(video, f"./{prompt}.gif")
-
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
- parser.add_argument("--image_size", type=int, default=512)
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
parser.add_argument("--cache_dir", type=str, default='./cache_dir')
parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
- parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--guidance_scale", type=float, default=2.5)
+ parser.add_argument("--motion_score", type=float, default=None)
parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--max_sequence_length", type=int, default=300)
parser.add_argument("--num_sampling_steps", type=int, default=50)
parser.add_argument("--fps", type=int, default=24)
parser.add_argument("--run_time", type=int, default=0)
+ parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--text_prompt", nargs='+')
- parser.add_argument('--force_images', action='store_true')
- parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.125)
parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--compile', action='store_true')
+ parser.add_argument('--model_type', type=str, default="udit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--save_memory', action='store_true')
args = parser.parse_args()
main(args)
\ No newline at end of file
diff --git a/opensora/sample/sample_t2v_ddp.py b/opensora/sample/sample_t2v_ddp.py
new file mode 100644
index 000000000..0492ad9dd
--- /dev/null
+++ b/opensora/sample/sample_t2v_ddp.py
@@ -0,0 +1,399 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, MT5EncoderModel
+
+import os, sys
+
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V as SparseOpenSoraT2V
+# from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+# from opensora.models.captioner.refiner import model_gen
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.sample.pipeline_opensora import OpenSoraPipeline, OpenSoraFreeInitPipeline
+
+import imageio
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+ pass
+import time
+
+
+
+def load_t2v_checkpoint(model_path):
+ if args.model_type == 'udit':
+ transformer_model = UDiTT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'sparsedit':
+ transformer_model = SparseOpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'dit':
+ transformer_model = OpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ else:
+ transformer_model = LatteT2V.from_pretrained(model_path, cache_dir=args.cache_dir, low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
+ # print(transformer_model.config)
+
+ # set eval mode
+ transformer_model.eval()
+ # if True:
+ # pipeline = OpenSoraFreeInitPipeline(
+ # vae=vae,
+ # text_encoder=text_encoder,
+ # tokenizer=tokenizer,
+ # scheduler=scheduler,
+ # transformer=transformer_model
+ # ).to(device)
+ # else:
+ pipeline = OpenSoraPipeline(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model
+ ).to(device)
+
+ if args.compile:
+ # 5% https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+ # options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", \
+ # "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+ # "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # # options = '{"mode": "max-autotune", "memory_format": "channels_last", \
+ # # "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # from onediffx import compile_pipe
+ # pipeline = compile_pipe(
+ # pipeline, backend="nexfort", options=options, fuse_qkv_projections=True
+ # )
+
+ # 4%
+ pipeline.transformer = torch.compile(pipeline.transformer)
+ return pipeline
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+
+def run_model_and_save_images(pipeline, model_path):
+ video_grids = []
+ if not isinstance(args.text_prompt, list):
+ args.text_prompt = [args.text_prompt]
+ if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ text_prompt = open(args.text_prompt[0], 'r').readlines()
+ args.text_prompt = [i.strip() for i in text_prompt]
+
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = """
+ masterpiece, high quality, ultra-detailed,
+ {}.
+ emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
+ sharp focus, high budget, cinemascope, moody, epic, gorgeous
+ """
+
+ positive_prompt = """
+ high quality, high aesthetic, {}
+ """
+
+ negative_prompt = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
+
+
+ for index, prompt in enumerate(args.text_prompt):
+ if index % world_size != local_rank:
+ continue
+ if args.refine_caption:
+ q = f'Translate this brief generation prompt into a detailed caption: {prompt}'
+ query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+ # print(query)
+ with torch.cuda.amp.autocast():
+ refine_prompt = model_gen(refiner, query, None)
+ refine_prompt = refine_prompt.replace('<|im_end|>', '').replace('', '')
+ input_prompt = positive_prompt.format(refine_prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'refine_prompt ({refine_prompt})\n input_prompt ({input_prompt})\n device ({device})')
+ else:
+ input_prompt = positive_prompt.format(prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'input_prompt ({input_prompt})\n device ({device})')
+ videos = pipeline(
+ input_prompt,
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ motion_score=args.motion_score,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_samples_per_prompt=args.num_samples_per_prompt,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=args.max_sequence_length,
+ ).images
+ print('videos.shape', videos.shape)
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f'{model_path}', f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}.{ext}'),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+ print('save done...')
+
+ else:
+ if args.num_samples_per_prompt == 1:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{model_path}', f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}.{ext}'
+ ), videos[0],
+ fps=args.fps, quality=6) # highest quality is 10, lowest is 0
+ print('save done...')
+ else:
+ for i in range(args.num_samples_per_prompt):
+ print(f'save i {i}')
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{model_path}', f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}_i{i}.{ext}'
+ ), videos[i],
+ fps=args.fps, quality=6) # highest quality is 10, lowest is 0
+ videos = save_video_grid(videos)
+ print(f'save total video_grids {videos.shape}')
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{model_path}', f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}.{ext}'
+ ), videos,
+ fps=args.fps, quality=6) # highest quality is 10, lowest is 0)
+
+ video_grids.append(videos)
+ dist.barrier()
+ video_grids = torch.cat(video_grids, dim=0).cuda()
+ shape = list(video_grids.shape)
+ shape[0] *= world_size
+ gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ video_grids = gathered_tensor.cpu()
+
+ # video_grids = video_grids.repeat(world_size, 1, 1, 1)
+ # output = torch.zeros(video_grids.shape, dtype=video_grids.dtype, device=device)
+ # dist.all_to_all_single(output, video_grids)
+ # video_grids = output.cpu()
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}_{checkpoint_name}.{ext}')
+
+ if local_rank == 0:
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=6)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--max_sequence_length", type=int, default=512)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--num_samples_per_prompt", type=int, default=1)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--refine_caption', action='store_true')
+ parser.add_argument('--compile', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--save_memory', action='store_true')
+ parser.add_argument('--motion_score', type=float, default=None)
+ parser.add_argument("--prediction_type", type=str, default='epsilon', help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.")
+ parser.add_argument('--rescale_betas_zero_snr', action='store_true')
+ args = parser.parse_args()
+
+ if torch_npu is not None:
+ npu_config.print_msg(args)
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ print('world_size', world_size)
+ if torch_npu is not None and npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ else:
+ torch.cuda.set_device(local_rank)
+ dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=local_rank)
+
+ torch.manual_seed(args.seed)
+ weight_dtype = torch.bfloat16
+ device = torch.cuda.current_device()
+ vae = ae_wrapper[args.ae](args.ae_path)
+ print(args.ae)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ cache_dir=args.cache_dir, low_cpu_mem_usage=True,
+ torch_dtype=weight_dtype).to(device)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ cache_dir=args.cache_dir)
+ # text_encoder = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir)
+
+ # text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ # low_cpu_mem_usage=True, torch_dtype=weight_dtype).to(device)
+ # tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+ if args.refine_caption:
+ from transformers import AutoModel, AutoTokenizer
+ new_path = '/storage/zhubin/ShareGPT4Video/sharegpt4video/sharecaptioner_v1'
+ refiner_tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
+ refiner = AutoModel.from_pretrained(new_path, torch_dtype=weight_dtype, trust_remote_code=True).eval()
+ refiner.to(device)
+ refiner.tokenizer = refiner_tokenizer
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler(clip_sample=False, prediction_type=args.prediction_type, rescale_betas_zero_snr=args.rescale_betas_zero_snr, timestep_spacing="trailing")
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler(clip_sample=False)
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler(prediction_type=args.prediction_type, rescale_betas_zero_snr=args.rescale_betas_zero_snr, timestep_spacing="trailing")
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+ # while True:
+ # cur_path = get_latest_path()
+ # # print(cur_path, latest_path)
+ # if cur_path == latest_path:
+ # time.sleep(5)
+ # continue
+
+ # time.sleep(1)
+ # latest_path = cur_path
+ # os.makedirs(os.path.join(args.save_img_path, latest_path), exist_ok=True)
+ # if npu_config is not None:
+ # npu_config.print_msg(f"The latest_path is {latest_path}")
+ # else:
+ # print(f"The latest_path is {latest_path}")
+ if latest_path is None:
+ latest_path = ''
+
+ full_path = f"{args.model_path}"
+ # full_path = f"{args.model_path}/{latest_path}/model_ema"
+ # full_path = f"{args.model_path}/{latest_path}/model"
+ try:
+ pipeline = load_t2v_checkpoint(full_path)
+ except:
+ time.sleep(100)
+ pipeline = load_t2v_checkpoint(full_path)
+ # print('load model')
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = "/home/image_data/npu_profiling_t2v"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=10000, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ run_model_and_save_images(pipeline, latest_path)
+ prof.step()
+ else:
+ # print('gpu')
+ run_model_and_save_images(pipeline, latest_path)
diff --git a/opensora/sample/sample_t2v_ddp_vbench_gpt.py b/opensora/sample/sample_t2v_ddp_vbench_gpt.py
new file mode 100644
index 000000000..30db76a04
--- /dev/null
+++ b/opensora/sample/sample_t2v_ddp_vbench_gpt.py
@@ -0,0 +1,350 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, MT5EncoderModel
+
+import os, sys
+
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V as SparseOpenSoraT2V
+# from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+# from opensora.models.captioner.refiner import model_gen
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.sample.pipeline_opensora import OpenSoraPipeline, OpenSoraFreeInitPipeline
+
+import imageio
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+ pass
+import time
+import pandas as pd
+global epoch
+epoch = 0
+
+def load_t2v_checkpoint(model_path):
+ if args.model_type == 'udit':
+ transformer_model = UDiTT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'sparsedit':
+ transformer_model = SparseOpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'dit':
+ transformer_model = OpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ else:
+ transformer_model = LatteT2V.from_pretrained(model_path, cache_dir=args.cache_dir, low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
+ # print(transformer_model.config)
+
+ # set eval mode
+ transformer_model.eval()
+ # if True:
+ # pipeline = OpenSoraFreeInitPipeline(
+ # vae=vae,
+ # text_encoder=text_encoder,
+ # tokenizer=tokenizer,
+ # scheduler=scheduler,
+ # transformer=transformer_model
+ # ).to(device)
+ # else:
+ pipeline = OpenSoraPipeline(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model
+ ).to(device)
+
+ if args.compile:
+ # 5% https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+ # options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", \
+ # "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+ # "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # # options = '{"mode": "max-autotune", "memory_format": "channels_last", \
+ # # "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # from onediffx import compile_pipe
+ # pipeline = compile_pipe(
+ # pipeline, backend="nexfort", options=options, fuse_qkv_projections=True
+ # )
+
+ # 4%
+ pipeline.transformer = torch.compile(pipeline.transformer)
+ return pipeline
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+
+def run_model_and_save_images(pipeline, model_path):
+ video_grids = []
+ # if not isinstance(args.text_prompt, list):
+ # args.text_prompt = [args.text_prompt]
+ # if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ # text_prompt = open(args.text_prompt[0], 'r').readlines()
+ # args.text_prompt = [i.strip() for i in text_prompt]
+ print(args.text_prompt[0])
+ df = pd.read_csv(args.text_prompt[0])
+ print("=============================================")
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = """
+ masterpiece, high quality, ultra-detailed,
+ {}.
+ emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
+ sharp focus, high budget, cinemascope, moody, epic, gorgeous
+ """
+
+ negative_prompt = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
+
+ for index, row in df.iterrows():
+ if index % world_size != local_rank:
+ continue
+ if args.refine_caption:
+ q = f'Translate this brief generation prompt into a detailed caption: {prompt}'
+ query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+ # print(query)
+ with torch.cuda.amp.autocast():
+ refine_prompt = model_gen(refiner, query, None)
+ refine_prompt = refine_prompt.replace('<|im_end|>', '').replace('', '')
+ input_prompt = positive_prompt.format(refine_prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'refine_prompt ({refine_prompt})\n input_prompt ({input_prompt})\n device ({device})')
+ else:
+ prompt = row['refine']
+ input_prompt = positive_prompt.format(prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'input_prompt ({input_prompt})\n device ({device})')
+ videos = pipeline(
+ input_prompt,
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ motion_score=args.motion_score,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=args.max_sequence_length,
+ ).images
+ print('videos.shape', videos.shape)
+ try:
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f"{model_path}", f"{row['org']}-{epoch}.{ext}"),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+ print('save done...')
+
+ else:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f"{model_path}", f"{row['org']}-{epoch}.{ext}"
+ ), videos[0],
+ fps=args.fps, quality=6) # highest quality is 10, lowest is 0
+ print('save done...')
+ except:
+ print('Error when saving {}'.format(prompt))
+ video_grids.append(videos)
+ dist.barrier()
+ video_grids = torch.cat(video_grids, dim=0).cuda()
+ shape = list(video_grids.shape)
+ shape[0] *= world_size
+ gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ video_grids = gathered_tensor.cpu()
+
+ # video_grids = video_grids.repeat(world_size, 1, 1, 1)
+ # output = torch.zeros(video_grids.shape, dtype=video_grids.dtype, device=device)
+ # dist.all_to_all_single(output, video_grids)
+ # video_grids = output.cpu()
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}_{checkpoint_name}.{ext}')
+
+ if local_rank == 0:
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=6)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--max_sequence_length", type=int, default=512)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument("--seed", nargs='+', default=42, help="List of seed values")
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--refine_caption', action='store_true')
+ parser.add_argument('--compile', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--save_memory', action='store_true')
+ parser.add_argument('--motion_score', type=float, default=None)
+ args = parser.parse_args()
+
+ if torch_npu is not None:
+ npu_config.print_msg(args)
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ print('world_size', world_size)
+ if torch_npu is not None and npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ else:
+ torch.cuda.set_device(local_rank)
+ dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=local_rank)
+
+ # torch.manual_seed(args.seed)
+ weight_dtype = torch.bfloat16
+ device = torch.cuda.current_device()
+ vae = ae_wrapper[args.ae](args.ae_path)
+ print(args.ae)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ cache_dir=args.cache_dir, low_cpu_mem_usage=True,
+ torch_dtype=weight_dtype).to(device)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ cache_dir=args.cache_dir)
+ # text_encoder = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d3cache_dir=args.cache_dir)
+
+ # text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ # low_cpu_mem_usage=True, torch_dtype=weight_dtype).to(device)
+ # tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+ if args.refine_caption:
+ from transformers import AutoModel, AutoTokenizer
+ new_path = '/storage/zhubin/ShareGPT4Video/sharegpt4video/sharecaptioner_v1'
+ refiner_tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
+ refiner = AutoModel.from_pretrained(new_path, torch_dtype=weight_dtype, trust_remote_code=True).eval()
+ refiner.to(device)
+ refiner.tokenizer = refiner_tokenizer
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler(clip_sample=False)
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler(clip_sample=False)
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler()
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+
+ if latest_path is None:
+ latest_path = ''
+
+ full_path = f"{args.model_path}"
+
+ try:
+ pipeline = load_t2v_checkpoint(full_path)
+ except:
+ time.sleep(100)
+ pipeline = load_t2v_checkpoint(full_path)
+ # print('load model')
+ print(args.seed)
+ for i in args.seed:
+ print("seed")
+ print(i)
+ torch.manual_seed(int(i))
+ # print('gpu')
+ run_model_and_save_images(pipeline, latest_path)
+ epoch += 1
diff --git a/opensora/sample/sample_t2v_ddp_vbench_org.py b/opensora/sample/sample_t2v_ddp_vbench_org.py
new file mode 100644
index 000000000..3d7692e82
--- /dev/null
+++ b/opensora/sample/sample_t2v_ddp_vbench_org.py
@@ -0,0 +1,347 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, MT5EncoderModel
+
+import os, sys
+
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V as SparseOpenSoraT2V
+# from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+# from opensora.models.captioner.refiner import model_gen
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.sample.pipeline_opensora import OpenSoraPipeline, OpenSoraFreeInitPipeline
+
+import imageio
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+ pass
+import time
+
+global epoch
+epoch = 0
+
+def load_t2v_checkpoint(model_path):
+ if args.model_type == 'udit':
+ transformer_model = UDiTT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'sparsedit':
+ transformer_model = SparseOpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'dit':
+ transformer_model = OpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ else:
+ transformer_model = LatteT2V.from_pretrained(model_path, cache_dir=args.cache_dir, low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
+ # print(transformer_model.config)
+
+ # set eval mode
+ transformer_model.eval()
+ # if True:
+ # pipeline = OpenSoraFreeInitPipeline(
+ # vae=vae,
+ # text_encoder=text_encoder,
+ # tokenizer=tokenizer,
+ # scheduler=scheduler,
+ # transformer=transformer_model
+ # ).to(device)
+ # else:
+ pipeline = OpenSoraPipeline(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model
+ ).to(device)
+
+ if args.compile:
+ # 5% https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+ # options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", \
+ # "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+ # "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # # options = '{"mode": "max-autotune", "memory_format": "channels_last", \
+ # # "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # from onediffx import compile_pipe
+ # pipeline = compile_pipe(
+ # pipeline, backend="nexfort", options=options, fuse_qkv_projections=True
+ # )
+
+ # 4%
+ pipeline.transformer = torch.compile(pipeline.transformer)
+ return pipeline
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+
+def run_model_and_save_images(pipeline, model_path):
+ video_grids = []
+ if not isinstance(args.text_prompt, list):
+ args.text_prompt = [args.text_prompt]
+ if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ text_prompt = open(args.text_prompt[0], 'r').readlines()
+ args.text_prompt = [i.strip() for i in text_prompt]
+
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = """
+ masterpiece, high quality, ultra-detailed,
+ {}.
+ emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
+ sharp focus, high budget, cinemascope, moody, epic, gorgeous
+ """
+
+ negative_prompt = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
+
+ for index, prompt in enumerate(args.text_prompt):
+ if index % world_size != local_rank:
+ continue
+ if args.refine_caption:
+ q = f'Translate this brief generation prompt into a detailed caption: {prompt}'
+ query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+ # print(query)
+ with torch.cuda.amp.autocast():
+ refine_prompt = model_gen(refiner, query, None)
+ refine_prompt = refine_prompt.replace('<|im_end|>', '').replace('', '')
+ input_prompt = positive_prompt.format(refine_prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'refine_prompt ({refine_prompt})\n input_prompt ({input_prompt})\n device ({device})')
+ else:
+ input_prompt = positive_prompt.format(prompt)
+ print(f'Processing the origin prompt({prompt})\n '
+ f'input_prompt ({input_prompt})\n device ({device})')
+ videos = pipeline(
+ input_prompt,
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ motion_score=args.motion_score,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=args.max_sequence_length,
+ ).images
+ print('videos.shape', videos.shape)
+ try:
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f'{model_path}', f'{prompt}-{epoch}.{ext}'),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+ print('save done...')
+
+ else:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{model_path}', f'{prompt}-{epoch}.{ext}'
+ ), videos[0],
+ fps=args.fps, quality=6) # highest quality is 10, lowest is 0
+ print('save done...')
+ except:
+ print('Error when saving {}'.format(prompt))
+ video_grids.append(videos)
+ dist.barrier()
+ video_grids = torch.cat(video_grids, dim=0).cuda()
+ shape = list(video_grids.shape)
+ shape[0] *= world_size
+ gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ video_grids = gathered_tensor.cpu()
+
+ # video_grids = video_grids.repeat(world_size, 1, 1, 1)
+ # output = torch.zeros(video_grids.shape, dtype=video_grids.dtype, device=device)
+ # dist.all_to_all_single(output, video_grids)
+ # video_grids = output.cpu()
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_m{args.motion_score}_{checkpoint_name}.{ext}')
+
+ if local_rank == 0:
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=6)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--max_sequence_length", type=int, default=512)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument("--seed", nargs='+', default=42, help="List of seed values")
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--refine_caption', action='store_true')
+ parser.add_argument('--compile', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--save_memory', action='store_true')
+ parser.add_argument('--motion_score', type=float, default=None)
+ args = parser.parse_args()
+
+ if torch_npu is not None:
+ npu_config.print_msg(args)
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ print('world_size', world_size)
+ if torch_npu is not None and npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ else:
+ torch.cuda.set_device(local_rank)
+ dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=local_rank)
+
+ # torch.manual_seed(args.seed)
+ weight_dtype = torch.bfloat16
+ device = torch.cuda.current_device()
+ vae = ae_wrapper[args.ae](args.ae_path)
+ print(args.ae)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ cache_dir=args.cache_dir, low_cpu_mem_usage=True,
+ torch_dtype=weight_dtype).to(device)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ cache_dir=args.cache_dir)
+ # text_encoder = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d3cache_dir=args.cache_dir)
+
+ # text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ # low_cpu_mem_usage=True, torch_dtype=weight_dtype).to(device)
+ # tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+ if args.refine_caption:
+ from transformers import AutoModel, AutoTokenizer
+ new_path = '/storage/zhubin/ShareGPT4Video/sharegpt4video/sharecaptioner_v1'
+ refiner_tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
+ refiner = AutoModel.from_pretrained(new_path, torch_dtype=weight_dtype, trust_remote_code=True).eval()
+ refiner.to(device)
+ refiner.tokenizer = refiner_tokenizer
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler(clip_sample=False)
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler(clip_sample=False)
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler()
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+
+ if latest_path is None:
+ latest_path = ''
+
+ full_path = f"{args.model_path}"
+
+ try:
+ pipeline = load_t2v_checkpoint(full_path)
+ except:
+ time.sleep(100)
+ pipeline = load_t2v_checkpoint(full_path)
+ # print('load model')
+ print(args.seed)
+ for i in args.seed:
+ print("seed")
+ print(i)
+ torch.manual_seed(int(i))
+ # print('gpu')
+ run_model_and_save_images(pipeline, latest_path)
+ epoch += 1
diff --git a/opensora/sample/sample_t2v_on_npu.py b/opensora/sample/sample_t2v_on_npu.py
new file mode 100644
index 000000000..2ee531cb3
--- /dev/null
+++ b/opensora/sample/sample_t2v_on_npu.py
@@ -0,0 +1,296 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, MT5EncoderModel, T5Tokenizer, AutoTokenizer
+
+import os, sys
+
+from opensora.adaptor.modules import replace_with_fp32_forwards
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.models.diffusion.opensora2.modeling_opensora import OpenSoraT2V as SparseOpenSoraT2V
+# from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+# from opensora.models.diffusion.udit_ultra.modeling_udit_ultra import UDiTUltraT2V
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+
+import imageio
+
+try:
+ import torch_npu
+except:
+ pass
+import time
+from opensora.npu_config import npu_config
+
+
+def load_t2v_checkpoint(model_path):
+ if args.model_type == 'udit':
+ transformer_model = UDiTT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'sparsedit':
+ transformer_model = SparseOpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'dit':
+ transformer_model = OpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ else:
+ transformer_model = LatteT2V.from_pretrained(model_path, cache_dir=args.cache_dir, low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
+
+ # set eval mode
+ transformer_model.eval()
+ pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model).to(device)
+
+ return pipeline
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+
+def run_model_and_save_images(pipeline, model_path):
+ video_grids = []
+ if not isinstance(args.text_prompt, list):
+ args.text_prompt = [args.text_prompt]
+ if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ text_prompt = open(args.text_prompt[0], 'r').readlines()
+ args.text_prompt = [i.strip() for i in text_prompt]
+
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = "(masterpiece), (best quality), (ultra-detailed), {}. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous"
+ negative_prompt = "nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+ for index, prompt in enumerate(args.text_prompt):
+ if index % npu_config.N_NPU_PER_NODE != local_rank:
+ continue
+ print('Processing the ({}) prompt'.format(prompt))
+ videos = pipeline(positive_prompt.format(prompt),
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ motion_score=args.motion_score,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=512,
+ ).images
+ print(videos.shape)
+ try:
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+
+ else:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{args.sample_method}_{index}_{checkpoint_name}__gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'
+ ), videos[0],
+ fps=args.fps, quality=9, codec='libx264',
+ output_params=['-threads', '20']) # highest quality is 10, lowest is 0
+ except:
+ print('Error when saving {}'.format(prompt))
+ video_grids.append(videos)
+
+ video_grids = torch.cat(video_grids, dim=0).cuda()
+ shape = list(video_grids.shape)
+ shape[0] *= world_size
+ gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ video_grids = gathered_tensor.cpu()
+
+ # video_grids = video_grids.repeat(world_size, 1, 1, 1)
+ # output = torch.zeros(video_grids.shape, dtype=video_grids.dtype, device=device)
+ # dist.all_to_all_single(output, video_grids)
+ # video_grids = output.cpu()
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_{checkpoint_name}.{ext}')
+
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=9)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--run_time", type=int, default=0)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['sparsedit', 'dit', 'udit', 'latte'])
+ parser.add_argument('--motion_score', type=float, default=None)
+ args = parser.parse_args()
+
+ npu_config.print_msg(args)
+ npu_config.conv_dtype = torch.bfloat16
+ replace_with_fp32_forwards()
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ if npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ dist.init_process_group(backend='hccl', init_method='env://', world_size=8, rank=local_rank)
+
+ # torch.manual_seed(args.seed)
+ weight_dtype = torch.float16
+ device = torch.cuda.current_device()
+
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir=args.cache_dir)
+ vae = ae_wrapper[args.ae](args.ae_path)
+ print(args.ae)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=True, torch_dtype=torch.float16).to(device)
+ tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler()
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler()
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler()
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+ first_in = False
+ # while True:
+ # cur_path = get_latest_path()
+ # if cur_path == latest_path:
+ # time.sleep(60)
+ # continue
+
+ # if not first_in:
+ # first_in = True
+ # else:
+ # time.sleep(1200)
+
+ # latest_path = cur_path
+ if latest_path is None:
+ latest_path = ''
+
+ npu_config.print_msg(f"The latest_path is {latest_path}")
+ full_path = f"{args.model_path}"
+ # full_path = f"{args.model_path}/{latest_path}/model_ema"
+ # full_path = "/home/opensora/captions/240p_model_ema"
+ pipeline = load_t2v_checkpoint(full_path)
+
+ if npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = "/home/image_data/npu_profiling_t2v"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=10000, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ run_model_and_save_images(pipeline, latest_path)
+ prof.step()
+ else:
+ run_model_and_save_images(pipeline, latest_path)
diff --git a/opensora/sample/sample_t2v_sp.py b/opensora/sample/sample_t2v_sp.py
new file mode 100644
index 000000000..e1c7f3bf2
--- /dev/null
+++ b/opensora/sample/sample_t2v_sp.py
@@ -0,0 +1,319 @@
+import math
+import os
+import torch
+import argparse
+import torchvision
+import torch.distributed as dist
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder, Transformer2DModel
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, MT5EncoderModel
+
+import os, sys
+
+from opensora.models.causalvideovae import ae_stride_config, ae_channel_config, ae_norm, ae_denorm, CausalVAEModelWrapper
+
+from opensora.models.diffusion.udit.modeling_udit import UDiTT2V
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.sample.pipeline_opensora_sp import OpenSoraPipeline
+
+import imageio
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import initialize_sequence_parallel_state, hccl_info
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import initialize_sequence_parallel_state, nccl_info
+ pass
+import time
+
+
+
+def load_t2v_checkpoint(model_path):
+ if args.model_type == 'udit':
+ transformer_model = UDiTT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ elif args.model_type == 'dit':
+ transformer_model = OpenSoraT2V.from_pretrained(model_path, cache_dir=args.cache_dir,
+ low_cpu_mem_usage=False, device_map=None,
+ torch_dtype=weight_dtype)
+ else:
+ transformer_model = LatteT2V.from_pretrained(model_path, cache_dir=args.cache_dir, low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
+ # print(transformer_model.config)
+
+ # set eval mode
+ transformer_model.eval()
+ pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model).to(device)
+
+
+ if args.compile:
+ # 5% https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+ # options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", \
+ # "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+ # "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # # options = '{"mode": "max-autotune", "memory_format": "channels_last", \
+ # # "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
+ # from onediffx import compile_pipe
+ # pipeline = compile_pipe(
+ # pipeline, backend="nexfort", options=options, fuse_qkv_projections=True
+ # )
+
+ # 4%
+ pipeline.transformer = torch.compile(pipeline.transformer)
+ return pipeline
+
+
+def get_latest_path():
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.model_path)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ return path
+
+
+def run_model_and_save_images(pipeline, model_path):
+ video_grids = []
+ if not isinstance(args.text_prompt, list):
+ args.text_prompt = [args.text_prompt]
+ if len(args.text_prompt) == 1 and args.text_prompt[0].endswith('txt'):
+ text_prompt = open(args.text_prompt[0], 'r').readlines()
+ args.text_prompt = [i.strip() for i in text_prompt]
+
+ checkpoint_name = f"{os.path.basename(model_path)}"
+
+ positive_prompt = """
+ (masterpiece), (best quality), (ultra-detailed), (unwatermarked),
+ {}.
+ emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
+ sharp focus, high budget, cinemascope, moody, epic, gorgeous
+ """
+
+ negative_prompt = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
+
+ for index, prompt in enumerate(args.text_prompt):
+ # if index % world_size != local_rank:
+ # continue
+ # print('Processing the ({}) prompt, device ({})'.format(prompt, device))
+ videos = pipeline(positive_prompt.format(prompt),
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=args.device,
+ max_sequence_length=args.max_sequence_length,
+ ).images
+ print(videos.shape)
+
+ if nccl_info.rank <= 0:
+ try:
+ if args.num_frames == 1:
+ videos = videos[:, 0].permute(0, 3, 1, 2) # b t h w c -> b c h w
+ save_image(videos / 255.0, os.path.join(args.save_img_path,
+ f'{args.sample_method}_{index}_{checkpoint_name}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'),
+ nrow=1, normalize=True, value_range=(0, 1)) # t c h w
+
+ else:
+ imageio.mimwrite(
+ os.path.join(
+ args.save_img_path,
+ f'{args.sample_method}_{index}_{checkpoint_name}__gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'
+ ), videos[0],
+ fps=args.fps, quality=6, codec='libx264',
+ output_params=['-threads', '20']) # highest quality is 10, lowest is 0
+ except:
+ print('Error when saving {}'.format(prompt))
+ video_grids.append(videos)
+ if nccl_info.rank <= 0:
+ video_grids = torch.cat(video_grids, dim=0)
+
+ def get_file_name():
+ return os.path.join(args.save_img_path,
+ f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}_{checkpoint_name}.{ext}')
+
+ if args.num_frames == 1:
+ save_image(video_grids / 255.0, get_file_name(),
+ nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ else:
+ video_grids = save_video_grid(video_grids)
+ imageio.mimwrite(get_file_name(), video_grids, fps=args.fps, quality=6)
+
+ print('save path {}'.format(args.save_img_path))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=7.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--max_sequence_length", type=int, default=512)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--model_type', type=str, default="dit", choices=['dit', 'udit', 'latte'])
+ parser.add_argument('--compile', action='store_true')
+ parser.add_argument('--save_memory', action='store_true')
+ args = parser.parse_args()
+
+ if torch_npu is not None:
+ npu_config.print_msg(args)
+
+ # 初始化分布式环境
+ local_rank = int(os.getenv('RANK', 0))
+ world_size = int(os.getenv('WORLD_SIZE', 1))
+ print('world_size', world_size)
+ if torch_npu is not None and npu_config.on_npu:
+ torch_npu.npu.set_device(local_rank)
+ else:
+ torch.cuda.set_device(local_rank)
+ dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=local_rank)
+ initialize_sequence_parallel_state(world_size)
+ # torch.manual_seed(args.seed)
+ weight_dtype = torch.bfloat16
+ device = torch.cuda.current_device()
+ # print(11111111111111111111, local_rank, device)
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir=args.cache_dir)
+ vae = CausalVAEModelWrapper(args.ae_path)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ # text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ # cache_dir=args.cache_dir, low_cpu_mem_usage=True,
+ # torch_dtype=weight_dtype).to(device)
+ # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ # cache_dir=args.cache_dir)
+ text_encoder = T5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/models--DeepFloyd--t5-v1_1-xxl/snapshots/c9c625d2ec93667ec579ede125fd3811d1f81d37", cache_dir=args.cache_dir)
+
+ # text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir,
+ # low_cpu_mem_usage=True, torch_dtype=weight_dtype).to(device)
+ # tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir=args.cache_dir)
+
+ # set eval mode
+ vae.eval()
+ text_encoder.eval()
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler(clip_sample=False)
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler(clip_sample=False)
+ elif args.sample_method == 'DPMSolverMultistep':
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler()
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+
+ if not os.path.exists(args.save_img_path):
+ os.makedirs(args.save_img_path, exist_ok=True)
+
+ if args.num_frames == 1:
+ video_length = 1
+ ext = 'jpg'
+ else:
+ ext = 'mp4'
+
+ latest_path = None
+ save_img_path = args.save_img_path
+ while True:
+ cur_path = get_latest_path()
+ # print(cur_path, latest_path)
+ if cur_path == latest_path:
+ time.sleep(5)
+ continue
+
+ time.sleep(1)
+ latest_path = cur_path
+ if npu_config is not None:
+ npu_config.print_msg(f"The latest_path is {latest_path}")
+ else:
+ print(f"The latest_path is {latest_path}")
+ full_path = f"{args.model_path}/{latest_path}/model_ema"
+ # full_path = f"{args.model_path}/{latest_path}/model"
+ pipeline = load_t2v_checkpoint(full_path)
+ # print('load model')
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = "/home/image_data/npu_profiling_t2v"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=10000, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ run_model_and_save_images(pipeline, latest_path)
+ prof.step()
+ else:
+ # print('gpu')
+ run_model_and_save_images(pipeline, latest_path)
diff --git a/opensora/sample/sample_v2v.py b/opensora/sample/sample_v2v.py
new file mode 100644
index 000000000..2d19094ca
--- /dev/null
+++ b/opensora/sample/sample_v2v.py
@@ -0,0 +1,375 @@
+import os, sys
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.abspath(os.path.join(current_dir, '../..'))
+if project_root not in sys.path:
+ sys.path.append(project_root)
+
+import math
+import os
+from accelerate.utils import set_seed
+import pip
+from sympy import motzkin
+import torch
+import argparse
+import torchvision
+
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from omegaconf import OmegaConf
+from torchvision.utils import save_image
+from transformers import T5EncoderModel, MT5EncoderModel, UMT5EncoderModel, AutoTokenizer
+
+
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+
+from opensora.models.diffusion import Diffusion_models, Diffusion_models_class
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+
+from opensora.models.ae import ae_stride_config, getae, getae_wrapper
+from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
+from opensora.models.diffusion.opensora.modeling_inpaint import OpenSoraInpaint, VIPNet
+from opensora.models.diffusion.opensora.modeling_inpaint import STR_TO_TYPE, TYPE_TO_STR, ModelType
+import timm
+
+from opensora.models.text_encoder import get_text_enc
+from opensora.utils.utils import save_video_grid
+
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+from opensora.sample.pipeline_inpaint import hacked_pipeline_call, decode_latents
+# for validation
+import glob
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import Lambda
+from opensora.dataset.transform import ToTensorVideo, CenterCropResizeVideo, TemporalRandomCrop, LongSideResizeVideo, SpatialStrideCropVideo, ToTensorAfterResize
+import numpy as np
+from einops import rearrange
+
+import imageio
+import glob
+import gc
+import time
+
+@torch.inference_mode()
+def validation(args):
+
+ # torch.manual_seed(args.seed)
+ weight_dtype = torch.bfloat16
+ device = torch.device(f'cuda:{args.rank}')
+
+ model_type = STR_TO_TYPE[args.model_type]
+
+ norm_fun = Lambda(lambda x: 2. * x - 1.)
+
+ resize_transform = CenterCropResizeVideo((args.height, args.width))
+ transform = transforms.Compose([
+ ToTensorAfterResize(),
+ # RandomHorizontalFlipVideo(p=0.5), # in case their caption have position decription
+ norm_fun
+ ])
+
+
+ def preprocess_images(images):
+ condition_images = [resize_transform(image) for image in images]
+ condition_images = [transform(image).to(device=device, dtype=weight_dtype) for image in condition_images]
+ return dict(condition_images=condition_images, condition_images_indices=condition_images_indices)
+
+
+ # vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir=args.cache_dir)
+ vae = getae_wrapper(args.ae)(args.ae_path)
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
+ if args.enable_tiling:
+ vae.vae.enable_tiling()
+ vae.vae.tile_overlap_factor = args.tile_overlap_factor
+ vae.vae.tile_sample_min_size = 512
+ vae.vae.tile_latent_min_size = 64
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ if args.save_memory:
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config[args.ae]
+
+ text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir, low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir)
+
+ if os.path.exists(args.model_path):
+ transformer_model_path = args.model_path
+ else:
+ transformer_model_path = args.pretrained_transformer_model_path
+
+ if args.rank == 0:
+ print(transformer_model_path)
+
+ transformer_model = OpenSoraInpaint.from_pretrained(transformer_model_path, torch_dtype=weight_dtype)
+
+ if transformer_model_path != args.model_path:
+ pretrained_path = dict(transformer=args.pretrained_transformer_model_path)
+ transformer_model.custom_load_state_dict(pretrained_path)
+ transformer_model.to(dtype=weight_dtype)
+
+ transformer_model = transformer_model.to(device)
+ vae.vae = vae.vae.to(device)
+ text_encoder = text_encoder.to(device)
+
+ # set eval mode
+ transformer_model.eval()
+ vae.eval()
+ text_encoder.eval()
+
+
+ if args.sample_method == 'DDIM': #########
+ scheduler = DDIMScheduler(clip_sample=False)
+ elif args.sample_method == 'EulerDiscrete':
+ scheduler = EulerDiscreteScheduler()
+ elif args.sample_method == 'DDPM': #############
+ scheduler = DDPMScheduler(clip_sample=False)
+ elif args.sample_method == 'DPMSolverMultistep':
+ '''
+ DPM++ 2M DPMSolverMultistepScheduler
+ DPM++ 2M Karras DPMSolverMultistepScheduler init with use_karras_sigmas=True
+ DPM++ 2M SDE DPMSolverMultistepScheduler init with algorithm_type="sde-dpmsolver++"
+ DPM++ 2M SDE Karras DPMSolverMultistepScheduler init with use_karras_sigmas=True and algorithm_type="sde-dpmsolver++"
+
+ DPM++ SDE DPMSolverSinglestepScheduler
+ DPM++ SDE Karras DPMSolverSinglestepScheduler init with use_karras_sigmas=True
+ DPM2 KDPM2DiscreteScheduler
+ DPM2 Karras KDPM2DiscreteScheduler init with use_karras_sigmas=True
+ DPM2 a KDPM2AncestralDiscreteScheduler
+ DPM2 a Karras KDPM2AncestralDiscreteScheduler init with use_karras_sigmas=True
+ '''
+ # scheduler = DPMSolverMultistepScheduler(use_karras_sigmas=True)
+ scheduler = DPMSolverMultistepScheduler()
+ elif args.sample_method == 'DPMSolverSinglestep':
+ scheduler = DPMSolverSinglestepScheduler()
+ elif args.sample_method == 'PNDM':
+ scheduler = PNDMScheduler()
+ elif args.sample_method == 'HeunDiscrete': ########
+ scheduler = HeunDiscreteScheduler()
+ elif args.sample_method == 'EulerAncestralDiscrete':
+ scheduler = EulerAncestralDiscreteScheduler()
+ elif args.sample_method == 'DEISMultistep':
+ scheduler = DEISMultistepScheduler()
+ elif args.sample_method == 'KDPM2AncestralDiscrete': #########
+ scheduler = KDPM2AncestralDiscreteScheduler()
+ elif args.sample_method == 'EulerDiscreteSVD':
+ scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-video-diffusion-img2vid",
+ subfolder="scheduler", cache_dir=args.cache_dir)
+ # Save the generated videos
+ save_dir = args.save_img_path
+ os.makedirs(save_dir, exist_ok=True)
+
+ pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer_model)
+
+ pipeline.__call__ = hacked_pipeline_call.__get__(pipeline, OpenSoraPipeline)
+ pipeline.decode_latents = decode_latents.__get__(pipeline, OpenSoraPipeline)
+
+ validation_dir = args.validation_dir if args.validation_dir is not None else "./validation"
+ prompt_file = os.path.join(validation_dir, "prompt.txt")
+
+ with open(prompt_file, 'r') as f:
+ validation_prompt = f.readlines()
+
+ index = 0
+ validation_images_list = []
+ img_ext = 'png'
+ while True:
+ temp = glob.glob(os.path.join(validation_dir, f"*_{index:04d}*.{img_ext}"))
+ print(temp)
+ if len(temp) > 0:
+ validation_images_list.append(sorted(temp))
+ index += 1
+ else:
+ break
+
+ positive_prompt = "(masterpiece), (best quality), (ultra-detailed), {}. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous"
+ negative_prompt = """nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry,
+ """
+
+ videos = []
+ max_sample_num = args.max_sample_num // args.world_size
+ current_sample_num = 0
+
+ v2v_epoch = 1
+ overlapping_ratio = 0.2
+
+ for idx, (prompt, images) in enumerate(zip(validation_prompt, validation_images_list)):
+
+ if (current_sample_num + 1) > max_sample_num:
+ break
+
+ if not isinstance(images, list):
+ images = [images]
+ if 'img' in images[0]:
+ continue
+
+ if idx % args.world_size != args.rank:
+ continue
+
+
+ video_list = []
+
+ condition_images, condition_images_indices= None, None
+
+ video_length = args.num_frames
+ overlapping = int(overlapping_ratio * video_length)
+
+ for epoch_id in range(v2v_epoch):
+
+ if epoch_id == 0:
+ if len(images) == 1:
+ condition_images_indices = [0]
+ elif len(images) == 2:
+ condition_images_indices = [0, -1]
+ condition_images = [Image.open(image).convert("RGB") for image in images]
+ condition_images = [torch.from_numpy(np.copy(np.array(image))) for image in condition_images]
+ condition_images = [rearrange(image, 'h w c -> c h w').unsqueeze(0) for image in condition_images]
+ condition_images = [resize_transform(image) for image in condition_images]
+ condition_images = [transform(image).to(device=device, dtype=weight_dtype) for image in condition_images]
+ else:
+ condition_images = rearrange(condition_images, 'f h w c -> f c h w')
+ condition_images = resize_transform(condition_images)
+ condition_images = transform(condition_images).to(device=device, dtype=weight_dtype)
+
+
+ video = pipeline.__call__(
+ prompt=prompt,
+ condition_images=condition_images,
+ condition_images_indices=condition_images_indices,
+ negative_prompt=negative_prompt,
+ clip_features=None,
+ negative_clip_features=None,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ device=device,
+ max_sequence_length=args.max_sequence_length,
+ model_type=model_type,
+ ).images
+
+ video = video[0]
+ start_idx = video_length - overlapping
+ condition_images = video[start_idx:]
+ condition_images_indices = list(range(0, overlapping))
+ video_list.append(video)
+
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ temp_video = [video_list[0]]
+ if v2v_epoch > 1:
+ for i in range(v2v_epoch - 1):
+ temp_video.append(video_list[i + 1][overlapping:])
+
+ video = torch.cat(temp_video, dim=0)
+
+ videos.append(video)
+
+ ext = 'mp4'
+ imageio.mimwrite(
+ os.path.join(save_dir, f'{idx}.{ext}'), video, fps=24, quality=6) # highest quality is 10, lowest is 0
+ current_sample_num += 1
+
+ dist.barrier()
+ # video_grids = torch.stack(videos, dim=0).to(device=device)
+ # shape = list(video_grids.shape)
+ # shape[0] *= world_size
+ # gathered_tensor = torch.zeros(shape, dtype=video_grids.dtype, device=device)
+ # dist.all_gather_into_tensor(gathered_tensor, video_grids.contiguous())
+ # video_grids = gathered_tensor.cpu()
+
+ # if args.rank == 0:
+ # # torchvision.io.write_video(args.save_img_path + '_%04d' % args.run_time + '-.mp4', video_grids, fps=6)
+ # if args.num_frames == 1:
+ # save_image(video_grids / 255.0, os.path.join(args.save_img_path, f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'),
+ # nrow=math.ceil(math.sqrt(len(video_grids))), normalize=True, value_range=(0, 1))
+ # else:
+ # video_grids = save_video_grid(video_grids)
+ # imageio.mimwrite(os.path.join(args.save_img_path, f'{args.sample_method}_gs{args.guidance_scale}_s{args.num_sampling_steps}.{ext}'), video_grids, fps=args.fps, quality=8)
+
+ # print('save path {}'.format(args.save_img_path))
+
+ del pipeline
+ del text_encoder
+ del vae
+ del transformer_model
+ gc.collect()
+ torch.cuda.empty_cache()
+
+def main(args):
+ with open('seed.txt', 'r') as f:
+ rand_num = [int(i) for i in f.readlines()]
+
+ root_save_img_path = args.save_img_path
+ idx = 0
+
+ while True:
+ dist.barrier()
+ set_seed(rand_num[idx])
+ args.save_img_path = os.path.join(root_save_img_path, f"{idx:09d}")
+ validation(args)
+ idx += 1
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--version", type=str, default=None, choices=[None, '65x512x512', '65x256x256', '17x256x256'])
+ parser.add_argument("--num_frames", type=int, default=1)
+ parser.add_argument("--height", type=int, default=512)
+ parser.add_argument("--width", type=int, default=512)
+ parser.add_argument("--device", type=str, default='cuda:0')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--ae", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--ae_path", type=str, default='CausalVAEModel_4x8x8')
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--save_img_path", type=str, default="./sample_videos/t2v")
+ parser.add_argument("--guidance_scale", type=float, default=2.5)
+ parser.add_argument("--sample_method", type=str, default="PNDM")
+ parser.add_argument("--max_sequence_length", type=int, default=300)
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument("--fps", type=int, default=24)
+ parser.add_argument("--run_time", type=int, default=0)
+ parser.add_argument("--text_prompt", nargs='+')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.125)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument('--save_memory', action='store_true')
+ parser.add_argument('--model_3d', action='store_true')
+ parser.add_argument('--enable_stable_fp32', action='store_true')
+
+ parser.add_argument("--max_sample_num", type=int, default=8)
+ parser.add_argument("--validation_dir", type=str, default=None)
+ parser.add_argument("--model_type", type=str, default='inpaint_only', choices=['inpaint_only', 'vip_only', 'vip_inpaint'])
+ parser.add_argument("--image_encoder_name", type=str, default='laion/CLIP-ViT-H-14-laion2B-s32B-b79K')
+ parser.add_argument("--pretrained_transformer_model_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--pretrained_vipnet_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--image_encoder_path", type=str, default='LanguageBind/Open-Sora-Plan-v1.0.0')
+ parser.add_argument("--seed", type=int, default=42)
+ args = parser.parse_args()
+
+ if args.seed is not None:
+ set_seed(args.seed)
+
+ args.world_size = world_size = torch.cuda.device_count()
+ args.rank = int(os.environ["LOCAL_RANK"])
+
+ # Initialize the process group
+ dist.init_process_group("nccl", rank=args.rank, world_size=args.world_size)
+
+ main(args)
+
diff --git a/opensora/sample/transport_sample.py b/opensora/sample/transport_sample.py
deleted file mode 100644
index b88b55206..000000000
--- a/opensora/sample/transport_sample.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Sample new images from a pre-trained SiT.
-"""
-import os
-import sys
-
-from opensora.dataset import ae_denorm
-from opensora.models.ae import ae_channel_config, getae, ae_stride_config
-from opensora.models.diffusion import Diffusion_models
-from opensora.models.diffusion.transport import create_transport, Sampler
-from opensora.utils.utils import find_model
-
-import torch
-import argparse
-
-from einops import rearrange
-import imageio
-
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-
-
-
-def main(mode, args):
- # Setup PyTorch:
- # torch.manual_seed(args.seed)
- torch.set_grad_enabled(False)
- device = "cuda" if torch.cuda.is_available() else "cpu"
-
- using_cfg = args.cfg_scale > 1.0
-
- # Load model:
- latent_size = args.image_size // ae_stride_config[args.ae][1]
- args.latent_size = latent_size
- model = Diffusion_models[args.model](
- input_size=latent_size,
- num_classes=args.num_classes,
- in_channels=ae_channel_config[args.ae],
- extras=args.extras
- ).to(device)
-
- if args.use_compile:
- model = torch.compile(model)
-
- # a pre-trained model or load a custom Latte checkpoint from train.py:
- ckpt_path = args.ckpt
- state_dict = find_model(ckpt_path)
- model.load_state_dict(state_dict)
-
- model.eval() # important!
- transport = create_transport(
- args.path_type,
- args.prediction,
- args.loss_weight,
- args.train_eps,
- args.sample_eps
- )
- sampler = Sampler(transport)
- if mode == "ODE":
- if args.likelihood:
- assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
- sample_fn = sampler.sample_ode_likelihood(
- sampling_method=args.sampling_method,
- num_steps=args.num_sampling_steps,
- atol=args.atol,
- rtol=args.rtol,
- )
- else:
- sample_fn = sampler.sample_ode(
- sampling_method=args.sampling_method,
- num_steps=args.num_sampling_steps,
- atol=args.atol,
- rtol=args.rtol,
- reverse=args.reverse
- )
- elif mode == "SDE":
- sample_fn = sampler.sample_sde(
- sampling_method=args.sampling_method,
- diffusion_form=args.diffusion_form,
- diffusion_norm=args.diffusion_norm,
- last_step=args.last_step,
- last_step_size=args.last_step_size,
- num_steps=args.num_sampling_steps,
- )
-
- ae = getae(args).to(device)
-
- if args.use_fp16:
- print('WARNING: using half percision for inferencing!')
- ae.to(dtype=torch.float16)
- model.to(dtype=torch.float16)
-
- # Labels to condition the model with (feel free to change):
-
- # Create sampling noise:
- if args.use_fp16:
- z = torch.randn(1, args.num_frames // ae_stride_config[args.ae][0], model.in_channels, latent_size, latent_size, dtype=torch.float16, device=device) # b c f h w
- else:
- z = torch.randn(1, args.num_frames // ae_stride_config[args.ae][0], model.in_channels, latent_size, latent_size, device=device)
-
- # Setup classifier-free guidance:
- if using_cfg:
- z = torch.cat([z, z], 0)
- y = torch.randint(0, args.num_classes, (1,), device=device)
- y_null = torch.tensor([args.num_classes] * 1, device=device)
- y = torch.cat([y, y_null], dim=0)
- model_kwargs = dict(y=y, cfg_scale=args.cfg_scale, use_fp16=args.use_fp16)
- forward_fn = model.forward_with_cfg
- else:
- forward_fn = model.forward
- model_kwargs = dict(y=None, use_fp16=args.use_fp16)
-
- # Sample images:
- samples = sample_fn(z, forward_fn, **model_kwargs)[-1]
-
- if args.use_fp16:
- samples = samples.to(dtype=torch.float16)
- samples = ae.decode(samples)
-
- # Save and display images:
- if not os.path.exists(args.save_video_path):
- os.makedirs(args.save_video_path)
-
-
- video_ = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous()
- video_save_path = os.path.join(args.save_video_path, 'sample' + '.mp4')
- print(video_save_path)
- imageio.mimwrite(video_save_path, video_, fps=args.fps, quality=9)
- print('save path {}'.format(args.save_video_path))
-
-
-def none_or_str(value):
- if value == 'None':
- return None
- return value
-
-def parse_transport_args(parser):
- group = parser.add_argument_group("Transport arguments")
- group.add_argument("--path-type", type=str, default="Linear", choices=["Linear", "GVP", "VP"])
- group.add_argument("--prediction", type=str, default="velocity", choices=["velocity", "score", "noise"])
- group.add_argument("--loss-weight", type=none_or_str, default=None, choices=[None, "velocity", "likelihood"])
- group.add_argument("--sample-eps", type=float)
- group.add_argument("--train-eps", type=float)
-
-def parse_ode_args(parser):
- group = parser.add_argument_group("ODE arguments")
- group.add_argument("--sampling-method", type=str, default="dopri5", help="blackbox ODE solver methods; for full list check https://github.com/rtqichen/torchdiffeq")
- group.add_argument("--atol", type=float, default=1e-6, help="Absolute tolerance")
- group.add_argument("--rtol", type=float, default=1e-3, help="Relative tolerance")
- group.add_argument("--reverse", action="store_true")
- group.add_argument("--likelihood", action="store_true")
-
-def parse_sde_args(parser):
- group = parser.add_argument_group("SDE arguments")
- group.add_argument("--sampling-method", type=str, default="Euler", choices=["Euler", "Heun"])
- group.add_argument("--diffusion-form", type=str, default="sigma", \
- choices=["constant", "SBDM", "sigma", "linear", "decreasing", "increasing-decreasing"],\
- help="form of diffusion coefficient in the SDE")
- group.add_argument("--diffusion-norm", type=float, default=1.0)
- group.add_argument("--last-step", type=none_or_str, default="Mean", choices=[None, "Mean", "Tweedie", "Euler"],\
- help="form of last step taken in the SDE")
- group.add_argument("--last-step-size", type=float, default=0.04, \
- help="size of the last step taken")
-
-if __name__ == "__main__":
- if len(sys.argv) < 2:
- print("Usage: program.py [options]")
- sys.exit(1)
-
- mode = sys.argv[1]
-
- assert mode[:2] != "--", "Usage: program.py [options]"
- assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
-
- parser = argparse.ArgumentParser()
- parser.add_argument("--ckpt", type=str, default="")
- parser.add_argument("--model", type=str, default='Latte-XL/122')
- parser.add_argument("--ae", type=str, default='stabilityai/sd-vae-ft-mse')
- parser.add_argument("--save-video-path", type=str, default="./sample_videos/")
- parser.add_argument("--fps", type=int, default=10)
- parser.add_argument("--num-classes", type=int, default=101)
- parser.add_argument("--num-frames", type=int, default=16)
- parser.add_argument("--image-size", type=int, default=256, choices=[256, 512])
- parser.add_argument("--extras", type=int, default=1)
- parser.add_argument("--num-sampling-steps", type=int, default=250)
- parser.add_argument("--cfg-scale", type=float, default=1.0)
- parser.add_argument("--use-fp16", action="store_true")
- parser.add_argument("--use-compile", action="store_true")
- parser.add_argument("--sample-method", type=str, default='ddpm')
-
- parse_transport_args(parser)
- if mode == "ODE":
- parse_ode_args(parser)
- # Further processing for ODE
- elif mode == "SDE":
- parse_sde_args(parser)
- # Further processing for SDE
-
- args = parser.parse_known_args()[0]
- main(mode, args)
diff --git a/opensora/serve/gradio_utils.py b/opensora/serve/gradio_utils.py
index 1935fe9e9..7367dec36 100644
--- a/opensora/serve/gradio_utils.py
+++ b/opensora/serve/gradio_utils.py
@@ -1,65 +1,89 @@
import random
+import imageio
+import uuid
import torch
+import numpy as np
-def set_env(seed=0):
- torch.manual_seed(seed)
- torch.set_grad_enabled(False)
+MAX_SEED = np.iinfo(np.int32).max
+DESCRIPTION = """
+
+
+ # Open-Sora Plan 93x720p
+ #### [Open-Sora Plan 93x720p](https://github.com/PKU-YuanGroup/Open-Sora-Plan) is a transformer-based text-to-video diffusion system trained on text embeddings from mT5. This demo uses the []() checkpoint.
+ #### Multilingual prompts SUPPORT; 支持多语言
+ #### Welcome to Star🌟 our [GitHub](https://github.com/PKU-YuanGroup/Open-Sora-Plan)
+ ### You may change frame to 29 if you're not satisfied with the speed, as generating short video is significantly faster than long one.
+ """
+NEG_PROMPT = """
+ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
+ low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
+ """
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
- if randomize_seed:
- seed = random.randint(0, 203279)
- return seed
-title_markdown = ("""
-
-
-
-""")
-DESCRIPTION = """
-# Open-Sora-Plan v1.0.0
-## If Open-Sora-Plan is helpful, please help to ✨ the [Github Repo](https://github.com/PKU-YuanGroup/Open-Sora-Plan) and recommend it to your friends 😊'
-#### [Open-Sora-Plan v1.0.0](https://github.com/PKU-YuanGroup/Open-Sora-Plan) is a transformer-based text-to-video diffusion system trained on text embeddings from T5.
-#### This demo is only trained on 40k videos, when creating videos, please be aware that it has the potential to generate harmful videos. For more details read our [report]().
-#### Image generation is typically 50 steps, video generation maybe 150 steps will yield good results, but this may take 3-4 minutes.
-#### Feel free to enjoy the examples.
-#### English prompts ONLY; 提示词仅限英文
-####
-"""
+style_list = [
+ {
+ "name": "(Default)",
+ "prompt": "(masterpiece), (best quality), (ultra-detailed), (unwatermarked), {prompt}",
+ "negative_prompt": NEG_PROMPT,
+ },
+ {
+ "name": "Cinematic",
+ "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
+ "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured. ",
+ },
+ {
+ "name": "Photographic",
+ "prompt": "cinematic photo, a close-up of {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+ "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly. ",
+ },
+ {
+ "name": "Anime",
+ "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
+ "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast. ",
+ },
+ {
+ "name": "Manga",
+ "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
+ "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style. ",
+ },
+ {
+ "name": "Digital Art",
+ "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
+ "negative_prompt": "photo, photorealistic, realism, ugly. ",
+ },
+ {
+ "name": "Pixel art",
+ "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
+ "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic. ",
+ },
+ {
+ "name": "Fantasy art",
+ "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
+ "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white. ",
+ },
+ {
+ "name": "Neonpunk",
+ "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
+ "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured. ",
+ },
+ {
+ "name": "3D Model",
+ "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
+ "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting. ",
+ },
+]
-#
-#
-#
-#
-#
-#
-#
-# """)
-block_css = """
-#buttons button {
- min-width: min(120px,100%);
-}
-"""
+def save_video(video):
+ unique_name = str(uuid.uuid4()) + ".mp4"
+ imageio.mimwrite(unique_name, video, fps=23, quality=6)
+ return unique_name
-examples = [
- ["A quiet beach at dawn, the waves gently lapping at the shore and the sky painted in pastel hues.", 50, 10.0],
- ["A quiet beach at dawn, the waves softly lapping at the shore, pink and orange hues painting the sky, offering a moment of solitude and reflection.", 50, 10.0],
- ["The majestic beauty of a waterfall cascading down a cliff into a serene lake.", 50, 10.0],
- ["Sunset over the sea.", 50, 10.0],
- ["a cat wearing sunglasses and working as a lifeguard at pool.", 50, 10.0],
- ["Slow pan upward of blazing oak fire in an indoor fireplace.", 50, 10.0],
- ["Yellow and black tropical fish dart through the sea.", 50, 10.0],
- ["a serene winter scene in a forest. The forest is blanketed in a thick layer of snow, which has settled on the branches of the trees, creating a canopy of white. The trees, a mix of evergreens and deciduous, stand tall and silent, their forms partially obscured by the snow. The ground is a uniform white, with no visible tracks or signs of human activity. The sun is low in the sky, casting a warm glow that contrasts with the cool tones of the snow. The light filters through the trees, creating a soft, diffused illumination that highlights the texture of the snow and the contours of the trees. The overall style of the scene is naturalistic, with a focus on the tranquility and beauty of the winter landscape.", 50, 10.0],
- ["a dynamic interaction between the ocean and a large rock. The rock, with its rough texture and jagged edges, is partially submerged in the water, suggesting it is a natural feature of the coastline. The water around the rock is in motion, with white foam and waves crashing against the rock, indicating the force of the ocean's movement. The background is a vast expanse of the ocean, with small ripples and waves, suggesting a moderate sea state. The overall style of the scene is a realistic depiction of a natural landscape, with a focus on the interplay between the rock and the water.", 50, 10.0],
- ["A serene waterfall cascading down moss-covered rocks, its soothing sound creating a harmonious symphony with nature.", 50, 10.0],
- ["A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures.", 50, 10.0],
- ["The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.", 50, 10.0],
- ["A vibrant scene of a snowy mountain landscape. The sky is filled with a multitude of colorful hot air balloons, each floating at different heights, creating a dynamic and lively atmosphere. The balloons are scattered across the sky, some closer to the viewer, others further away, adding depth to the scene. Below, the mountainous terrain is blanketed in a thick layer of snow, with a few patches of bare earth visible here and there. The snow-covered mountains provide a stark contrast to the colorful balloons, enhancing the visual appeal of the scene.", 50, 10.0],
- ["A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene.", 50, 10.0],
- ["A snowy forest landscape with a dirt road running through it. The road is flanked by trees covered in snow, and the ground is also covered in snow. The sun is shining, creating a bright and serene atmosphere. The road appears to be empty, and there are no people or animals visible in the video. The style of the video is a natural landscape shot, with a focus on the beauty of the snowy forest and the peacefulness of the road.", 50, 10.0],
- ["The dynamic movement of tall, wispy grasses swaying in the wind. The sky above is filled with clouds, creating a dramatic backdrop. The sunlight pierces through the clouds, casting a warm glow on the scene. The grasses are a mix of green and brown, indicating a change in seasons. The overall style of the video is naturalistic, capturing the beauty of the landscape in a realistic manner. The focus is on the grasses and their movement, with the sky serving as a secondary element. The video does not contain any human or animal elements.", 50, 10.0],
-]
\ No newline at end of file
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+ if randomize_seed:
+ seed = random.randint(0, MAX_SEED)
+ return seed
diff --git a/opensora/serve/gradio_web_server.py b/opensora/serve/gradio_web_server.py
index 99dbeff0b..1c932c8fa 100644
--- a/opensora/serve/gradio_web_server.py
+++ b/opensora/serve/gradio_web_server.py
@@ -1,124 +1,292 @@
-
-
+#!/usr/bin/env python
+from __future__ import annotations
import argparse
-import sys
import os
-import random
+import sys
+import gradio as gr
+from diffusers import ConsistencyDecoderVAE, DPMSolverMultistepScheduler, Transformer2DModel, AutoencoderKL, SASolverScheduler
-import imageio
import torch
-from diffusers import PNDMScheduler
-from huggingface_hub import hf_hub_download
-from torchvision.utils import save_image
-from diffusers.models import AutoencoderKL
+from typing import Tuple
from datetime import datetime
-from typing import List, Union
-import gradio as gr
-import numpy as np
-from gradio.components import Textbox, Video, Image
-from transformers import T5Tokenizer, T5EncoderModel
-
+from peft import PeftModel
+# import spaces
from opensora.models.ae import ae_stride_config, getae, getae_wrapper
-from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
-from opensora.models.diffusion.latte.modeling_latte import LatteT2V
-from opensora.sample.pipeline_videogen import VideoGenPipeline
-from opensora.serve.gradio_utils import block_css, title_markdown, randomize_seed_fn, set_env, examples, DESCRIPTION
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, MT5EncoderModel
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+ EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+ HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+ DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from opensora.models.diffusion.opensora.modeling_opensora import OpenSoraT2V
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+from opensora.serve.gradio_utils import DESCRIPTION, MAX_SEED, style_list, randomize_seed_fn, save_video
+if not torch.cuda.is_available():
+ DESCRIPTION += "\nRunning on CPU 🥶 This demo does not work on CPU.
"
-@torch.inference_mode()
-def generate_img(prompt, sample_steps, scale, seed=0, randomize_seed=False, force_images=False):
- seed = int(randomize_seed_fn(seed, randomize_seed))
- set_env(seed)
- video_length = transformer_model.config.video_length if not force_images else 1
- height, width = int(args.version.split('x')[1]), int(args.version.split('x')[2])
- num_frames = 1 if video_length == 1 else int(args.version.split('x')[0])
- videos = videogen_pipeline(prompt,
- video_length=video_length,
- height=height,
- width=width,
- num_inference_steps=sample_steps,
- guidance_scale=scale,
- enable_temporal_attentions=not force_images,
- num_images_per_prompt=1,
- mask_feature=True,
- ).video
-
- torch.cuda.empty_cache()
- videos = videos[0]
- tmp_save_path = 'tmp.mp4'
- imageio.mimwrite(tmp_save_path, videos, fps=24, quality=9) # highest quality is 10, lowest is 0
- display_model_info = f"Video size: {num_frames}×{height}×{width}, \nSampling Step: {sample_steps}, \nGuidance Scale: {scale}"
- return tmp_save_path, prompt, display_model_info, seed
-
-if __name__ == '__main__':
- args = type('args', (), {
- 'ae': 'CausalVAEModel_4x8x8',
- 'force_images': False,
- 'model_path': 'LanguageBind/Open-Sora-Plan-v1.0.0',
- 'text_encoder_name': 'DeepFloyd/t5-v1_1-xxl',
- 'version': '65x512x512'
- })
- device = torch.device('cuda:0')
-
- # Load model:
- transformer_model = LatteT2V.from_pretrained(args.model_path, subfolder=args.version, torch_dtype=torch.float16, cache_dir='cache_dir').to(device)
-
- vae = getae_wrapper(args.ae)(args.model_path, subfolder="vae", cache_dir='cache_dir').to(device)
- vae = vae.half()
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
+CACHE_EXAMPLES = False
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "6000"))
+MAX_VIDEO_FRAME = int(os.getenv("MAX_IMAGE_SIZE", "93"))
+SPEED_UP_T5 = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+PORT = int(os.getenv("DEMO_PORT", "15432"))
+
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+STYLE_NAMES = list(styles.keys())
+DEFAULT_STYLE_NAME = "(Default)"
+SCHEDULE_NAME = [
+ "PNDM-Solver", "EulerA-Solver", "DPM-Solver", "SA-Solver",
+ "DDIM-Solver", "Euler-Solver", "DDPM-Solver", "DEISM-Solver"]
+DEFAULT_SCHEDULE_NAME = "PNDM-Solver"
+NUM_IMAGES_PER_PROMPT = 1
+
+def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+ if not negative:
+ negative = ""
+ return p.replace("{prompt}", positive), n + negative
+
+if torch.cuda.is_available():
+ weight_dtype = torch.bfloat16
+ T5_token_max_length = 512
+
+ vae = getae_wrapper('CausalVAEModel_4x8x8')("/storage/dataset/test140k")
+ vae.vae = vae.vae.to(device=device, dtype=weight_dtype)
vae.vae.enable_tiling()
- image_size = int(args.version.split('x')[1])
- latent_size = (image_size // ae_stride_config[args.ae][1], image_size // ae_stride_config[args.ae][2])
- vae.latent_size = latent_size
- transformer_model.force_images = args.force_images
- tokenizer = T5Tokenizer.from_pretrained(args.text_encoder_name, cache_dir="cache_dir")
- text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_name, cache_dir="cache_dir",
- torch_dtype=torch.float16).to(device)
-
- # set eval mode
- transformer_model.eval()
- vae.eval()
- text_encoder.eval()
+ vae.vae.tile_overlap_factor = 0.125
+ vae.vae.tile_sample_min_size = 256
+ vae.vae.tile_latent_min_size = 32
+ vae.vae.tile_sample_min_size_t = 29
+ vae.vae.tile_latent_min_size_t = 8
+ vae.vae_scale_factor = ae_stride_config['CausalVAEModel_4x8x8']
+
+ text_encoder = MT5EncoderModel.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl",
+ low_cpu_mem_usage=True, torch_dtype=weight_dtype)
+ tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl")
+ transformer = OpenSoraT2V.from_pretrained("/storage/dataset/hw29/model_ema", low_cpu_mem_usage=False,
+ device_map=None, torch_dtype=weight_dtype)
scheduler = PNDMScheduler()
- videogen_pipeline = VideoGenPipeline(vae=vae,
- text_encoder=text_encoder,
- tokenizer=tokenizer,
- scheduler=scheduler,
- transformer=transformer_model).to(device)
-
-
- demo = gr.Interface(
- fn=generate_img,
- inputs=[Textbox(label="",
- placeholder="Please enter your prompt. \n"),
- gr.Slider(
- label='Sample Steps',
+ pipe = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=transformer)
+ pipe.to(device)
+ print("Loaded on Device!")
+
+ # speed-up T5
+ if SPEED_UP_T5:
+ pipe.text_encoder.to_bettertransformer()
+
+ if USE_TORCH_COMPILE:
+ pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
+ print("Model Compiled!")
+
+# @spaces.GPU(duration=120)
+@torch.no_grad()
+@torch.inference_mode()
+def generate(
+ prompt: str,
+ negative_prompt: str = "",
+ style: str = DEFAULT_STYLE_NAME,
+ use_negative_prompt: bool = False,
+ seed: int = 0,
+ frame: int = 29,
+ schedule: str = 'DPM-Solver',
+ guidance_scale: float = 4.5,
+ num_inference_steps: int = 25,
+ randomize_seed: bool = False,
+ progress=gr.Progress(track_tqdm=True),
+):
+ seed = int(randomize_seed_fn(seed, randomize_seed))
+ generator = torch.Generator().manual_seed(seed)
+
+ if schedule == 'DPM-Solver':
+ if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler):
+ pipe.scheduler = DPMSolverMultistepScheduler()
+ elif schedule == "PNDM-Solver":
+ if not isinstance(pipe.scheduler, PNDMScheduler):
+ pipe.scheduler = PNDMScheduler()
+ elif schedule == "DDIM-Solver":
+ if not isinstance(pipe.scheduler, DDIMScheduler):
+ pipe.scheduler = DDIMScheduler()
+ elif schedule == "Euler-Solver":
+ if not isinstance(pipe.scheduler, EulerDiscreteScheduler):
+ pipe.scheduler = EulerDiscreteScheduler()
+ elif schedule == "DDPM-Solver":
+ if not isinstance(pipe.scheduler, DDPMScheduler):
+ pipe.scheduler = DDPMScheduler()
+ elif schedule == "EulerA-Solver":
+ if not isinstance(pipe.scheduler, EulerAncestralDiscreteScheduler):
+ pipe.scheduler = EulerAncestralDiscreteScheduler()
+ elif schedule == "DEISM-Solver":
+ if not isinstance(pipe.scheduler, DEISMultistepScheduler):
+ pipe.scheduler = DEISMultistepScheduler()
+ elif schedule == "SA-Solver":
+ if not isinstance(pipe.scheduler, SASolverScheduler):
+ pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2)
+ else:
+ raise ValueError(f"Unknown schedule: {schedule}")
+
+ if not use_negative_prompt:
+ negative_prompt = None # type: ignore
+ prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+ print(prompt, negative_prompt)
+ videos = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ num_frames=frame,
+ # width=1280,
+ # height=720,
+ width=640,
+ height=480,
+ guidance_scale=guidance_scale,
+ num_inference_steps=num_inference_steps,
+ generator=generator,
+ num_images_per_prompt=1, # num_imgs
+ max_sequence_length=T5_token_max_length,
+ ).images
+
+ video_paths = [save_video(vid) for vid in videos]
+ print(video_paths)
+ return video_paths[0], seed
+
+
+examples = [
+ "A small cactus with a happy face in the Sahara desert.",
+ "Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture.",
+ "3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.",
+ "Color photo of a corgi made of transparent glass, standing on the riverside in Yosemite National Park.",
+ "A close-up photo of a person. The subject is a woman. She wore a blue coat with a gray dress underneath. She has blue eyes and blond hair, and wears a pair of earrings. Behind are blurred city buildings and streets.",
+ "A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.",
+ "a handsome young boy in the middle with sky color background wearing eye glasses, it's super detailed with anime style, it's a portrait with delicated eyes and nice looking face",
+ "an astronaut sitting in a diner, eating fries, cinematic, analog film",
+ "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+ "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
+ "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
+]
+
+with gr.Blocks(css="style.css") as demo:
+ gr.Markdown(DESCRIPTION)
+ gr.DuplicateButton(
+ value="Duplicate Space for private use",
+ elem_id="duplicate-button",
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+ )
+ with gr.Row(equal_height=False):
+ with gr.Group():
+ with gr.Row():
+ use_negative_prompt = gr.Checkbox(label="Use additional negative prompt", value=False, visible=True)
+ negative_prompt = gr.Text(
+ label="Negative prompt",
+ max_lines=1,
+ placeholder="Enter a additional negative prompt",
+ visible=True,
+ )
+ with gr.Row(visible=True):
+ schedule = gr.Radio(
+ show_label=True,
+ container=True,
+ interactive=True,
+ choices=SCHEDULE_NAME,
+ value=DEFAULT_SCHEDULE_NAME,
+ label="Sampler Schedule",
+ visible=True,
+ )
+ style_selection = gr.Radio(
+ show_label=True,
+ container=True,
+ interactive=True,
+ choices=STYLE_NAMES,
+ value=DEFAULT_STYLE_NAME,
+ label="Video Style",
+ )
+ seed = gr.Slider(
+ label="Seed",
+ minimum=0,
+ maximum=MAX_SEED,
+ step=1,
+ value=0,
+ )
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+ with gr.Row(visible=True):
+ frame = gr.Slider(
+ label="Frame",
+ minimum=29,
+ maximum=MAX_VIDEO_FRAME,
+ step=16,
+ value=29,
+ )
+ with gr.Row():
+ guidance_scale = gr.Slider(
+ label="Guidance scale",
minimum=1,
- maximum=500,
- value=50,
- step=10
- ),
- gr.Slider(
- label='Guidance Scale',
- minimum=0.1,
- maximum=30.0,
- value=10.0,
- step=0.1
- ),
- gr.Slider(
- label="Seed",
- minimum=0,
- maximum=203279,
+ maximum=10,
+ step=0.1,
+ value=5.0,
+ )
+ inference_steps = gr.Slider(
+ label="inference steps",
+ minimum=10,
+ maximum=200,
step=1,
- value=0,
- ),
- gr.Checkbox(label="Randomize seed", value=True),
- gr.Checkbox(label="Generate image (1 frame video)", value=False),
- ],
- outputs=[Video(label="Vid", width=512, height=512),
- Textbox(label="input prompt"),
- Textbox(label="model info"),
- gr.Slider(label='seed')],
- title=title_markdown, description=DESCRIPTION, theme=gr.themes.Default(), css=block_css,
+ value=50,
+ )
+ with gr.Group():
+ with gr.Row():
+ prompt = gr.Text(
+ label="Prompt",
+ show_label=False,
+ max_lines=1,
+ placeholder="Enter your prompt",
+ container=False,
+ )
+ run_button = gr.Button("Run", scale=0)
+ result = gr.Video(label="Result")
+
+ gr.Examples(
examples=examples,
+ inputs=prompt,
+ outputs=[result, seed],
+ fn=generate,
+ cache_examples=CACHE_EXAMPLES,
)
- demo.launch()
\ No newline at end of file
+
+ use_negative_prompt.change(
+ fn=lambda x: gr.update(visible=x),
+ inputs=use_negative_prompt,
+ outputs=negative_prompt,
+ api_name=False,
+ )
+
+ gr.on(
+ triggers=[
+ prompt.submit,
+ negative_prompt.submit,
+ run_button.click,
+ ],
+ fn=generate,
+ inputs=[
+ prompt,
+ negative_prompt,
+ style_selection,
+ use_negative_prompt,
+ seed,
+ frame,
+ schedule,
+ guidance_scale,
+ inference_steps,
+ randomize_seed,
+ ],
+ outputs=[result, seed],
+ api_name="run",
+ )
+
+if __name__ == "__main__":
+ # demo.queue(max_size=20).launch(server_name='0.0.0.0', share=True)
+ demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=11900, debug=True)
\ No newline at end of file
diff --git a/opensora/serve/style.css b/opensora/serve/style.css
new file mode 100644
index 000000000..73054b14b
--- /dev/null
+++ b/opensora/serve/style.css
@@ -0,0 +1 @@
+.gradio-container{width:1280px!important}
\ No newline at end of file
diff --git a/opensora/train/train_inpaint.py b/opensora/train/train_inpaint.py
new file mode 100644
index 000000000..596d73cf9
--- /dev/null
+++ b/opensora/train/train_inpaint.py
@@ -0,0 +1,1063 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+A minimal training script for DiT using PyTorch DDP.
+"""
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import Optional
+import gc
+import numpy as np
+from einops import rearrange
+import torch.utils
+import torch.utils.data
+from tqdm import tqdm
+import torch.multiprocessing as mp
+import dill
+
+
+from opensora.adaptor.modules import replace_with_fp32_forwards
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.acceleration.communications import prepare_parallel_data, broadcast
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.utils.communications import prepare_parallel_data, broadcast
+ pass
+import time
+# import cv2
+from ultralytics import YOLO
+
+from dataclasses import field, dataclass
+from torch.utils.data import DataLoader
+from copy import deepcopy
+import accelerate
+import torch
+from torch.nn import functional as F
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from packaging import version
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import DDPMScheduler, PNDMScheduler, DPMSolverMultistepScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+# from torchvision import transforms
+
+from opensora.models.causalvideovae import ae_stride_config, ae_channel_config
+from opensora.models.causalvideovae import ae_norm, ae_denorm
+from opensora.models.text_encoder import get_text_enc, get_text_warpper
+from opensora.dataset import getdataset
+from opensora.models.diffusion import Diffusion_models, Diffusion_models_class
+from opensora.utils.dataset_utils import Collate, LengthGroupedSampler
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+from opensora.utils.ema import EMAModel
+# from opensora.sample.pipeline_inpaint import OpenSoraInpaintPipeline
+# from opensora.dataset.transform import CenterCropResizeVideo
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+
+
+
+check_min_version("0.24.0")
+logger = get_logger(__name__)
+
+# def get_video(video_path):
+# cap = cv2.VideoCapture(video_path)
+# frames = []
+# while True:
+# ret, frame = cap.read()
+# if not ret:
+# break # 视频读取结束
+
+# # frame 是一个 (H, W, C) 的 numpy 数组,代表一帧
+# frames.append(frame)
+
+# # 释放视频文件
+# cap.release()
+
+# # 将帧列表转换为 numpy 数组,形状为 (T, H, W, C)
+# video_numpy = np.array(frames)
+
+# return video_numpy
+
+class ProgressInfo:
+ def __init__(self, global_step, train_loss=0.0):
+ self.global_step = global_step
+ self.train_loss = train_loss
+
+
+# def log_validation(args, model, vae, text_encoder, tokenizer, accelerator, weight_dtype, global_step, motion_score, ema=False):
+# positive_prompt = "(masterpiece), (best quality), (ultra-detailed), {}. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous"
+# negative_prompt = """nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry,
+# """
+
+
+# input_prompts = ["A man is riding a horse"]
+# videos_path = ["/home/image_data/hxy/data/video/output_video.mp4"]
+# masks_path = ["/home/image_data/hxy/data/video/output_mask.mp4"]
+# model = accelerator.unwrap_model(model)
+# scheduler = DPMSolverMultistepScheduler()
+
+# inpaint_pipeline = OpenSoraInpaintPipeline(
+# vae=vae,
+# text_encoder=text_encoder,
+# tokenizer=tokenizer,
+# scheduler=scheduler,
+# transformer=model
+# ).to(device=accelerator.device)
+
+# resize = [CenterCropResizeVideo((args.height, args.width)), ]
+# resize_transform = transforms.Compose([*resize])
+
+# inference_videos = []
+
+
+# for index, (prompt, video_path, mask_path) in enumerate(zip(input_prompts, videos_path, masks_path)):
+# video = get_video((video_path))
+# mask = get_video(mask_path)
+
+# video_tensor = resize_transform(torch.from_numpy(video.transpose(0,3,1,2)))
+# mask_tensor = resize_transform(torch.from_numpy(mask.transpose(0,3,1,2)))
+
+# video_resize = video_tensor.numpy()
+# mask_resize = mask_tensor.numpy()
+
+
+# inference_video = inpaint_pipeline(
+# video = video_resize,
+# masks = mask_resize,
+# prompt=prompt,
+# negative_prompt=negative_prompt,
+# num_frames=args.num_frames,
+# height=args.height,
+# width=args.width,
+# motion_score=motion_score,
+# num_inference_steps=args.num_sampling_steps,
+# guidance_scale=args.guidance_scale,
+# num_images_per_prompt=1,
+# mask_feature=True,
+# device=accelerator.device,
+# max_sequence_length=args.max_sequence_length,
+# ).images
+# inference_videos.append(inference_video[0])
+# videos = torch.stack(inference_videos).numpy()
+# videos = rearrange(videos, 'b t h w c -> b t c h w')
+
+# for tracker in accelerator.trackers:
+# if tracker.name == "wandb":
+# import wandb
+# if videos.shape[1] == 1:
+# images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+# logs = {
+# f"{'ema_' if ema else ''}validation": [
+# wandb.Image(image, caption=f"{i}: {prompt}")
+# for i, (image, prompt) in enumerate(zip(images, input_prompts))
+# ]
+# }
+# else:
+# logs = {
+# f"{'ema_' if ema else ''}validation": [
+# wandb.Video(video, caption=f"{i}: {prompt}", fps=24)
+# for i, (video, prompt) in enumerate(zip(videos, input_prompts))
+# ]
+# }
+# tracker.log(logs, step=global_step)
+
+
+#################################################################################
+# Training Loop #
+#################################################################################
+
+def main(args):
+ logging_dir = Path(args.output_dir, args.logging_dir)
+
+ # use LayerNorm, GeLu, SiLu always as fp32 mode
+ if args.enable_stable_fp32:
+ replace_with_fp32_forwards()
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(args)
+ npu_config.seed_everything(args.seed)
+ accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+ accelerator = Accelerator(
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
+ mixed_precision=args.mixed_precision,
+ log_with=args.report_to,
+ project_config=accelerator_project_config,
+ )
+
+ if args.num_frames != 1 and args.use_image_num == 0:
+ initialize_sequence_parallel_state(args.sp_size)
+
+ if args.report_to == "wandb":
+ if not is_wandb_available():
+ raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+ # Make one log on every process with the configuration for debugging.
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+ )
+ logger.info(accelerator.state, main_process_only=False)
+ if accelerator.is_local_main_process:
+ transformers.utils.logging.set_verbosity_warning()
+ diffusers.utils.logging.set_verbosity_info()
+ else:
+ transformers.utils.logging.set_verbosity_error()
+ diffusers.utils.logging.set_verbosity_error()
+
+ # If passed along, set the training seed now.
+ if args.seed is not None:
+ set_seed(args.seed, device_specific=True)
+
+ # Handle the repository creation
+ if accelerator.is_main_process:
+ if args.output_dir is not None:
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ # For mixed precision training we cast all non-trainable weigths to half-precision
+ # as these weights are only used for inference, keeping weights in full precision is not required.
+ weight_dtype = torch.float32
+ if accelerator.mixed_precision == "fp16":
+ weight_dtype = torch.float16
+ elif accelerator.mixed_precision == "bf16":
+ weight_dtype = torch.bfloat16
+
+ # Create model:
+ kwargs = {}
+ ae = ae_wrapper[args.ae](args.ae_path, cache_dir=args.cache_dir, **kwargs).eval()
+ if args.enable_tiling:
+ ae.vae.enable_tiling()
+ ae.vae.tile_overlap_factor = args.tile_overlap_factor
+
+ kwargs = {'load_in_8bit': args.enable_8bit_t5, 'torch_dtype': weight_dtype, 'low_cpu_mem_usage': True}
+ text_enc = get_text_warpper(args.text_encoder_name)(args, **kwargs).eval()
+
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ ae.vae_scale_factor = (ae_stride_t, ae_stride_h, ae_stride_w)
+ assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
+ args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
+ args.ae_stride = args.ae_stride_h
+ patch_size = args.model[-3:]
+ patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
+ args.patch_size = patch_size_h
+ args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
+ assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
+ # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
+ assert args.max_height % ae_stride_h == 0, f"Height must be divisible by ae_stride_h, but found Height ({args.max_height}), ae_stride_h ({ae_stride_h})."
+ assert args.max_width % ae_stride_h == 0, f"Width size must be divisible by ae_stride_h, but found Width ({args.max_width}), ae_stride_h ({ae_stride_h})."
+
+ args.stride_t = ae_stride_t * patch_size_t
+ args.stride = ae_stride_h * patch_size_h
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ ae.latent_size = latent_size
+
+ if args.num_frames % 2 == 1:
+ args.latent_size_t = latent_size_t = (args.num_frames - 1) // ae_stride_t + 1
+ else:
+ latent_size_t = args.num_frames // ae_stride_t
+
+ model_kwargs = {'vae_scale_factor_t': ae_stride_t}
+
+ model = Diffusion_models[args.model](
+ in_channels=ae_channel_config[args.ae],
+ out_channels=ae_channel_config[args.ae],
+ # caption_channels=4096,
+ # cross_attention_dim=1152,
+ attention_bias=True,
+ sample_size=latent_size,
+ sample_size_t=latent_size_t,
+ num_vector_embeds=None,
+ activation_fn="gelu-approximate",
+ num_embeds_ada_norm=1000,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ double_self_attention=False,
+ upcast_attention=False,
+ # norm_type="ada_norm_single",
+ norm_elementwise_affine=False,
+ norm_eps=1e-6,
+ attention_type='default',
+ attention_mode=args.attention_mode,
+ interpolation_scale_h=args.interpolation_scale_h,
+ interpolation_scale_w=args.interpolation_scale_w,
+ interpolation_scale_t=args.interpolation_scale_t,
+ downsampler=args.downsampler,
+ # compress_kv_factor=args.compress_kv_factor,
+ use_rope=args.use_rope,
+ # model_max_length=args.model_max_length,
+ use_stable_fp32=args.enable_stable_fp32,
+ sparse1d=args.sparse1d,
+ sparse2d=args.sparse2d,
+ sparse_n=args.sparse_n,
+ use_motion=args.use_motion,
+ **model_kwargs,
+ )
+ model.gradient_checkpointing = args.gradient_checkpointing
+
+ pretrained_transformer_model_path = args.pretrained_transformer_model_path
+ pretrained_model_path = dict(transformer_model=pretrained_transformer_model_path)
+ if pretrained_transformer_model_path is not None:
+ model.custom_load_state_dict(pretrained_model_path)
+
+ # Freeze main models
+ ae.vae.requires_grad_(False)
+ text_enc.requires_grad_(False)
+ # Set model as trainable.
+ model.train()
+
+ ae.vae.tile_sample_min_size = args.tile_sample_min_size
+ ae.vae.tile_sample_min_size_t = args.tile_sample_min_size_t
+
+ noise_scheduler = DDPMScheduler(rescale_betas_zero_snr=args.rescale_betas_zero_snr)
+ # Move unet, vae and text_encoder to device and cast to weight_dtype
+ # The VAE is in float32 to avoid NaN losses.
+ ae.vae.to(accelerator.device, dtype=torch.float32)
+ # ae.vae.to(accelerator.device, dtype=weight_dtype)
+ text_enc.to(accelerator.device, dtype=weight_dtype)
+
+ # Create EMA for the unet.
+ if args.use_ema:
+ ema_model = deepcopy(model)
+ ema_model = EMAModel(ema_model.parameters(), decay=args.ema_decay, update_after_step=args.ema_start_step,
+ model_cls=Diffusion_models_class[args.model], model_config=ema_model.config)
+
+ # `accelerate` 0.16.0 will have better support for customized saving
+ if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+ def save_model_hook(models, weights, output_dir):
+ if accelerator.is_main_process:
+ if args.use_ema:
+ ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
+
+ for i, model in enumerate(models):
+ model.save_pretrained(os.path.join(output_dir, "model"))
+ if weights: # Don't pop if empty
+ # make sure to pop weight so that corresponding model is not saved again
+ weights.pop()
+
+
+ def load_model_hook(models, input_dir):
+ # loading ema with customed 'from_pretrained' function
+ if args.use_ema:
+ load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), Diffusion_models_class[args.model])
+ ema_model.load_state_dict(load_model.state_dict())
+ ema_model.to(accelerator.device)
+ del load_model
+
+ for i in range(len(models)):
+ # pop models so that they are not loaded again
+ model = models.pop()
+
+ # load diffusers style into model
+ load_model = Diffusion_models_class[args.model].from_pretrained(input_dir, subfolder="model")
+ model.register_to_config(**load_model.config)
+
+ model.load_state_dict(load_model.state_dict())
+ del load_model
+
+ accelerator.register_save_state_pre_hook(save_model_hook)
+ accelerator.register_load_state_pre_hook(load_model_hook)
+
+ # Enable TF32 for faster training on Ampere GPUs,
+ # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+ if args.allow_tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+
+ if args.scale_lr:
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+ )
+
+ params_to_optimize = list(filter(lambda p: p.requires_grad, model.parameters()))
+ # Optimizer creation
+ if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+ logger.warning(
+ f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+ "Defaulting to adamW"
+ )
+ args.optimizer = "adamw"
+
+ if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+ logger.warning(
+ f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+ f"set to {args.optimizer.lower()}"
+ )
+
+ if args.optimizer.lower() == "adamw":
+ if args.use_8bit_adam:
+ try:
+ import bitsandbytes as bnb
+ except ImportError:
+ raise ImportError(
+ "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+ )
+
+ optimizer_class = bnb.optim.AdamW8bit
+ else:
+ optimizer_class = torch.optim.AdamW
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ )
+
+ if args.optimizer.lower() == "prodigy":
+ try:
+ import prodigyopt
+ except ImportError:
+ raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+ optimizer_class = prodigyopt.Prodigy
+
+ if args.learning_rate <= 0.1:
+ logger.warning(
+ "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+ )
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ beta3=args.prodigy_beta3,
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ decouple=args.prodigy_decouple,
+ use_bias_correction=args.prodigy_use_bias_correction,
+ safeguard_warmup=args.prodigy_safeguard_warmup,
+ )
+ logger.info(f"optimizer: {optimizer}")
+
+ if args.trained_data_global_step is not None:
+ initial_global_step_for_sampler = args.trained_data_global_step
+ else:
+ initial_global_step_for_sampler = 0
+
+ train_dataset = getdataset(args)
+ sampler = LengthGroupedSampler(
+ args.train_batch_size,
+ world_size=accelerator.num_processes,
+ gradient_accumulation_size=args.gradient_accumulation_steps,
+ initial_global_step=initial_global_step_for_sampler,
+ lengths=train_dataset.lengths,
+ group_data=args.group_data,
+ )
+
+ Yolomodel = YOLO(args.yolomodel_pathorname).to(accelerator.device)
+
+ train_dataloader = DataLoader(
+ train_dataset,
+ shuffle=False,
+ # pin_memory=True,
+ collate_fn=Collate(args,Yolomodel),
+ batch_size=args.train_batch_size,
+ num_workers=args.dataloader_num_workers,
+ sampler=sampler,
+ drop_last=True,
+ # prefetch_factor=4
+ )
+ logger.info(f'after train_dataloader')
+
+ # Scheduler and math around the number of training steps.
+ overrode_max_train_steps = False
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if args.max_train_steps is None:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ overrode_max_train_steps = True
+
+ lr_scheduler = get_scheduler(
+ args.lr_scheduler,
+ optimizer=optimizer,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+ )
+
+ # Prepare everything with our `accelerator`.
+ # model.requires_grad_(False)
+ # model.pos_embed.requires_grad_(True)
+ if args.adapt_vae:
+ model.requires_grad_(False)
+ for name, param in model.named_parameters():
+ if 'pos_embed' in name or 'proj_out' in name:
+ param.requires_grad = True
+ logger.info(f'before accelerator.prepare')
+ model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+ model, optimizer, train_dataloader, lr_scheduler
+ )
+ logger.info(f'after accelerator.prepare')
+ if args.use_ema:
+ ema_model.to(accelerator.device)
+
+ # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if overrode_max_train_steps:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ # Afterwards we recalculate our number of training epochs
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+ # We need to initialize the trackers we use, and also store our configuration.
+ # The trackers initializes automatically on the main process.
+ # NOTE wandb
+ if accelerator.is_main_process:
+ logger.info("init trackers...")
+ project_name = os.getenv('PROJECT', os.path.basename(args.output_dir))
+ entity = os.getenv('ENTITY', None)
+ run_name = os.getenv('WANDB_NAME', None)
+ init_kwargs = {
+ "entity": entity,
+ "run_name": run_name,
+ }
+ accelerator.init_trackers(project_name=project_name, config=vars(args), init_kwargs=init_kwargs)
+
+ # Setup data:
+ total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+ total_batch_size = total_batch_size // args.sp_size * args.train_sp_batch_size
+
+ # Train!
+ logger.info("***** Running training *****")
+ logger.info(f" Model = {model}")
+ logger.info(f" Num examples = {len(train_dataset)}")
+ logger.info(f" Num Epochs = {args.num_train_epochs}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Total optimization steps = {args.max_train_steps}")
+ logger.info(f" Total optimization steps (num_update_steps_per_epoch) = {num_update_steps_per_epoch}")
+ logger.info(f" Total trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B")
+
+ global_step = 0
+ first_epoch = 0
+ # Potentially load in the weights and states from a previous save
+ if args.resume_from_checkpoint:
+ if args.resume_from_checkpoint != "latest":
+ path = os.path.basename(args.resume_from_checkpoint)
+ else:
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.output_dir)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ if path is None:
+ accelerator.print(
+ f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+ )
+ args.resume_from_checkpoint = None
+ initial_global_step = 0
+ else:
+ accelerator.print(f"Resuming from checkpoint {path}")
+ accelerator.load_state(os.path.join(args.output_dir, path))
+ global_step = int(path.split("-")[1])
+
+ initial_global_step = global_step
+ first_epoch = global_step // num_update_steps_per_epoch
+
+ else:
+ initial_global_step = 0
+
+
+ progress_bar = tqdm(
+ range(0, args.max_train_steps),
+ initial=initial_global_step,
+ desc="Steps",
+ # Only show the progress bar once on each machine.
+ disable=not accelerator.is_local_main_process,
+ )
+ progress_info = ProgressInfo(global_step, train_loss=0.0)
+
+ def sync_gradients_info(loss):
+ # Checks if the accelerator has performed an optimization step behind the scenes
+ if args.use_ema:
+ ema_model.step(model.parameters())
+ progress_bar.update(1)
+ progress_info.global_step += 1
+ end_time = time.time()
+ one_step_duration = end_time - start_time
+ accelerator.log({"train_loss": progress_info.train_loss}, step=progress_info.global_step)
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(f"Step: [{progress_info.global_step}], local_loss={loss.detach().item()}, "
+ f"train_loss={progress_info.train_loss}, time_cost={one_step_duration}",
+ rank=0)
+ progress_info.train_loss = 0.0
+
+ # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+ if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+ if progress_info.global_step % args.checkpointing_steps == 0 or progress_info.global_step == args.after_one_epoch_global_step:
+ # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+ if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+ checkpoints = os.listdir(args.output_dir)
+ checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+ checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+ # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+ if len(checkpoints) >= args.checkpoints_total_limit:
+ num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+ removing_checkpoints = checkpoints[0:num_to_remove]
+
+ logger.info(
+ f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+ )
+ logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+ for removing_checkpoint in removing_checkpoints:
+ removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+ shutil.rmtree(removing_checkpoint)
+
+ save_path = os.path.join(args.output_dir, f"checkpoint-{progress_info.global_step}")
+ accelerator.save_state(save_path)
+ logger.info(f"Saved state to {save_path}")
+
+ logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+ progress_bar.set_postfix(**logs)
+
+ def run(model_input, model_kwargs, prof):
+ global start_time
+ start_time = time.time()
+
+ try:
+ in_channels = ae_channel_config[args.ae]
+ model_input, masked_input, video_mask = model_input[:, 0:in_channels], model_input[:, in_channels:2 * in_channels], model_input[:, 2 * in_channels:]
+ except:
+ raise ValueError("masked_x and video_mask is None!")
+
+ noise = torch.randn_like(model_input)
+ if args.noise_offset:
+ # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+ noise += args.noise_offset * torch.randn((model_input.shape[0], model_input.shape[1], 1, 1, 1),
+ device=model_input.device)
+
+ bsz = model_input.shape[0]
+ current_step_frame = model_input.shape[2]
+ # Sample a random timestep for each image without bias.
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device)
+ # print('accelerator.process_index, timesteps', accelerator.process_index, timesteps)
+ if current_step_frame != 1 and get_sequence_parallel_state(): # image do not need sp
+ broadcast(timesteps)
+ motion_score = model_kwargs.pop('motion_score', None)
+ if motion_score is not None:
+ raise NotImplementedError
+
+ # Add noise to the model input according to the noise magnitude at each timestep
+ # (this is the forward diffusion process)
+
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+ model_pred = model(
+ torch.cat([noisy_model_input, masked_input, video_mask], dim=1),
+ timesteps,
+ **model_kwargs,
+ )[0]
+
+ # Get the target for loss depending on the prediction type
+ if args.prediction_type is not None:
+ # set prediction_type of scheduler if defined
+ noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+ if noise_scheduler.config.prediction_type == "epsilon":
+ target = noise
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+ elif noise_scheduler.config.prediction_type == "sample":
+ # We set the target to latents here, but the model_pred will return the noise sample prediction.
+ target = model_input
+ # We will have to subtract the noise residual from the prediction to get the target sample.
+ model_pred = model_pred - noise
+ else:
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+ mask = model_kwargs.get('attention_mask', None)
+ if torch.all(mask.bool()):
+ mask = None
+ if get_sequence_parallel_state():
+ assert mask is None
+ b, c, _, _, _ = model_pred.shape
+ if mask is not None:
+ mask = mask.unsqueeze(1).repeat(1, c, 1, 1, 1).float() # b t h w -> b c t h w
+ mask = mask.reshape(b, -1)
+ if args.snr_gamma is None:
+ # model_pred: b c t h w, attention_mask: b t h w
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.reshape(b, -1)
+ if mask is not None:
+ loss = (loss * mask).sum() / mask.sum() # mean loss on unpad patches
+ else:
+ loss = loss.mean()
+ else:
+ # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+ # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+ # This is discussed in Section 4.2 of the same paper.
+ snr = compute_snr(noise_scheduler, timesteps)
+ mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+ dim=1
+ )[0]
+ if noise_scheduler.config.prediction_type == "epsilon":
+ mse_loss_weights = mse_loss_weights / snr
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ mse_loss_weights = mse_loss_weights / (snr + 1)
+ else:
+ raise NameError(f'{noise_scheduler.config.prediction_type}')
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.reshape(b, -1)
+ mse_loss_weights = mse_loss_weights.reshape(b, 1)
+ if mask is not None:
+ loss = (loss * mask * mse_loss_weights).sum() / mask.sum() # mean loss on unpad patches
+ else:
+ loss = (loss * mse_loss_weights).mean()
+
+ # Gather the losses across all processes for logging (if we use distributed training).
+ avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+ progress_info.train_loss += avg_loss.detach().item() / args.gradient_accumulation_steps
+
+ # Backpropagate
+ accelerator.backward(loss)
+ if accelerator.sync_gradients:
+ params_to_clip = params_to_optimize
+ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+
+ if accelerator.sync_gradients:
+ sync_gradients_info(loss)
+
+ # if accelerator.is_main_process:
+
+ # if progress_info.global_step % 1 == 0:
+
+ # if args.enable_tracker:
+ # log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ # weight_dtype, progress_info.global_step)
+
+ # if args.use_ema and npu_config is None:
+ # # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+ # ema_model.store(model.parameters())
+ # ema_model.copy_to(model.parameters())
+ # log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ # weight_dtype, progress_info.global_step, motion_score,ema=True)
+ # # Switch back to the original UNet parameters.
+ # ema_model.restore(model.parameters())
+
+ if prof is not None:
+ prof.step()
+
+
+ return loss
+
+ def train_one_step(step_, data_item_, prof_=None):
+ train_loss = 0.0
+ x, attn_mask, input_ids, cond_mask, motion_score = data_item_
+ assert not torch.any(torch.isnan(x)), 'torch.any(torch.isnan(x))'
+ x = x.to(accelerator.device, dtype=ae.vae.dtype) # B C T+num_images H W, 16 + 4
+
+ attn_mask = attn_mask.to(accelerator.device) # B T+num_images H W
+ input_ids = input_ids.to(accelerator.device) # B 1+num_images L
+ cond_mask = cond_mask.to(accelerator.device) # B 1+num_images L
+ motion_score = motion_score.to(accelerator.device) if motion_score is not None else motion_score # B 1
+ # if accelerator.process_index == 0:
+ # logger.info(f'rank: {accelerator.process_index}, x: {x.shape}, attn_mask: {attn_mask.shape}')
+
+ with torch.no_grad():
+ # import ipdb;ipdb.set_trace()
+ # use for loop to avoid OOM, because T5 is too huge...
+ B, N, L = input_ids.shape # B 1 L
+ # cond_ = torch.stack([text_enc(input_ids[i], cond_mask[i]) for i in range(B)]) # B 1 L D
+
+ # use batch inference
+ input_ids_ = input_ids.reshape(-1, L)
+ cond_mask_ = cond_mask.reshape(-1, L)
+ cond = text_enc(input_ids_, cond_mask_) # B 1 L D
+ cond = cond.reshape(B, N, L, -1)
+
+
+ def preprocess_x_for_inpaint(x):
+
+ x, masked_x, mask = x[:, :3], x[:, 3:6], x[:, 6:7]
+ x, masked_x = ae.encode(x), ae.encode(masked_x)
+ batch_size, channels, frame = mask.shape[:3]
+ new_frame, new_height, new_width = x.shape[2:]
+ mask = rearrange(mask, 'b c t h w -> (b c t) 1 h w')
+ mask = F.interpolate(mask, size=(new_height, new_width), mode='bilinear')
+ mask = rearrange(mask, '(b c t) 1 h w -> b c t h w', t=frame, b=batch_size)
+ mask_first_frame = mask[:, :, 0:1].repeat(1, 1, ae_stride_t, 1, 1).contiguous()
+ mask = torch.cat([mask_first_frame, mask[:, :, 1:]], dim=2)
+ mask = mask.view(batch_size, new_frame, ae_stride_t, new_height, new_width)
+ mask = mask.transpose(1, 2).contiguous()
+
+ return x, masked_x, mask
+
+
+ # Map input images to latent space + normalize latents
+ x, masked_x, mask = preprocess_x_for_inpaint(x)
+ x = torch.cat([x, masked_x, mask], dim=1)
+
+ current_step_frame = x.shape[2]
+ current_step_sp_state = get_sequence_parallel_state()
+ if args.sp_size != 1: # enable sp
+ if current_step_frame == 1: # but image do not need sp
+ set_sequence_parallel_state(False)
+ else:
+ set_sequence_parallel_state(True)
+ if get_sequence_parallel_state():
+ x, cond, attn_mask, cond_mask, use_image_num = prepare_parallel_data(x, cond, attn_mask, cond_mask,
+ args.use_image_num)
+ for iter in range(args.train_batch_size * args.sp_size // args.train_sp_batch_size):
+ with accelerator.accumulate(model):
+ st_idx = iter * args.train_sp_batch_size
+ ed_idx = (iter + 1) * args.train_sp_batch_size
+ model_kwargs = dict(encoder_hidden_states=cond[st_idx: ed_idx],
+ attention_mask=attn_mask[st_idx: ed_idx],
+ encoder_attention_mask=cond_mask[st_idx: ed_idx], use_image_num=use_image_num)
+ run(x[st_idx: ed_idx], model_kwargs, prof_)
+
+ else:
+ with accelerator.accumulate(model):
+ assert not torch.any(torch.isnan(x)), 'after vae'
+ x = x.to(weight_dtype)
+ model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask, motion_score=motion_score,
+ encoder_attention_mask=cond_mask, use_image_num=args.use_image_num,)
+ run(x, model_kwargs, prof_)
+
+ set_sequence_parallel_state(current_step_sp_state) # in case the next step use sp, which need broadcast(timesteps)
+
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ return False
+
+ def train_one_epoch(prof_=None):
+ # for epoch in range(first_epoch, args.num_train_epochs):
+ progress_info.train_loss = 0.0
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ args.after_one_epoch_global_step = progress_info.global_step + len(train_dataloader) // args.gradient_accumulation_steps - 1
+
+ for step, data_item in enumerate(train_dataloader):
+ if train_one_step(step, data_item, prof_):
+ break
+
+ if step >= 2 and torch_npu is not None and npu_config is not None:
+ npu_config.free_mm()
+
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = f"/home/image_data/npu_profiling_t2v/{os.getenv('PROJECT_NAME', 'local')}"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=npu_config.profiling_step, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ train_one_epoch(prof)
+ else:
+ train_one_epoch()
+ accelerator.wait_for_everyone()
+ accelerator.end_training()
+ if npu_config is not None and get_sequence_parallel_state():
+ destroy_sequence_parallel_group()
+
+
+if __name__ == "__main__":
+ # mp.set_start_method('spawn', force=True)
+ # mp.set_start_method('spawn',force=True)
+ # dill.settings['recurse'] = True
+ # 替换 multiprocessing 的序列化机制为 dill
+ # mp.get_context().reduction.dump = dill.dump
+ parser = argparse.ArgumentParser()
+
+ # dataset & dataloader
+ parser.add_argument("--dataset", type=str, required=True)
+ parser.add_argument("--data", type=str, required='')
+ parser.add_argument("--sample_rate", type=int, default=1)
+ parser.add_argument("--train_fps", type=int, default=24)
+ parser.add_argument("--drop_short_ratio", type=float, default=1.0)
+ parser.add_argument("--speed_factor", type=float, default=1.0)
+ parser.add_argument("--num_frames", type=int, default=65)
+ parser.add_argument("--max_height", type=int, default=320)
+ parser.add_argument("--max_width", type=int, default=240)
+ parser.add_argument("--use_img_from_vid", action="store_true")
+ parser.add_argument("--use_image_num", type=int, default=0)
+ parser.add_argument("--model_max_length", type=int, default=512)
+ parser.add_argument('--cfg', type=float, default=0.1)
+ parser.add_argument("--dataloader_num_workers", type=int, default=10, help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.")
+ parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader.")
+ parser.add_argument("--group_data", action="store_true")
+ parser.add_argument("--hw_stride", type=int, default=32)
+ parser.add_argument("--skip_low_resolution", action="store_true")
+ parser.add_argument("--force_resolution", action="store_true")
+ parser.add_argument("--trained_data_global_step", type=int, default=None)
+ parser.add_argument("--use_decord", action="store_true")
+
+ # text encoder & vae & diffusion model
+ parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="Latte-XL/122")
+ parser.add_argument('--enable_8bit_t5', action='store_true')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.125)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument("--compress_kv", action="store_true")
+ parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="xformers")
+ parser.add_argument('--use_rope', action='store_true')
+ parser.add_argument('--compress_kv_factor', type=int, default=1)
+ parser.add_argument('--interpolation_scale_h', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_w', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_t', type=float, default=1.0)
+ parser.add_argument("--downsampler", type=str, default=None)
+ parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--pretrained", type=str, default=None)
+ parser.add_argument('--enable_stable_fp32', action='store_true')
+ parser.add_argument('--sparse1d', action='store_true')
+ parser.add_argument('--sparse2d', action='store_true')
+ parser.add_argument('--sparse_n', type=int, default=2)
+ parser.add_argument('--tile_sample_min_size', type=int, default=512)
+ parser.add_argument('--tile_sample_min_size_t', type=int, default=33)
+ parser.add_argument('--adapt_vae', action='store_true')
+ parser.add_argument('--use_motion', action='store_true')
+ parser.add_argument("--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.")
+
+ # diffusion setting
+ parser.add_argument("--snr_gamma", type=float, default=None, help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556.")
+ parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+ parser.add_argument("--ema_decay", type=float, default=0.999)
+ parser.add_argument("--ema_start_step", type=int, default=0)
+ parser.add_argument("--noise_offset", type=float, default=0.02, help="The scale of noise offset.")
+ parser.add_argument("--prediction_type", type=str, default=None, help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.")
+ parser.add_argument('--rescale_betas_zero_snr', action='store_true')
+
+ # validation & logs
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument('--guidance_scale', type=float, default=2.5)
+ parser.add_argument("--enable_tracker", action="store_true")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+ parser.add_argument("--output_dir", type=str, default=None, help="The output directory where the model predictions and checkpoints will be written.")
+ parser.add_argument("--checkpoints_total_limit", type=int, default=None, help=("Max number of checkpoints to store."))
+ parser.add_argument("--checkpointing_steps", type=int, default=500,
+ help=(
+ "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+ " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+ " training using `--resume_from_checkpoint`."
+ ),
+ )
+ parser.add_argument("--resume_from_checkpoint", type=str, default=None,
+ help=(
+ "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+ ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+ ),
+ )
+ parser.add_argument("--logging_dir", type=str, default="logs",
+ help=(
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+ ),
+ )
+ parser.add_argument("--report_to", type=str, default="tensorboard",
+ help=(
+ 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+ ),
+ )
+ # optimizer & scheduler
+ parser.add_argument("--num_train_epochs", type=int, default=100)
+ parser.add_argument("--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.")
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")
+ parser.add_argument("--optimizer", type=str, default="adamW", help='The optimizer type to use. Choose between ["AdamW", "prodigy"]')
+ parser.add_argument("--learning_rate", type=float, default=1e-4, help="Initial learning rate (after the potential warmup period) to use.")
+ parser.add_argument("--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.")
+ parser.add_argument("--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler.")
+ parser.add_argument("--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW")
+ parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-02, help="Weight decay to use for unet params")
+ parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder")
+ parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer and Prodigy optimizers.")
+ parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True, help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True, help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ parser.add_argument("--prodigy_beta3", type=float, default=None,
+ help="coefficients for computing the Prodidy stepsize using running averages. If set to None, "
+ "uses the value of square root of beta2. Ignored if optimizer is adamW",
+ )
+ parser.add_argument("--lr_scheduler", type=str, default="constant",
+ help=(
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
+ parser.add_argument("--allow_tf32", action="store_true",
+ help=(
+ "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+ ),
+ )
+ parser.add_argument("--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"],
+ help=(
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+ ),
+ )
+
+ parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+ parser.add_argument("--sp_size", type=int, default=1, help="For sequence parallel")
+ parser.add_argument("--train_sp_batch_size", type=int, default=1, help="Batch size for sequence parallel training")
+
+ # inpaint
+ parser.add_argument("--t2v_ratio", type=float, default=0.0) # for inpainting mode
+ parser.add_argument("--i2v_ratio", type=float, default=0.0) # for inpainting mode
+ parser.add_argument("--transition_ratio", type=float, default=0.05) # for inpainting mode
+ parser.add_argument("--v2v_ratio", type=float, default=0.05) # for inpainting mode
+ parser.add_argument("--clear_video_ratio", type=float, default=0.0) # for inpainting mode
+ parser.add_argument("--Semantic_ratio", type=float, default=0.2) # for inpainting mode
+ parser.add_argument("--bbox_ratio", type=float, default=0.2) # for inpainting mode
+ parser.add_argument("--background_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--fixed_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--Semantic_expansion_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--fixed_bg_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--min_clear_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--default_text_ratio", type=float, default=0.5) # for inpainting mode
+ parser.add_argument("--pretrained_transformer_model_path", type=str, default=None)
+ parser.add_argument("--yolomodel_pathorname",type=str,default="/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt")
+
+ #visualize
+ parser.add_argument("--max_sequence_length", type=int, default=512)
+
+
+
+ args = parser.parse_args()
+ main(args)
diff --git a/opensora/train/train_t2v.py b/opensora/train/train_t2v.py
deleted file mode 100644
index b18828f6b..000000000
--- a/opensora/train/train_t2v.py
+++ /dev/null
@@ -1,846 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-A minimal training script for DiT using PyTorch DDP.
-"""
-import argparse
-import logging
-import math
-import os
-import shutil
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-from einops import rearrange
-from tqdm import tqdm
-from dataclasses import field, dataclass
-from torch.utils.data import DataLoader
-from copy import deepcopy
-
-import accelerate
-import torch
-from torch.nn import functional as F
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo
-from packaging import version
-from tqdm.auto import tqdm
-from transformers import HfArgumentParser, TrainingArguments, AutoTokenizer
-
-import diffusers
-from diffusers import DDPMScheduler, PNDMScheduler
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel, compute_snr
-from diffusers.utils import check_min_version, is_wandb_available
-
-from opensora.dataset import getdataset, ae_denorm
-from opensora.models.ae import getae, getae_wrapper
-from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
-from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion
-from opensora.models.diffusion.latte.modeling_latte import LatteT2V
-from opensora.models.text_encoder import get_text_enc, get_text_warpper
-from opensora.utils.dataset_utils import Collate
-from opensora.models.ae import ae_stride_config, ae_channel_config
-from opensora.models.diffusion import Diffusion_models
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.24.0")
-logger = get_logger(__name__)
-
-
-def generate_timestep_weights(args, num_timesteps):
- weights = torch.ones(num_timesteps)
-
- # Determine the indices to bias
- num_to_bias = int(args.timestep_bias_portion * num_timesteps)
-
- if args.timestep_bias_strategy == "later":
- bias_indices = slice(-num_to_bias, None)
- elif args.timestep_bias_strategy == "earlier":
- bias_indices = slice(0, num_to_bias)
- elif args.timestep_bias_strategy == "range":
- # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
- range_begin = args.timestep_bias_begin
- range_end = args.timestep_bias_end
- if range_begin < 0:
- raise ValueError(
- "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
- )
- if range_end > num_timesteps:
- raise ValueError(
- "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
- )
- bias_indices = slice(range_begin, range_end)
- else: # 'none' or any other string
- return weights
- if args.timestep_bias_multiplier <= 0:
- return ValueError(
- "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
- " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
- " A timestep bias multiplier less than or equal to 0 is not allowed."
- )
-
- # Apply the bias
- weights[bias_indices] *= args.timestep_bias_multiplier
-
- # Normalize
- weights /= weights.sum()
-
- return weights
-
-
-#################################################################################
-# Training Loop #
-#################################################################################
-
-def main(args):
- logging_dir = Path(args.output_dir, args.logging_dir)
-
- accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
-
- accelerator = Accelerator(
- gradient_accumulation_steps=args.gradient_accumulation_steps,
- mixed_precision=args.mixed_precision,
- log_with=args.report_to,
- project_config=accelerator_project_config,
- )
-
- if args.report_to == "wandb":
- if not is_wandb_available():
- raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
- import wandb
-
- # Make one log on every process with the configuration for debugging.
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO,
- )
- logger.info(accelerator.state, main_process_only=False)
- if accelerator.is_local_main_process:
- transformers.utils.logging.set_verbosity_warning()
- diffusers.utils.logging.set_verbosity_info()
- else:
- transformers.utils.logging.set_verbosity_error()
- diffusers.utils.logging.set_verbosity_error()
-
- # If passed along, set the training seed now.
- if args.seed is not None:
- set_seed(args.seed)
-
- # Handle the repository creation
- if accelerator.is_main_process:
- if args.output_dir is not None:
- os.makedirs(args.output_dir, exist_ok=True)
-
- # if args.push_to_hub:
- # repo_id = create_repo(
- # repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
- # ).repo_id
-
- # For mixed precision training we cast all non-trainable weigths to half-precision
- # as these weights are only used for inference, keeping weights in full precision is not required.
- weight_dtype = torch.float32
- if accelerator.mixed_precision == "fp16":
- weight_dtype = torch.float16
- elif accelerator.mixed_precision == "bf16":
- weight_dtype = torch.bfloat16
-
- # Create model:
- diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule
- kwargs = {}
- ae = getae_wrapper(args.ae)(args.ae_path, cache_dir=args.cache_dir, **kwargs).eval()
- if args.enable_tiling:
- ae.vae.enable_tiling()
- ae.vae.tile_overlap_factor = args.tile_overlap_factor
-
- kwargs = {'load_in_8bit': args.enable_8bit_t5, 'torch_dtype': weight_dtype, 'low_cpu_mem_usage': True}
- text_enc = get_text_warpper(args.text_encoder_name)(args, **kwargs).eval()
-
- ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
- assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
- args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
- args.ae_stride = args.ae_stride_h
- patch_size = args.model[-3:]
- patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
- args.patch_size = patch_size_h
- args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
- assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
- # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
- assert args.max_image_size % ae_stride_h == 0, f"Image size must be divisible by ae_stride_h, but found max_image_size ({args.max_image_size}), ae_stride_h ({ae_stride_h})."
-
- args.stride_t = ae_stride_t * patch_size_t
- args.stride = ae_stride_h * patch_size_h
- latent_size = (args.max_image_size // ae_stride_h, args.max_image_size // ae_stride_w)
-
- if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper:
- args.video_length = video_length = args.num_frames // ae_stride_t + 1
- else:
- video_length = args.num_frames // ae_stride_t
- model = Diffusion_models[args.model](
- in_channels=ae_channel_config[args.ae],
- out_channels=ae_channel_config[args.ae] * 2,
- # caption_channels=4096,
- # cross_attention_dim=1152,
- attention_bias=True,
- sample_size=latent_size,
- num_vector_embeds=None,
- activation_fn="gelu-approximate",
- num_embeds_ada_norm=1000,
- use_linear_projection=False,
- only_cross_attention=False,
- double_self_attention=False,
- upcast_attention=False,
- # norm_type="ada_norm_single",
- norm_elementwise_affine=False,
- norm_eps=1e-6,
- attention_type='default',
- video_length=video_length,
- attention_mode=args.attention_mode,
- compress_kv_factor=args.compress_kv_factor,
- use_rope=args.use_rope,
- model_max_length=args.model_max_length,
- )
- model.gradient_checkpointing = args.gradient_checkpointing
-
- # # use pretrained model?
- if args.pretrained:
- if 'safetensors' in args.pretrained:
- from safetensors.torch import load_file as safe_load
- checkpoint = safe_load(args.pretrained, device="cpu")
- else:
- checkpoint = torch.load(args.pretrained, map_location='cpu')['model']
- model_state_dict = model.state_dict()
- missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
- logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
- logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
-
- # Freeze vae and text encoders.
- ae.requires_grad_(False)
- text_enc.requires_grad_(False)
- # Set model as trainable.
- model.train()
-
-
- # Move unet, vae and text_encoder to device and cast to weight_dtype
- # The VAE is in float32 to avoid NaN losses.
- # ae.to(accelerator.device, dtype=torch.float32)
- ae.to(accelerator.device, dtype=weight_dtype)
- # ae.to(accelerator.device)
- text_enc.to(accelerator.device, dtype=weight_dtype)
- # text_enc.to(accelerator.device)
-
- # Create EMA for the unet.
- if args.use_ema:
- ema_model = deepcopy(model)
- ema_model = EMAModel(ema_model.parameters(), model_cls=LatteT2V, model_config=ema_model.config)
-
- # `accelerate` 0.16.0 will have better support for customized saving
- if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
- # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
- def save_model_hook(models, weights, output_dir):
- if accelerator.is_main_process:
- if args.use_ema:
- ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
-
- for i, model in enumerate(models):
- model.save_pretrained(os.path.join(output_dir, "model"))
- if weights: # Don't pop if empty
- # make sure to pop weight so that corresponding model is not saved again
- weights.pop()
-
- def load_model_hook(models, input_dir):
- if args.use_ema:
- load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), LatteT2V)
- ema_model.load_state_dict(load_model.state_dict())
- ema_model.to(accelerator.device)
- del load_model
-
- for i in range(len(models)):
- # pop models so that they are not loaded again
- model = models.pop()
-
- # load diffusers style into model
- load_model = LatteT2V.from_pretrained(input_dir, subfolder="model")
- model.register_to_config(**load_model.config)
-
- model.load_state_dict(load_model.state_dict())
- del load_model
-
- accelerator.register_save_state_pre_hook(save_model_hook)
- accelerator.register_load_state_pre_hook(load_model_hook)
-
- # Enable TF32 for faster training on Ampere GPUs,
- # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
- if args.allow_tf32:
- torch.backends.cuda.matmul.allow_tf32 = True
-
- if args.scale_lr:
- args.learning_rate = (
- args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
- )
-
- # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
- if args.use_8bit_adam:
- try:
- import bitsandbytes as bnb
- except ImportError:
- raise ImportError(
- "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
- )
-
- optimizer_class = bnb.optim.AdamW8bit
- else:
- optimizer_class = torch.optim.AdamW
-
- # Optimizer creation
- params_to_optimize = model.parameters()
- optimizer = optimizer_class(
- params_to_optimize,
- lr=args.learning_rate,
- betas=(args.adam_beta1, args.adam_beta2),
- weight_decay=args.adam_weight_decay,
- eps=args.adam_epsilon,
- )
-
- # Setup data:
- train_dataset = getdataset(args)
- train_dataloader = torch.utils.data.DataLoader(
- train_dataset,
- shuffle=True,
- collate_fn=Collate(args),
- batch_size=args.train_batch_size,
- num_workers=args.dataloader_num_workers,
- )
-
- # Scheduler and math around the number of training steps.
- overrode_max_train_steps = False
- num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
- if args.max_train_steps is None:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- overrode_max_train_steps = True
-
- lr_scheduler = get_scheduler(
- args.lr_scheduler,
- optimizer=optimizer,
- num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
- )
-
- # Prepare everything with our `accelerator`.
- model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
- model, optimizer, train_dataloader, lr_scheduler
- )
-
- # We need to recalculate our total training steps as the size of the training dataloader may have changed.
- num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
- if overrode_max_train_steps:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- # Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
- # We need to initialize the trackers we use, and also store our configuration.
- # The trackers initializes automatically on the main process.
- if accelerator.is_main_process:
- accelerator.init_trackers(args.output_dir, config=vars(args))
-
- # Train!
- total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
- logger.info("***** Running training *****")
- logger.info(f" Num examples = {len(train_dataset)}")
- logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
- logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
- logger.info(f" Total optimization steps = {args.max_train_steps}")
- global_step = 0
- first_epoch = 0
-
- # Potentially load in the weights and states from a previous save
- if args.resume_from_checkpoint:
- if args.resume_from_checkpoint != "latest":
- path = os.path.basename(args.resume_from_checkpoint)
- else:
- # Get the most recent checkpoint
- dirs = os.listdir(args.output_dir)
- dirs = [d for d in dirs if d.startswith("checkpoint")]
- dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
- path = dirs[-1] if len(dirs) > 0 else None
-
- if path is None:
- accelerator.print(
- f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
- )
- args.resume_from_checkpoint = None
- initial_global_step = 0
- else:
- accelerator.print(f"Resuming from checkpoint {path}")
- accelerator.load_state(os.path.join(args.output_dir, path))
- global_step = int(path.split("-")[1])
-
- initial_global_step = global_step
- first_epoch = global_step // num_update_steps_per_epoch
-
- else:
- initial_global_step = 0
-
- progress_bar = tqdm(
- range(0, args.max_train_steps),
- initial=initial_global_step,
- desc="Steps",
- # Only show the progress bar once on each machine.
- disable=not accelerator.is_local_main_process,
- )
-
- for epoch in range(first_epoch, args.num_train_epochs):
- train_loss = 0.0
- for step, (x, attn_mask, input_ids, cond_mask) in enumerate(train_dataloader):
- with accelerator.accumulate(model):
- # Sample noise that we'll add to the latents
-
-
- x = x.to(accelerator.device, dtype=weight_dtype) # B C T+num_images H W, 16 + 4
- attn_mask = attn_mask.to(accelerator.device) # B L or B 1+num_images L
- input_ids = input_ids.to(accelerator.device) # B L or B 1+num_images L
- cond_mask = cond_mask.to(accelerator.device) # B L or B 1+num_images L
- print('x.shape, attn_mask.shape, input_ids.shape, cond_mask.shape', x.shape, attn_mask.shape, input_ids.shape, cond_mask.shape)
-
- with torch.no_grad():
- # use for loop to avoid OOM, because T5 is too huge...
- B, _, _ = input_ids.shape # B T+num_images L b 1+4, L
- cond = torch.stack([text_enc(input_ids[i], cond_mask[i]) for i in range(B)]) # B 1+num_images L D
-
- # Map input images to latent space + normalize latents
- if args.use_image_num == 0:
- x = ae.encode(x) # B C T H W
- cond = text_enc(input_ids, cond_mask) # B L -> B L D
- else:
- videos, images = x[:, :, :-args.use_image_num], x[:, :, -args.use_image_num:]
- videos = ae.encode(videos) # B C T H W
-
-
- def custom_to_video(x: torch.Tensor, fps: float = 2.0, output_file: str = 'output_video.mp4') -> None:
- from examples.rec_imvi_vae import array_to_video
- x = x.detach().cpu()
- x = torch.clamp(x, -1, 1)
- x = (x + 1) / 2
- x = x.permute(1, 2, 3, 0).numpy()
- x = (255*x).astype(np.uint8)
- array_to_video(x, fps=fps, output_file=output_file)
- return
-
- # videos = ae.decode(videos.to(dtype=weight_dtype))[0]
- # videos = videos.transpose(0, 1)
- # custom_to_video(videos.to(torch.float32), fps=24, output_file='tmp.mp4')
- # sys.exit()
-
- images = rearrange(images, 'b c t h w -> (b t) c 1 h w')
- images = ae.encode(images)
-
- # import ipdb;ipdb.set_trace()
- # images = ae.decode(images.to(dtype=weight_dtype))
- # for idx in range(args.use_image_num):
- # x = images[idx, 0, :, :, :].to(torch.float32)
- # x = x.squeeze()
- # x = x.detach().cpu().numpy()
- # x = np.clip(x, -1, 1)
- # x = (x + 1) / 2
- # x = (255 * x).astype(np.uint8)
- # x = x.transpose(1, 2, 0)
- # from PIL import Image
- # image = Image.fromarray(x)
- # image.save(f'tmp{idx}.jpg')
- # import sys
- # sys.exit()
-
-
- images = rearrange(images, '(b t) c 1 h w -> b c t h w', t=args.use_image_num)
- x = torch.cat([videos, images], dim=2) # b c 17+4, h, w
-
-
-
- # print('(x.shape, attn_mask.shape, cond.shape, cond_mask.shape', x.shape, attn_mask.shape, cond.shape, cond_mask.shape)
- model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask,
- encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
- t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=accelerator.device)
- loss_dict = diffusion.training_losses(model, x, t, model_kwargs)
- loss = loss_dict["loss"].mean()
-
- # Gather the losses across all processes for logging (if we use distributed training).
- avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
- train_loss += avg_loss.item() / args.gradient_accumulation_steps
-
- # Backpropagate
- accelerator.backward(loss)
- if accelerator.sync_gradients:
- params_to_clip = model.parameters()
- accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
-
- # Checks if the accelerator has performed an optimization step behind the scenes
- if accelerator.sync_gradients:
- progress_bar.update(1)
- global_step += 1
- accelerator.log({"train_loss": train_loss}, step=global_step)
- train_loss = 0.0
-
- if args.use_deepspeed or accelerator.is_main_process:
- if global_step % args.checkpointing_steps == 0:
- # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
- if args.checkpoints_total_limit is not None:
- checkpoints = os.listdir(args.output_dir)
- checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
- checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
- # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
- if len(checkpoints) >= args.checkpoints_total_limit:
- num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
- removing_checkpoints = checkpoints[0:num_to_remove]
-
- logger.info(
- f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
- )
- logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
- for removing_checkpoint in removing_checkpoints:
- removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
- shutil.rmtree(removing_checkpoint)
-
- save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
- accelerator.save_state(save_path)
- logger.info(f"Saved state to {save_path}")
-
- logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
- progress_bar.set_postfix(**logs)
-
- if global_step >= args.max_train_steps:
- break
-
- if accelerator.is_main_process:
- if global_step % args.checkpointing_steps == 0:
- if args.use_ema:
- # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
- ema_model.store(model.parameters())
- ema_model.copy_to(model.parameters())
-
- if args.enable_tracker:
- validation_prompt = "The majestic beauty of a waterfall cascading down a cliff into a serene lake."
- logger.info(f"Running validation...Using DDPM naive sampling...\n"
- f"Generating {args.num_validation_videos} videos with prompt: {validation_prompt}")
- with torch.no_grad():
- # create pipeline
- ae_ = getae_wrapper(args.ae)(args.ae_path).to(accelerator.device).eval()
- if args.enable_tiling:
- ae_.vae.enable_tiling()
- ae_.vae.tile_overlap_factor = args.tile_overlap_factor
- # text_enc_ = get_text_enc(args).to(accelerator.device).eval()
- model_ = LatteT2V.from_pretrained(save_path, subfolder="model").to(accelerator.device).eval()
- diffusion_ = create_diffusion(str(250))
- tokenizer_ = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir='./cache_dir')
- videos = []
- for idx in range(args.num_validation_videos):
- with torch.autocast(device_type='cuda', dtype=weight_dtype):
- z = torch.randn(1, model_.in_channels, video_length,
- latent_size[0], latent_size[1], device=accelerator.device)
- text_tokens_and_mask = tokenizer_(
- validation_prompt,
- max_length=args.model_max_length,
- padding='max_length',
- truncation=True,
- return_attention_mask=True,
- add_special_tokens=True,
- return_tensors='pt'
- )
- input_ids = text_tokens_and_mask['input_ids'].to(accelerator.device)
- cond_mask = text_tokens_and_mask['attention_mask'].to(accelerator.device)
- # cond = text_enc_(input_ids, cond_mask) # B L D
- cond = text_enc(input_ids, cond_mask) # B L D
- model_kwargs = dict(encoder_hidden_states=cond, attention_mask=None, encoder_attention_mask=cond_mask)
- sample_fn = model_.forward
- # Sample images:
- samples = diffusion_.p_sample_loop(
- sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
- device=accelerator.device
- )
- samples = ae_.decode(samples)
- # Save and display images:
- video = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(
- dtype=torch.uint8).cpu().contiguous() # t c h w
- videos.append(video)
-
- videos = torch.stack(videos).numpy()
- for tracker in accelerator.trackers:
- if tracker.name == "tensorboard":
- np_videos = np.stack([np.asarray(vid) for vid in videos])
- tracker.writer.add_video("validation", np_videos, global_step, fps=10)
- if tracker.name == "wandb":
- tracker.log(
- {
- "validation": [
- wandb.Video(video, caption=f"{i}: {validation_prompt}", fps=10)
- for i, video in enumerate(videos)
- ]
- }
- )
-
- # del ae_, text_enc_, model_, diffusion_, tokenizer_
- del ae_, model_, diffusion_, tokenizer_
- torch.cuda.empty_cache()
-
- accelerator.wait_for_everyone()
- accelerator.end_training()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--dataset", type=str, required=True)
- parser.add_argument("--video_data_path", type=str, required=True)
- parser.add_argument("--video_folder", type=str, default='')
- parser.add_argument("--image_data_path", type=str, default='')
- parser.add_argument("--image_folder", type=str, default='')
- parser.add_argument("--sample_rate", type=int, default=1)
- parser.add_argument("--num_frames", type=int, default=17)
- parser.add_argument("--max_image_size", type=int, default=512)
- parser.add_argument("--use_img_from_vid", action="store_true")
- parser.add_argument("--use_image_num", type=int, default=0)
- parser.add_argument("--model_max_length", type=int, default=300)
-
- parser.add_argument('--enable_8bit_t5', action='store_true')
- parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
- parser.add_argument('--enable_tiling', action='store_true')
- parser.add_argument("--compress_kv", action="store_true")
- parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="xformers")
- parser.add_argument('--use_rope', action='store_true')
- parser.add_argument('--compress_kv_factor', type=int, default=1)
-
- parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="Latte-XL/122")
- parser.add_argument("--pretrained", type=str, default=None)
- parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
- parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
- parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
- parser.add_argument("--cache_dir", type=str, default='./cache_dir')
-
- parser.add_argument("--enable_tracker", action="store_true")
- parser.add_argument("--use_deepspeed", action="store_true")
- parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
- parser.add_argument(
- "--num_validation_videos",
- type=int,
- default=2,
- help="Number of images that should be generated during validation with `validation_prompt`.",
- )
- parser.add_argument(
- "--output_dir",
- type=str,
- default=None,
- help="The output directory where the model predictions and checkpoints will be written.",
- )
- parser.add_argument(
- "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
- )
- parser.add_argument("--num_train_epochs", type=int, default=100)
- parser.add_argument(
- "--max_train_steps",
- type=int,
- default=None,
- help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
- )
- parser.add_argument(
- "--checkpointing_steps",
- type=int,
- default=500,
- help=(
- "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
- " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
- " training using `--resume_from_checkpoint`."
- ),
- )
- parser.add_argument(
- "--checkpoints_total_limit",
- type=int,
- default=None,
- help=("Max number of checkpoints to store."),
- )
- parser.add_argument(
- "--resume_from_checkpoint",
- type=str,
- default=None,
- help=(
- "Whether training should be resumed from a previous checkpoint. Use a path saved by"
- ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
- ),
- )
- parser.add_argument(
- "--gradient_accumulation_steps",
- type=int,
- default=1,
- help="Number of updates steps to accumulate before performing a backward/update pass.",
- )
- parser.add_argument(
- "--gradient_checkpointing",
- action="store_true",
- help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
- )
- parser.add_argument(
- "--learning_rate",
- type=float,
- default=1e-4,
- help="Initial learning rate (after the potential warmup period) to use.",
- )
- parser.add_argument(
- "--scale_lr",
- action="store_true",
- default=False,
- help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
- )
- parser.add_argument(
- "--lr_scheduler",
- type=str,
- default="constant",
- help=(
- 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'
- ),
- )
- parser.add_argument(
- "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
- )
- parser.add_argument(
- "--timestep_bias_strategy",
- type=str,
- default="none",
- choices=["earlier", "later", "range", "none"],
- help=(
- "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
- " Choices: ['earlier', 'later', 'range', 'none']."
- " The default is 'none', which means no bias is applied, and training proceeds normally."
- " The value of 'later' will increase the frequency of the model's final training timesteps."
- ),
- )
- parser.add_argument(
- "--timestep_bias_multiplier",
- type=float,
- default=1.0,
- help=(
- "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
- " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
- ),
- )
- parser.add_argument(
- "--timestep_bias_begin",
- type=int,
- default=0,
- help=(
- "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
- " Defaults to zero, which equates to having no specific bias."
- ),
- )
- parser.add_argument(
- "--timestep_bias_end",
- type=int,
- default=1000,
- help=(
- "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
- " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
- ),
- )
- parser.add_argument(
- "--timestep_bias_portion",
- type=float,
- default=0.25,
- help=(
- "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
- " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
- " whether the biased portions are in the earlier or later timesteps."
- ),
- )
- parser.add_argument(
- "--snr_gamma",
- type=float,
- default=None,
- help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
- "More details here: https://arxiv.org/abs/2303.09556.",
- )
- parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
- parser.add_argument(
- "--allow_tf32",
- action="store_true",
- help=(
- "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
- " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
- ),
- )
- parser.add_argument(
- "--dataloader_num_workers",
- type=int,
- default=10,
- help=(
- "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ),
- )
- parser.add_argument(
- "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
- )
- parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
- parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
- parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
- parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
- parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
- parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
- parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
- parser.add_argument(
- "--prediction_type",
- type=str,
- default=None,
- help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
- )
- parser.add_argument(
- "--hub_model_id",
- type=str,
- default=None,
- help="The name of the repository to keep in sync with the local `output_dir`.",
- )
- parser.add_argument(
- "--logging_dir",
- type=str,
- default="logs",
- help=(
- "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
- " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
- ),
- )
- parser.add_argument(
- "--report_to",
- type=str,
- default="tensorboard",
- help=(
- 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
- ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
- ),
- )
- parser.add_argument(
- "--mixed_precision",
- type=str,
- default=None,
- choices=["no", "fp16", "bf16"],
- help=(
- "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
- " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
- " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
- ),
- )
- parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
- parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-
- args = parser.parse_args()
- main(args)
diff --git a/opensora/train/train_t2v_diffusers.py b/opensora/train/train_t2v_diffusers.py
new file mode 100644
index 000000000..f9c7c6f05
--- /dev/null
+++ b/opensora/train/train_t2v_diffusers.py
@@ -0,0 +1,992 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+A minimal training script for DiT using PyTorch DDP.
+"""
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import Optional
+import gc
+import numpy as np
+from einops import rearrange
+import torch.utils
+import torch.utils.data
+from tqdm import tqdm
+
+from opensora.adaptor.modules import replace_with_fp32_forwards
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.acceleration.communications import prepare_parallel_data, broadcast
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.utils.communications import prepare_parallel_data, broadcast
+ pass
+import time
+from dataclasses import field, dataclass
+from torch.utils.data import DataLoader
+from copy import deepcopy
+import accelerate
+import torch
+from torch.nn import functional as F
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from packaging import version
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import DDPMScheduler, PNDMScheduler, DPMSolverMultistepScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+
+from opensora.models.causalvideovae import ae_stride_config, ae_channel_config
+from opensora.models.causalvideovae import ae_norm, ae_denorm
+from opensora.models import CausalVAEModelWrapper
+from opensora.models.text_encoder import get_text_enc, get_text_warpper
+from opensora.dataset import getdataset
+from opensora.models import CausalVAEModelWrapper
+from opensora.models.diffusion import Diffusion_models, Diffusion_models_class
+from opensora.utils.dataset_utils import Collate, LengthGroupedSampler
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+from opensora.models.causalvideovae import ae_stride_config, ae_wrapper
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.24.0")
+logger = get_logger(__name__)
+from torch.utils.data import _utils
+_utils.MP_STATUS_CHECK_INTERVAL = 1800.0 # dataloader timeout (default is 5.0s), we increase it to 1800s.
+
+@torch.inference_mode()
+def log_validation(args, model, vae, text_encoder, tokenizer, accelerator, weight_dtype, global_step, ema=False):
+ positive_prompt = "(masterpiece), (best quality), (ultra-detailed), {}. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous"
+ negative_prompt = """nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry,
+ """
+ validation_prompt = [
+ "a cat wearing sunglasses and working as a lifeguard at pool.",
+ "A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene."
+ ]
+ if 'mt5' in args.text_encoder_name:
+ validation_prompt_cn = [
+ "一只戴着墨镜在泳池当救生员的猫咪。",
+ "这是一个宁静的水下场景,一只海龟游过珊瑚礁。海龟带着绿褐色的龟壳,优雅地游向画面右侧,成为视频的焦点。背景中的珊瑚礁生机盎然,为海龟的旅程提供了生动多彩的背景。几条小鱼在海龟周围穿梭,为画面增添了动感和活力。"
+ ]
+ validation_prompt += validation_prompt_cn
+ logger.info(f"Running validation....\n")
+ model = accelerator.unwrap_model(model)
+ scheduler = DPMSolverMultistepScheduler()
+ opensora_pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=model).to(device=accelerator.device)
+ videos = []
+ for prompt in validation_prompt:
+ logger.info('Processing the ({}) prompt'.format(prompt))
+ video = opensora_pipeline(
+ positive_prompt.format(prompt),
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.max_height,
+ width=args.max_width,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ enable_temporal_attentions=True,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ max_sequence_length=args.model_max_length,
+ ).images
+ videos.append(video[0])
+ # import ipdb;ipdb.set_trace()
+ gc.collect()
+ torch.cuda.empty_cache()
+ videos = torch.stack(videos).numpy()
+ videos = rearrange(videos, 'b t h w c -> b t c h w')
+ for tracker in accelerator.trackers:
+ if tracker.name == "tensorboard":
+ if videos.shape[1] == 1:
+ assert args.num_frames == 1
+ images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+ np_images = np.stack([np.asarray(img) for img in images])
+ tracker.writer.add_images(f"{'ema_' if ema else ''}validation", np_images, global_step, dataformats="NHWC")
+ else:
+ np_videos = np.stack([np.asarray(vid) for vid in videos])
+ tracker.writer.add_video(f"{'ema_' if ema else ''}validation", np_videos, global_step, fps=24)
+ if tracker.name == "wandb":
+ import wandb
+ if videos.shape[1] == 1:
+ images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+ logs = {
+ f"{'ema_' if ema else ''}validation": [
+ wandb.Image(image, caption=f"{i}: {prompt}")
+ for i, (image, prompt) in enumerate(zip(images, validation_prompt))
+ ]
+ }
+ else:
+ logs = {
+ f"{'ema_' if ema else ''}validation": [
+ wandb.Video(video, caption=f"{i}: {prompt}", fps=24)
+ for i, (video, prompt) in enumerate(zip(videos, validation_prompt))
+ ]
+ }
+ tracker.log(logs, step=global_step)
+
+ del opensora_pipeline
+ gc.collect()
+ torch.cuda.empty_cache()
+
+
+class ProgressInfo:
+ def __init__(self, global_step, train_loss=0.0):
+ self.global_step = global_step
+ self.train_loss = train_loss
+
+
+#################################################################################
+# Training Loop #
+#################################################################################
+
+def main(args):
+ logging_dir = Path(args.output_dir, args.logging_dir)
+
+ # use LayerNorm, GeLu, SiLu always as fp32 mode
+ if args.enable_stable_fp32:
+ replace_with_fp32_forwards()
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(args)
+ npu_config.seed_everything(args.seed)
+ accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+ accelerator = Accelerator(
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
+ mixed_precision=args.mixed_precision,
+ log_with=args.report_to,
+ project_config=accelerator_project_config,
+ )
+
+ if args.num_frames != 1 and args.use_image_num == 0:
+ initialize_sequence_parallel_state(args.sp_size)
+
+ if args.report_to == "wandb":
+ if not is_wandb_available():
+ raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+ # Make one log on every process with the configuration for debugging.
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+ )
+ logger.info(accelerator.state, main_process_only=False)
+ if accelerator.is_local_main_process:
+ transformers.utils.logging.set_verbosity_warning()
+ diffusers.utils.logging.set_verbosity_info()
+ else:
+ transformers.utils.logging.set_verbosity_error()
+ diffusers.utils.logging.set_verbosity_error()
+
+ # If passed along, set the training seed now.
+ if args.seed is not None:
+ set_seed(args.seed, device_specific=True)
+
+ # Handle the repository creation
+ if accelerator.is_main_process:
+ if args.output_dir is not None:
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ # For mixed precision training we cast all non-trainable weigths to half-precision
+ # as these weights are only used for inference, keeping weights in full precision is not required.
+ weight_dtype = torch.float32
+ if accelerator.mixed_precision == "fp16":
+ weight_dtype = torch.float16
+ elif accelerator.mixed_precision == "bf16":
+ weight_dtype = torch.bfloat16
+
+ # Create model:
+ kwargs = {}
+ ae = ae_wrapper[args.ae](args.ae_path, cache_dir=args.cache_dir, **kwargs).eval()
+ if args.enable_tiling:
+ ae.vae.enable_tiling()
+ ae.vae.tile_overlap_factor = args.tile_overlap_factor
+
+ kwargs = {'load_in_8bit': args.enable_8bit_t5, 'torch_dtype': weight_dtype, 'low_cpu_mem_usage': True}
+ text_enc = get_text_warpper(args.text_encoder_name)(args, **kwargs).eval()
+
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ ae.vae_scale_factor = (ae_stride_t, ae_stride_h, ae_stride_w)
+ assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
+ args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
+ args.ae_stride = args.ae_stride_h
+ patch_size = args.model[-3:]
+ patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
+ args.patch_size = patch_size_h
+ args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
+ assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
+ # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
+ assert args.max_height % ae_stride_h == 0, f"Height must be divisible by ae_stride_h, but found Height ({args.max_height}), ae_stride_h ({ae_stride_h})."
+ assert args.max_width % ae_stride_h == 0, f"Width size must be divisible by ae_stride_h, but found Width ({args.max_width}), ae_stride_h ({ae_stride_h})."
+
+ args.stride_t = ae_stride_t * patch_size_t
+ args.stride = ae_stride_h * patch_size_h
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ ae.latent_size = latent_size
+
+ if args.num_frames % 2 == 1:
+ args.latent_size_t = latent_size_t = (args.num_frames - 1) // ae_stride_t + 1
+ else:
+ latent_size_t = args.num_frames // ae_stride_t
+ model = Diffusion_models[args.model](
+ in_channels=ae_channel_config[args.ae],
+ out_channels=ae_channel_config[args.ae],
+ # caption_channels=4096,
+ # cross_attention_dim=1152,
+ attention_bias=True,
+ sample_size=latent_size,
+ sample_size_t=latent_size_t,
+ num_vector_embeds=None,
+ activation_fn="gelu-approximate",
+ num_embeds_ada_norm=1000,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ double_self_attention=False,
+ upcast_attention=False,
+ # norm_type="ada_norm_single",
+ norm_elementwise_affine=False,
+ norm_eps=1e-6,
+ attention_type='default',
+ attention_mode=args.attention_mode,
+ interpolation_scale_h=args.interpolation_scale_h,
+ interpolation_scale_w=args.interpolation_scale_w,
+ interpolation_scale_t=args.interpolation_scale_t,
+ downsampler=args.downsampler,
+ # compress_kv_factor=args.compress_kv_factor,
+ use_rope=args.use_rope,
+ # model_max_length=args.model_max_length,
+ use_stable_fp32=args.enable_stable_fp32,
+ sparse1d=args.sparse1d,
+ sparse2d=args.sparse2d,
+ sparse_n=args.sparse_n,
+ use_motion=args.use_motion
+ )
+ model.gradient_checkpointing = args.gradient_checkpointing
+
+ # # use pretrained model?
+ if args.pretrained:
+ model_state_dict = model.state_dict()
+ if 'safetensors' in args.pretrained: # pixart series
+ from safetensors.torch import load_file as safe_load
+ # import ipdb;ipdb.set_trace()
+ pretrained_checkpoint = safe_load(args.pretrained, device="cpu")
+ pretrained_keys = set(list(pretrained_checkpoint.keys()))
+ model_keys = set(list(model_state_dict.keys()))
+ common_keys = list(pretrained_keys & model_keys)
+ checkpoint = {k: pretrained_checkpoint[k] for k in common_keys if model_state_dict[k].numel() == pretrained_checkpoint[k].numel()}
+ # if checkpoint['pos_embed.proj.weight'].shape != model.pos_embed.proj.weight.shape and checkpoint['pos_embed.proj.weight'].ndim == 4:
+ # logger.info(f"Resize pos_embed, {checkpoint['pos_embed.proj.weight'].shape} -> {model.pos_embed.proj.weight.shape}")
+ # repeat = model.pos_embed.proj.weight.shape[2]
+ # checkpoint['pos_embed.proj.weight'] = checkpoint['pos_embed.proj.weight'].unsqueeze(2).repeat(1, 1, repeat, 1, 1) / float(repeat)
+ # del checkpoint['proj_out.weight'], checkpoint['proj_out.bias']
+ else: # latest stage training weight
+ checkpoint = torch.load(args.pretrained, map_location='cpu')
+ if 'model' in checkpoint:
+ checkpoint = checkpoint['model']
+ missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
+ logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
+ logger.info(f'Successfully load {len(model_state_dict) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
+
+ # Freeze vae and text encoders.
+ ae.vae.requires_grad_(False)
+ text_enc.requires_grad_(False)
+ # Set model as trainable.
+ model.train()
+
+ ae.vae.tile_sample_min_size = args.tile_sample_min_size
+ ae.vae.tile_sample_min_size_t = args.tile_sample_min_size_t
+
+ noise_scheduler = DDPMScheduler(rescale_betas_zero_snr=args.rescale_betas_zero_snr)
+ # Move unet, vae and text_encoder to device and cast to weight_dtype
+ # The VAE is in float32 to avoid NaN losses.
+ ae.vae.to(accelerator.device, dtype=torch.float32)
+ # ae.vae.to(accelerator.device, dtype=weight_dtype)
+ text_enc.to(accelerator.device, dtype=weight_dtype)
+
+ # Create EMA for the unet.
+ if args.use_ema:
+ ema_model = deepcopy(model)
+ ema_model = EMAModel(ema_model.parameters(), decay=args.ema_decay, update_after_step=args.ema_start_step,
+ model_cls=Diffusion_models_class[args.model], model_config=ema_model.config)
+
+ # `accelerate` 0.16.0 will have better support for customized saving
+ if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+ def save_model_hook(models, weights, output_dir):
+ if accelerator.is_main_process:
+ if args.use_ema:
+ ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
+
+ for i, model in enumerate(models):
+ model.save_pretrained(os.path.join(output_dir, "model"))
+ if weights: # Don't pop if empty
+ # make sure to pop weight so that corresponding model is not saved again
+ weights.pop()
+
+ def load_model_hook(models, input_dir):
+ if args.use_ema:
+ load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), Diffusion_models_class[args.model])
+ ema_model.load_state_dict(load_model.state_dict())
+ ema_model.to(accelerator.device)
+ del load_model
+
+ for i in range(len(models)):
+ # pop models so that they are not loaded again
+ model = models.pop()
+
+ # load diffusers style into model
+ load_model = Diffusion_models_class[args.model].from_pretrained(input_dir, subfolder="model")
+ model.register_to_config(**load_model.config)
+
+ model.load_state_dict(load_model.state_dict())
+ del load_model
+
+ accelerator.register_save_state_pre_hook(save_model_hook)
+ accelerator.register_load_state_pre_hook(load_model_hook)
+
+ # Enable TF32 for faster training on Ampere GPUs,
+ # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+ if args.allow_tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+
+ if args.scale_lr:
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+ )
+
+ params_to_optimize = model.parameters()
+ # Optimizer creation
+ if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+ logger.warning(
+ f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+ "Defaulting to adamW"
+ )
+ args.optimizer = "adamw"
+
+ if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+ logger.warning(
+ f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+ f"set to {args.optimizer.lower()}"
+ )
+
+ if args.optimizer.lower() == "adamw":
+ if args.use_8bit_adam:
+ try:
+ import bitsandbytes as bnb
+ except ImportError:
+ raise ImportError(
+ "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+ )
+
+ optimizer_class = bnb.optim.AdamW8bit
+ else:
+ optimizer_class = torch.optim.AdamW
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ )
+
+ if args.optimizer.lower() == "prodigy":
+ try:
+ import prodigyopt
+ except ImportError:
+ raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+ optimizer_class = prodigyopt.Prodigy
+
+ if args.learning_rate <= 0.1:
+ logger.warning(
+ "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+ )
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ beta3=args.prodigy_beta3,
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ decouple=args.prodigy_decouple,
+ use_bias_correction=args.prodigy_use_bias_correction,
+ safeguard_warmup=args.prodigy_safeguard_warmup,
+ )
+ logger.info(f"optimizer: {optimizer}")
+
+ # Setup data:
+
+ if args.trained_data_global_step is not None:
+ initial_global_step_for_sampler = args.trained_data_global_step
+ else:
+ initial_global_step_for_sampler = 0
+ # print('initial_global_step, initial_global_step_for_sampler, total_batch_size',
+ # initial_global_step, initial_global_step_for_sampler, total_batch_size)
+ train_dataset = getdataset(args)
+ sampler = LengthGroupedSampler(
+ args.train_batch_size,
+ world_size=accelerator.num_processes,
+ gradient_accumulation_size=args.gradient_accumulation_steps,
+ initial_global_step=initial_global_step_for_sampler,
+ lengths=train_dataset.lengths,
+ group_data=args.group_data,
+ )
+ train_dataloader = DataLoader(
+ train_dataset,
+ shuffle=False,
+ # pin_memory=True,
+ collate_fn=Collate(args),
+ batch_size=args.train_batch_size,
+ num_workers=args.dataloader_num_workers,
+ sampler=sampler,
+ drop_last=True,
+ prefetch_factor=4
+ )
+ logger.info(f'after train_dataloader')
+
+ # Scheduler and math around the number of training steps.
+ overrode_max_train_steps = False
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if args.max_train_steps is None:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ overrode_max_train_steps = True
+
+ lr_scheduler = get_scheduler(
+ args.lr_scheduler,
+ optimizer=optimizer,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+ )
+
+ # Prepare everything with our `accelerator`.
+ # model.requires_grad_(False)
+ # model.pos_embed.requires_grad_(True)
+ if args.adapt_vae:
+ model.requires_grad_(False)
+ for name, param in model.named_parameters():
+ if 'pos_embed' in name or 'proj_out' in name:
+ param.requires_grad = True
+ logger.info(f'before accelerator.prepare')
+ model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+ model, optimizer, train_dataloader, lr_scheduler
+ )
+ logger.info(f'after accelerator.prepare')
+ if args.use_ema:
+ ema_model.to(accelerator.device)
+
+ # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if overrode_max_train_steps:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ # Afterwards we recalculate our number of training epochs
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+ # We need to initialize the trackers we use, and also store our configuration.
+ # The trackers initializes automatically on the main process.
+ if accelerator.is_main_process:
+ accelerator.init_trackers(os.path.basename(args.output_dir), config=vars(args))
+
+ # Train!
+ total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+ total_batch_size = total_batch_size // args.sp_size * args.train_sp_batch_size
+
+ logger.info("***** Running training *****")
+ logger.info(f" Model = {model}")
+ logger.info(f" Num examples = {len(train_dataset)}")
+ logger.info(f" Num Epochs = {args.num_train_epochs}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Total optimization steps = {args.max_train_steps}")
+ logger.info(f" Total optimization steps (num_update_steps_per_epoch) = {num_update_steps_per_epoch}")
+ logger.info(f" Total training parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B")
+
+ global_step = 0
+ first_epoch = 0
+ # Potentially load in the weights and states from a previous save
+ if args.resume_from_checkpoint:
+ if args.resume_from_checkpoint != "latest":
+ path = os.path.basename(args.resume_from_checkpoint)
+ else:
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.output_dir)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ if path is None:
+ accelerator.print(
+ f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+ )
+ args.resume_from_checkpoint = None
+ initial_global_step = 0
+ else:
+ accelerator.print(f"Resuming from checkpoint {path}")
+ accelerator.load_state(os.path.join(args.output_dir, path))
+ global_step = int(path.split("-")[1])
+
+ initial_global_step = global_step
+ first_epoch = global_step // num_update_steps_per_epoch
+
+ else:
+ initial_global_step = 0
+
+ progress_bar = tqdm(
+ range(0, args.max_train_steps),
+ initial=initial_global_step,
+ desc="Steps",
+ # Only show the progress bar once on each machine.
+ disable=not accelerator.is_local_main_process,
+ )
+ progress_info = ProgressInfo(global_step, train_loss=0.0)
+
+ def sync_gradients_info(loss):
+ # Checks if the accelerator has performed an optimization step behind the scenes
+ if args.use_ema:
+ ema_model.step(model.parameters())
+ progress_bar.update(1)
+ progress_info.global_step += 1
+ end_time = time.time()
+ one_step_duration = end_time - start_time
+ accelerator.log({"train_loss": progress_info.train_loss}, step=progress_info.global_step)
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(f"Step: [{progress_info.global_step}], local_loss={loss.detach().item()}, "
+ f"train_loss={progress_info.train_loss}, time_cost={one_step_duration}",
+ rank=0)
+ progress_info.train_loss = 0.0
+
+ # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+ if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+ if progress_info.global_step % args.checkpointing_steps == 0:
+ # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+ if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+ checkpoints = os.listdir(args.output_dir)
+ checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+ checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+ # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+ if len(checkpoints) >= args.checkpoints_total_limit:
+ num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+ removing_checkpoints = checkpoints[0:num_to_remove]
+
+ logger.info(
+ f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+ )
+ logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+ for removing_checkpoint in removing_checkpoints:
+ removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+ shutil.rmtree(removing_checkpoint)
+
+ save_path = os.path.join(args.output_dir, f"checkpoint-{progress_info.global_step}")
+ accelerator.save_state(save_path)
+ logger.info(f"Saved state to {save_path}")
+
+ logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+ progress_bar.set_postfix(**logs)
+
+ def run(model_input, model_kwargs, prof):
+ global start_time
+ start_time = time.time()
+
+ noise = torch.randn_like(model_input)
+ if args.noise_offset:
+ # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+ noise += args.noise_offset * torch.randn((model_input.shape[0], model_input.shape[1], 1, 1, 1),
+ device=model_input.device)
+
+ bsz = model_input.shape[0]
+ current_step_frame = model_input.shape[2]
+ # Sample a random timestep for each image without bias.
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device)
+ # print('accelerator.process_index, timesteps', accelerator.process_index, timesteps)
+ if current_step_frame != 1 and get_sequence_parallel_state(): # image do not need sp
+ broadcast(timesteps)
+ motion_score = model_kwargs.pop('motion_score', None)
+ if motion_score is not None:
+ raise NotImplementedError
+
+ # Add noise to the model input according to the noise magnitude at each timestep
+ # (this is the forward diffusion process)
+
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+ model_pred = model(
+ noisy_model_input,
+ timesteps,
+ **model_kwargs
+ )[0]
+ # Get the target for loss depending on the prediction type
+ if args.prediction_type is not None:
+ # set prediction_type of scheduler if defined
+ noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+ if noise_scheduler.config.prediction_type == "epsilon":
+ target = noise
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+ elif noise_scheduler.config.prediction_type == "sample":
+ # We set the target to latents here, but the model_pred will return the noise sample prediction.
+ target = model_input
+ # We will have to subtract the noise residual from the prediction to get the target sample.
+ model_pred = model_pred - noise
+ else:
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+ mask = model_kwargs.get('attention_mask', None)
+ if torch.all(mask.bool()):
+ mask = None
+ if get_sequence_parallel_state():
+ assert mask is None
+ b, c, _, _, _ = model_pred.shape
+ if mask is not None:
+ mask = mask.unsqueeze(1).repeat(1, c, 1, 1, 1).float() # b t h w -> b c t h w
+ mask = mask.reshape(b, -1)
+ if args.snr_gamma is None:
+ # model_pred: b c t h w, attention_mask: b t h w
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.reshape(b, -1)
+ if mask is not None:
+ loss = (loss * mask).sum() / mask.sum() # mean loss on unpad patches
+ else:
+ loss = loss.mean()
+ else:
+ # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+ # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+ # This is discussed in Section 4.2 of the same paper.
+ snr = compute_snr(noise_scheduler, timesteps)
+ mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+ dim=1
+ )[0]
+ if noise_scheduler.config.prediction_type == "epsilon":
+ mse_loss_weights = mse_loss_weights / snr
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ mse_loss_weights = mse_loss_weights / (snr + 1)
+ else:
+ raise NameError(f'{noise_scheduler.config.prediction_type}')
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.reshape(b, -1)
+ mse_loss_weights = mse_loss_weights.reshape(b, 1)
+ if mask is not None:
+ loss = (loss * mask * mse_loss_weights).sum() / mask.sum() # mean loss on unpad patches
+ else:
+ loss = (loss * mse_loss_weights).mean()
+
+ # Gather the losses across all processes for logging (if we use distributed training).
+ avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+ progress_info.train_loss += avg_loss.detach().item() / args.gradient_accumulation_steps
+
+ # Backpropagate
+ accelerator.backward(loss)
+ if accelerator.sync_gradients:
+ params_to_clip = model.parameters()
+ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+
+ if accelerator.sync_gradients:
+ sync_gradients_info(loss)
+
+ if accelerator.is_main_process:
+
+ if progress_info.global_step % args.checkpointing_steps == 0:
+
+ if args.enable_tracker:
+ log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ weight_dtype, progress_info.global_step)
+
+ if args.use_ema and npu_config is None:
+ # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+ ema_model.store(model.parameters())
+ ema_model.copy_to(model.parameters())
+ log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ weight_dtype, progress_info.global_step, ema=True)
+ # Switch back to the original UNet parameters.
+ ema_model.restore(model.parameters())
+
+ if prof is not None:
+ prof.step()
+
+
+ return loss
+
+ def train_one_step(step_, data_item_, prof_=None):
+ train_loss = 0.0
+ x, attn_mask, input_ids, cond_mask, motion_score = data_item_
+ # print(f'step: {step_}, rank: {accelerator.process_index}, x: {x.shape}')
+ assert not torch.any(torch.isnan(x)), 'torch.any(torch.isnan(x))'
+ x = x.to(accelerator.device, dtype=ae.vae.dtype) # B C T+num_images H W, 16 + 4
+
+ attn_mask = attn_mask.to(accelerator.device) # B T+num_images H W
+ input_ids = input_ids.to(accelerator.device) # B 1+num_images L
+ cond_mask = cond_mask.to(accelerator.device) # B 1+num_images L
+ motion_score = motion_score.to(accelerator.device) if motion_score is not None else motion_score # B 1
+ # if accelerator.process_index == 0:
+ # logger.info(f'rank: {accelerator.process_index}, x: {x.shape}, attn_mask: {attn_mask.shape}')
+
+ with torch.no_grad():
+ B, N, L = input_ids.shape # B 1+num_images L
+ # use batch inference
+ input_ids_ = input_ids.reshape(-1, L)
+ cond_mask_ = cond_mask.reshape(-1, L)
+ cond = text_enc(input_ids_, cond_mask_) # B 1+num_images L D
+ cond = cond.reshape(B, N, L, -1)
+ # Map input images to latent space + normalize latents
+ x = ae.encode(x) # B C T H W
+
+ # def custom_to_video(x: torch.Tensor, fps: float = 2.0, output_file: str = 'output_video.mp4') -> None:
+ # from examples.rec_video import array_to_video
+ # x = x.detach().cpu()
+ # x = torch.clamp(x, -1, 1)
+ # x = (x + 1) / 2
+ # x = x.permute(1, 2, 3, 0).numpy()
+ # x = (255*x).astype(np.uint8)
+ # array_to_video(x, fps=fps, output_file=output_file)
+ # return
+ # videos = ae.decode(x)[0]
+ # videos = videos.transpose(0, 1)
+ # custom_to_video(videos.to(torch.float32), fps=24, output_file='tmp.mp4')
+ # import sys;sys.exit()
+ current_step_frame = x.shape[2]
+ current_step_sp_state = get_sequence_parallel_state()
+ if args.sp_size != 1: # enable sp
+ if current_step_frame == 1: # but image do not need sp
+ set_sequence_parallel_state(False)
+ else:
+ set_sequence_parallel_state(True)
+ if get_sequence_parallel_state():
+ x, cond, attn_mask, cond_mask, use_image_num = prepare_parallel_data(x, cond, attn_mask, cond_mask,
+ args.use_image_num)
+ for iter in range(args.train_batch_size * args.sp_size // args.train_sp_batch_size):
+ with accelerator.accumulate(model):
+ st_idx = iter * args.train_sp_batch_size
+ ed_idx = (iter + 1) * args.train_sp_batch_size
+ model_kwargs = dict(encoder_hidden_states=cond[st_idx: ed_idx],
+ attention_mask=attn_mask[st_idx: ed_idx],
+ encoder_attention_mask=cond_mask[st_idx: ed_idx], use_image_num=use_image_num)
+ run(x[st_idx: ed_idx], model_kwargs, prof_)
+
+ else:
+ with accelerator.accumulate(model):
+ assert not torch.any(torch.isnan(x)), 'after vae'
+ x = x.to(weight_dtype)
+ # print(f'rank: {accelerator.process_index}, x: {x.shape}')
+ model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask, motion_score=motion_score,
+ encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
+ run(x, model_kwargs, prof_)
+
+ set_sequence_parallel_state(current_step_sp_state) # in case the next step use sp, which need broadcast(timesteps)
+
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ return False
+
+ def train_one_epoch(prof_=None):
+ # for epoch in range(first_epoch, args.num_train_epochs):
+ progress_info.train_loss = 0.0
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ for step, data_item in enumerate(train_dataloader):
+ if train_one_step(step, data_item, prof_):
+ break
+
+ if step >= 2 and torch_npu is not None and npu_config is not None:
+ npu_config.free_mm()
+
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = f"/home/image_data/npu_profiling_t2v/{os.getenv('PROJECT_NAME', 'local')}"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=npu_config.profiling_step, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ train_one_epoch(prof)
+ else:
+ train_one_epoch()
+ accelerator.wait_for_everyone()
+ accelerator.end_training()
+ if get_sequence_parallel_state():
+ destroy_sequence_parallel_group()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ # dataset & dataloader
+ parser.add_argument("--dataset", type=str, required=True)
+ parser.add_argument("--data", type=str, required='')
+ parser.add_argument("--sample_rate", type=int, default=1)
+ parser.add_argument("--train_fps", type=int, default=24)
+ parser.add_argument("--drop_short_ratio", type=float, default=1.0)
+ parser.add_argument("--speed_factor", type=float, default=1.0)
+ parser.add_argument("--num_frames", type=int, default=65)
+ parser.add_argument("--max_height", type=int, default=320)
+ parser.add_argument("--max_width", type=int, default=240)
+ parser.add_argument("--use_img_from_vid", action="store_true")
+ parser.add_argument("--use_image_num", type=int, default=0)
+ parser.add_argument("--model_max_length", type=int, default=512)
+ parser.add_argument('--cfg', type=float, default=0.1)
+ parser.add_argument("--dataloader_num_workers", type=int, default=10, help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.")
+ parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader.")
+ parser.add_argument("--group_data", action="store_true")
+ parser.add_argument("--hw_stride", type=int, default=32)
+ parser.add_argument("--skip_low_resolution", action="store_true")
+ parser.add_argument("--force_resolution", action="store_true")
+ parser.add_argument("--trained_data_global_step", type=int, default=None)
+ parser.add_argument("--use_decord", action="store_true")
+
+ # text encoder & vae & diffusion model
+ parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="Latte-XL/122")
+ parser.add_argument('--enable_8bit_t5', action='store_true')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.125)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument("--compress_kv", action="store_true")
+ parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="xformers")
+ parser.add_argument('--use_rope', action='store_true')
+ parser.add_argument('--compress_kv_factor', type=int, default=1)
+ parser.add_argument('--interpolation_scale_h', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_w', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_t', type=float, default=1.0)
+ parser.add_argument("--downsampler", type=str, default=None)
+ parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--pretrained", type=str, default=None)
+ parser.add_argument('--enable_stable_fp32', action='store_true')
+ parser.add_argument('--sparse1d', action='store_true')
+ parser.add_argument('--sparse2d', action='store_true')
+ parser.add_argument('--sparse_n', type=int, default=2)
+ parser.add_argument('--tile_sample_min_size', type=int, default=512)
+ parser.add_argument('--tile_sample_min_size_t', type=int, default=33)
+ parser.add_argument('--adapt_vae', action='store_true')
+ parser.add_argument('--use_motion', action='store_true')
+ parser.add_argument("--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.")
+
+ # diffusion setting
+ parser.add_argument("--snr_gamma", type=float, default=None, help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556.")
+ parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+ parser.add_argument("--ema_decay", type=float, default=0.999)
+ parser.add_argument("--ema_start_step", type=int, default=0)
+ parser.add_argument("--noise_offset", type=float, default=0.0, help="The scale of noise offset.")
+ parser.add_argument("--prediction_type", type=str, default=None, help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.")
+ parser.add_argument('--rescale_betas_zero_snr', action='store_true')
+
+ # validation & logs
+ parser.add_argument("--num_sampling_steps", type=int, default=20)
+ parser.add_argument('--guidance_scale', type=float, default=4.5)
+ parser.add_argument("--enable_tracker", action="store_true")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+ parser.add_argument("--output_dir", type=str, default=None, help="The output directory where the model predictions and checkpoints will be written.")
+ parser.add_argument("--checkpoints_total_limit", type=int, default=None, help=("Max number of checkpoints to store."))
+ parser.add_argument("--checkpointing_steps", type=int, default=500,
+ help=(
+ "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+ " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+ " training using `--resume_from_checkpoint`."
+ ),
+ )
+ parser.add_argument("--resume_from_checkpoint", type=str, default=None,
+ help=(
+ "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+ ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+ ),
+ )
+ parser.add_argument("--logging_dir", type=str, default="logs",
+ help=(
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+ ),
+ )
+ parser.add_argument("--report_to", type=str, default="tensorboard",
+ help=(
+ 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+ ),
+ )
+ # optimizer & scheduler
+ parser.add_argument("--num_train_epochs", type=int, default=100)
+ parser.add_argument("--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.")
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")
+ parser.add_argument("--optimizer", type=str, default="adamW", help='The optimizer type to use. Choose between ["AdamW", "prodigy"]')
+ parser.add_argument("--learning_rate", type=float, default=1e-4, help="Initial learning rate (after the potential warmup period) to use.")
+ parser.add_argument("--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.")
+ parser.add_argument("--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler.")
+ parser.add_argument("--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW")
+ parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-02, help="Weight decay to use for unet params")
+ parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder")
+ parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer and Prodigy optimizers.")
+ parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True, help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True, help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ parser.add_argument("--prodigy_beta3", type=float, default=None,
+ help="coefficients for computing the Prodidy stepsize using running averages. If set to None, "
+ "uses the value of square root of beta2. Ignored if optimizer is adamW",
+ )
+ parser.add_argument("--lr_scheduler", type=str, default="constant",
+ help=(
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
+ parser.add_argument("--allow_tf32", action="store_true",
+ help=(
+ "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+ ),
+ )
+ parser.add_argument("--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"],
+ help=(
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+ ),
+ )
+
+ parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+ parser.add_argument("--sp_size", type=int, default=1, help="For sequence parallel")
+ parser.add_argument("--train_sp_batch_size", type=int, default=1, help="Batch size for sequence parallel training")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/opensora/train/train_t2v_diffusers_lora copy.py b/opensora/train/train_t2v_diffusers_lora copy.py
new file mode 100644
index 000000000..7b1098a75
--- /dev/null
+++ b/opensora/train/train_t2v_diffusers_lora copy.py
@@ -0,0 +1,960 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+A minimal training script for DiT using PyTorch DDP.
+"""
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import Optional
+import gc
+import numpy as np
+from einops import rearrange
+from tqdm import tqdm
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.utils.communications import prepare_parallel_data, broadcast
+ pass
+import time
+from dataclasses import field, dataclass
+from torch.utils.data import DataLoader
+from copy import deepcopy
+import accelerate
+import torch
+from torch.nn import functional as F
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo
+from packaging import version
+from tqdm.auto import tqdm
+from transformers import HfArgumentParser, TrainingArguments, AutoTokenizer
+
+from peft import LoraConfig, PeftModel, get_peft_model
+
+import diffusers
+from diffusers import DDPMScheduler, PNDMScheduler, DPMSolverMultistepScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+
+from opensora.dataset import getdataset, ae_denorm
+from opensora.models.ae import getae, getae_wrapper
+from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
+from opensora.models.diffusion.latte.modeling_latte import LatteT2V
+from opensora.models.text_encoder import get_text_enc, get_text_warpper
+from opensora.utils.dataset_utils import Collate
+from opensora.utils.lora_utils import EMAModel_LoRA, maybe_zero_3, get_peft_state_maybe_zero_3
+from opensora.models.ae import ae_stride_config, ae_channel_config
+from opensora.models.diffusion import Diffusion_models, Diffusion_models_class
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.24.0")
+logger = get_logger(__name__)
+
+
+@torch.inference_mode()
+def log_validation(args, model, vae, text_encoder, tokenizer, accelerator, weight_dtype, global_step, ema=False):
+ validation_prompt = [
+ "a cat wearing sunglasses and working as a lifeguard at pool.",
+ "A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene."
+ ]
+ if 'mt5' in args.text_encoder_name:
+ validation_prompt_cn = [
+ "一只戴着墨镜在泳池当救生员的猫咪。",
+ "这是一个宁静的水下场景,一只海龟游过珊瑚礁。海龟带着绿褐色的龟壳,优雅地游向画面右侧,成为视频的焦点。背景中的珊瑚礁生机盎然,为海龟的旅程提供了生动多彩的背景。几条小鱼在海龟周围穿梭,为画面增添了动感和活力。"
+ ]
+ validation_prompt += validation_prompt_cn
+ logger.info(f"Running validation....\n")
+ model = accelerator.unwrap_model(model)
+ scheduler = DPMSolverMultistepScheduler()
+ opensora_pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=model).to(device=accelerator.device)
+ videos = []
+ for prompt in validation_prompt:
+ logger.info('Processing the ({}) prompt'.format(prompt))
+ video = opensora_pipeline(prompt,
+ num_frames=args.num_frames,
+ # num_frames=1,
+ height=args.max_height,
+ width=args.max_width,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ enable_temporal_attentions=True,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ max_sequence_length=150,
+ ).images
+ videos.append(video[0])
+ # import ipdb;ipdb.set_trace()
+ gc.collect()
+ torch.cuda.empty_cache()
+ videos = torch.stack(videos).numpy()
+ videos = rearrange(videos, 'b t h w c -> b t c h w')
+ for tracker in accelerator.trackers:
+ if tracker.name == "tensorboard":
+ if videos.shape[1] == 1:
+ assert args.num_frames == 1
+ images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+ np_images = np.stack([np.asarray(img) for img in images])
+ tracker.writer.add_images(f"{'ema_' if ema else ''}validation", np_images, global_step, dataformats="NHWC")
+ else:
+ np_videos = np.stack([np.asarray(vid) for vid in videos])
+ tracker.writer.add_video(f"{'ema_' if ema else ''}validation", np_videos, global_step, fps=30)
+ if tracker.name == "wandb":
+ import wandb
+ if videos.shape[1] == 1:
+ # assert args.num_frames == 1
+ images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+ # import ipdb;ipdb.set_trace()
+ logs = {
+ f"{'ema_' if ema else ''}validation": [
+ wandb.Image(image, caption=f"{i}: {prompt}")
+ for i, (image, prompt) in enumerate(zip(images, validation_prompt))
+ ]
+ }
+ else:
+ logs = {
+ f"{'ema_' if ema else ''}validation": [
+ wandb.Video(video, caption=f"{i}: {prompt}", fps=30)
+ for i, (video, prompt) in enumerate(zip(videos, validation_prompt))
+ ]
+ }
+ # import ipdb;ipdb.set_trace()
+ if hasattr(model.pos_embed, 'temp_embed_gate'):
+ logs.update({'temp_embed_gate (tanh)': float(model.pos_embed.temp_embed_gate.tanh().item())})
+ tracker.log(logs, step=global_step)
+
+ del opensora_pipeline
+ gc.collect()
+ torch.cuda.empty_cache()
+
+
+class ProgressInfo:
+ def __init__(self, global_step, train_loss=0.0):
+ self.global_step = global_step
+ self.train_loss = train_loss
+
+
+#################################################################################
+# Training Loop #
+#################################################################################
+
+def main(args):
+ logging_dir = Path(args.output_dir, args.logging_dir)
+
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(args)
+ npu_config.seed_everything()
+ accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+ accelerator = Accelerator(
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
+ mixed_precision=args.mixed_precision,
+ log_with=args.report_to,
+ project_config=accelerator_project_config,
+ )
+
+ if args.report_to == "wandb":
+ if not is_wandb_available():
+ raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+ # Make one log on every process with the configuration for debugging.
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+ )
+ logger.info(accelerator.state, main_process_only=False)
+ if accelerator.is_local_main_process:
+ transformers.utils.logging.set_verbosity_warning()
+ diffusers.utils.logging.set_verbosity_info()
+ else:
+ transformers.utils.logging.set_verbosity_error()
+ diffusers.utils.logging.set_verbosity_error()
+
+ # If passed along, set the training seed now.
+ if args.seed is not None:
+ set_seed(args.seed)
+
+ # Handle the repository creation
+ if accelerator.is_main_process:
+ if args.output_dir is not None:
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ # For mixed precision training we cast all non-trainable weigths to half-precision
+ # as these weights are only used for inference, keeping weights in full precision is not required.
+ weight_dtype = torch.float32
+ if accelerator.mixed_precision == "fp16":
+ weight_dtype = torch.float16
+ elif accelerator.mixed_precision == "bf16":
+ weight_dtype = torch.bfloat16
+
+ # Create model:
+ kwargs = {}
+ ae = getae_wrapper(args.ae)(args.ae_path, cache_dir=args.cache_dir, **kwargs).eval()
+ if args.enable_tiling:
+ ae.vae.enable_tiling()
+ ae.vae.tile_overlap_factor = args.tile_overlap_factor
+
+ kwargs = {'load_in_8bit': args.enable_8bit_t5, 'torch_dtype': weight_dtype, 'low_cpu_mem_usage': True}
+ text_enc = get_text_warpper(args.text_encoder_name)(args, **kwargs).eval()
+
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ ae.vae_scale_factor = (ae_stride_t, ae_stride_h, ae_stride_w)
+ assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
+ args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
+ args.ae_stride = args.ae_stride_h
+ patch_size = args.model[-3:]
+ patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
+ args.patch_size = patch_size_h
+ args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
+ assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
+ # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
+ assert args.max_height % ae_stride_h == 0, f"Height must be divisible by ae_stride_h, but found Height ({args.max_height}), ae_stride_h ({ae_stride_h})."
+ assert args.max_width % ae_stride_h == 0, f"Width size must be divisible by ae_stride_h, but found Width ({args.max_width}), ae_stride_h ({ae_stride_h})."
+
+ args.stride_t = ae_stride_t * patch_size_t
+ args.stride = ae_stride_h * patch_size_h
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ ae.latent_size = latent_size
+
+ if args.num_frames % 2 == 1:
+ args.latent_size_t = latent_size_t = (args.num_frames - 1) // ae_stride_t + 1
+ else:
+ latent_size_t = args.num_frames // ae_stride_t
+ model = Diffusion_models[args.model](
+ in_channels=ae_channel_config[args.ae],
+ out_channels=ae_channel_config[args.ae],
+ # caption_channels=4096,
+ # cross_attention_dim=1152,
+ attention_bias=True,
+ sample_size=latent_size,
+ sample_size_t=latent_size_t,
+ num_vector_embeds=None,
+ activation_fn="gelu-approximate",
+ num_embeds_ada_norm=1000,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ double_self_attention=False,
+ upcast_attention=False,
+ # norm_type="ada_norm_single",
+ norm_elementwise_affine=False,
+ norm_eps=1e-6,
+ attention_type='default',
+ attention_mode=args.attention_mode,
+ interpolation_scale_h=args.interpolation_scale_h,
+ interpolation_scale_w=args.interpolation_scale_w,
+ interpolation_scale_t=args.interpolation_scale_t,
+ downsampler=args.downsampler,
+ # compress_kv_factor=args.compress_kv_factor,
+ use_rope=args.use_rope,
+ # model_max_length=args.model_max_length,
+ use_stable_fp32=args.enable_stable_fp32,
+ )
+ model.gradient_checkpointing = args.gradient_checkpointing
+
+ # NOTE replace some functions in model
+ hacked_model(model, model_type=ModelType.INPAINT_ONLY, model_cls=Diffusion_models_class[args.model])
+
+ # # use pretrained model?
+ if args.pretrained:
+ model_state_dict = model.state_dict()
+ if 'safetensors' in args.pretrained: # pixart series
+ from safetensors.torch import load_file as safe_load
+ # import ipdb;ipdb.set_trace()
+ pretrained_checkpoint = safe_load(args.pretrained, device="cpu")
+ pretrained_keys = set(list(pretrained_checkpoint.keys()))
+ model_keys = set(list(model_state_dict.keys()))
+ common_keys = list(pretrained_keys & model_keys)
+ checkpoint = {k: pretrained_checkpoint[k] for k in common_keys if model_state_dict[k].numel() == pretrained_checkpoint[k].numel()}
+ # if checkpoint['pos_embed.proj.weight'].shape != model.pos_embed.proj.weight.shape and checkpoint['pos_embed.proj.weight'].ndim == 4:
+ # logger.info(f"Resize pos_embed, {checkpoint['pos_embed.proj.weight'].shape} -> {model.pos_embed.proj.weight.shape}")
+ # repeat = model.pos_embed.proj.weight.shape[2]
+ # checkpoint['pos_embed.proj.weight'] = checkpoint['pos_embed.proj.weight'].unsqueeze(2).repeat(1, 1, repeat, 1, 1) / float(repeat)
+ # del checkpoint['proj_out.weight'], checkpoint['proj_out.bias']
+ else: # latest stage training weight
+ checkpoint = torch.load(args.pretrained, map_location='cpu')
+ if 'model' in checkpoint:
+ checkpoint = checkpoint['model']
+ missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
+ logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
+ logger.info(f'Successfully load {len(model_state_dict) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
+
+ # Freeze vae and text encoders.
+ ae.vae.requires_grad_(False)
+ text_enc.requires_grad_(False)
+ model.requires_grad_(False)
+ # # Set model as trainable.
+ # model.train()
+
+ noise_scheduler = DDPMScheduler()
+ # Move unet, vae and text_encoder to device and cast to weight_dtype
+ # The VAE is in float32 to avoid NaN losses.
+ ae.vae.to(accelerator.device, dtype=torch.float32)
+ # ae.vae.to(accelerator.device, dtype=weight_dtype)
+ text_enc.to(accelerator.device, dtype=weight_dtype)
+
+
+
+
+ # now we will add new LoRA weights to the attention layers
+ # Set correct lora layers
+ if args.enable_lora:
+ lora_config = LoraConfig(
+ r=args.rank,
+ lora_alpha=args.rank,
+ init_lora_weights="gaussian",
+ target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+ )
+ model = get_peft_model(model, lora_config)
+ # Create EMA for the unet.
+ if args.use_ema:
+ ema_model = deepcopy(model)
+ if args.enable_lora: # ema the whole lora_model
+ ema_model = EMAModel_LoRA(lora_config, parameters=ema_model.parameters(), update_after_step=args.ema_start_step,
+ model_cls=Diffusion_models_class[args.model], model_config=ema_model.config)
+ else:
+ ema_model = EMAModel(ema_model.parameters(), update_after_step=args.ema_start_step,
+ model_cls=Diffusion_models_class[args.model], model_config=ema_model.config)
+
+
+ # `accelerate` 0.16.0 will have better support for customized saving
+ if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+ def save_model_hook(models, weights, output_dir):
+ if accelerator.is_main_process:
+ if args.use_ema:
+ if args.enable_lora: # only save lora weight
+ ema_model.save_pretrained(os.path.join(output_dir, "model_ema_lora"))
+ else:
+ ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
+
+ for i, model in enumerate(models):
+ if args.enable_lora: # only save lora weight
+ model.save_pretrained(os.path.join(output_dir, "model_lora"))
+ else:
+ model.save_pretrained(os.path.join(output_dir, "model"))
+ if weights: # Don't pop if empty
+ # make sure to pop weight so that corresponding model is not saved again
+ weights.pop()
+
+ def load_model_hook(models, input_dir):
+ if args.use_ema:
+ if args.enable_lora:
+ load_model = EMAModel_LoRA.from_pretrained(os.path.join(input_dir, "model_ema_lora"), Diffusion_models_class[args.model],
+ lora_config, os.path.splitext(args.pretrained))
+ ema_model.load_state_dict(load_model.state_dict())
+ ema_model.to(accelerator.device)
+ del load_model
+
+ else:
+ load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), Diffusion_models_class[args.model])
+ ema_model.load_state_dict(load_model.state_dict())
+ ema_model.to(accelerator.device)
+ del load_model
+
+ for i in range(len(models)):
+ # pop models so that they are not loaded again
+ model = models.pop()
+ if args.enable_lora:
+ model = PeftModel.from_pretrained(model, os.path.join(input_dir, "model_lora"))
+ else:
+ # load diffusers style into model
+ load_model = Diffusion_models_class[args.model].from_pretrained(input_dir, subfolder="model")
+ model.register_to_config(**load_model.config)
+
+ model.load_state_dict(load_model.state_dict())
+ del load_model
+
+ accelerator.register_save_state_pre_hook(save_model_hook)
+ accelerator.register_load_state_pre_hook(load_model_hook)
+
+ # Enable TF32 for faster training on Ampere GPUs,
+ # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+ if args.allow_tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+
+ if args.scale_lr:
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+ )
+
+ params_to_optimize = model.parameters()
+ # Optimizer creation
+ if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+ logger.warning(
+ f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+ "Defaulting to adamW"
+ )
+ args.optimizer = "adamw"
+
+ if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+ logger.warning(
+ f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+ f"set to {args.optimizer.lower()}"
+ )
+
+ if args.optimizer.lower() == "adamw":
+ if args.use_8bit_adam:
+ try:
+ import bitsandbytes as bnb
+ except ImportError:
+ raise ImportError(
+ "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+ )
+
+ optimizer_class = bnb.optim.AdamW8bit
+ else:
+ optimizer_class = torch.optim.AdamW
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ )
+
+ if args.optimizer.lower() == "prodigy":
+ try:
+ import prodigyopt
+ except ImportError:
+ raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+ optimizer_class = prodigyopt.Prodigy
+
+ if args.learning_rate <= 0.1:
+ logger.warning(
+ "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+ )
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ beta3=args.prodigy_beta3,
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ decouple=args.prodigy_decouple,
+ use_bias_correction=args.prodigy_use_bias_correction,
+ safeguard_warmup=args.prodigy_safeguard_warmup,
+ )
+
+ # Setup data:
+ train_dataset = getdataset(args)
+ train_dataloader = torch.utils.data.DataLoader(
+ train_dataset,
+ shuffle=True,
+ # pin_memory=True,
+ collate_fn=Collate(args),
+ batch_size=args.train_batch_size,
+ num_workers=args.dataloader_num_workers,
+ prefetch_factor=4
+ )
+
+ # Scheduler and math around the number of training steps.
+ overrode_max_train_steps = False
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if args.max_train_steps is None:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ overrode_max_train_steps = True
+
+ lr_scheduler = get_scheduler(
+ args.lr_scheduler,
+ optimizer=optimizer,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+ )
+
+ # Prepare everything with our `accelerator`.
+ model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+ model, optimizer, train_dataloader, lr_scheduler
+ )
+ if args.use_ema:
+ ema_model.to(accelerator.device)
+
+ # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if overrode_max_train_steps:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ # Afterwards we recalculate our number of training epochs
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+ # We need to initialize the trackers we use, and also store our configuration.
+ # The trackers initializes automatically on the main process.
+ # NOTE wandb
+ if accelerator.is_main_process:
+ accelerator.init_trackers(os.path.basename(args.output_dir), config=vars(args))
+
+ # Train!
+ total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+ total_batch_size = total_batch_size // args.sp_size * args.train_sp_batch_size
+ logger.info("***** Running training *****")
+ logger.info(f" Model = {model}")
+ logger.info(f" Num examples = {len(train_dataset)}")
+ logger.info(f" Num Epochs = {args.num_train_epochs}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Total optimization steps = {args.max_train_steps}")
+ logger.info(f" Total parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B")
+ global_step = 0
+ first_epoch = 0
+
+ # Potentially load in the weights and states from a previous save
+ if args.resume_from_checkpoint:
+ if args.resume_from_checkpoint != "latest":
+ path = os.path.basename(args.resume_from_checkpoint)
+ else:
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.output_dir)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ if path is None:
+ accelerator.print(
+ f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+ )
+ args.resume_from_checkpoint = None
+ initial_global_step = 0
+ else:
+ accelerator.print(f"Resuming from checkpoint {path}")
+ accelerator.load_state(os.path.join(args.output_dir, path))
+ global_step = int(path.split("-")[1])
+
+ initial_global_step = global_step
+ first_epoch = global_step // num_update_steps_per_epoch
+
+ if npu_config is not None:
+ train_dataset.n_used_elements = global_step * args.train_batch_size
+
+ else:
+ initial_global_step = 0
+
+ progress_bar = tqdm(
+ range(0, args.max_train_steps),
+ initial=initial_global_step,
+ desc="Steps",
+ # Only show the progress bar once on each machine.
+ disable=not accelerator.is_local_main_process,
+ )
+ progress_info = ProgressInfo(global_step, train_loss=0.0)
+
+ def sync_gradients_info(loss):
+ # Checks if the accelerator has performed an optimization step behind the scenes
+ if args.use_ema:
+ ema_model.step(model.parameters())
+ progress_bar.update(1)
+ progress_info.global_step += 1
+ end_time = time.time()
+ one_step_duration = end_time - start_time
+ accelerator.log({"train_loss": progress_info.train_loss}, step=progress_info.global_step)
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(f"Step: [{progress_info.global_step}], local_loss={loss.detach().item()}, "
+ f"train_loss={progress_info.train_loss}, time_cost={one_step_duration}",
+ rank=0)
+ progress_info.train_loss = 0.0
+
+ # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+ if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+ if progress_info.global_step % args.checkpointing_steps == 0:
+ # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+ if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+ checkpoints = os.listdir(args.output_dir)
+ checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+ checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+ # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+ if len(checkpoints) >= args.checkpoints_total_limit:
+ num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+ removing_checkpoints = checkpoints[0:num_to_remove]
+
+ logger.info(
+ f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+ )
+ logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+ for removing_checkpoint in removing_checkpoints:
+ removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+ shutil.rmtree(removing_checkpoint)
+
+ save_path = os.path.join(args.output_dir, f"checkpoint-{progress_info.global_step}")
+ accelerator.save_state(save_path)
+ logger.info(f"Saved state to {save_path}")
+
+ logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+ progress_bar.set_postfix(**logs)
+
+ def run(model_input, model_kwargs, prof):
+ global start_time
+ start_time = time.time()
+
+ noise = torch.randn_like(model_input)
+ if args.noise_offset:
+ # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+ noise += args.noise_offset * torch.randn((model_input.shape[0], model_input.shape[1], 1, 1, 1),
+ device=model_input.device)
+
+ bsz = model_input.shape[0]
+ # Sample a random timestep for each image without bias.
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device)
+
+ # Add noise to the model input according to the noise magnitude at each timestep
+ # (this is the forward diffusion process)
+
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+ model_pred = model(
+ noisy_model_input,
+ timesteps,
+ **model_kwargs
+ )[0]
+ # Get the target for loss depending on the prediction type
+ if args.prediction_type is not None:
+ # set prediction_type of scheduler if defined
+ noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+ if noise_scheduler.config.prediction_type == "epsilon":
+ target = noise
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+ elif noise_scheduler.config.prediction_type == "sample":
+ # We set the target to latents here, but the model_pred will return the noise sample prediction.
+ target = model_input
+ # We will have to subtract the noise residual from the prediction to get the target sample.
+ model_pred = model_pred - noise
+ else:
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+ if args.snr_gamma is None:
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+ else:
+ # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+ # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+ # This is discussed in Section 4.2 of the same paper.
+ snr = compute_snr(noise_scheduler, timesteps)
+ mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+ dim=1
+ )[0]
+ if noise_scheduler.config.prediction_type == "epsilon":
+ mse_loss_weights = mse_loss_weights / snr
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ mse_loss_weights = mse_loss_weights / (snr + 1)
+
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+ loss = loss.mean()
+
+ # Gather the losses across all processes for logging (if we use distributed training).
+ avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+ progress_info.train_loss += avg_loss.detach().item() / args.gradient_accumulation_steps
+
+ # Backpropagate
+ accelerator.backward(loss)
+ if accelerator.sync_gradients:
+ params_to_clip = model.parameters()
+ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+
+ if accelerator.sync_gradients:
+ sync_gradients_info(loss)
+
+ if accelerator.is_main_process:
+ for tracker in accelerator.trackers:
+ if tracker.name == "wandb":
+ if progress_info.global_step % args.checkpointing_steps != 0:
+ if hasattr(model, 'module') and hasattr(model.module.pos_embed, 'temp_embed_gate'):
+ tracker.log(
+ {'temp_embed_gate (tanh)': float(model.module.pos_embed.temp_embed_gate.tanh().item())})
+ elif hasattr(model, 'pos_embed') and hasattr(model.pos_embed, 'temp_embed_gate'):
+ tracker.log(
+ {'temp_embed_gate (tanh)': float(model.pos_embed.temp_embed_gate.tanh().item())})
+
+ if progress_info.global_step % args.checkpointing_steps == 0:
+
+ if args.enable_tracker:
+ log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ weight_dtype, progress_info.global_step)
+
+ if args.use_ema:
+ # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+ ema_model.store(model.parameters())
+ ema_model.copy_to(model.parameters())
+ if npu_config is None:
+ log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ weight_dtype, progress_info.global_step, ema=True)
+ # Switch back to the original UNet parameters.
+ ema_model.restore(model.parameters())
+
+ if prof is not None:
+ prof.step()
+
+
+ return loss
+
+ def train_one_step(step_, data_item_, prof_=None):
+ train_loss = 0.0
+ x, attn_mask, input_ids, cond_mask = data_item_
+ # Sample noise that we'll add to the latents
+
+ if not args.multi_scale:
+ assert torch.all(attn_mask)
+ assert not torch.any(torch.isnan(x)), 'torch.any(torch.isnan(x))'
+ x = x.to(accelerator.device, dtype=ae.vae.dtype) # B C T+num_images H W, 16 + 4
+
+ attn_mask = attn_mask.to(accelerator.device) # B T+num_images H W
+ input_ids = input_ids.to(accelerator.device) # B 1+num_images L
+ cond_mask = cond_mask.to(accelerator.device) # B 1+num_images L
+ # print('x.shape, attn_mask.shape, input_ids.shape, cond_mask.shape', x.shape, attn_mask.shape, input_ids.shape, cond_mask.shape)
+
+ with torch.no_grad():
+ # import ipdb;ipdb.set_trace()
+ # use for loop to avoid OOM, because T5 is too huge...
+ B, N, L = input_ids.shape # B 1+num_images L
+ # cond_ = torch.stack([text_enc(input_ids[i], cond_mask[i]) for i in range(B)]) # B 1+num_images L D
+
+ # use batch inference
+ input_ids_ = input_ids.reshape(-1, L)
+ cond_mask_ = cond_mask.reshape(-1, L)
+ cond = text_enc(input_ids_, cond_mask_) # B 1+num_images L D
+ cond = cond.reshape(B, N, L, -1)
+
+ # Map input images to latent space + normalize latents
+ if args.use_image_num == 0:
+ x = ae.encode(x) # B C T H W
+ else:
+ videos, images = x[:, :, :-args.use_image_num], x[:, :, -args.use_image_num:]
+ videos = ae.encode(videos) # B C T H W
+ images = rearrange(images, 'b c t h w -> (b t) c 1 h w')
+ images = ae.encode(images)
+ images = rearrange(images, '(b t) c 1 h w -> b c t h w', t=args.use_image_num)
+ x = torch.cat([videos, images], dim=2) # b c 17+4, h, w
+
+ with accelerator.accumulate(model):
+ assert not torch.any(torch.isnan(x)), 'after vae'
+ x = x.to(weight_dtype)
+ model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask,
+ encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
+ run(x, model_kwargs, prof_)
+
+ set_sequence_parallel_state(current_step_sp_state) # in case the next step use sp, which need broadcast(timesteps)
+
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ return False
+
+ def train_all_epoch(prof_=None):
+ for epoch in range(first_epoch, args.num_train_epochs):
+ progress_info.train_loss = 0.0
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ for step, data_item in enumerate(train_dataloader):
+ if train_one_step(step, data_item, prof_):
+ break
+
+ if step >= 2 and torch_npu is not None and npu_config is not None:
+ npu_config.free_mm()
+
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = f"/home/image_data/npu_profiling_t2v/{os.getenv('PROJECT_NAME', 'local')}"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=npu_config.profiling_step, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ train_all_epoch(prof)
+ else:
+ train_all_epoch()
+ accelerator.wait_for_everyone()
+ accelerator.end_training()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+
+ # dataset & dataloader
+ parser.add_argument("--dataset", type=str, required=True)
+ parser.add_argument("--data", type=str, required='')
+ parser.add_argument("--sample_rate", type=int, default=1)
+ parser.add_argument("--num_frames", type=int, default=65)
+ parser.add_argument("--max_height", type=int, default=320)
+ parser.add_argument("--max_width", type=int, default=240)
+ parser.add_argument("--use_img_from_vid", action="store_true")
+ parser.add_argument("--use_image_num", type=int, default=0)
+ parser.add_argument("--model_max_length", type=int, default=512)
+ parser.add_argument("--multi_scale", action="store_true")
+ parser.add_argument('--cfg', type=float, default=0.1)
+ parser.add_argument("--dataloader_num_workers", type=int, default=10, help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.")
+ parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader.")
+
+ # text encoder & vae & diffusion model
+ parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="Latte-XL/122")
+ parser.add_argument('--enable_8bit_t5', action='store_true')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument("--compress_kv", action="store_true")
+ parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="xformers")
+ parser.add_argument('--use_rope', action='store_true')
+ parser.add_argument('--compress_kv_factor', type=int, default=1)
+ parser.add_argument('--interpolation_scale_h', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_w', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_t', type=float, default=1.0)
+ parser.add_argument("--downsampler", type=str, default=None)
+ parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument('--enable_stable_fp32', action='store_true')
+ parser.add_argument("--enable_lora", action="store_true")
+ parser.add_argument('--rank', type=int, default=64)
+ parser.add_argument("--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.")
+
+ # diffusion setting
+ parser.add_argument("--snr_gamma", type=float, default=None, help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556.")
+ parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+ parser.add_argument("--ema_decay", type=float, default=0.999)
+ parser.add_argument("--ema_start_step", type=int, default=0)
+ parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+ parser.add_argument("--prediction_type", type=str, default=None, help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.")
+
+ # validation & logs
+ parser.add_argument("--num_sampling_steps", type=int, default=50)
+ parser.add_argument('--guidance_scale', type=float, default=5.0)
+ parser.add_argument("--enable_tracker", action="store_true")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+ parser.add_argument("--output_dir", type=str, default=None, help="The output directory where the model predictions and checkpoints will be written.")
+ parser.add_argument("--checkpoints_total_limit", type=int, default=None, help=("Max number of checkpoints to store."))
+ parser.add_argument("--checkpointing_steps", type=int, default=500,
+ help=(
+ "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+ " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+ " training using `--resume_from_checkpoint`."
+ ),
+ )
+ parser.add_argument("--resume_from_checkpoint", type=str, default=None,
+ help=(
+ "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+ ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+ ),
+ )
+ parser.add_argument("--logging_dir", type=str, default="logs",
+ help=(
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+ ),
+ )
+ parser.add_argument("--report_to", type=str, default="tensorboard",
+ help=(
+ 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+ ),
+ )
+ # optimizer & scheduler
+ parser.add_argument("--num_train_epochs", type=int, default=100)
+ parser.add_argument("--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.")
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")
+ parser.add_argument("--optimizer", type=str, default="adamW", help='The optimizer type to use. Choose between ["AdamW", "prodigy"]')
+ parser.add_argument("--learning_rate", type=float, default=1e-4, help="Initial learning rate (after the potential warmup period) to use.")
+ parser.add_argument("--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.")
+ parser.add_argument("--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler.")
+ parser.add_argument("--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW")
+ parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-02, help="Weight decay to use for unet params")
+ parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder")
+ parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer and Prodigy optimizers.")
+ parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True, help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True, help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ parser.add_argument("--prodigy_beta3", type=float, default=None,
+ help="coefficients for computing the Prodidy stepsize using running averages. If set to None, "
+ "uses the value of square root of beta2. Ignored if optimizer is adamW",
+ )
+ parser.add_argument("--lr_scheduler", type=str, default="constant",
+ help=(
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
+ parser.add_argument("--allow_tf32", action="store_true",
+ help=(
+ "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+ ),
+ )
+ parser.add_argument("--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"],
+ help=(
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+ ),
+ )
+
+ parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+<<<<<<< HEAD:opensora/train/train_inpaint.py
+ parser.add_argument("--sp_size", type=int, default=1, help="For sequence parallel")
+ parser.add_argument("--train_sp_batch_size", type=int, default=1, help="Batch size for sequence parallel training")
+
+ parser.add_argument("--model_type", type=str, default='inpaint_only', choices=['inpaint_only', 'vip_only', 'vip_inpaint'])
+ parser.add_argument("--train_vip", action="store_true")
+ parser.add_argument("--need_validation", action="store_true")
+ # inpaint
+ parser.add_argument("--i2v_ratio", type=float, default=0.5) # for inpainting mode
+ parser.add_argument("--transition_ratio", type=float, default=0.4) # for inpainting mode
+ parser.add_argument("--v2v_ratio", type=float, default=0.1) # for inpainting mode
+ parser.add_argument("--clear_video_ratio", type=float, default=0.0)
+ parser.add_argument("--default_text_ratio", type=float, default=0.1)
+ parser.add_argument("--validation_dir", type=str, default=None, help="Path to the validation dataset.")
+ parser.add_argument("--image_encoder_name", type=str, default='laion/CLIP-ViT-H-14-laion2B-s32B-b79K')
+ parser.add_argument("--image_encoder_path", type=str, default=None)
+ parser.add_argument("--use_clip_mask", action="store_true")
+ parser.add_argument("--clip_loss_lambda", type=float, default=0.9)
+ parser.add_argument("--pretrained_transformer_model_path", type=str, default=None)
+ parser.add_argument("--pretrained_vip_adapter_path", type=str, default=None)
+ parser.add_argument("--vip_num_attention_heads", type=int, default=8)
+ parser.add_argument("--use_vae_preprocessed_mask", action="store_true")
+=======
+>>>>>>> anyres:opensora/train/train_t2v_diffusers_lora copy.py
+
+
+ args = parser.parse_args()
+ main(args)
\ No newline at end of file
diff --git a/opensora/train/train_t2v_diffusers_lora.py b/opensora/train/train_t2v_diffusers_lora.py
new file mode 100644
index 000000000..be8e08183
--- /dev/null
+++ b/opensora/train/train_t2v_diffusers_lora.py
@@ -0,0 +1,1023 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+A minimal training script for DiT using PyTorch DDP.
+"""
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import Optional
+import gc
+import numpy as np
+from einops import rearrange
+from tqdm import tqdm
+
+from opensora.adaptor.modules import replace_with_fp32_forwards
+
+try:
+ import torch_npu
+ from opensora.npu_config import npu_config
+ from opensora.acceleration.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.acceleration.communications import prepare_parallel_data, broadcast
+except:
+ torch_npu = None
+ npu_config = None
+ from opensora.utils.parallel_states import initialize_sequence_parallel_state, \
+ destroy_sequence_parallel_group, get_sequence_parallel_state, set_sequence_parallel_state
+ from opensora.utils.communications import prepare_parallel_data, broadcast
+ pass
+import time
+from dataclasses import field, dataclass
+from torch.utils.data import DataLoader
+from copy import deepcopy
+import accelerate
+import torch
+from torch.nn import functional as F
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
+from packaging import version
+from tqdm.auto import tqdm
+
+import diffusers
+from diffusers import DDPMScheduler, PNDMScheduler, DPMSolverMultistepScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+
+from opensora.models.causalvideovae import ae_stride_config, ae_channel_config
+from opensora.models.causalvideovae import ae_norm, ae_denorm
+from opensora.models import CausalVAEModelWrapper
+from opensora.models.text_encoder import get_text_enc, get_text_warpper
+from opensora.dataset import getdataset
+from opensora.models import CausalVAEModelWrapper
+from opensora.models.diffusion import Diffusion_models, Diffusion_models_class
+from opensora.utils.dataset_utils import Collate, LengthGroupedSampler
+from opensora.utils.ema_utils import EMAModel
+from opensora.sample.pipeline_opensora import OpenSoraPipeline
+from peft import LoraConfig, PeftModel, get_peft_model
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.24.0")
+logger = get_logger(__name__)
+
+
+@torch.inference_mode()
+def log_validation(args, model, vae, text_encoder, tokenizer, accelerator, weight_dtype, global_step, ema=False):
+ positive_prompt = "(masterpiece), (best quality), (ultra-detailed), {}. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous"
+ negative_prompt = """nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry,
+ """
+ validation_prompt = [
+ "a cat wearing sunglasses and working as a lifeguard at pool.",
+ "A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene."
+ ]
+ if 'mt5' in args.text_encoder_name:
+ validation_prompt_cn = [
+ "一只戴着墨镜在泳池当救生员的猫咪。",
+ "这是一个宁静的水下场景,一只海龟游过珊瑚礁。海龟带着绿褐色的龟壳,优雅地游向画面右侧,成为视频的焦点。背景中的珊瑚礁生机盎然,为海龟的旅程提供了生动多彩的背景。几条小鱼在海龟周围穿梭,为画面增添了动感和活力。"
+ ]
+ validation_prompt += validation_prompt_cn
+ logger.info(f"Running validation....\n")
+ model = accelerator.unwrap_model(model)
+ scheduler = DPMSolverMultistepScheduler()
+ opensora_pipeline = OpenSoraPipeline(vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ transformer=model).to(device=accelerator.device)
+ videos = []
+ for prompt in validation_prompt:
+ logger.info('Processing the ({}) prompt'.format(prompt))
+ video = opensora_pipeline(
+ positive_prompt.format(prompt),
+ negative_prompt=negative_prompt,
+ num_frames=args.num_frames,
+ height=args.max_height,
+ width=args.max_width,
+ num_inference_steps=args.num_sampling_steps,
+ guidance_scale=args.guidance_scale,
+ enable_temporal_attentions=True,
+ num_images_per_prompt=1,
+ mask_feature=True,
+ max_sequence_length=args.model_max_length,
+ ).images
+ videos.append(video[0])
+ # import ipdb;ipdb.set_trace()
+ gc.collect()
+ torch.cuda.empty_cache()
+ videos = torch.stack(videos).numpy()
+ videos = rearrange(videos, 'b t h w c -> b t c h w')
+ for tracker in accelerator.trackers:
+ if tracker.name == "tensorboard":
+ if videos.shape[1] == 1:
+ assert args.num_frames == 1
+ images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+ np_images = np.stack([np.asarray(img) for img in images])
+ tracker.writer.add_images(f"{'ema_' if ema else ''}validation", np_images, global_step, dataformats="NHWC")
+ else:
+ np_videos = np.stack([np.asarray(vid) for vid in videos])
+ tracker.writer.add_video(f"{'ema_' if ema else ''}validation", np_videos, global_step, fps=24)
+ if tracker.name == "wandb":
+ import wandb
+ if videos.shape[1] == 1:
+ images = rearrange(videos, 'b 1 c h w -> (b 1) h w c')
+ logs = {
+ f"{'ema_' if ema else ''}validation": [
+ wandb.Image(image, caption=f"{i}: {prompt}")
+ for i, (image, prompt) in enumerate(zip(images, validation_prompt))
+ ]
+ }
+ else:
+ logs = {
+ f"{'ema_' if ema else ''}validation": [
+ wandb.Video(video, caption=f"{i}: {prompt}", fps=24)
+ for i, (video, prompt) in enumerate(zip(videos, validation_prompt))
+ ]
+ }
+ tracker.log(logs, step=global_step)
+
+ del opensora_pipeline
+ gc.collect()
+ torch.cuda.empty_cache()
+
+
+class ProgressInfo:
+ def __init__(self, global_step, train_loss=0.0):
+ self.global_step = global_step
+ self.train_loss = train_loss
+
+
+#################################################################################
+# Training Loop #
+#################################################################################
+
+def main(args):
+ logging_dir = Path(args.output_dir, args.logging_dir)
+
+ # use LayerNorm, GeLu, SiLu always as fp32 mode
+ if args.enable_stable_fp32:
+ replace_with_fp32_forwards()
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(args)
+ npu_config.seed_everything(args.seed)
+ accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+ accelerator = Accelerator(
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
+ mixed_precision=args.mixed_precision,
+ log_with=args.report_to,
+ project_config=accelerator_project_config,
+ )
+
+ if args.num_frames != 1 and args.use_image_num == 0:
+ initialize_sequence_parallel_state(args.sp_size)
+
+ if args.report_to == "wandb":
+ if not is_wandb_available():
+ raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+ # Make one log on every process with the configuration for debugging.
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+ )
+ logger.info(accelerator.state, main_process_only=False)
+ if accelerator.is_local_main_process:
+ transformers.utils.logging.set_verbosity_warning()
+ diffusers.utils.logging.set_verbosity_info()
+ else:
+ transformers.utils.logging.set_verbosity_error()
+ diffusers.utils.logging.set_verbosity_error()
+
+ # If passed along, set the training seed now.
+ if args.seed is not None:
+ set_seed(args.seed)
+
+ # Handle the repository creation
+ if accelerator.is_main_process:
+ if args.output_dir is not None:
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ # For mixed precision training we cast all non-trainable weigths to half-precision
+ # as these weights are only used for inference, keeping weights in full precision is not required.
+ weight_dtype = torch.float32
+ if accelerator.mixed_precision == "fp16":
+ weight_dtype = torch.float16
+ elif accelerator.mixed_precision == "bf16":
+ weight_dtype = torch.bfloat16
+
+ # Create model:
+ kwargs = {}
+ ae = CausalVAEModelWrapper(args.ae_path, cache_dir=args.cache_dir, **kwargs).eval()
+ if args.enable_tiling:
+ ae.vae.enable_tiling()
+ ae.vae.tile_overlap_factor = args.tile_overlap_factor
+
+ kwargs = {'load_in_8bit': args.enable_8bit_t5, 'torch_dtype': weight_dtype, 'low_cpu_mem_usage': True}
+ text_enc = get_text_warpper(args.text_encoder_name)(args, **kwargs).eval()
+
+ ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
+ ae.vae_scale_factor = (ae_stride_t, ae_stride_h, ae_stride_w)
+ assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
+ args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
+ args.ae_stride = args.ae_stride_h
+ patch_size = args.model[-3:]
+ patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
+ args.patch_size = patch_size_h
+ args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
+ assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
+ # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
+ assert args.max_height % ae_stride_h == 0, f"Height must be divisible by ae_stride_h, but found Height ({args.max_height}), ae_stride_h ({ae_stride_h})."
+ assert args.max_width % ae_stride_h == 0, f"Width size must be divisible by ae_stride_h, but found Width ({args.max_width}), ae_stride_h ({ae_stride_h})."
+
+ args.stride_t = ae_stride_t * patch_size_t
+ args.stride = ae_stride_h * patch_size_h
+ latent_size = (args.max_height // ae_stride_h, args.max_width // ae_stride_w)
+ ae.latent_size = latent_size
+
+ if args.num_frames % 2 == 1:
+ args.latent_size_t = latent_size_t = (args.num_frames - 1) // ae_stride_t + 1
+ else:
+ latent_size_t = args.num_frames // ae_stride_t
+ model = Diffusion_models[args.model](
+ in_channels=ae_channel_config[args.ae],
+ out_channels=ae_channel_config[args.ae],
+ # caption_channels=4096,
+ # cross_attention_dim=1152,
+ attention_bias=True,
+ sample_size=latent_size,
+ sample_size_t=latent_size_t,
+ num_vector_embeds=None,
+ activation_fn="gelu-approximate",
+ num_embeds_ada_norm=1000,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ double_self_attention=False,
+ upcast_attention=False,
+ # norm_type="ada_norm_single",
+ norm_elementwise_affine=False,
+ norm_eps=1e-6,
+ attention_type='default',
+ attention_mode=args.attention_mode,
+ interpolation_scale_h=args.interpolation_scale_h,
+ interpolation_scale_w=args.interpolation_scale_w,
+ interpolation_scale_t=args.interpolation_scale_t,
+ downsampler=args.downsampler,
+ # compress_kv_factor=args.compress_kv_factor,
+ use_rope=args.use_rope,
+ # model_max_length=args.model_max_length,
+ use_stable_fp32=args.enable_stable_fp32,
+ sparse1d=args.sparse1d,
+ sparse2d=args.sparse2d,
+ sparse_n=args.sparse_n,
+ )
+ model.gradient_checkpointing = args.gradient_checkpointing
+
+ # # use pretrained model?
+ if args.model_base:
+ model_state_dict = model.state_dict()
+ if '.safetensors' in args.model_base or any(['diffusion_pytorch_model.safetensors' in i for i in os.listdir(args.model_base)]):
+ from safetensors.torch import load_file as safe_load
+ # import ipdb;ipdb.set_trace()
+ if '.safetensors' in args.model_base:
+ model_base = args.model_base
+ else:
+ model_base = os.path.join(args.model_base, 'diffusion_pytorch_model.safetensors')
+ pretrained_checkpoint = safe_load(model_base, device="cpu")
+ pretrained_keys = set(list(pretrained_checkpoint.keys()))
+ model_keys = set(list(model_state_dict.keys()))
+ common_keys = list(pretrained_keys & model_keys)
+ checkpoint = {k: pretrained_checkpoint[k] for k in common_keys if model_state_dict[k].numel() == pretrained_checkpoint[k].numel()}
+ else: # latest stage training weight
+ checkpoint = torch.load(args.model_base, map_location='cpu')
+ if 'model' in checkpoint:
+ checkpoint = checkpoint['model']
+ missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
+ logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
+ logger.info(f'Successfully load {len(model_state_dict) - len(missing_keys)}/{len(model_state_dict)} keys from {args.model_base}!')
+
+ # Freeze vae and text encoders.
+ ae.vae.requires_grad_(False)
+ text_enc.requires_grad_(False)
+ # Set model as trainable.
+ model.train()
+
+ noise_scheduler = DDPMScheduler()
+ # Move unet, vae and text_encoder to device and cast to weight_dtype
+ # The VAE is in float32 to avoid NaN losses.
+ ae.vae.to(accelerator.device, dtype=torch.float32)
+ # ae.vae.to(accelerator.device, dtype=weight_dtype)
+ text_enc.to(accelerator.device, dtype=weight_dtype)
+
+
+ # now we will add new LoRA weights to the attention layers
+ # Set correct lora layers
+ if args.enable_lora:
+ lora_config = LoraConfig(
+ r=args.rank,
+ lora_alpha=args.rank,
+ init_lora_weights="gaussian",
+ target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+ )
+ model = get_peft_model(model, lora_config)
+ logger.info(f'{model.print_trainable_parameters()}')
+ else:
+ lora_config = None
+
+ # Create EMA for the unet.
+ if args.use_ema:
+ ema_model = deepcopy(model)
+ ema_model = EMAModel(ema_model.parameters(), decay=args.ema_decay, update_after_step=args.ema_start_step,
+ model_cls=Diffusion_models_class[args.model], model_config=ema_model.config, lora_config=lora_config)
+
+ # `accelerate` 0.16.0 will have better support for customized saving
+ if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+ def save_model_hook(models, weights, output_dir):
+ if accelerator.is_main_process:
+ if args.use_ema:
+ ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
+
+ # import ipdb;ipdb.set_trace()
+ # if True:
+ # load_ema_model = EMAModel.from_pretrained(os.path.join(output_dir, "model_ema"),
+ # Diffusion_models_class[args.model], lora_config, args.model_base)
+ # res = 0
+ # for p, p_ in zip(ema_model.shadow_params, load_ema_model.shadow_params):
+ # res += int(torch.allclose(p.detach().float().cpu(), p_.detach().float().cpu(), atol=1e-6))
+ # # [int(torch.allclose(p.detach().cpu().float(), p_.detach().cpu().float(), atol=1e-6)) for p, p_ in zip(ema_model.shadow_params, load_ema_model.shadow_params)]
+ # print(f'total {len(ema_model.shadow_params)}, allclose {res}')
+ # import ipdb;ipdb.set_trace()
+
+ for i, model in enumerate(models):
+ model.save_pretrained(os.path.join(output_dir, "model"))
+ if args.enable_lora:
+ unwrap_model = accelerator.unwrap_model(model)
+ unwrap_model_bak = deepcopy(unwrap_model)
+
+ # import ipdb;ipdb.set_trace()
+ # if True:
+ # base_model = Diffusion_models_class[args.model].from_pretrained(args.model_base)
+ # model_lora = PeftModel.from_pretrained(base_model, os.path.join(output_dir, "model")) # 基模型+lora
+ # res = 0
+ # for (n, p), (n_, p_) in zip(unwrap_model_bak.named_parameters(), model_lora.named_parameters()):
+ # res += int(torch.allclose(p.detach().float().cpu(), p_.detach().float().cpu(), atol=1e-6))
+ # if torch.allclose(p.detach().float().cpu(), p_.detach().float().cpu(), atol=1e-6):
+ # print(n, n_)
+ # # [int(torch.allclose(p.detach().cpu().float(), p_.detach().cpu().float(), atol=1e-6)) for p, p_ in zip(unwrap_model_bak.parameters(), model_lora.parameters())]
+ # print(f'total {len(list(unwrap_model_bak.parameters()))}, allclose {res}')
+ # import ipdb;ipdb.set_trace()
+
+ model_merge = unwrap_model_bak.merge_and_unload()
+ model_merge.save_pretrained(os.path.join(output_dir, "model"))
+
+ del model_merge
+ del unwrap_model_bak
+
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ if weights: # Don't pop if empty
+ # make sure to pop weight so that corresponding model is not saved again
+ weights.pop()
+
+ def load_model_hook(models, input_dir):
+ if args.use_ema:
+ load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), Diffusion_models_class[args.model],
+ lora_config, args.model_base)
+ ema_model.load_state_dict(load_model.state_dict())
+ ema_model.to(accelerator.device)
+ del load_model
+
+ for i in range(len(models)):
+ # pop models so that they are not loaded again
+ model = models.pop()
+ if args.enable_lora:
+ # base_model = Diffusion_models_class[args.model].from_pretrained(args.model_base)
+ model = PeftModel.from_pretrained(model, os.path.join(input_dir, "model"))
+ else:
+ # load diffusers style into model
+ load_model = Diffusion_models_class[args.model].from_pretrained(input_dir, subfolder="model")
+ model.register_to_config(**load_model.config)
+
+ model.load_state_dict(load_model.state_dict())
+ del load_model
+
+ accelerator.register_save_state_pre_hook(save_model_hook)
+ accelerator.register_load_state_pre_hook(load_model_hook)
+
+ # Enable TF32 for faster training on Ampere GPUs,
+ # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+ if args.allow_tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+
+ if args.scale_lr:
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+ )
+
+ params_to_optimize = model.parameters()
+ # Optimizer creation
+ if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+ logger.warning(
+ f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+ "Defaulting to adamW"
+ )
+ args.optimizer = "adamw"
+
+ if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+ logger.warning(
+ f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+ f"set to {args.optimizer.lower()}"
+ )
+
+ if args.optimizer.lower() == "adamw":
+ if args.use_8bit_adam:
+ try:
+ import bitsandbytes as bnb
+ except ImportError:
+ raise ImportError(
+ "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+ )
+
+ optimizer_class = bnb.optim.AdamW8bit
+ else:
+ optimizer_class = torch.optim.AdamW
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ )
+
+ if args.optimizer.lower() == "prodigy":
+ try:
+ import prodigyopt
+ except ImportError:
+ raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+ optimizer_class = prodigyopt.Prodigy
+
+ if args.learning_rate <= 0.1:
+ logger.warning(
+ "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+ )
+
+ optimizer = optimizer_class(
+ params_to_optimize,
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ beta3=args.prodigy_beta3,
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ decouple=args.prodigy_decouple,
+ use_bias_correction=args.prodigy_use_bias_correction,
+ safeguard_warmup=args.prodigy_safeguard_warmup,
+ )
+ logger.info(f"optimizer: {optimizer}")
+
+ # Setup data:
+ train_dataset = getdataset(args)
+ sampler = LengthGroupedSampler(
+ args.train_batch_size,
+ world_size=accelerator.num_processes,
+ lengths=train_dataset.lengths,
+ group_data=args.group_data,
+ ) if args.group_data and args.train_batch_size != 1 else None
+ train_dataloader = DataLoader(
+ train_dataset,
+ shuffle=sampler is None,
+ # pin_memory=True,
+ collate_fn=Collate(args),
+ batch_size=args.train_batch_size,
+ num_workers=args.dataloader_num_workers,
+ sampler=sampler if args.group_data and args.train_batch_size != 1 else None,
+ drop_last=True,
+ # prefetch_factor=4
+ )
+ logger.info(f'after train_dataloader')
+
+ # Scheduler and math around the number of training steps.
+ overrode_max_train_steps = False
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if args.max_train_steps is None:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ overrode_max_train_steps = True
+
+ lr_scheduler = get_scheduler(
+ args.lr_scheduler,
+ optimizer=optimizer,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+ )
+
+ # Prepare everything with our `accelerator`.
+ # model.requires_grad_(False)
+ # model.pos_embed.requires_grad_(True)
+ if args.adapt_vae:
+ model.requires_grad_(False)
+ for name, param in model.named_parameters():
+ if 'pos_embed' in name or 'proj_out' in name:
+ param.requires_grad = True
+ logger.info(f'before accelerator.prepare')
+ model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+ model, optimizer, train_dataloader, lr_scheduler
+ )
+ logger.info(f'after accelerator.prepare')
+ if args.use_ema:
+ ema_model.to(accelerator.device)
+
+ # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if overrode_max_train_steps:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ # Afterwards we recalculate our number of training epochs
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+ # We need to initialize the trackers we use, and also store our configuration.
+ # The trackers initializes automatically on the main process.
+ if accelerator.is_main_process:
+ accelerator.init_trackers(os.path.basename(args.output_dir), config=vars(args))
+
+ # Train!
+ total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+ total_batch_size = total_batch_size // args.sp_size * args.train_sp_batch_size
+ logger.info("***** Running training *****")
+ logger.info(f" Model = {model}")
+ logger.info(f" Num examples = {len(train_dataset)}")
+ logger.info(f" Num Epochs = {args.num_train_epochs}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Total optimization steps = {args.max_train_steps}")
+ logger.info(f" Total training parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9} B")
+ global_step = 0
+ first_epoch = 0
+
+ # Potentially load in the weights and states from a previous save
+ if args.resume_from_checkpoint:
+ if args.resume_from_checkpoint != "latest":
+ path = os.path.basename(args.resume_from_checkpoint)
+ else:
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.output_dir)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ if path is None:
+ accelerator.print(
+ f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+ )
+ args.resume_from_checkpoint = None
+ initial_global_step = 0
+ else:
+ accelerator.print(f"Resuming from checkpoint {path}")
+ accelerator.load_state(os.path.join(args.output_dir, path))
+ global_step = int(path.split("-")[1])
+
+ initial_global_step = global_step
+ first_epoch = global_step // num_update_steps_per_epoch
+
+ if npu_config is not None:
+ train_dataset.n_used_elements = global_step * args.train_batch_size
+
+ else:
+ initial_global_step = 0
+
+ progress_bar = tqdm(
+ range(0, args.max_train_steps),
+ initial=initial_global_step,
+ desc="Steps",
+ # Only show the progress bar once on each machine.
+ disable=not accelerator.is_local_main_process,
+ )
+ progress_info = ProgressInfo(global_step, train_loss=0.0)
+
+ def sync_gradients_info(loss):
+ # Checks if the accelerator has performed an optimization step behind the scenes
+ if args.use_ema:
+ ema_model.step(model.parameters())
+ progress_bar.update(1)
+ progress_info.global_step += 1
+ end_time = time.time()
+ one_step_duration = end_time - start_time
+ accelerator.log({"train_loss": progress_info.train_loss}, step=progress_info.global_step)
+ if torch_npu is not None and npu_config is not None:
+ npu_config.print_msg(f"Step: [{progress_info.global_step}], local_loss={loss.detach().item()}, "
+ f"train_loss={progress_info.train_loss}, time_cost={one_step_duration}",
+ rank=0)
+ progress_info.train_loss = 0.0
+
+ # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+ if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+ if progress_info.global_step % args.checkpointing_steps == 0:
+ # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+ if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+ checkpoints = os.listdir(args.output_dir)
+ checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+ checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+ # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+ if len(checkpoints) >= args.checkpoints_total_limit:
+ num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+ removing_checkpoints = checkpoints[0:num_to_remove]
+
+ logger.info(
+ f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+ )
+ logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+ for removing_checkpoint in removing_checkpoints:
+ removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+ shutil.rmtree(removing_checkpoint)
+
+ save_path = os.path.join(args.output_dir, f"checkpoint-{progress_info.global_step}")
+ accelerator.save_state(save_path)
+ logger.info(f"Saved state to {save_path}")
+
+ logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+ progress_bar.set_postfix(**logs)
+
+ def run(model_input, model_kwargs, prof):
+ global start_time
+ start_time = time.time()
+
+ noise = torch.randn_like(model_input)
+ if args.noise_offset:
+ # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+ noise += args.noise_offset * torch.randn((model_input.shape[0], model_input.shape[1], 1, 1, 1),
+ device=model_input.device)
+
+ bsz = model_input.shape[0]
+ current_step_frame = model_input.shape[2]
+ # Sample a random timestep for each image without bias.
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device)
+ if current_step_frame != 1 and get_sequence_parallel_state(): # image do not need sp
+ broadcast(timesteps)
+
+ # Add noise to the model input according to the noise magnitude at each timestep
+ # (this is the forward diffusion process)
+
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+ model_pred = model(
+ noisy_model_input,
+ timesteps,
+ **model_kwargs
+ )[0]
+ # Get the target for loss depending on the prediction type
+ if args.prediction_type is not None:
+ # set prediction_type of scheduler if defined
+ noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+ if noise_scheduler.config.prediction_type == "epsilon":
+ target = noise
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+ elif noise_scheduler.config.prediction_type == "sample":
+ # We set the target to latents here, but the model_pred will return the noise sample prediction.
+ target = model_input
+ # We will have to subtract the noise residual from the prediction to get the target sample.
+ model_pred = model_pred - noise
+ else:
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+ mask = model_kwargs.get('attention_mask', None)
+ if torch.all(mask.bool()):
+ mask = None
+ if get_sequence_parallel_state():
+ assert mask is None
+ b, c, _, _, _ = model_pred.shape
+ if mask is not None:
+ mask = mask.unsqueeze(1).repeat(1, c, 1, 1, 1).float() # b t h w -> b c t h w
+ mask = mask.reshape(b, -1)
+ if args.snr_gamma is None:
+ # model_pred: b c t h w, attention_mask: b t h w
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.reshape(b, -1)
+ if mask is not None:
+ loss = (loss * mask).sum() / mask.sum() # mean loss on unpad patches
+ else:
+ loss = loss.mean()
+ else:
+ # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+ # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+ # This is discussed in Section 4.2 of the same paper.
+ snr = compute_snr(noise_scheduler, timesteps)
+ mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+ dim=1
+ )[0]
+ if noise_scheduler.config.prediction_type == "epsilon":
+ mse_loss_weights = mse_loss_weights / snr
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ mse_loss_weights = mse_loss_weights / (snr + 1)
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.reshape(b, -1)
+ mse_loss_weights = mse_loss_weights.reshape(b, 1)
+ if mask is not None:
+ loss = (loss * mask * mse_loss_weights).sum() / mask.sum() # mean loss on unpad patches
+ else:
+ loss = (loss * mse_loss_weights).mean()
+
+ # Gather the losses across all processes for logging (if we use distributed training).
+ avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+ progress_info.train_loss += avg_loss.detach().item() / args.gradient_accumulation_steps
+
+ # Backpropagate
+ accelerator.backward(loss)
+ if accelerator.sync_gradients:
+ params_to_clip = model.parameters()
+ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+
+ if accelerator.sync_gradients:
+ sync_gradients_info(loss)
+
+ if accelerator.is_main_process:
+
+ if progress_info.global_step % args.checkpointing_steps == 0:
+
+ if args.enable_tracker:
+ log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ weight_dtype, progress_info.global_step)
+
+ if args.use_ema and npu_config is None:
+ # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+ ema_model.store(model.parameters())
+ ema_model.copy_to(model.parameters())
+ log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator,
+ weight_dtype, progress_info.global_step, ema=True)
+ # Switch back to the original UNet parameters.
+ ema_model.restore(model.parameters())
+
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ if prof is not None:
+ prof.step()
+
+
+ return loss
+
+ def train_one_step(step_, data_item_, prof_=None):
+ train_loss = 0.0
+ x, attn_mask, input_ids, cond_mask = data_item_
+ assert not torch.any(torch.isnan(x)), 'torch.any(torch.isnan(x))'
+ x = x.to(accelerator.device, dtype=ae.vae.dtype) # B C T+num_images H W, 16 + 4
+
+ attn_mask = attn_mask.to(accelerator.device) # B T+num_images H W
+ input_ids = input_ids.to(accelerator.device) # B 1+num_images L
+ cond_mask = cond_mask.to(accelerator.device) # B 1+num_images L
+ # if accelerator.process_index == 0:
+ # logger.info(f'rank: {accelerator.process_index}, x: {x.shape}, attn_mask: {attn_mask.shape}')
+
+ with torch.no_grad():
+ B, N, L = input_ids.shape # B 1+num_images L
+ # use batch inference
+ input_ids_ = input_ids.reshape(-1, L)
+ cond_mask_ = cond_mask.reshape(-1, L)
+ cond = text_enc(input_ids_, cond_mask_) # B 1+num_images L D
+ cond = cond.reshape(B, N, L, -1)
+ # Map input images to latent space + normalize latents
+ x = ae.encode(x) # B C T H W
+
+ # def custom_to_video(x: torch.Tensor, fps: float = 2.0, output_file: str = 'output_video.mp4') -> None:
+ # from examples.rec_video import array_to_video
+ # x = x.detach().cpu()
+ # x = torch.clamp(x, -1, 1)
+ # x = (x + 1) / 2
+ # x = x.permute(1, 2, 3, 0).numpy()
+ # x = (255*x).astype(np.uint8)
+ # array_to_video(x, fps=fps, output_file=output_file)
+ # return
+ # videos = ae.decode(x)[0]
+ # videos = videos.transpose(0, 1)
+ # custom_to_video(videos.to(torch.float32), fps=24, output_file='tmp.mp4')
+ # import sys;sys.exit()
+ current_step_frame = x.shape[2]
+ current_step_sp_state = get_sequence_parallel_state()
+ if args.sp_size != 1: # enable sp
+ if current_step_frame == 1: # but image do not need sp
+ set_sequence_parallel_state(False)
+ else:
+ set_sequence_parallel_state(True)
+ if get_sequence_parallel_state():
+ x, cond, attn_mask, cond_mask, use_image_num = prepare_parallel_data(x, cond, attn_mask, cond_mask,
+ args.use_image_num)
+ for iter in range(args.train_batch_size * args.sp_size // args.train_sp_batch_size):
+ with accelerator.accumulate(model):
+ st_idx = iter * args.train_sp_batch_size
+ ed_idx = (iter + 1) * args.train_sp_batch_size
+ model_kwargs = dict(encoder_hidden_states=cond[st_idx: ed_idx],
+ attention_mask=attn_mask[st_idx: ed_idx],
+ encoder_attention_mask=cond_mask[st_idx: ed_idx], use_image_num=use_image_num)
+ run(x[st_idx: ed_idx], model_kwargs, prof_)
+
+ else:
+ with accelerator.accumulate(model):
+ assert not torch.any(torch.isnan(x)), 'after vae'
+ x = x.to(weight_dtype)
+ model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask,
+ encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
+ run(x, model_kwargs, prof_)
+
+ set_sequence_parallel_state(current_step_sp_state) # in case the next step use sp, which need broadcast(timesteps)
+
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ return False
+
+ def train_all_epoch(prof_=None):
+ for epoch in range(first_epoch, args.num_train_epochs):
+ progress_info.train_loss = 0.0
+ if progress_info.global_step >= args.max_train_steps:
+ return True
+
+ for step, data_item in enumerate(train_dataloader):
+ if train_one_step(step, data_item, prof_):
+ break
+
+ if step >= 2 and torch_npu is not None and npu_config is not None:
+ npu_config.free_mm()
+
+ if npu_config is not None and npu_config.on_npu and npu_config.profiling:
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization
+ )
+ profile_output_path = f"/home/image_data/npu_profiling_t2v/{os.getenv('PROJECT_NAME', 'local')}"
+ os.makedirs(profile_output_path, exist_ok=True)
+
+ with torch_npu.profiler.profile(
+ activities=[torch_npu.profiler.ProfilerActivity.NPU, torch_npu.profiler.ProfilerActivity.CPU],
+ with_stack=True,
+ record_shapes=True,
+ profile_memory=True,
+ experimental_config=experimental_config,
+ schedule=torch_npu.profiler.schedule(wait=npu_config.profiling_step, warmup=0, active=1, repeat=1,
+ skip_first=0),
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"{profile_output_path}/")
+ ) as prof:
+ train_all_epoch(prof)
+ else:
+ train_all_epoch()
+ accelerator.wait_for_everyone()
+ accelerator.end_training()
+ if get_sequence_parallel_state():
+ destroy_sequence_parallel_group()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ # dataset & dataloader
+ parser.add_argument("--dataset", type=str, required=True)
+ parser.add_argument("--data", type=str, required='')
+ parser.add_argument("--sample_rate", type=int, default=1)
+ parser.add_argument("--train_fps", type=int, default=24)
+ parser.add_argument("--drop_short_ratio", type=float, default=1.0)
+ parser.add_argument("--speed_factor", type=float, default=1.0)
+ parser.add_argument("--num_frames", type=int, default=65)
+ parser.add_argument("--max_height", type=int, default=320)
+ parser.add_argument("--max_width", type=int, default=240)
+ parser.add_argument("--use_img_from_vid", action="store_true")
+ parser.add_argument("--use_image_num", type=int, default=0)
+ parser.add_argument("--model_max_length", type=int, default=512)
+ parser.add_argument('--cfg', type=float, default=0.1)
+ parser.add_argument("--dataloader_num_workers", type=int, default=10, help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.")
+ parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader.")
+ parser.add_argument("--group_data", action="store_true")
+ parser.add_argument("--hw_stride", type=int, default=32)
+ parser.add_argument("--skip_low_resolution", action="store_true")
+ parser.add_argument("--force_resolution", action="store_true")
+
+ # text encoder & vae & diffusion model
+ parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="Latte-XL/122")
+ parser.add_argument('--enable_8bit_t5', action='store_true')
+ parser.add_argument('--tile_overlap_factor', type=float, default=0.125)
+ parser.add_argument('--enable_tiling', action='store_true')
+ parser.add_argument("--compress_kv", action="store_true")
+ parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="xformers")
+ parser.add_argument('--use_rope', action='store_true')
+ parser.add_argument('--compress_kv_factor', type=int, default=1)
+ parser.add_argument('--interpolation_scale_h', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_w', type=float, default=1.0)
+ parser.add_argument('--interpolation_scale_t', type=float, default=1.0)
+ parser.add_argument("--downsampler", type=str, default=None)
+ parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
+ parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
+ parser.add_argument("--cache_dir", type=str, default='./cache_dir')
+ parser.add_argument("--model_base", type=str, default='')
+ parser.add_argument('--enable_stable_fp32', action='store_true')
+ parser.add_argument("--enable_lora", action="store_true")
+ parser.add_argument('--rank', type=int, default=64)
+ parser.add_argument('--sparse1d', action='store_true')
+ parser.add_argument('--sparse2d', action='store_true')
+ parser.add_argument('--sparse_n', type=int, default=2)
+ parser.add_argument('--adapt_vae', action='store_true')
+ parser.add_argument("--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.")
+
+ # diffusion setting
+ parser.add_argument("--snr_gamma", type=float, default=None, help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556.")
+ parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+ parser.add_argument("--ema_decay", type=float, default=0.9999)
+ parser.add_argument("--ema_start_step", type=int, default=0)
+ parser.add_argument("--noise_offset", type=float, default=0.02, help="The scale of noise offset.")
+ parser.add_argument("--prediction_type", type=str, default=None, help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.")
+
+ # validation & logs
+ parser.add_argument("--num_sampling_steps", type=int, default=20)
+ parser.add_argument('--guidance_scale', type=float, default=4.5)
+ parser.add_argument("--enable_tracker", action="store_true")
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+ parser.add_argument("--output_dir", type=str, default=None, help="The output directory where the model predictions and checkpoints will be written.")
+ parser.add_argument("--checkpoints_total_limit", type=int, default=None, help=("Max number of checkpoints to store."))
+ parser.add_argument("--checkpointing_steps", type=int, default=500,
+ help=(
+ "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+ " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+ " training using `--resume_from_checkpoint`."
+ ),
+ )
+ parser.add_argument("--resume_from_checkpoint", type=str, default=None,
+ help=(
+ "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+ ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+ ),
+ )
+ parser.add_argument("--logging_dir", type=str, default="logs",
+ help=(
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+ ),
+ )
+ parser.add_argument("--report_to", type=str, default="tensorboard",
+ help=(
+ 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+ ),
+ )
+ # optimizer & scheduler
+ parser.add_argument("--num_train_epochs", type=int, default=100)
+ parser.add_argument("--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.")
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")
+ parser.add_argument("--optimizer", type=str, default="adamW", help='The optimizer type to use. Choose between ["AdamW", "prodigy"]')
+ parser.add_argument("--learning_rate", type=float, default=1e-4, help="Initial learning rate (after the potential warmup period) to use.")
+ parser.add_argument("--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.")
+ parser.add_argument("--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler.")
+ parser.add_argument("--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW")
+ parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers.")
+ parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-02, help="Weight decay to use for unet params")
+ parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder")
+ parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer and Prodigy optimizers.")
+ parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True, help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True, help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. Ignored if optimizer is adamW")
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ parser.add_argument("--prodigy_beta3", type=float, default=None,
+ help="coefficients for computing the Prodidy stepsize using running averages. If set to None, "
+ "uses the value of square root of beta2. Ignored if optimizer is adamW",
+ )
+ parser.add_argument("--lr_scheduler", type=str, default="constant",
+ help=(
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
+ parser.add_argument("--allow_tf32", action="store_true",
+ help=(
+ "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+ ),
+ )
+ parser.add_argument("--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"],
+ help=(
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+ ),
+ )
+
+ parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+ parser.add_argument("--sp_size", type=int, default=1, help="For sequence parallel")
+ parser.add_argument("--train_sp_batch_size", type=int, default=1, help="Batch size for sequence parallel training")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/opensora/train/train_t2v_feature.py b/opensora/train/train_t2v_feature.py
deleted file mode 100644
index fbd2ea4c3..000000000
--- a/opensora/train/train_t2v_feature.py
+++ /dev/null
@@ -1,787 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-A minimal training script for DiT using PyTorch DDP.
-"""
-import argparse
-import logging
-import math
-import os
-import shutil
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-from einops import rearrange
-from tqdm import tqdm
-from dataclasses import field, dataclass
-from torch.utils.data import DataLoader
-from copy import deepcopy
-
-import accelerate
-import torch
-from torch.nn import functional as F
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo
-from packaging import version
-from tqdm.auto import tqdm
-from transformers import HfArgumentParser, TrainingArguments, AutoTokenizer
-
-import diffusers
-from diffusers import DDPMScheduler, PNDMScheduler
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel, compute_snr
-from diffusers.utils import check_min_version, is_wandb_available
-
-from opensora.dataset import getdataset, ae_denorm
-from opensora.models.ae import getae, getae_wrapper
-from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
-from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion
-from opensora.models.diffusion.latte.modeling_latte import LatteT2V
-from opensora.models.text_encoder import get_text_enc
-from opensora.utils.dataset_utils import Collate
-from opensora.models.ae import ae_stride_config, ae_channel_config
-from opensora.models.diffusion import Diffusion_models
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.24.0")
-logger = get_logger(__name__)
-
-
-def generate_timestep_weights(args, num_timesteps):
- weights = torch.ones(num_timesteps)
-
- # Determine the indices to bias
- num_to_bias = int(args.timestep_bias_portion * num_timesteps)
-
- if args.timestep_bias_strategy == "later":
- bias_indices = slice(-num_to_bias, None)
- elif args.timestep_bias_strategy == "earlier":
- bias_indices = slice(0, num_to_bias)
- elif args.timestep_bias_strategy == "range":
- # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
- range_begin = args.timestep_bias_begin
- range_end = args.timestep_bias_end
- if range_begin < 0:
- raise ValueError(
- "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
- )
- if range_end > num_timesteps:
- raise ValueError(
- "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
- )
- bias_indices = slice(range_begin, range_end)
- else: # 'none' or any other string
- return weights
- if args.timestep_bias_multiplier <= 0:
- return ValueError(
- "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
- " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
- " A timestep bias multiplier less than or equal to 0 is not allowed."
- )
-
- # Apply the bias
- weights[bias_indices] *= args.timestep_bias_multiplier
-
- # Normalize
- weights /= weights.sum()
-
- return weights
-
-
-#################################################################################
-# Training Loop #
-#################################################################################
-
-def main(args):
- logging_dir = Path(args.output_dir, args.logging_dir)
-
- accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
-
- accelerator = Accelerator(
- gradient_accumulation_steps=args.gradient_accumulation_steps,
- mixed_precision=args.mixed_precision,
- log_with=args.report_to,
- project_config=accelerator_project_config,
- )
-
- if args.report_to == "wandb":
- if not is_wandb_available():
- raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
- import wandb
-
- # Make one log on every process with the configuration for debugging.
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO,
- )
- logger.info(accelerator.state, main_process_only=False)
- if accelerator.is_local_main_process:
- transformers.utils.logging.set_verbosity_warning()
- diffusers.utils.logging.set_verbosity_info()
- else:
- transformers.utils.logging.set_verbosity_error()
- diffusers.utils.logging.set_verbosity_error()
-
- # If passed along, set the training seed now.
- if args.seed is not None:
- set_seed(args.seed)
-
- # Handle the repository creation
- if accelerator.is_main_process:
- if args.output_dir is not None:
- os.makedirs(args.output_dir, exist_ok=True)
-
- # if args.push_to_hub:
- # repo_id = create_repo(
- # repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
- # ).repo_id
-
- # Create model:
-
- diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule
- # ae = getae(args).eval()
- # text_enc = get_text_enc(args).eval()
-
- ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
- args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
- args.ae_stride = args.ae_stride_h
- patch_size = args.model[-3:]
- patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
- args.patch_size = patch_size_h
- args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
- assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
- assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
- # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
- assert args.max_image_size % ae_stride_h == 0, f"Image size must be divisible by ae_stride_h, but found max_image_size ({args.max_image_size}), ae_stride_h ({ae_stride_h})."
-
- latent_size = (args.max_image_size // ae_stride_h, args.max_image_size // ae_stride_w)
-
- if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper:
- args.video_length = video_length = args.num_frames // ae_stride_t + 1
- else:
- args.video_length = video_length = args.num_frames // ae_stride_t
- model = Diffusion_models[args.model](
- in_channels=ae_channel_config[args.ae],
- out_channels=ae_channel_config[args.ae] * 2,
- # caption_channels=4096,
- # cross_attention_dim=1152,
- attention_bias=True,
- sample_size=latent_size,
- num_vector_embeds=None,
- activation_fn="gelu-approximate",
- num_embeds_ada_norm=1000,
- use_linear_projection=False,
- only_cross_attention=False,
- double_self_attention=False,
- upcast_attention=False,
- # norm_type="ada_norm_single",
- norm_elementwise_affine=False,
- norm_eps=1e-6,
- attention_type='default',
- video_length=video_length,
- attention_mode=args.attention_mode,
- # compress_kv=args.compress_kv
- )
- model.gradient_checkpointing = args.gradient_checkpointing
-
- # # use pretrained model?
- if args.pretrained:
- if 'safetensors' in args.pretrained:
- from safetensors.torch import load_file as safe_load
- checkpoint = safe_load(args.pretrained, device="cpu")
- else:
- checkpoint = torch.load(args.pretrained, map_location='cpu')['model']
- model_state_dict = model.state_dict()
- missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
- logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
- logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
- # load from pixart-alpha
- # pixelart_alpha = torch.load(args.pretrained, map_location='cpu')['state_dict']
- # checkpoint = {}
- # for k, v in pixelart_alpha.items():
- # if 'x_embedder' in k or 't_embedder' in k or 'y_embedder' in k:
- # checkpoint[k] = v
- # if k.startswith('blocks'):
- # k_spilt = k.split('.')
- # blk_id = str(int(k_spilt[1]) * 2)
- # k_spilt[1] = blk_id
- # new_k = '.'.join(k_spilt)
- # checkpoint[new_k] = v
- # missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
- # logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)} keys from {args.pretrained}!')
-
- # Freeze vae and text encoders.
- # ae.requires_grad_(False)
- # text_enc.requires_grad_(False)
- # Set model as trainable.
- model.train()
-
- # For mixed precision training we cast all non-trainable weigths to half-precision
- # as these weights are only used for inference, keeping weights in full precision is not required.
- weight_dtype = torch.float32
- if accelerator.mixed_precision == "fp16":
- weight_dtype = torch.float16
- elif accelerator.mixed_precision == "bf16":
- weight_dtype = torch.bfloat16
-
- # Move unet, vae and text_encoder to device and cast to weight_dtype
- # The VAE is in float32 to avoid NaN losses.
- # ae.to(accelerator.device, dtype=torch.float32)
- # text_enc.to(accelerator.device, dtype=weight_dtype)
-
- # Create EMA for the unet.
- if args.use_ema:
- ema_model = deepcopy(model)
- ema_model = EMAModel(ema_model.parameters(), model_cls=LatteT2V, model_config=ema_model.config)
-
- # `accelerate` 0.16.0 will have better support for customized saving
- if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
- # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
- def save_model_hook(models, weights, output_dir):
- if accelerator.is_main_process:
- if args.use_ema:
- ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
-
- for i, model in enumerate(models):
- model.save_pretrained(os.path.join(output_dir, "model"))
- if weights: # Don't pop if empty
- # make sure to pop weight so that corresponding model is not saved again
- weights.pop()
-
- def load_model_hook(models, input_dir):
- if args.use_ema:
- load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), LatteT2V)
- ema_model.load_state_dict(load_model.state_dict())
- ema_model.to(accelerator.device)
- del load_model
-
- for i in range(len(models)):
- # pop models so that they are not loaded again
- model = models.pop()
-
- # load diffusers style into model
- load_model = LatteT2V.from_pretrained(input_dir, subfolder="model")
- model.register_to_config(**load_model.config)
-
- model.load_state_dict(load_model.state_dict())
- del load_model
-
- accelerator.register_save_state_pre_hook(save_model_hook)
- accelerator.register_load_state_pre_hook(load_model_hook)
-
- # Enable TF32 for faster training on Ampere GPUs,
- # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
- if args.allow_tf32:
- torch.backends.cuda.matmul.allow_tf32 = True
-
- if args.scale_lr:
- args.learning_rate = (
- args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
- )
-
- # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
- if args.use_8bit_adam:
- try:
- import bitsandbytes as bnb
- except ImportError:
- raise ImportError(
- "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
- )
-
- optimizer_class = bnb.optim.AdamW8bit
- else:
- optimizer_class = torch.optim.AdamW
-
- # Optimizer creation
- params_to_optimize = model.parameters()
- optimizer = optimizer_class(
- params_to_optimize,
- lr=args.learning_rate,
- betas=(args.adam_beta1, args.adam_beta2),
- weight_decay=args.adam_weight_decay,
- eps=args.adam_epsilon,
- )
-
- # Setup data:
- train_dataset = getdataset(args)
- train_dataloader = torch.utils.data.DataLoader(
- train_dataset,
- shuffle=True,
- # collate_fn=Collate(args), # TODO: do not enable dynamic mask in this point
- batch_size=args.train_batch_size,
- num_workers=args.dataloader_num_workers,
- )
-
- # Scheduler and math around the number of training steps.
- overrode_max_train_steps = False
- num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
- if args.max_train_steps is None:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- overrode_max_train_steps = True
-
- lr_scheduler = get_scheduler(
- args.lr_scheduler,
- optimizer=optimizer,
- num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
- )
-
- # Prepare everything with our `accelerator`.
- model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
- model, optimizer, train_dataloader, lr_scheduler
- )
-
- # We need to recalculate our total training steps as the size of the training dataloader may have changed.
- num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
- if overrode_max_train_steps:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- # Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
- # We need to initialize the trackers we use, and also store our configuration.
- # The trackers initializes automatically on the main process.
- if accelerator.is_main_process:
- accelerator.init_trackers(args.output_dir, config=vars(args))
-
- # Train!
- total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
- logger.info("***** Running training *****")
- logger.info(f" Num examples = {len(train_dataset)}")
- logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
- logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
- logger.info(f" Total optimization steps = {args.max_train_steps}")
- global_step = 0
- first_epoch = 0
-
- # Potentially load in the weights and states from a previous save
- if args.resume_from_checkpoint:
- if args.resume_from_checkpoint != "latest":
- path = os.path.basename(args.resume_from_checkpoint)
- else:
- # Get the most recent checkpoint
- dirs = os.listdir(args.output_dir)
- dirs = [d for d in dirs if d.startswith("checkpoint")]
- dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
- path = dirs[-1] if len(dirs) > 0 else None
-
- if path is None:
- accelerator.print(
- f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
- )
- args.resume_from_checkpoint = None
- initial_global_step = 0
- else:
- accelerator.print(f"Resuming from checkpoint {path}")
- accelerator.load_state(os.path.join(args.output_dir, path))
- global_step = int(path.split("-")[1])
-
- initial_global_step = global_step
- first_epoch = global_step // num_update_steps_per_epoch
-
- else:
- initial_global_step = 0
-
- progress_bar = tqdm(
- range(0, args.max_train_steps),
- initial=initial_global_step,
- desc="Steps",
- # Only show the progress bar once on each machine.
- disable=not accelerator.is_local_main_process,
- )
-
- for epoch in range(first_epoch, args.num_train_epochs):
- train_loss = 0.0
- for step, (x, cond, cond_mask) in enumerate(train_dataloader):
- with accelerator.accumulate(model):
- # Sample noise that we'll add to the latents
- x = x.to(accelerator.device) # B C T H W
- # attn_mask = attn_mask.to(device) # B T H W
- # assert torch.all(attn_mask.bool()), 'do not enable dynamic input'
- attn_mask = None
- cond = cond.to(accelerator.device) # B L or B 1+num_images L
- cond_mask = cond_mask.to(accelerator.device) # B L or B 1+num_images L
- # print(args.use_image_num, x.shape, cond.shape, cond_mask.shape, cond_mask)
- model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask,
- encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
- t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=accelerator.device)
- loss_dict = diffusion.training_losses(model, x, t, model_kwargs)
- loss = loss_dict["loss"].mean()
-
- # Gather the losses across all processes for logging (if we use distributed training).
- avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
- train_loss += avg_loss.item() / args.gradient_accumulation_steps
-
- # Backpropagate
- accelerator.backward(loss)
- if accelerator.sync_gradients:
- params_to_clip = model.parameters()
- accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
-
- # Checks if the accelerator has performed an optimization step behind the scenes
- if accelerator.sync_gradients:
- progress_bar.update(1)
- global_step += 1
- accelerator.log({"train_loss": train_loss}, step=global_step)
- train_loss = 0.0
-
- if args.use_deepspeed or accelerator.is_main_process:
- if global_step % args.checkpointing_steps == 0:
- # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
- if args.checkpoints_total_limit is not None:
- checkpoints = os.listdir(args.output_dir)
- checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
- checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
- # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
- if len(checkpoints) >= args.checkpoints_total_limit:
- num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
- removing_checkpoints = checkpoints[0:num_to_remove]
-
- logger.info(
- f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
- )
- logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
- for removing_checkpoint in removing_checkpoints:
- removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
- shutil.rmtree(removing_checkpoint)
-
- save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
- accelerator.save_state(save_path)
- logger.info(f"Saved state to {save_path}")
-
- logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
- progress_bar.set_postfix(**logs)
-
- if global_step >= args.max_train_steps:
- break
-
- if accelerator.is_main_process:
- validation_prompt = "The majestic beauty of a waterfall cascading down a cliff into a serene lake. The camera angle provides a bird's eye view of the waterfall."
- if global_step % args.checkpointing_steps == 0:
- logger.info(f"Running validation... \n"
- f"Generating {args.num_validation_videos} videos with prompt: {validation_prompt}")
- if args.use_ema:
- # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
- ema_model.store(model.parameters())
- ema_model.copy_to(model.parameters())
-
- if args.enable_tracker:
- with torch.no_grad():
- # create pipeline
- ae_ = getae_wrapper(args.ae)(args.ae_path).to(accelerator.device).eval()
- if args.enable_tiling:
- ae_.vae.enable_tiling()
- ae_.vae.tile_overlap_factor = args.tile_overlap_factor
- text_enc_ = get_text_enc(args).to(accelerator.device).eval()
- model_ = LatteT2V.from_pretrained(save_path, subfolder="model").to(accelerator.device).eval()
- diffusion_ = create_diffusion(str(250))
- tokenizer_ = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir='./cache_dir')
- videos = []
- for idx in range(args.num_validation_videos):
- with torch.autocast(device_type='cuda', dtype=weight_dtype):
- z = torch.randn(1, model_.in_channels, video_length,
- latent_size[0], latent_size[1], device=accelerator.device)
- text_tokens_and_mask = tokenizer_(
- validation_prompt,
- max_length=args.model_max_length,
- padding='max_length',
- truncation=True,
- return_attention_mask=True,
- add_special_tokens=True,
- return_tensors='pt'
- )
- input_ids = text_tokens_and_mask['input_ids'].to(accelerator.device)
- cond_mask = text_tokens_and_mask['attention_mask'].to(accelerator.device)
- cond = text_enc_(input_ids, cond_mask) # B L D
- # cond = text_enc(input_ids, cond_mask) # B L D
- model_kwargs = dict(encoder_hidden_states=cond, attention_mask=None, encoder_attention_mask=cond_mask)
- sample_fn = model_.forward
- # Sample images:
- samples = diffusion_.p_sample_loop(
- sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
- device=accelerator.device
- )
- samples = ae_.decode(samples)
- # Save and display images:
- video = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(
- dtype=torch.uint8).cpu().contiguous() # t c h w
- videos.append(video)
-
- videos = torch.stack(videos).numpy()
- for tracker in accelerator.trackers:
- if tracker.name == "tensorboard":
- np_videos = np.stack([np.asarray(vid) for vid in videos])
- tracker.writer.add_video("validation", np_videos, global_step, fps=10)
- if tracker.name == "wandb":
- tracker.log(
- {
- "validation": [
- wandb.Video(video, caption=f"{i}: {validation_prompt}", fps=10)
- for i, video in enumerate(videos)
- ]
- }
- )
-
- del ae_, text_enc_, model_, diffusion_, tokenizer_
- # del ae_, model_, diffusion_, tokenizer_
- torch.cuda.empty_cache()
-
- accelerator.wait_for_everyone()
- accelerator.end_training()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--dataset", type=str, required=True)
- parser.add_argument("--data_path", type=str, required=True)
- parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="DiT-XL/122")
- parser.add_argument("--num_classes", type=int, default=1000)
- parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
- parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
- parser.add_argument("--sample_rate", type=int, default=4)
- parser.add_argument("--num_frames", type=int, default=16)
- parser.add_argument("--max_image_size", type=int, default=128)
- parser.add_argument("--dynamic_frames", action="store_true")
- parser.add_argument("--compress_kv", action="store_true")
- parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="math")
- parser.add_argument("--pretrained", type=str, default=None)
-
- parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
- parser.add_argument('--enable_tiling', action='store_true')
-
- parser.add_argument("--video_folder", type=str, default='')
- parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
- parser.add_argument("--model_max_length", type=int, default=120)
-
- parser.add_argument("--enable_tracker", action="store_true")
- parser.add_argument("--use_image_num", type=int, default=0)
- parser.add_argument("--use_img_from_vid", action="store_true")
- parser.add_argument("--use_deepspeed", action="store_true")
- parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
- parser.add_argument(
- "--num_validation_videos",
- type=int,
- default=2,
- help="Number of images that should be generated during validation with `validation_prompt`.",
- )
- parser.add_argument(
- "--output_dir",
- type=str,
- default=None,
- help="The output directory where the model predictions and checkpoints will be written.",
- )
- parser.add_argument(
- "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
- )
- parser.add_argument("--num_train_epochs", type=int, default=100)
- parser.add_argument(
- "--max_train_steps",
- type=int,
- default=None,
- help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
- )
- parser.add_argument(
- "--checkpointing_steps",
- type=int,
- default=500,
- help=(
- "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
- " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
- " training using `--resume_from_checkpoint`."
- ),
- )
- parser.add_argument(
- "--checkpoints_total_limit",
- type=int,
- default=None,
- help=("Max number of checkpoints to store."),
- )
- parser.add_argument(
- "--resume_from_checkpoint",
- type=str,
- default=None,
- help=(
- "Whether training should be resumed from a previous checkpoint. Use a path saved by"
- ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
- ),
- )
- parser.add_argument(
- "--gradient_accumulation_steps",
- type=int,
- default=1,
- help="Number of updates steps to accumulate before performing a backward/update pass.",
- )
- parser.add_argument(
- "--gradient_checkpointing",
- action="store_true",
- help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
- )
- parser.add_argument(
- "--learning_rate",
- type=float,
- default=1e-4,
- help="Initial learning rate (after the potential warmup period) to use.",
- )
- parser.add_argument(
- "--scale_lr",
- action="store_true",
- default=False,
- help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
- )
- parser.add_argument(
- "--lr_scheduler",
- type=str,
- default="constant",
- help=(
- 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'
- ),
- )
- parser.add_argument(
- "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
- )
- parser.add_argument(
- "--timestep_bias_strategy",
- type=str,
- default="none",
- choices=["earlier", "later", "range", "none"],
- help=(
- "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
- " Choices: ['earlier', 'later', 'range', 'none']."
- " The default is 'none', which means no bias is applied, and training proceeds normally."
- " The value of 'later' will increase the frequency of the model's final training timesteps."
- ),
- )
- parser.add_argument(
- "--timestep_bias_multiplier",
- type=float,
- default=1.0,
- help=(
- "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
- " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
- ),
- )
- parser.add_argument(
- "--timestep_bias_begin",
- type=int,
- default=0,
- help=(
- "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
- " Defaults to zero, which equates to having no specific bias."
- ),
- )
- parser.add_argument(
- "--timestep_bias_end",
- type=int,
- default=1000,
- help=(
- "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
- " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
- ),
- )
- parser.add_argument(
- "--timestep_bias_portion",
- type=float,
- default=0.25,
- help=(
- "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
- " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
- " whether the biased portions are in the earlier or later timesteps."
- ),
- )
- parser.add_argument(
- "--snr_gamma",
- type=float,
- default=None,
- help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
- "More details here: https://arxiv.org/abs/2303.09556.",
- )
- parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
- parser.add_argument(
- "--allow_tf32",
- action="store_true",
- help=(
- "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
- " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
- ),
- )
- parser.add_argument(
- "--dataloader_num_workers",
- type=int,
- default=10,
- help=(
- "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ),
- )
- parser.add_argument(
- "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
- )
- parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
- parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
- parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
- parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
- parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
- parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
- parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
- parser.add_argument(
- "--prediction_type",
- type=str,
- default=None,
- help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
- )
- parser.add_argument(
- "--hub_model_id",
- type=str,
- default=None,
- help="The name of the repository to keep in sync with the local `output_dir`.",
- )
- parser.add_argument(
- "--logging_dir",
- type=str,
- default="logs",
- help=(
- "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
- " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
- ),
- )
- parser.add_argument(
- "--report_to",
- type=str,
- default="tensorboard",
- help=(
- 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
- ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
- ),
- )
- parser.add_argument(
- "--mixed_precision",
- type=str,
- default=None,
- choices=["no", "fp16", "bf16"],
- help=(
- "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
- " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
- " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
- ),
- )
- parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
- parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-
- args = parser.parse_args()
- main(args)
diff --git a/opensora/train/train_t2v_t5_feature.py b/opensora/train/train_t2v_t5_feature.py
deleted file mode 100644
index a8f3543e0..000000000
--- a/opensora/train/train_t2v_t5_feature.py
+++ /dev/null
@@ -1,825 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-A minimal training script for DiT using PyTorch DDP.
-"""
-import argparse
-import logging
-import math
-import os
-import shutil
-import sys
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-from PIL import Image
-from einops import rearrange
-from tqdm import tqdm
-from dataclasses import field, dataclass
-from torch.utils.data import DataLoader
-from copy import deepcopy
-
-import accelerate
-import torch
-from torch.nn import functional as F
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo
-from packaging import version
-from tqdm.auto import tqdm
-from transformers import HfArgumentParser, TrainingArguments, AutoTokenizer
-
-import diffusers
-from diffusers import DDPMScheduler, PNDMScheduler
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel, compute_snr
-from diffusers.utils import check_min_version, is_wandb_available
-
-from examples.rec_imvi_vae import custom_to_video
-from opensora.dataset import getdataset, ae_denorm
-from opensora.models.ae import getae, getae_wrapper
-from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
-from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion
-from opensora.models.diffusion.latte.modeling_latte import LatteT2V
-from opensora.models.text_encoder import get_text_enc
-from opensora.utils.dataset_utils import Collate
-from opensora.models.ae import ae_stride_config, ae_channel_config
-from opensora.models.diffusion import Diffusion_models
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.24.0")
-logger = get_logger(__name__)
-
-
-def generate_timestep_weights(args, num_timesteps):
- weights = torch.ones(num_timesteps)
-
- # Determine the indices to bias
- num_to_bias = int(args.timestep_bias_portion * num_timesteps)
-
- if args.timestep_bias_strategy == "later":
- bias_indices = slice(-num_to_bias, None)
- elif args.timestep_bias_strategy == "earlier":
- bias_indices = slice(0, num_to_bias)
- elif args.timestep_bias_strategy == "range":
- # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
- range_begin = args.timestep_bias_begin
- range_end = args.timestep_bias_end
- if range_begin < 0:
- raise ValueError(
- "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
- )
- if range_end > num_timesteps:
- raise ValueError(
- "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
- )
- bias_indices = slice(range_begin, range_end)
- else: # 'none' or any other string
- return weights
- if args.timestep_bias_multiplier <= 0:
- return ValueError(
- "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
- " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
- " A timestep bias multiplier less than or equal to 0 is not allowed."
- )
-
- # Apply the bias
- weights[bias_indices] *= args.timestep_bias_multiplier
-
- # Normalize
- weights /= weights.sum()
-
- return weights
-
-
-#################################################################################
-# Training Loop #
-#################################################################################
-
-def main(args):
- logging_dir = Path(args.output_dir, args.logging_dir)
-
- accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
-
- accelerator = Accelerator(
- gradient_accumulation_steps=args.gradient_accumulation_steps,
- mixed_precision=args.mixed_precision,
- log_with=args.report_to,
- project_config=accelerator_project_config,
- )
-
- if args.report_to == "wandb":
- if not is_wandb_available():
- raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
- import wandb
-
- # Make one log on every process with the configuration for debugging.
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO,
- )
- logger.info(accelerator.state, main_process_only=False)
- if accelerator.is_local_main_process:
- transformers.utils.logging.set_verbosity_warning()
- diffusers.utils.logging.set_verbosity_info()
- else:
- transformers.utils.logging.set_verbosity_error()
- diffusers.utils.logging.set_verbosity_error()
-
- # If passed along, set the training seed now.
- if args.seed is not None:
- set_seed(args.seed)
-
- # Handle the repository creation
- if accelerator.is_main_process:
- if args.output_dir is not None:
- os.makedirs(args.output_dir, exist_ok=True)
-
- # if args.push_to_hub:
- # repo_id = create_repo(
- # repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
- # ).repo_id
-
- # Create model:
-
- diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule
- ae = getae_wrapper(args.ae)(args.ae_path).eval()
- if args.enable_tiling:
- ae.vae.enable_tiling()
- ae.vae.tile_overlap_factor = args.tile_overlap_factor
- # text_enc = get_text_enc(args).eval()
-
- ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
- args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
- args.ae_stride = args.ae_stride_h
- patch_size = args.model[-3:]
- patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
- args.patch_size = patch_size_h
- args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
- assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
- assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
- # assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
- assert args.max_image_size % ae_stride_h == 0, f"Image size must be divisible by ae_stride_h, but found max_image_size ({args.max_image_size}), ae_stride_h ({ae_stride_h})."
-
- latent_size = (args.max_image_size // ae_stride_h, args.max_image_size // ae_stride_w)
-
- if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper:
- args.video_length = video_length = args.num_frames // ae_stride_t + 1
- else:
- args.video_length = video_length = args.num_frames // ae_stride_t
- model = Diffusion_models[args.model](
- in_channels=ae_channel_config[args.ae],
- out_channels=ae_channel_config[args.ae] * 2,
- # caption_channels=4096,
- # cross_attention_dim=1152,
- attention_bias=True,
- sample_size=latent_size,
- num_vector_embeds=None,
- activation_fn="gelu-approximate",
- num_embeds_ada_norm=1000,
- use_linear_projection=False,
- only_cross_attention=False,
- double_self_attention=False,
- upcast_attention=False,
- # norm_type="ada_norm_single",
- norm_elementwise_affine=False,
- norm_eps=1e-6,
- attention_type='default',
- video_length=video_length,
- attention_mode=args.attention_mode,
- # compress_kv=args.compress_kv
- )
- model.gradient_checkpointing = args.gradient_checkpointing
-
- # # use pretrained model?
- if args.pretrained:
- if 'safetensors' in args.pretrained:
- from safetensors.torch import load_file as safe_load
- checkpoint = safe_load(args.pretrained, device="cpu")
- else:
- checkpoint = torch.load(args.pretrained, map_location='cpu')['model']
- model_state_dict = model.state_dict()
- missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
- logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}')
- logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
- # load from pixart-alpha
- # pixelart_alpha = torch.load(args.pretrained, map_location='cpu')['state_dict']
- # checkpoint = {}
- # for k, v in pixelart_alpha.items():
- # if 'x_embedder' in k or 't_embedder' in k or 'y_embedder' in k:
- # checkpoint[k] = v
- # if k.startswith('blocks'):
- # k_spilt = k.split('.')
- # blk_id = str(int(k_spilt[1]) * 2)
- # k_spilt[1] = blk_id
- # new_k = '.'.join(k_spilt)
- # checkpoint[new_k] = v
- # missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
- # logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)} keys from {args.pretrained}!')
-
- # Freeze vae and text encoders.
- ae.requires_grad_(False)
- # text_enc.requires_grad_(False)
- # Set model as trainable.
- model.train()
-
- # For mixed precision training we cast all non-trainable weigths to half-precision
- # as these weights are only used for inference, keeping weights in full precision is not required.
- weight_dtype = torch.float32
- if accelerator.mixed_precision == "fp16":
- weight_dtype = torch.float16
- elif accelerator.mixed_precision == "bf16":
- weight_dtype = torch.bfloat16
-
- # Move unet, vae and text_encoder to device and cast to weight_dtype
- # The VAE is in float32 to avoid NaN losses.
- # ae.to(accelerator.device, dtype=torch.float32)
- ae.to(accelerator.device, dtype=weight_dtype)
- model.to(accelerator.device, dtype=weight_dtype)
- # text_enc.to(accelerator.device, dtype=weight_dtype)
-
- # Create EMA for the unet.
- if args.use_ema:
- ema_model = deepcopy(model)
- ema_model = EMAModel(ema_model.parameters(), model_cls=LatteT2V, model_config=ema_model.config)
-
- # `accelerate` 0.16.0 will have better support for customized saving
- if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
- # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
- def save_model_hook(models, weights, output_dir):
- if accelerator.is_main_process:
- if args.use_ema:
- ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))
-
- for i, model in enumerate(models):
- model.save_pretrained(os.path.join(output_dir, "model"))
- if weights: # Don't pop if empty
- # make sure to pop weight so that corresponding model is not saved again
- weights.pop()
-
- def load_model_hook(models, input_dir):
- if args.use_ema:
- load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), LatteT2V)
- ema_model.load_state_dict(load_model.state_dict())
- ema_model.to(accelerator.device)
- del load_model
-
- for i in range(len(models)):
- # pop models so that they are not loaded again
- model = models.pop()
-
- # load diffusers style into model
- load_model = LatteT2V.from_pretrained(input_dir, subfolder="model")
- model.register_to_config(**load_model.config)
-
- model.load_state_dict(load_model.state_dict())
- del load_model
-
- accelerator.register_save_state_pre_hook(save_model_hook)
- accelerator.register_load_state_pre_hook(load_model_hook)
-
- # Enable TF32 for faster training on Ampere GPUs,
- # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
- if args.allow_tf32:
- torch.backends.cuda.matmul.allow_tf32 = True
-
- if args.scale_lr:
- args.learning_rate = (
- args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
- )
-
- # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
- if args.use_8bit_adam:
- try:
- import bitsandbytes as bnb
- except ImportError:
- raise ImportError(
- "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
- )
-
- optimizer_class = bnb.optim.AdamW8bit
- else:
- optimizer_class = torch.optim.AdamW
-
- # Optimizer creation
- params_to_optimize = model.parameters()
- optimizer = optimizer_class(
- params_to_optimize,
- lr=args.learning_rate,
- betas=(args.adam_beta1, args.adam_beta2),
- weight_decay=args.adam_weight_decay,
- eps=args.adam_epsilon,
- )
-
- # Setup data:
- train_dataset = getdataset(args)
- train_dataloader = torch.utils.data.DataLoader(
- train_dataset,
- shuffle=True,
- # collate_fn=Collate(args), # TODO: do not enable dynamic mask in this point
- batch_size=args.train_batch_size,
- num_workers=args.dataloader_num_workers,
- )
-
- # Scheduler and math around the number of training steps.
- overrode_max_train_steps = False
- num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
- if args.max_train_steps is None:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- overrode_max_train_steps = True
-
- lr_scheduler = get_scheduler(
- args.lr_scheduler,
- optimizer=optimizer,
- num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
- num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
- )
-
- # Prepare everything with our `accelerator`.
- model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
- model, optimizer, train_dataloader, lr_scheduler
- )
-
- # We need to recalculate our total training steps as the size of the training dataloader may have changed.
- num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
- if overrode_max_train_steps:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- # Afterwards we recalculate our number of training epochs
- args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
- # We need to initialize the trackers we use, and also store our configuration.
- # The trackers initializes automatically on the main process.
- if accelerator.is_main_process:
- accelerator.init_trackers(args.output_dir, config=vars(args))
-
- # Train!
- total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
- logger.info("***** Running training *****")
- logger.info(f" Num examples = {len(train_dataset)}")
- logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
- logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
- logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
- logger.info(f" Total optimization steps = {args.max_train_steps}")
- global_step = 0
- first_epoch = 0
-
- # Potentially load in the weights and states from a previous save
- if args.resume_from_checkpoint:
- if args.resume_from_checkpoint != "latest":
- path = os.path.basename(args.resume_from_checkpoint)
- else:
- # Get the most recent checkpoint
- dirs = os.listdir(args.output_dir)
- dirs = [d for d in dirs if d.startswith("checkpoint")]
- dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
- path = dirs[-1] if len(dirs) > 0 else None
-
- if path is None:
- accelerator.print(
- f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
- )
- args.resume_from_checkpoint = None
- initial_global_step = 0
- else:
- accelerator.print(f"Resuming from checkpoint {path}")
- accelerator.load_state(os.path.join(args.output_dir, path))
- global_step = int(path.split("-")[1])
-
- initial_global_step = global_step
- first_epoch = global_step // num_update_steps_per_epoch
-
- else:
- initial_global_step = 0
-
- progress_bar = tqdm(
- range(0, args.max_train_steps),
- initial=initial_global_step,
- desc="Steps",
- # Only show the progress bar once on each machine.
- disable=not accelerator.is_local_main_process,
- )
-
- for epoch in range(first_epoch, args.num_train_epochs):
- train_loss = 0.0
- for step, (x, cond, cond_mask) in enumerate(train_dataloader):
- with accelerator.accumulate(model):
- # Sample noise that we'll add to the latents
- x = x.to(accelerator.device) # B C T H W
- # print(x.dtype)
- # attn_mask = attn_mask.to(device) # B T H W
- # assert torch.all(attn_mask.bool()), 'do not enable dynamic input'
- attn_mask = None
- cond = cond.to(accelerator.device, dtype=weight_dtype) # B L or B 1+num_images L
- cond_mask = cond_mask.to(accelerator.device) # B L or B 1+num_images L
-
- with torch.no_grad():
- # Map input images to latent space + normalize latents
- if args.use_image_num == 0:
- x = ae.encode(x.to(dtype=weight_dtype)) # B C T H W
- else:
- videos, images = x[:, :, :-args.use_image_num], x[:, :, -args.use_image_num:]
- videos = ae.encode(videos.to(dtype=weight_dtype)) # B C T H W
-
- # videos = ae.decode(videos.to(dtype=weight_dtype))[0]
- # videos = videos.transpose(0, 1)
- # custom_to_video(videos.to(torch.float32), fps=24, output_file='tmp.mp4')
-
- images = rearrange(images, 'b c t h w -> (b t) c 1 h w')
- images = ae.encode(images.to(dtype=weight_dtype))
-
- # images = ae.decode(images.to(dtype=weight_dtype))
- # x = images[0, 0, :, :, :].to(torch.float32)
- # x = x.squeeze()
- # x = x.detach().cpu().numpy()
- # x = np.clip(x, -1, 1)
- # x = (x + 1) / 2
- # x = (255 * x).astype(np.uint8)
- # x = x.transpose(1, 2, 0)
- # image = Image.fromarray(x)
- # image.save('tmp.jpg')
- # sys.exit()
-
- images = rearrange(images, '(b t) c 1 h w -> b c t h w', t=args.use_image_num)
- x = torch.cat([videos, images], dim=2)
-
- # print(args.use_image_num, x.shape, cond.shape, cond_mask.shape, cond_mask)
- model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask,
- encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
- t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=accelerator.device)
- loss_dict = diffusion.training_losses(model, x, t, model_kwargs)
- loss = loss_dict["loss"].mean()
-
- # Gather the losses across all processes for logging (if we use distributed training).
- avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
- train_loss += avg_loss.item() / args.gradient_accumulation_steps
-
- # Backpropagate
- accelerator.backward(loss)
- if accelerator.sync_gradients:
- params_to_clip = model.parameters()
- accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
-
- # Checks if the accelerator has performed an optimization step behind the scenes
- if accelerator.sync_gradients:
- progress_bar.update(1)
- global_step += 1
- accelerator.log({"train_loss": train_loss}, step=global_step)
- train_loss = 0.0
-
- if args.use_deepspeed or accelerator.is_main_process:
- if global_step % args.checkpointing_steps == 0:
- # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
- if args.checkpoints_total_limit is not None:
- checkpoints = os.listdir(args.output_dir)
- checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
- checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
- # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
- if len(checkpoints) >= args.checkpoints_total_limit:
- num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
- removing_checkpoints = checkpoints[0:num_to_remove]
-
- logger.info(
- f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
- )
- logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
- for removing_checkpoint in removing_checkpoints:
- removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
- shutil.rmtree(removing_checkpoint)
-
- save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
- accelerator.save_state(save_path)
- logger.info(f"Saved state to {save_path}")
-
- logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
- progress_bar.set_postfix(**logs)
-
- if global_step >= args.max_train_steps:
- break
-
- if accelerator.is_main_process:
- validation_prompt = "The majestic beauty of a waterfall cascading down a cliff into a serene lake. The camera angle provides a bird's eye view of the waterfall."
- if global_step % args.checkpointing_steps == 0:
- logger.info(f"Running validation... \n"
- f"Generating {args.num_validation_videos} videos with prompt: {validation_prompt}")
- if args.use_ema:
- # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
- ema_model.store(model.parameters())
- ema_model.copy_to(model.parameters())
- if args.enable_tracker:
- with torch.no_grad():
- # create pipeline
- ae_ = getae_wrapper(args.ae)(args.ae_path).to(accelerator.device).eval()
- if args.enable_tiling:
- ae_.vae.enable_tiling()
- ae_.vae.tile_overlap_factor = args.tile_overlap_factor
- text_enc_ = get_text_enc(args).to(accelerator.device).eval()
- model_ = LatteT2V.from_pretrained(save_path, subfolder="model").to(accelerator.device).eval()
- diffusion_ = create_diffusion(str(500))
- tokenizer_ = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir='./cache_dir')
- videos = []
- for idx in range(args.num_validation_videos):
- with torch.autocast(device_type='cuda', dtype=weight_dtype):
- z = torch.randn(1, model_.in_channels, video_length,
- latent_size[0], latent_size[1], device=accelerator.device)
- text_tokens_and_mask = tokenizer_(
- validation_prompt,
- max_length=args.model_max_length,
- padding='max_length',
- truncation=True,
- return_attention_mask=True,
- add_special_tokens=True,
- return_tensors='pt'
- )
- input_ids = text_tokens_and_mask['input_ids'].to(accelerator.device)
- cond_mask = text_tokens_and_mask['attention_mask'].to(accelerator.device)
- cond = text_enc_(input_ids, cond_mask) # B L D
- # cond = text_enc(input_ids, cond_mask) # B L D
- model_kwargs = dict(encoder_hidden_states=cond, attention_mask=None, encoder_attention_mask=cond_mask)
- sample_fn = model_.forward
- # Sample images:
- samples = diffusion_.p_sample_loop(
- sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
- device=accelerator.device
- )
- samples = ae_.decode(samples)
- # Save and display images:
- video = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().contiguous() # t c h w
- videos.append(video)
-
- videos = torch.stack(videos).numpy()
- for tracker in accelerator.trackers:
- if tracker.name == "tensorboard":
- np_videos = np.stack([np.asarray(vid) for vid in videos])
- tracker.writer.add_video("validation", np_videos, global_step, fps=24)
- if tracker.name == "wandb":
- tracker.log(
- {
- "validation": [
- wandb.Video(video, caption=f"{i}: {validation_prompt}", fps=24)
- for i, video in enumerate(videos)
- ]
- }
- )
-
- del ae_, text_enc_, model_, diffusion_, tokenizer_
- # del ae_, model_, diffusion_, tokenizer_
- torch.cuda.empty_cache()
-
- accelerator.wait_for_everyone()
- accelerator.end_training()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--dataset", type=str, required=True)
- parser.add_argument("--data_path", type=str, required=True)
- parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="DiT-XL/122")
- parser.add_argument("--num_classes", type=int, default=1000)
- parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
- parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
- parser.add_argument("--sample_rate", type=int, default=4)
- parser.add_argument("--num_frames", type=int, default=16)
- parser.add_argument("--max_image_size", type=int, default=128)
- parser.add_argument("--dynamic_frames", action="store_true")
- parser.add_argument("--compress_kv", action="store_true")
- parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="math")
- parser.add_argument("--pretrained", type=str, default=None)
-
- parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
- parser.add_argument('--enable_tiling', action='store_true')
-
- parser.add_argument("--video_folder", type=str, default='')
- parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
- parser.add_argument("--model_max_length", type=int, default=120)
-
- parser.add_argument("--use_image_num", type=int, default=0)
- parser.add_argument("--use_img_from_vid", action="store_true")
- parser.add_argument("--enable_tracker", action="store_true")
- parser.add_argument("--use_deepspeed", action="store_true")
- parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
- parser.add_argument(
- "--num_validation_videos",
- type=int,
- default=2,
- help="Number of images that should be generated during validation with `validation_prompt`.",
- )
- parser.add_argument(
- "--output_dir",
- type=str,
- default=None,
- help="The output directory where the model predictions and checkpoints will be written.",
- )
- parser.add_argument(
- "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
- )
- parser.add_argument("--num_train_epochs", type=int, default=100)
- parser.add_argument(
- "--max_train_steps",
- type=int,
- default=None,
- help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
- )
- parser.add_argument(
- "--checkpointing_steps",
- type=int,
- default=500,
- help=(
- "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
- " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
- " training using `--resume_from_checkpoint`."
- ),
- )
- parser.add_argument(
- "--checkpoints_total_limit",
- type=int,
- default=None,
- help=("Max number of checkpoints to store."),
- )
- parser.add_argument(
- "--resume_from_checkpoint",
- type=str,
- default=None,
- help=(
- "Whether training should be resumed from a previous checkpoint. Use a path saved by"
- ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
- ),
- )
- parser.add_argument(
- "--gradient_accumulation_steps",
- type=int,
- default=1,
- help="Number of updates steps to accumulate before performing a backward/update pass.",
- )
- parser.add_argument(
- "--gradient_checkpointing",
- action="store_true",
- help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
- )
- parser.add_argument(
- "--learning_rate",
- type=float,
- default=1e-4,
- help="Initial learning rate (after the potential warmup period) to use.",
- )
- parser.add_argument(
- "--scale_lr",
- action="store_true",
- default=False,
- help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
- )
- parser.add_argument(
- "--lr_scheduler",
- type=str,
- default="constant",
- help=(
- 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
- ' "constant", "constant_with_warmup"]'
- ),
- )
- parser.add_argument(
- "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
- )
- parser.add_argument(
- "--timestep_bias_strategy",
- type=str,
- default="none",
- choices=["earlier", "later", "range", "none"],
- help=(
- "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
- " Choices: ['earlier', 'later', 'range', 'none']."
- " The default is 'none', which means no bias is applied, and training proceeds normally."
- " The value of 'later' will increase the frequency of the model's final training timesteps."
- ),
- )
- parser.add_argument(
- "--timestep_bias_multiplier",
- type=float,
- default=1.0,
- help=(
- "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
- " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
- ),
- )
- parser.add_argument(
- "--timestep_bias_begin",
- type=int,
- default=0,
- help=(
- "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
- " Defaults to zero, which equates to having no specific bias."
- ),
- )
- parser.add_argument(
- "--timestep_bias_end",
- type=int,
- default=1000,
- help=(
- "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
- " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
- ),
- )
- parser.add_argument(
- "--timestep_bias_portion",
- type=float,
- default=0.25,
- help=(
- "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
- " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
- " whether the biased portions are in the earlier or later timesteps."
- ),
- )
- parser.add_argument(
- "--snr_gamma",
- type=float,
- default=None,
- help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
- "More details here: https://arxiv.org/abs/2303.09556.",
- )
- parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
- parser.add_argument(
- "--allow_tf32",
- action="store_true",
- help=(
- "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
- " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
- ),
- )
- parser.add_argument(
- "--dataloader_num_workers",
- type=int,
- default=10,
- help=(
- "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
- ),
- )
- parser.add_argument(
- "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
- )
- parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
- parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
- parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
- parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
- parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
- parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
- parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
- parser.add_argument(
- "--prediction_type",
- type=str,
- default=None,
- help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
- )
- parser.add_argument(
- "--hub_model_id",
- type=str,
- default=None,
- help="The name of the repository to keep in sync with the local `output_dir`.",
- )
- parser.add_argument(
- "--logging_dir",
- type=str,
- default="logs",
- help=(
- "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
- " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
- ),
- )
- parser.add_argument(
- "--report_to",
- type=str,
- default="tensorboard",
- help=(
- 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
- ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
- ),
- )
- parser.add_argument(
- "--mixed_precision",
- type=str,
- default=None,
- choices=["no", "fp16", "bf16"],
- help=(
- "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
- " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
- " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
- ),
- )
- parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
- parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-
- args = parser.parse_args()
- main(args)
\ No newline at end of file
diff --git a/opensora/train/train_videogpt.py b/opensora/train/train_videogpt.py
deleted file mode 100644
index e953af872..000000000
--- a/opensora/train/train_videogpt.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import sys
-
-sys.path.append(".")
-
-from opensora.models.ae.videobase.dataset_videobase import VideoDataset
-from opensora.models.ae.videobase import (
- VQVAEModel,
- VQVAEConfiguration,
- VQVAETrainer,
-)
-import argparse
-from typing import Optional
-from accelerate.utils import set_seed
-from transformers import HfArgumentParser, TrainingArguments
-from dataclasses import dataclass, field, asdict
-
-
-@dataclass
-class VQVAEArgument:
- embedding_dim: int = (field(default=256),)
- n_codes: int = (field(default=2048),)
- n_hiddens: int = (field(default=240),)
- n_res_layers: int = (field(default=4),)
- resolution: int = (field(default=128),)
- sequence_length: int = (field(default=16),)
- downsample: str = (field(default="4,4,4"),)
- no_pos_embd: bool = (True,)
- data_path: str = field(default=None, metadata={"help": "data path"})
-
-
-@dataclass
-class VQVAETrainingArgument(TrainingArguments):
- remove_unused_columns: Optional[bool] = field(
- default=False,
- metadata={
- "help": "Remove columns not required by the model when using an nlp.Dataset."
- },
- )
-
-
-def train(args, vqvae_args: VQVAEArgument, training_args: VQVAETrainingArgument):
- # Load Config
- config = VQVAEConfiguration(
- embedding_dim=vqvae_args.embedding_dim,
- n_codes=vqvae_args.n_codes,
- n_hiddens=vqvae_args.n_hiddens,
- n_res_layers=vqvae_args.n_res_layers,
- resolution=vqvae_args.resolution,
- sequence_length=vqvae_args.sequence_length,
- downsample=vqvae_args.downsample,
- no_pos_embd=vqvae_args.no_pos_embd,
- )
- # Load Model
- model = VQVAEModel(config)
- # Load Dataset
- dataset = VideoDataset(
- args.data_path,
- sequence_length=args.sequence_length,
- resolution=config.resolution,
- )
- # Load Trainer
- trainer = VQVAETrainer(model, training_args, train_dataset=dataset)
- trainer.train()
-
-
-if __name__ == "__main__":
- parser = HfArgumentParser((VQVAEArgument, VQVAETrainingArgument))
- vqvae_args, training_args = parser.parse_args_into_dataclasses()
- args = argparse.Namespace(**vars(vqvae_args), **vars(training_args))
- set_seed(args.seed)
-
- train(args, vqvae_args, training_args)
diff --git a/opensora/utils/communications.py b/opensora/utils/communications.py
new file mode 100644
index 000000000..c7a79e3a6
--- /dev/null
+++ b/opensora/utils/communications.py
@@ -0,0 +1,122 @@
+import torch
+import torch.distributed as dist
+from einops import rearrange
+from opensora.utils.parallel_states import nccl_info
+
+def broadcast(input_: torch.Tensor):
+ sp_size = nccl_info.world_size
+ src = nccl_info.rank // sp_size * sp_size
+ dist.broadcast(input_, src=src, group=nccl_info.group)
+
+_COUNT = 0
+def _all_to_all(
+ input_: torch.Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+):
+ group = nccl_info.group
+ sp_size = nccl_info.world_size
+ input_list = [t.contiguous() for t in torch.tensor_split(input_, sp_size, scatter_dim)]
+ output_list = [torch.empty_like(input_list[0]) for _ in range(sp_size)]
+ dist.all_to_all(output_list, input_list, group=group)
+ return torch.cat(output_list, dim=gather_dim).contiguous()
+
+def _single_all_to_all(
+ input_: torch.Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+ enable_HCCL=False,
+):
+
+ sp_size = nccl_info.world_size
+ inp_shape = list(input_.shape)
+ inp_shape[scatter_dim] = inp_shape[scatter_dim] // sp_size
+ if scatter_dim < 1:
+ input_t = input_.reshape(
+ [sp_size, inp_shape[scatter_dim]] + \
+ inp_shape[scatter_dim + 1:]
+ )
+ else:
+ # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+ input_t = input_.reshape(
+ [-1, sp_size, inp_shape[scatter_dim]] + \
+ inp_shape[scatter_dim + 1:]
+ ).transpose(0, 1).contiguous()
+
+ output = torch.empty_like(input_t)
+ dist.all_to_all_single(output, input_t, group=nccl_info.group)
+ # if scattering the seq-dim, transpose the heads back to the original dimension
+ if scatter_dim < 1:
+ output = output.transpose(0, 1).contiguous()
+
+ return output.reshape(
+ inp_shape[: gather_dim] + [inp_shape[gather_dim] * sp_size, ] + inp_shape[gather_dim + 1:])
+
+
+class _AllToAll(torch.autograd.Function):
+ """All-to-all communication.
+
+ Args:
+ input_: input matrix
+ process_group: communication group
+ scatter_dim: scatter dimension
+ gather_dim: gather dimension
+ """
+
+ @staticmethod
+ def forward(ctx, input_, scatter_dim, gather_dim, all_to_all_func):
+ ctx.scatter_dim = scatter_dim
+ ctx.gather_dim = gather_dim
+ ctx.all_to_all = all_to_all_func
+ output = ctx.all_to_all(input_, scatter_dim, gather_dim)
+ return output
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ grad_output = ctx.all_to_all(
+ grad_output,
+ ctx.gather_dim,
+ ctx.scatter_dim,
+ )
+ return (
+ grad_output,
+ None,
+ None,
+ None,
+ )
+
+def all_to_all_SBH(
+ input_: torch.Tensor,
+ scatter_dim: int = 1,
+ gather_dim: int = 0,
+):
+ return _AllToAll.apply(input_, scatter_dim, gather_dim, _single_all_to_all)
+
+def all_to_all_BSND(
+ input_: torch.Tensor,
+ scatter_dim: int = 2,
+ gather_dim: int = 1,
+):
+ return _AllToAll.apply(input_, scatter_dim, gather_dim, _all_to_all)
+
+
+def prepare_parallel_data(hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask, use_image_num):
+ def all_to_all(hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask):
+ hidden_states = _single_all_to_all(hidden_states, scatter_dim=2, gather_dim=0, enable_HCCL=True)
+ encoder_hidden_states = _single_all_to_all(encoder_hidden_states, scatter_dim=1, gather_dim=0, enable_HCCL=True)
+ attention_mask = _single_all_to_all(attention_mask, scatter_dim=1, gather_dim=0, enable_HCCL=True)
+ encoder_attention_mask = _single_all_to_all(encoder_attention_mask, scatter_dim=1, gather_dim=0, enable_HCCL=True)
+ return hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask
+
+ sp_size = nccl_info.world_size
+ frame = hidden_states.shape[2]
+ assert frame % sp_size == 0, "frame should be a multiple of sp_size"
+
+ encoder_hidden_states = rearrange(encoder_hidden_states, 'b 1 (n x) h -> b n x h',
+ n=sp_size, x=encoder_hidden_states.shape[2]//sp_size).contiguous()
+ hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask = all_to_all(hidden_states,
+ encoder_hidden_states,
+ attention_mask.repeat(1, sp_size, 1, 1),
+ encoder_attention_mask.repeat(1, sp_size, 1))
+
+ return hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask, use_image_num
\ No newline at end of file
diff --git a/opensora/utils/dataset_utils.py b/opensora/utils/dataset_utils.py
index 8ab656446..2f9169b50 100644
--- a/opensora/utils/dataset_utils.py
+++ b/opensora/utils/dataset_utils.py
@@ -3,6 +3,16 @@
import decord
from torch.nn import functional as F
import torch
+from typing import Optional
+import torch.utils
+import torch.utils.data
+import torch
+from torch.utils.data import Sampler
+from typing import List
+from collections import Counter, defaultdict
+import random
+
+from opensora.utils.mask_utils import MaskProcessor
IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
@@ -43,99 +53,315 @@ def pad_to_multiple(number, ds_stride):
return number + padding
class Collate:
- def __init__(self, args):
- self.max_image_size = args.max_image_size
+ def __init__(self, args,YOLOmodel):
+ self.batch_size = args.train_batch_size
+ self.group_data = args.group_data
+ self.force_resolution = args.force_resolution
+
+ self.max_height = args.max_height
+ self.max_width = args.max_width
self.ae_stride = args.ae_stride
+
self.ae_stride_t = args.ae_stride_t
self.ae_stride_thw = (self.ae_stride_t, self.ae_stride, self.ae_stride)
- self.ae_stride_1hw = (1, self.ae_stride, self.ae_stride)
self.patch_size = args.patch_size
self.patch_size_t = args.patch_size_t
- self.patch_size_thw = (self.patch_size_t, self.patch_size, self.patch_size)
- self.patch_size_1hw = (1, self.patch_size, self.patch_size)
self.num_frames = args.num_frames
self.use_image_num = args.use_image_num
- self.max_thw = (self.num_frames, self.max_image_size, self.max_image_size)
- self.max_1hw = (1, self.max_image_size, self.max_image_size)
+ self.max_thw = (self.num_frames, self.max_height, self.max_width)
+
+ self.mask_processor = MaskProcessor(args,YOLOmodel)
+
def package(self, batch):
- batch_tubes_vid = [i['video_data']['video'] for i in batch] # b [c t h w]
- input_ids_vid = torch.stack([i['video_data']['input_ids'] for i in batch]) # b 1 l
- cond_mask_vid = torch.stack([i['video_data']['cond_mask'] for i in batch]) # b 1 l
- batch_tubes_img, input_ids_img, cond_mask_img = None, None, None
- if self.use_image_num != 0:
- batch_tubes_img = [j for i in batch for j in i['image_data']['image']] # b*num_img [c 1 h w]
- input_ids_img = torch.stack([i['image_data']['input_ids'] for i in batch]) # b image_num l
- cond_mask_img = torch.stack([i['image_data']['cond_mask'] for i in batch]) # b image_num l
- return batch_tubes_vid, input_ids_vid, cond_mask_vid, batch_tubes_img, input_ids_img, cond_mask_img
+ batch_tubes = [i['pixel_values'] for i in batch] # b [c t h w]
+ input_ids = [i['input_ids'] for i in batch] # b [1 l]
+ cond_mask = [i['cond_mask'] for i in batch] # b [1 l]
+ motion_score = [i['motion_score'] for i in batch] # List[float]
+ assert all([i is None for i in motion_score]) or all([i is not None for i in motion_score])
+ if all([i is None for i in motion_score]):
+ motion_score = None
+ return batch_tubes, input_ids, cond_mask, motion_score
def __call__(self, batch):
- batch_tubes_vid, input_ids_vid, cond_mask_vid, batch_tubes_img, input_ids_img, cond_mask_img = self.package(batch)
+ batch_tubes, input_ids, cond_mask, motion_score = self.package(batch)
+ # b 2c+1 * t * h * w
+ masked_batch_tubes = []
+
+ for pixed_values in batch_tubes:
+ masked_video,video,mask = self.mask_processor(pixed_values)
+ masked_batch_tube = torch.cat((video,masked_video,mask),dim=1)
+ masked_batch_tubes.append(masked_batch_tube)
ds_stride = self.ae_stride * self.patch_size
t_ds_stride = self.ae_stride_t * self.patch_size_t
- if self.use_image_num == 0:
- pad_batch_tubes, attention_mask = self.process(batch_tubes_vid, t_ds_stride, ds_stride,
- self.max_thw, self.ae_stride_thw, self.patch_size_thw, extra_1=True)
- # attention_mask: b t h w
- input_ids, cond_mask = input_ids_vid.squeeze(1), cond_mask_vid.squeeze(1) # b 1 l -> b l
- else:
- pad_batch_tubes_vid, attention_mask_vid = self.process(batch_tubes_vid, t_ds_stride, ds_stride,
- self.max_thw, self.ae_stride_thw, self.patch_size_thw, extra_1=True)
- # attention_mask_vid: b t h w
- pad_batch_tubes_img, attention_mask_img = self.process(batch_tubes_img, 1, ds_stride,
- self.max_1hw, self.ae_stride_1hw, self.patch_size_1hw, extra_1=False)
- pad_batch_tubes_img = rearrange(pad_batch_tubes_img, '(b i) c 1 h w -> b c i h w', i=self.use_image_num)
- attention_mask_img = rearrange(attention_mask_img, '(b i) 1 h w -> b i h w', i=self.use_image_num)
- pad_batch_tubes = torch.cat([pad_batch_tubes_vid, pad_batch_tubes_img], dim=2) # concat at temporal, video first
- # attention_mask_img: b num_img h w
- attention_mask = torch.cat([attention_mask_vid, attention_mask_img], dim=1) # b t+num_img h w
- input_ids = torch.cat([input_ids_vid, input_ids_img], dim=1) # b 1+num_img hw
- cond_mask = torch.cat([cond_mask_vid, cond_mask_img], dim=1) # b 1+num_img hw
- return pad_batch_tubes, attention_mask, input_ids, cond_mask
-
- def process(self, batch_tubes, t_ds_stride, ds_stride, max_thw, ae_stride_thw, patch_size_thw, extra_1):
+
+ pad_batch_tubes, attention_mask, input_ids, cond_mask, motion_score = self.process(masked_batch_tubes, input_ids,
+ cond_mask, motion_score,
+ t_ds_stride, ds_stride,
+ self.max_thw, self.ae_stride_thw)
+
+ # pad_batch_tubes, attention_mask, input_ids, cond_mask, motion_score = self.process(batch_tubes, input_ids,
+ # cond_mask, motion_score,
+ # t_ds_stride, ds_stride,
+ # self.max_thw, self.ae_stride_thw)
+ assert not torch.any(torch.isnan(pad_batch_tubes)), 'after pad_batch_tubes'
+ return pad_batch_tubes, attention_mask, input_ids, cond_mask, motion_score
+
+ def process(self, batch_tubes, input_ids, cond_mask, motion_score, t_ds_stride, ds_stride, max_thw, ae_stride_thw):
# pad to max multiple of ds_stride
batch_input_size = [i.shape for i in batch_tubes] # [(c t h w), (c t h w)]
- max_t, max_h, max_w = max_thw
- pad_max_t, pad_max_h, pad_max_w = pad_to_multiple(max_t-1 if extra_1 else max_t, t_ds_stride), \
+ assert len(batch_input_size) == self.batch_size
+ if self.group_data or self.batch_size == 1: #
+ len_each_batch = batch_input_size
+ idx_length_dict = dict([*zip(list(range(self.batch_size)), len_each_batch)])
+ count_dict = Counter(len_each_batch)
+ if len(count_dict) != 1:
+ sorted_by_value = sorted(count_dict.items(), key=lambda item: item[1])
+ # import ipdb;ipdb.set_trace()
+ # print(batch, idx_length_dict, count_dict, sorted_by_value)
+ pick_length = sorted_by_value[-1][0] # the highest frequency
+ candidate_batch = [idx for idx, length in idx_length_dict.items() if length == pick_length]
+ random_select_batch = [random.choice(candidate_batch) for _ in range(len(len_each_batch) - len(candidate_batch))]
+ print(batch_input_size, idx_length_dict, count_dict, sorted_by_value, pick_length, candidate_batch, random_select_batch)
+ pick_idx = candidate_batch + random_select_batch
+
+ batch_tubes = [batch_tubes[i] for i in pick_idx]
+ batch_input_size = [i.shape for i in batch_tubes] # [(c t h w), (c t h w)]
+ input_ids = [input_ids[i] for i in pick_idx] # b [1, l]
+ cond_mask = [cond_mask[i] for i in pick_idx] # b [1, l]
+ if motion_score is not None:
+ motion_score = [motion_score[i] for i in pick_idx] # b [1, l]
+
+ for i in range(1, self.batch_size):
+ assert batch_input_size[0] == batch_input_size[i]
+ max_t = max([i[1] for i in batch_input_size])
+ max_h = max([i[2] for i in batch_input_size])
+ max_w = max([i[3] for i in batch_input_size])
+ else:
+ max_t, max_h, max_w = max_thw
+ pad_max_t, pad_max_h, pad_max_w = pad_to_multiple(max_t-1+self.ae_stride_t, t_ds_stride), \
pad_to_multiple(max_h, ds_stride), \
pad_to_multiple(max_w, ds_stride)
- pad_max_t = pad_max_t + 1 if extra_1 else pad_max_t
- each_pad_t_h_w = [[pad_max_t - i.shape[1],
- pad_max_h - i.shape[2],
- pad_max_w - i.shape[3]] for i in batch_tubes]
- pad_batch_tubes = [F.pad(im,
- (0, pad_w,
- 0, pad_h,
- 0, pad_t), value=0) for (pad_t, pad_h, pad_w), im in zip(each_pad_t_h_w, batch_tubes)]
+ pad_max_t = pad_max_t + 1 - self.ae_stride_t
+ each_pad_t_h_w = [
+ [
+ pad_max_t - i.shape[1],
+ pad_max_h - i.shape[2],
+ pad_max_w - i.shape[3]
+ ] for i in batch_tubes
+ ]
+ pad_batch_tubes = [
+ F.pad(im, (0, pad_w, 0, pad_h, 0, pad_t), value=0)
+ for (pad_t, pad_h, pad_w), im in zip(each_pad_t_h_w, batch_tubes)
+ ]
pad_batch_tubes = torch.stack(pad_batch_tubes, dim=0)
- # make attention_mask
- first_channel_first_frame, first_channel_other_frame = pad_batch_tubes[:, :1, :1], pad_batch_tubes[:, :1, 1:] # first channel to make attention_mask
- attention_mask_first_frame = F.max_pool3d(first_channel_first_frame, kernel_size=(1, *ae_stride_thw[1:]), stride=(1, *ae_stride_thw[1:]))
- if first_channel_other_frame.numel() != 0:
- attention_mask_other_frame = F.max_pool3d(first_channel_other_frame, kernel_size=ae_stride_thw, stride=ae_stride_thw)
- attention_mask = torch.cat([attention_mask_first_frame, attention_mask_other_frame], dim=2)
- else:
- attention_mask = attention_mask_first_frame
- attention_mask = attention_mask[:, 0].bool().float() # b t h w, do not channel
- # max_tube_size = [pad_max_t, pad_max_h, pad_max_w]
- # max_latent_size = [((max_tube_size[0]-1) // ae_stride_thw[0] + 1) if extra_1 else (max_tube_size[0] // ae_stride_thw[0]),
- # max_tube_size[1] // ae_stride_thw[1],
- # max_tube_size[2] // ae_stride_thw[2]]
- # max_patchify_latent_size = [((max_latent_size[0]-1) // patch_size_thw[0] + 1) if extra_1 else (max_latent_size[0] // patch_size_thw[0]),
- # max_latent_size[1] // patch_size_thw[1],
- # max_latent_size[2] // patch_size_thw[2]]
- # valid_patchify_latent_size = [[int(math.ceil((i[1]-1) / t_ds_stride)) + 1 if extra_1 else int(math.ceil(i[1] / t_ds_stride)),
- # int(math.ceil(i[2] / ds_stride)),
- # int(math.ceil(i[3] / ds_stride))] for i in batch_input_size]
- # attention_mask = [F.pad(torch.ones(i),
- # (0, max_patchify_latent_size[2] - i[2],
- # 0, max_patchify_latent_size[1] - i[1],
- # 0, max_patchify_latent_size[0] - i[0]), value=0) for i in valid_patchify_latent_size]
- # attention_mask = torch.stack(attention_mask) # b t h w
-
- return pad_batch_tubes, attention_mask
+
+ max_tube_size = [pad_max_t, pad_max_h, pad_max_w]
+ max_latent_size = [
+ ((max_tube_size[0]-1) // ae_stride_thw[0] + 1),
+ max_tube_size[1] // ae_stride_thw[1],
+ max_tube_size[2] // ae_stride_thw[2]
+ ]
+ valid_latent_size = [
+ [
+ int(math.ceil((i[1]-1) / ae_stride_thw[0])) + 1,
+ int(math.ceil(i[2] / ae_stride_thw[1])),
+ int(math.ceil(i[3] / ae_stride_thw[2]))
+ ] for i in batch_input_size]
+ attention_mask = [
+ F.pad(torch.ones(i, dtype=pad_batch_tubes.dtype), (0, max_latent_size[2] - i[2],
+ 0, max_latent_size[1] - i[1],
+ 0, max_latent_size[0] - i[0]), value=0) for i in valid_latent_size]
+ attention_mask = torch.stack(attention_mask) # b t h w
+ if self.batch_size == 1 or self.group_data:
+ if not torch.all(attention_mask.bool()):
+ print(batch_input_size, (max_t, max_h, max_w), (pad_max_t, pad_max_h, pad_max_w), each_pad_t_h_w, max_latent_size, valid_latent_size)
+ assert torch.all(attention_mask.bool())
+
+ input_ids = torch.stack(input_ids) # b 1 l
+ cond_mask = torch.stack(cond_mask) # b 1 l
+ motion_score = torch.tensor(motion_score) if motion_score is not None else motion_score # b
+
+ return pad_batch_tubes, attention_mask, input_ids, cond_mask, motion_score
+
+
+
+
+def group_data_fun(lengths, generator=None):
+ counter = Counter(lengths)
+ grouped_indices = defaultdict(list)
+ for idx, item in enumerate(lengths):
+ grouped_indices[counter[item]].append(idx)
+ grouped_indices = dict(grouped_indices)
+ sorted_indices = [grouped_indices[count] for count in sorted(grouped_indices, reverse=True)]
+
+ shuffle_sorted_indices = []
+ for indice in sorted_indices:
+ shuffle_idx = torch.randperm(len(indice), generator=generator).tolist()
+ shuffle_sorted_indices.extend([indice[idx] for idx in shuffle_idx])
+ return shuffle_sorted_indices
+
+def last_group_data_fun(shuffled_megabatches, lengths):
+ re_shuffled_megabatches = []
+ # print('shuffled_megabatches', len(shuffled_megabatches))
+ for i_megabatch, megabatch in enumerate(shuffled_megabatches):
+ re_megabatch = []
+ for i_batch, batch in enumerate(megabatch):
+ assert len(batch) != 0
+
+ len_each_batch = [lengths[i] for i in batch]
+ idx_length_dict = dict([*zip(batch, len_each_batch)])
+ count_dict = Counter(len_each_batch)
+ if len(count_dict) != 1:
+ sorted_by_value = sorted(count_dict.items(), key=lambda item: item[1])
+ # import ipdb;ipdb.set_trace()
+ # print(batch, idx_length_dict, count_dict, sorted_by_value)
+ pick_length = sorted_by_value[-1][0] # the highest frequency
+ candidate_batch = [idx for idx, length in idx_length_dict.items() if length == pick_length]
+ random_select_batch = [random.choice(candidate_batch) for i in range(len(len_each_batch) - len(candidate_batch))]
+ # print(batch, idx_length_dict, count_dict, sorted_by_value, pick_length, candidate_batch, random_select_batch)
+ batch = candidate_batch + random_select_batch
+ # print(batch)
+
+ for i in range(1, len(batch)-1):
+ # if not lengths[batch[0]] == lengths[batch[i]]:
+ # print(batch, [lengths[i] for i in batch])
+ # import ipdb;ipdb.set_trace()
+ assert lengths[batch[0]] == lengths[batch[i]]
+ re_megabatch.append(batch)
+ re_shuffled_megabatches.append(re_megabatch)
+
+
+ # for megabatch, re_megabatch in zip(shuffled_megabatches, re_shuffled_megabatches):
+ # for batch, re_batch in zip(megabatch, re_megabatch):
+ # for i, re_i in zip(batch, re_batch):
+ # if i != re_i:
+ # print(i, re_i)
+ return re_shuffled_megabatches
+
+def split_to_even_chunks(indices, lengths, num_chunks, batch_size):
+ """
+ Split a list of indices into `chunks` chunks of roughly equal lengths.
+ """
+
+ if len(indices) % num_chunks != 0:
+ chunks = [indices[i::num_chunks] for i in range(num_chunks)]
+ else:
+ num_indices_per_chunk = len(indices) // num_chunks
+
+ chunks = [[] for _ in range(num_chunks)]
+ cur_chunk = 0
+ for index in indices:
+ chunks[cur_chunk].append(index)
+ if len(chunks[cur_chunk]) == num_indices_per_chunk:
+ cur_chunk += 1
+ pad_chunks = []
+ for idx, chunk in enumerate(chunks):
+ if batch_size != len(chunk):
+ assert batch_size > len(chunk)
+ if len(chunk) != 0:
+ chunk = chunk + [random.choice(chunk) for _ in range(batch_size - len(chunk))]
+ else:
+ chunk = random.choice(pad_chunks)
+ print(chunks[idx], '->', chunk)
+ pad_chunks.append(chunk)
+ return pad_chunks
+
+def get_length_grouped_indices(lengths, batch_size, world_size, gradient_accumulation_size, initial_global_step, generator=None, group_data=False, seed=42):
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+ if generator is None:
+ generator = torch.Generator().manual_seed(seed) # every rank will generate a fixed order but random index
+ # print('lengths', lengths)
+
+ if group_data:
+ indices = group_data_fun(lengths, generator)
+ else:
+ indices = torch.randperm(len(lengths), generator=generator).tolist()
+ # print('indices', len(indices))
+
+ # print('sort indices', len(indices))
+ # print('sort indices', indices)
+ # print('sort lengths', [lengths[i] for i in indices])
+
+ megabatch_size = world_size * batch_size
+ megabatches = [indices[i: i + megabatch_size] for i in range(0, len(lengths), megabatch_size)]
+ # print('megabatches', len(megabatches))
+ # print('\nmegabatches', megabatches)
+ megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+ # print('sort megabatches', len(megabatches))
+ # megabatches_len = [[lengths[i] for i in megabatch] for megabatch in megabatches]
+ # print('\nsorted megabatches', megabatches)
+ # print('\nsorted megabatches_len', megabatches_len)
+ # import ipdb;ipdb.set_trace()
+ megabatches = [split_to_even_chunks(megabatch, lengths, world_size, batch_size) for megabatch in megabatches]
+ # print('nsplit_to_even_chunks megabatches', len(megabatches))
+ # print('\nsplit_to_even_chunks megabatches', megabatches)
+ # print('\nsplit_to_even_chunks len', [lengths[i] for megabatch in megabatches for batch in megabatch for i in batch])
+ # return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+ indices_mega = torch.randperm(len(megabatches), generator=generator).tolist()
+
+ shuffled_megabatches = [megabatches[i] for i in indices_mega]
+ # print('shuffled_megabatches', len(shuffled_megabatches))
+ if group_data:
+ shuffled_megabatches = last_group_data_fun(shuffled_megabatches, lengths)
+
+ initial_global_step = initial_global_step * gradient_accumulation_size
+ print('shuffled_megabatches', len(shuffled_megabatches))
+ print('have been trained idx:', len(shuffled_megabatches[:initial_global_step]))
+ # print('shuffled_megabatches[:10]', shuffled_megabatches[:10])
+ # print('have been trained idx:', shuffled_megabatches[:initial_global_step])
+ shuffled_megabatches = shuffled_megabatches[initial_global_step:]
+ print('after shuffled_megabatches', len(shuffled_megabatches))
+ # print('after shuffled_megabatches[:10]', shuffled_megabatches[:10])
+
+ # print('\nshuffled_megabatches', shuffled_megabatches)
+ # import ipdb;ipdb.set_trace()
+ # print('\nshuffled_megabatches len', [[i, lengths[i]] for megabatch in shuffled_megabatches for batch in megabatch for i in batch])
+
+ return [i for megabatch in shuffled_megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+ r"""
+ Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+ keeping a bit of randomness.
+ """
+
+ def __init__(
+ self,
+ batch_size: int,
+ world_size: int,
+ gradient_accumulation_size: int,
+ initial_global_step: int,
+ lengths: Optional[List[int]] = None,
+ group_data=False,
+ generator=None,
+ ):
+ if lengths is None:
+ raise ValueError("Lengths must be provided.")
+
+ self.batch_size = batch_size
+ self.world_size = world_size
+ self.initial_global_step = initial_global_step
+ self.gradient_accumulation_size = gradient_accumulation_size
+ self.lengths = lengths
+ self.group_data = group_data
+ self.generator = generator
+ # print('self.lengths, self.initial_global_step, self.batch_size, self.world_size, self.gradient_accumulation_size',
+ # len(self.lengths), self.initial_global_step, self.batch_size, self.world_size, self.gradient_accumulation_size)
+
+ def __len__(self):
+ return len(self.lengths) - self.initial_global_step * self.batch_size * self.world_size * self.gradient_accumulation_size
+
+ def __iter__(self):
+ indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size,
+ self.gradient_accumulation_size, self.initial_global_step,
+ group_data=self.group_data, generator=self.generator)
+ # print(len(indices), indices[23640:23690])
+ # import sys;sys.exit()
+ return iter(indices)
diff --git a/opensora/utils/ema.py b/opensora/utils/ema.py
new file mode 100644
index 000000000..a906efee9
--- /dev/null
+++ b/opensora/utils/ema.py
@@ -0,0 +1,328 @@
+import contextlib
+import copy
+import random
+from typing import Any, Dict, Iterable, List, Optional, Union
+
+from diffusers.utils import (
+ deprecate,
+ is_torchvision_available,
+ is_transformers_available,
+)
+
+if is_transformers_available():
+ import transformers
+
+if is_torchvision_available():
+ from torchvision import transforms
+
+import numpy as np
+import torch
+
+
+# Adapted from diffusers-style ema https://github.com/huggingface/diffusers/blob/main/src/diffusers/training_utils.py#L263
+class EMAModel:
+ """
+ Exponential Moving Average of models weights
+ """
+
+ def __init__(
+ self,
+ parameters: Iterable[torch.nn.Parameter],
+ decay: float = 0.9999,
+ min_decay: float = 0.0,
+ update_after_step: int = 0,
+ use_ema_warmup: bool = False,
+ inv_gamma: Union[float, int] = 1.0,
+ power: Union[float, int] = 2 / 3,
+ model_cls: Optional[Any] = None,
+ model_config: Dict[str, Any] = None,
+ **kwargs,
+ ):
+ """
+ Args:
+ parameters (Iterable[torch.nn.Parameter]): The parameters to track.
+ decay (float): The decay factor for the exponential moving average.
+ min_decay (float): The minimum decay factor for the exponential moving average.
+ update_after_step (int): The number of steps to wait before starting to update the EMA weights.
+ use_ema_warmup (bool): Whether to use EMA warmup.
+ inv_gamma (float):
+ Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
+ power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
+ device (Optional[Union[str, torch.device]]): The device to store the EMA weights on. If None, the EMA
+ weights will be stored on CPU.
+
+ @crowsonkb's notes on EMA Warmup:
+ If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+ to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+ gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+ at 215.4k steps).
+ """
+
+ if isinstance(parameters, torch.nn.Module):
+ deprecation_message = (
+ "Passing a `torch.nn.Module` to `ExponentialMovingAverage` is deprecated. "
+ "Please pass the parameters of the module instead."
+ )
+ deprecate(
+ "passing a `torch.nn.Module` to `ExponentialMovingAverage`",
+ "1.0.0",
+ deprecation_message,
+ standard_warn=False,
+ )
+ parameters = parameters.parameters()
+
+ # set use_ema_warmup to True if a torch.nn.Module is passed for backwards compatibility
+ use_ema_warmup = True
+
+ if kwargs.get("max_value", None) is not None:
+ deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead."
+ deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False)
+ decay = kwargs["max_value"]
+
+ if kwargs.get("min_value", None) is not None:
+ deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead."
+ deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False)
+ min_decay = kwargs["min_value"]
+
+ parameters = list(parameters)
+ self.shadow_params = [p.clone().detach() for p in parameters]
+
+ if kwargs.get("device", None) is not None:
+ deprecation_message = "The `device` argument is deprecated. Please use `to` instead."
+ deprecate("device", "1.0.0", deprecation_message, standard_warn=False)
+ self.to(device=kwargs["device"])
+
+ self.temp_stored_params = None
+
+ self.decay = decay
+ self.min_decay = min_decay
+ self.update_after_step = update_after_step
+ self.use_ema_warmup = use_ema_warmup
+ self.inv_gamma = inv_gamma
+ self.power = power
+ self.optimization_step = 0
+ self.cur_decay_value = None # set in `step()`
+
+ self.model_cls = model_cls
+ self.model_config = model_config
+
+ @classmethod
+ def extract_ema_kwargs(cls, kwargs):
+ """
+ Extracts the EMA kwargs from the kwargs of a class method.
+ """
+ ema_kwargs = {}
+ for key in [
+ "decay",
+ "min_decay",
+ "optimization_step",
+ "update_after_step",
+ "use_ema_warmup",
+ "inv_gamma",
+ "power",
+ ]:
+ if kwargs.get(key, None) is not None:
+ ema_kwargs[key] = kwargs.pop(key)
+ return ema_kwargs
+
+ @classmethod
+ def from_pretrained(cls, path, model_cls) -> "EMAModel":
+ config = model_cls.load_config(path)
+ ema_kwargs = cls.extract_ema_kwargs(config)
+ model = model_cls.from_pretrained(path)
+
+ ema_model = cls(model.parameters(), model_cls=model_cls, model_config=config)
+
+ ema_model.load_state_dict(ema_kwargs)
+ return ema_model
+
+ def save_pretrained(self, path):
+ if self.model_cls is None:
+ raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
+
+ if self.model_config is None:
+ raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
+
+ model = self.model_cls.from_config(self.model_config)
+ state_dict = self.state_dict()
+ state_dict.pop("shadow_params", None)
+
+ model.register_to_config(**state_dict)
+ self.copy_to(model.parameters())
+ model.save_pretrained(path)
+
+ def get_decay(self, optimization_step: int) -> float:
+ """
+ Compute the decay factor for the exponential moving average.
+ """
+ step = max(0, optimization_step - self.update_after_step - 1)
+
+ if step <= 0:
+ return 0.0
+
+ if self.use_ema_warmup:
+ cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
+ else:
+ cur_decay_value = (1 + step) / (10 + step)
+
+ cur_decay_value = min(cur_decay_value, self.decay)
+ # make sure decay is not smaller than min_decay
+ cur_decay_value = max(cur_decay_value, self.min_decay)
+ return cur_decay_value
+
+ @torch.no_grad()
+ def step(self, parameters: Iterable[torch.nn.Parameter]):
+ if isinstance(parameters, torch.nn.Module):
+ deprecation_message = (
+ "Passing a `torch.nn.Module` to `ExponentialMovingAverage.step` is deprecated. "
+ "Please pass the parameters of the module instead."
+ )
+ deprecate(
+ "passing a `torch.nn.Module` to `ExponentialMovingAverage.step`",
+ "1.0.0",
+ deprecation_message,
+ standard_warn=False,
+ )
+ parameters = parameters.parameters()
+
+ parameters = list(parameters)
+
+ self.optimization_step += 1
+
+ # Compute the decay factor for the exponential moving average.
+ decay = self.get_decay(self.optimization_step)
+ self.cur_decay_value = decay
+ one_minus_decay = 1 - decay
+
+ context_manager = contextlib.nullcontext
+ if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
+ import deepspeed
+
+ for s_param, param in zip(self.shadow_params, parameters):
+ if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
+ context_manager = deepspeed.zero.GatheredParameters(param, modifier_rank=None)
+
+ with context_manager():
+ if param.requires_grad:
+ s_param.sub_(one_minus_decay * (s_param - param))
+ else:
+ s_param.copy_(param)
+
+ def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+ """
+ Copy current averaged parameters into given collection of parameters.
+
+ Args:
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+ updated with the stored moving averages. If `None`, the parameters with which this
+ `ExponentialMovingAverage` was initialized will be used.
+ """
+ parameters = list(parameters)
+ for s_param, param in zip(self.shadow_params, parameters):
+ param.data.copy_(s_param.to(param.device).data)
+
+
+ def to(self, device=None, dtype=None) -> None:
+ r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+
+ Args:
+ device: like `device` argument to `torch.Tensor.to`
+ """
+ # .to() on the tensors handles None correctly
+ self.shadow_params = [
+ p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+ for p in self.shadow_params
+ ]
+
+ def state_dict(self) -> dict:
+ r"""
+ Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
+ checkpointing to save the ema state dict.
+ """
+ # Following PyTorch conventions, references to tensors are returned:
+ # "returns a reference to the state and not its copy!" -
+ # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+ return {
+ "decay": self.decay,
+ "min_decay": self.min_decay,
+ "optimization_step": self.optimization_step,
+ "update_after_step": self.update_after_step,
+ "use_ema_warmup": self.use_ema_warmup,
+ "inv_gamma": self.inv_gamma,
+ "power": self.power,
+ "shadow_params": self.shadow_params,
+ }
+
+ def store(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+ r"""
+ Args:
+ Save the current parameters for restoring later.
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+ temporarily stored.
+ """
+ self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
+
+ def restore(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+ r"""
+ Args:
+ Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters without:
+ affecting the original optimization process. Store the parameters before the `copy_to()` method. After
+ validation (or model saving), use this to restore the former parameters.
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+ updated with the stored parameters. If `None`, the parameters with which this
+ `ExponentialMovingAverage` was initialized will be used.
+ """
+ if self.temp_stored_params is None:
+ raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`")
+ for c_param, param in zip(self.temp_stored_params, parameters):
+ param.data.copy_(c_param.data)
+
+ # Better memory-wise.
+ self.temp_stored_params = None
+
+ def load_state_dict(self, state_dict: dict) -> None:
+ r"""
+ Args:
+ Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
+ ema state dict.
+ state_dict (dict): EMA state. Should be an object returned
+ from a call to :meth:`state_dict`.
+ """
+ # deepcopy, to be consistent with module API
+ state_dict = copy.deepcopy(state_dict)
+
+ self.decay = state_dict.get("decay", self.decay)
+ if self.decay < 0.0 or self.decay > 1.0:
+ raise ValueError("Decay must be between 0 and 1")
+
+ self.min_decay = state_dict.get("min_decay", self.min_decay)
+ if not isinstance(self.min_decay, float):
+ raise ValueError("Invalid min_decay")
+
+ self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
+ if not isinstance(self.optimization_step, int):
+ raise ValueError("Invalid optimization_step")
+
+ self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
+ if not isinstance(self.update_after_step, int):
+ raise ValueError("Invalid update_after_step")
+
+ self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
+ if not isinstance(self.use_ema_warmup, bool):
+ raise ValueError("Invalid use_ema_warmup")
+
+ self.inv_gamma = state_dict.get("inv_gamma", self.inv_gamma)
+ if not isinstance(self.inv_gamma, (float, int)):
+ raise ValueError("Invalid inv_gamma")
+
+ self.power = state_dict.get("power", self.power)
+ if not isinstance(self.power, (float, int)):
+ raise ValueError("Invalid power")
+
+ shadow_params = state_dict.get("shadow_params", None)
+ if shadow_params is not None:
+ self.shadow_params = shadow_params
+ if not isinstance(self.shadow_params, list):
+ raise ValueError("shadow_params must be a list")
+ if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+ raise ValueError("shadow_params must all be Tensors")
diff --git a/opensora/utils/ema_utils.py b/opensora/utils/ema_utils.py
new file mode 100644
index 000000000..7e8d2649c
--- /dev/null
+++ b/opensora/utils/ema_utils.py
@@ -0,0 +1,60 @@
+
+from peft import get_peft_model, PeftModel
+import os
+from copy import deepcopy
+import torch
+import json
+from diffusers.training_utils import EMAModel as diffuser_EMAModel
+
+
+
+class EMAModel(diffuser_EMAModel):
+ def __init__(self, parameters, **kwargs):
+ self.lora_config = kwargs.pop('lora_config', None)
+ super().__init__(parameters, **kwargs)
+
+ @classmethod
+ def from_pretrained(cls, path, model_cls, lora_config, model_base) -> "EMAModel":
+ # 1. load model
+ if lora_config is not None:
+ # 1.1 load origin model
+ model_base = model_cls.from_pretrained(model_base) # model_base
+ config = model_base.config
+ # 1.2 convert to lora model automatically and load lora weight
+ model = PeftModel.from_pretrained(model_base, path) # lora_origin_model
+ else:
+ model = model_cls.from_pretrained(path)
+ config = model.config
+ # 3. ema the whole model
+ ema_model = cls(model.parameters(), model_cls=model_cls, model_config=config, lora_config=lora_config)
+ # 4. load ema_config, e.g decay...
+ with open(os.path.join(path, 'ema_config.json'), 'r') as f:
+ state_dict = json.load(f)
+ ema_model.load_state_dict(state_dict)
+ return ema_model
+
+ def save_pretrained(self, path):
+ if self.model_cls is None:
+ raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
+
+ if self.model_config is None:
+ raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
+ # 1. init a base model randomly
+ model = self.model_cls.from_config(self.model_config)
+ # 1.1 convert lora_model
+ if self.lora_config is not None:
+ model = get_peft_model(model, self.lora_config)
+ # 2. ema_model copy to model
+ self.copy_to(model.parameters())
+ # 3. save weight
+ if self.lora_config is not None:
+ model.save_pretrained(path) # only lora weight
+ merge_model = model.merge_and_unload()
+ merge_model.save_pretrained(path) # merge_model weight
+ else:
+ merge_model.save_pretrained(path) # model weight
+ # 4. save ema_config, e.g decay...
+ state_dict = self.state_dict() # lora_model weight
+ state_dict.pop("shadow_params", None)
+ with open(os.path.join(path, 'ema_config.json'), 'w') as f:
+ json.dump(state_dict, f, indent=2)
\ No newline at end of file
diff --git a/opensora/utils/freeinit_utils.py b/opensora/utils/freeinit_utils.py
new file mode 100644
index 000000000..a55cc2940
--- /dev/null
+++ b/opensora/utils/freeinit_utils.py
@@ -0,0 +1,140 @@
+import torch
+import torch.fft as fft
+import math
+
+
+def freq_mix_3d(x, noise, LPF):
+ """
+ Noise reinitialization.
+
+ Args:
+ x: diffused latent
+ noise: randomly sampled noise
+ LPF: low pass filter
+ """
+ # FFT
+ x_freq = fft.fftn(x, dim=(-3, -2, -1))
+ x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+ noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+ noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+
+ # frequency mix
+ HPF = 1 - LPF
+ x_freq_low = x_freq * LPF
+ noise_freq_high = noise_freq * HPF
+ x_freq_mixed = x_freq_low + noise_freq_high # mix in freq domain
+
+ # IFFT
+ x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+ x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+
+ return x_mixed
+
+
+def get_freq_filter(shape, device, filter_type, n, d_s, d_t):
+ """
+ Form the frequency filter for noise reinitialization.
+
+ Args:
+ shape: shape of latent (B, C, T, H, W)
+ filter_type: type of the freq filter
+ n: (only for butterworth) order of the filter, larger n ~ ideal, smaller n ~ gaussian
+ d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+ d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+ """
+ if filter_type == "gaussian":
+ return gaussian_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
+ elif filter_type == "ideal":
+ return ideal_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
+ elif filter_type == "box":
+ return box_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
+ elif filter_type == "butterworth":
+ return butterworth_low_pass_filter(shape=shape, n=n, d_s=d_s, d_t=d_t).to(device)
+ else:
+ raise NotImplementedError
+
+def gaussian_low_pass_filter(shape, d_s=0.25, d_t=0.25):
+ """
+ Compute the gaussian low pass filter mask.
+
+ Args:
+ shape: shape of the filter (volume)
+ d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+ d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+ """
+ T, H, W = shape[-3], shape[-2], shape[-1]
+ mask = torch.zeros(shape)
+ if d_s==0 or d_t==0:
+ return mask
+ for t in range(T):
+ for h in range(H):
+ for w in range(W):
+ d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
+ mask[..., t,h,w] = math.exp(-1/(2*d_s**2) * d_square)
+ return mask
+
+
+def butterworth_low_pass_filter(shape, n=4, d_s=0.25, d_t=0.25):
+ """
+ Compute the butterworth low pass filter mask.
+
+ Args:
+ shape: shape of the filter (volume)
+ n: order of the filter, larger n ~ ideal, smaller n ~ gaussian
+ d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+ d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+ """
+ T, H, W = shape[-3], shape[-2], shape[-1]
+ mask = torch.zeros(shape)
+ if d_s==0 or d_t==0:
+ return mask
+ for t in range(T):
+ for h in range(H):
+ for w in range(W):
+ d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
+ mask[..., t,h,w] = 1 / (1 + (d_square / d_s**2)**n)
+ return mask
+
+
+def ideal_low_pass_filter(shape, d_s=0.25, d_t=0.25):
+ """
+ Compute the ideal low pass filter mask.
+
+ Args:
+ shape: shape of the filter (volume)
+ d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+ d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+ """
+ T, H, W = shape[-3], shape[-2], shape[-1]
+ mask = torch.zeros(shape)
+ if d_s==0 or d_t==0:
+ return mask
+ for t in range(T):
+ for h in range(H):
+ for w in range(W):
+ d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
+ mask[..., t,h,w] = 1 if d_square <= d_s*2 else 0
+ return mask
+
+
+def box_low_pass_filter(shape, d_s=0.25, d_t=0.25):
+ """
+ Compute the ideal low pass filter mask (approximated version).
+
+ Args:
+ shape: shape of the filter (volume)
+ d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+ d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+ """
+ T, H, W = shape[-3], shape[-2], shape[-1]
+ mask = torch.zeros(shape)
+ if d_s==0 or d_t==0:
+ return mask
+
+ threshold_s = round(int(H // 2) * d_s)
+ threshold_t = round(T // 2 * d_t)
+
+ cframe, crow, ccol = T // 2, H // 2, W //2
+ mask[..., cframe - threshold_t:cframe + threshold_t, crow - threshold_s:crow + threshold_s, ccol - threshold_s:ccol + threshold_s] = 1.0
+
+ return mask
\ No newline at end of file
diff --git a/opensora/utils/gpu_mem_track.py b/opensora/utils/gpu_mem_track.py
new file mode 100644
index 000000000..7b7fb04de
--- /dev/null
+++ b/opensora/utils/gpu_mem_track.py
@@ -0,0 +1,113 @@
+import gc
+import datetime
+import inspect
+
+import torch
+import numpy as np
+
+dtype_memory_size_dict = {
+ torch.float64: 64/8,
+ torch.double: 64/8,
+ torch.float32: 32/8,
+ torch.float: 32/8,
+ torch.float16: 16/8,
+ torch.half: 16/8,
+ torch.int64: 64/8,
+ torch.long: 64/8,
+ torch.int32: 32/8,
+ torch.int: 32/8,
+ torch.int16: 16/8,
+ torch.short: 16/6,
+ torch.uint8: 8/8,
+ torch.int8: 8/8,
+}
+# compatibility of torch1.0
+if getattr(torch, "bfloat16", None) is not None:
+ dtype_memory_size_dict[torch.bfloat16] = 16/8
+if getattr(torch, "bool", None) is not None:
+ dtype_memory_size_dict[torch.bool] = 8/8 # pytorch use 1 byte for a bool, see https://github.com/pytorch/pytorch/issues/41571
+
+def get_mem_space(x):
+ try:
+ ret = dtype_memory_size_dict[x]
+ except KeyError:
+ print(f"dtype {x} is not supported!")
+ return ret
+
+class MemTracker(object):
+ """
+ Class used to track pytorch memory usage
+ Arguments:
+ detail(bool, default True): whether the function shows the detail gpu memory usage
+ path(str): where to save log file
+ verbose(bool, default False): whether show the trivial exception
+ device(int): GPU number, default is 0
+ """
+ def __init__(self, detail=True, path='', verbose=False, device=0):
+ self.print_detail = detail
+ self.last_tensor_sizes = set()
+ self.gpu_profile_fn = path + f'{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_track.txt'
+ self.verbose = verbose
+ self.begin = True
+ self.device = device
+
+ def get_tensors(self):
+ for obj in gc.get_objects():
+ try:
+ if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
+ tensor = obj
+ else:
+ continue
+ if tensor.is_cuda:
+ yield tensor
+ except Exception as e:
+ if self.verbose:
+ print('A trivial exception occured: {}'.format(e))
+
+ def get_tensor_usage(self):
+ sizes = [np.prod(np.array(tensor.size())) * get_mem_space(tensor.dtype) for tensor in self.get_tensors()]
+ return np.sum(sizes) / 1024**2
+
+ def get_allocate_usage(self):
+ return torch.cuda.memory_allocated() / 1024**2
+
+ def clear_cache(self):
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ def print_all_gpu_tensor(self, file=None):
+ for x in self.get_tensors():
+ print(x.size(), x.dtype, np.prod(np.array(x.size()))*get_mem_space(x.dtype)/1024**2, file=file)
+
+ def track(self):
+ """
+ Track the GPU memory usage
+ """
+ frameinfo = inspect.stack()[1]
+ where_str = frameinfo.filename + ' line ' + str(frameinfo.lineno) + ': ' + frameinfo.function
+
+ with open(self.gpu_profile_fn, 'a+') as f:
+
+ if self.begin:
+ f.write(f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |"
+ f" Total Tensor Used Memory:{self.get_tensor_usage():<7.1f}Mb"
+ f" Total Allocated Memory:{self.get_allocate_usage():<7.1f}Mb\n\n")
+ self.begin = False
+
+ if self.print_detail is True:
+ ts_list = [(tensor.size(), tensor.dtype) for tensor in self.get_tensors()]
+ new_tensor_sizes = {(type(x),
+ tuple(x.size()),
+ ts_list.count((x.size(), x.dtype)),
+ np.prod(np.array(x.size()))*get_mem_space(x.dtype)/1024**2,
+ x.dtype) for x in self.get_tensors()}
+ for t, s, n, m, data_type in new_tensor_sizes - self.last_tensor_sizes:
+ f.write(f'+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} | {data_type}\n')
+ for t, s, n, m, data_type in self.last_tensor_sizes - new_tensor_sizes:
+ f.write(f'- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} | {data_type}\n')
+
+ self.last_tensor_sizes = new_tensor_sizes
+
+ f.write(f"\nAt {where_str:<50}"
+ f" Total Tensor Used Memory:{self.get_tensor_usage():<7.1f}Mb"
+ f" Total Allocated Memory:{self.get_allocate_usage():<7.1f}Mb\n\n")
\ No newline at end of file
diff --git a/opensora/utils/lora_utils.py b/opensora/utils/lora_utils.py
new file mode 100644
index 000000000..e4159e9fd
--- /dev/null
+++ b/opensora/utils/lora_utils.py
@@ -0,0 +1,44 @@
+
+from peft import get_peft_model, PeftModel
+import os
+from copy import deepcopy
+import torch
+import json
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+ from deepspeed import zero
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+ if hasattr(param, "ds_id"):
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+ if not ignore_status:
+ logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+ with zero.GatheredParameters([param]):
+ param = param.data.detach().cpu().clone()
+ else:
+ param = param.detach().cpu().clone()
+ return param
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+ if bias == "none":
+ to_return = {k: t for k, t in named_params if "lora_" in k}
+ elif bias == "all":
+ to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+ elif bias == "lora_only":
+ to_return = {}
+ maybe_lora_bias = {}
+ lora_bias_names = set()
+ for k, t in named_params:
+ if "lora_" in k:
+ to_return[k] = t
+ bias_name = k.split("lora_")[0] + "bias"
+ lora_bias_names.add(bias_name)
+ elif "bias" in k:
+ maybe_lora_bias[k] = t
+ for k, t in maybe_lora_bias:
+ if bias_name in lora_bias_names:
+ to_return[bias_name] = t
+ else:
+ raise NotImplementedError
+ to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+ return to_return
diff --git a/opensora/utils/mask_utils.py b/opensora/utils/mask_utils.py
new file mode 100644
index 000000000..e61467ed1
--- /dev/null
+++ b/opensora/utils/mask_utils.py
@@ -0,0 +1,548 @@
+import random
+from opensora.dataset.inpaint_utils import get_mask_tensor,MaskType
+import torch
+from opensora.dataset.transform import ToTensorVideo
+from opensora.models.causalvideovae import ae_norm
+from torchvision import transforms
+import os
+from PIL import Image
+import numpy as np
+import cv2
+from enum import Enum, auto
+from ultralytics import YOLO
+
+
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+
+
+os.environ['YOLO_VERBOSS'] = 'False'
+
+class MaskType(Enum):
+ Semantic_mask = auto()
+ bbox_mask = auto()
+ background_mask = auto()
+ fixed_mask = auto()
+ Semantic_expansion_mask = auto()
+ fixed_bg_mask = auto()
+ t2iv_mask = auto()
+ i2v_mask = auto()
+ transition_mask = auto()
+ v2v_mask = auto()
+ clear_mask = auto()
+ random_mask = auto()
+
+
+
+
+class single_info:
+ def __init__(self, id, label, shape) -> None:
+ self.id = id
+ self.label = label
+ self.shape = shape
+ self.frame_indexes = []
+ self.infos = []
+ def update(self,frame_index,box,conf,mask):
+ self.frame_indexes.append(frame_index)
+ info = dict(
+ box=box,
+ conf=conf,
+ mask=mask,
+ )
+ self.infos.append(info)
+ def return_dict(self,):
+ return dict(
+ id=self.id,
+ label=self.label,
+ frame_size=self.shape,
+ frame_index_list = self.frame_indexes,
+ infos_list = self.infos
+ )
+
+def save_videos_from_pil(pil_images, path, fps=24):
+ """
+ pil_images: list[Image,...]
+ """
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+
+ image = pil_images[0]
+
+ image = ndarray_to_pil(pil_images[0])
+ width, height = image.size
+
+
+ codec = "libx264"
+ container = av.open(path, "w")
+ stream = container.add_stream(codec, rate=fps)
+
+ stream.width = width
+ stream.height = height
+
+ for pil_image in pil_images:
+ # pil_image = Image.fromarray(image_arr).convert("RGB")
+ pil_image = ndarray_to_pil(pil_image)
+ av_frame = av.VideoFrame.from_image(pil_image)
+ container.mux(stream.encode(av_frame))
+ container.mux(stream.encode())
+ container.close()
+
+def read_frames(video_tensor) -> list:
+ """
+ 读取视频,返回一个元素类型为ndarray的列表
+ """
+ # container = av.open(video_path)
+ T = video_tensor.shape[0]
+ frames = []
+ for t in range(T):
+ frame_tensor = video_tensor[t]
+ frame_tensor = frame_tensor.cpu().numpy()
+ frame_tensor = np.transpose(frame_tensor, (1, 2, 0))
+ frames.append(frame_tensor)
+ return frames
+
+
+def get_masked_image(image: np.ndarray, mask: np.ndarray) -> np.ndarray:
+ mask = mask.astype(bool)
+ if len(mask.shape) == 2:
+ mask = np.expand_dims(mask, axis=2)
+ masked_img = image * (1-mask)
+ return masked_img # shape: [H,W,C]; range: [0, 255]
+
+def get_bbox_image(image: np.ndarray,bbox,obj_id):
+ # cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
+ bbox_image = image.copy()
+ bbox_image[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 0
+ # cv2.putText(image, f'ID: {obj_id}', (int(bbox[0]), int(bbox[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 0), 2)
+ return bbox_image
+
+
+
+def select_bg_from_video(bg_masks, video):
+ new_container = []
+ for index, frame in enumerate(video):
+
+ mask = bg_masks[index]
+ masked_frame = get_masked_image(frame, mask)
+ new_container.append(masked_frame)
+ return new_container
+
+def get_random_box(image_tensor):
+
+ H, W,C = image_tensor.shape
+
+ box_min_size = min(H,W)/2
+ box_max_size = max(H,W)/2
+
+ # 随机确定 box 的宽高
+ box_width = random.randint(box_min_size, min(box_max_size, W))
+ box_height = random.randint(box_min_size, min(box_max_size, H))
+
+ # 随机确定 box 的左上角坐标
+ x_start = random.randint(0, W - box_width)
+ y_start = random.randint(0, H - box_height)
+
+ box = (x_start, y_start, x_start + box_width, y_start + box_height)
+
+ return box
+
+def combine_masks_and_get_background(masks):
+ """
+ 合并所有 mask 并取反得到背景 mask
+ """
+ combined_mask = np.any(masks, axis=0)
+ background_mask = np.logical_not(combined_mask)
+ return background_mask
+
+def parser_results_for_ids(results, frame_size=None):
+ id_record = []
+ single_info_ins = {}
+ background_masks = []
+ for frame_index, result in enumerate(results):
+ result = result[0]
+ if frame_index == 0 and frame_size is None:
+ frame_size = result.boxes.orig_shape
+ id = result.boxes.id
+
+ # 如果没有检测到物体
+ if id is None:
+ background_masks.append(np.ones((frame_size)) * 255)
+ continue
+
+ id = id.tolist()
+ cls = result.boxes.cls.tolist() #每个id对应的label
+ conf = result.boxes.conf.tolist() #每个id对应的预测置信度
+ box_n = result.boxes.xyxy.tolist() #每个id对应的box
+ mask = result.masks.data.cpu().detach().numpy() #每个id对应的mask
+ background_masks.append(combine_masks_and_get_background(mask))
+
+ for i, iden in enumerate(id):
+ if iden not in id_record:
+ id_record.append(iden)
+ single_info_ins[iden] = single_info(iden, cls[i], frame_size)
+ single_info_ins[iden].update(frame_index, box_n[i], conf[i],mask[i])
+ return_list = []
+ for _, value in single_info_ins.items():
+ return_list.append(value.return_dict())
+ return return_list, background_masks
+
+
+def get_mask(video_tensor,mask_type,yole_model):
+
+ video = read_frames(video_tensor=video_tensor)
+
+ # video_tensor_batch = video_tensor.unsqueeze(1)
+ T,C,H,W = video_tensor.shape
+
+ # video = video_tensor
+
+ tracker = yole_model
+
+ results = []
+
+
+ for t in range(T):
+ frame_tensor = video_tensor[t] # 获取当前帧, (C, H, W)
+ frame_tensor = frame_tensor.data.cpu().numpy() # 转为numpy
+ frame_tensor = np.transpose(frame_tensor, (1, 2, 0))
+
+ # 进行推理
+ result = tracker.track(frame_tensor,save=False, retina_masks=True, agnostic_nms=True,half=True,verbose=False,nms=False)
+
+ # 保存结果
+ results.append(result)
+
+
+ parser_res, background_masks = parser_results_for_ids(results)
+
+ select_index = -1
+ object_info = []
+ frame_indexes = []
+ infos = []
+
+
+ #随机选择一个被追踪物体
+ if len(parser_res) is not 0:
+ select_index = random.randint(0, len(parser_res)-1)
+ object_info = parser_res[select_index]
+ frame_indexes = object_info['frame_index_list']
+ infos = object_info['infos_list']
+ # print("infos size",len(infos))
+ # print("frame_indexed",len(frame_indexes))
+ else:
+ mask_type = MaskType.fixed_mask
+
+
+
+ # print("frame_indexed:",frame_indexes)
+ # print("infos:",infos)
+
+ # mask_type = MaskType.Semantic_mask
+
+ if mask_type == MaskType.Semantic_mask or mask_type == MaskType.Semantic_expansion_mask:
+ Semantic_masks = []
+ mask_container = []
+ info_index = 0
+ for index, frame in enumerate(video):
+ if index in frame_indexes:
+
+ mask = infos[info_index]['mask']
+ info_index = info_index + 1
+
+ if mask_type == MaskType.Semantic_expansion_mask:
+ kernel = np.ones((5, 5), np.uint8)
+ # 进行膨胀操作
+ mask = cv2.dilate(mask, kernel, iterations=1)
+
+ # 计算掩码中前景像素的数量
+ foreground_pixels = np.sum(mask)
+
+ # 计算图像的总像素数
+ total_pixels = mask.size # 或者使用 image.shape[0] * image.shape[1]
+
+ # 计算比例
+ ratio = foreground_pixels / total_pixels
+
+ if ratio < 0.2:
+ if random.random() < 0.5:
+ mask_type = MaskType.fixed_mask
+ break
+
+ masked_frame = get_masked_image(frame, mask)
+ mask_container.append(masked_frame)
+ Semantic_masks.append(mask)
+ else:
+ mask_container.append(np.zeros_like(frame))
+ Semantic_masks.append(np.zeros_like(frame)[:,:,0])
+ if mask_type == MaskType.Semantic_mask or mask_type == MaskType.Semantic_expansion_mask:
+ return mask_container, Semantic_masks
+
+ if mask_type == MaskType.bbox_mask:
+ boxes_masks = []
+ box_container = []
+
+ info_index = 0
+
+ for index, frame in enumerate(video):
+ if index in frame_indexes:
+ bbox = infos[info_index]['box']
+ info_index = info_index + 1
+
+ boxed_frame = get_bbox_image(frame, bbox, object_info['id'])
+ box_container.append(boxed_frame)
+ boxmask = np.zeros_like(frame)[:,:,0]
+ boxmask[int(bbox[1]): int(bbox[3]), int(bbox[0]): int(bbox[2])] = 1
+ boxes_masks.append(boxmask)
+ else:
+ box_container.append(frame)
+ boxes_masks.append(np.zeros_like(frame)[:,:,0])
+
+ return box_container, boxes_masks
+
+ if mask_type == MaskType.background_mask:
+ bg_container = select_bg_from_video(background_masks, video)
+ return bg_container, background_masks
+
+ if mask_type == MaskType.fixed_mask or mask_type == MaskType.fixed_bg_mask:
+ fixed_mask_container = []
+ fixed_masks = []
+
+ box = get_random_box(video[0])
+ for index , frame in enumerate(video):
+ if mask_type == MaskType.fixed_mask:
+ boxed_frame = frame.copy()
+ boxed_frame[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 0
+ fixed_mask_container.append(boxed_frame)
+
+ fixed_mask = np.zeros_like(frame)[:,:,0]
+ fixed_mask[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 1
+ fixed_masks.append(fixed_mask)
+ if mask_type == MaskType.fixed_bg_mask:
+ boxed_frame = frame.copy()
+
+ fixed_mask = np.zeros_like(frame)[:,:,0]
+ fixed_mask[int(box[1]): int(box[3]), int(box[0]): int(box[2])] = 1
+ fixed_mask = 1 - fixed_mask
+ fixed_masks.append(fixed_mask)
+
+ boxed_bg_frame = get_masked_image(boxed_frame, fixed_mask)
+ fixed_mask_container.append(boxed_bg_frame)
+
+ return fixed_mask_container, fixed_masks
+
+
+
+def video_to_tensor(video_path):
+ # 打开视频文件
+ cap = cv2.VideoCapture(video_path)
+
+ frames = []
+
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ # 将 BGR 转换为 RGB
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+ # 转换为张量并添加到帧列表中
+ frame_tensor = torch.from_numpy(frame).permute(2, 0, 1) # (H, W, C) -> (C, H, W)
+ frames.append(frame_tensor)
+
+ cap.release()
+
+ # 将所有帧组合成一个四维张量
+ video_tensor = torch.stack(frames) # (T, C, H, W)
+
+ return video_tensor
+
+
+def ndarray_to_pil(image: np.ndarray) -> Image:
+ if np.max(image) <= 1.1:
+ image = image * 255
+ image = image.astype(np.uint8)
+ return Image.fromarray(image)
+
+def get_random_type():
+ # 数字列表
+ mask_type = [MaskType.Semantic_mask, MaskType.bbox_mask, MaskType.background_mask, MaskType.fixed_mask, MaskType.Semantic_expansion_mask, MaskType.fixed_bg_mask]
+
+ # 概率权重列表(总和应为 1 或任意正数比例)
+ weights = [0.3, 0.2, 0.1, 0.1, 0.2, 0.1] # 例如,第一个数字1的概率是0.1
+
+ # 从1-6中按设定概率随机选择一个数字
+ chosen_number = random.choices(mask_type, weights=weights)[0]
+
+ return chosen_number
+
+def get_mask_tensor(video_tensor,mask_type,yolomodel):
+
+ # return video_tensor,video_tensor
+
+ masked_video_container,masks_container = get_mask(video_tensor,mask_type,yolomodel)
+
+ masked_frames = [torch.from_numpy(frame.transpose(2,0,1)) for frame in masked_video_container]
+ masked_video = torch.stack(masked_frames)
+
+ masks = [torch.from_numpy(mask.reshape(1,mask.shape[0],mask.shape[1])) for mask in masks_container]
+ # import ipdb; ipdb.set_trace()
+ mask = torch.stack(masks)
+
+ return masked_video,mask
+
+
+
+class MaskProcessor:
+ def __init__(self,args,YOLOmodel):
+ # ratio
+ # transform
+ self.num_frames = args.num_frames
+ if self.num_frames != 1:
+ # inpaint
+ self.t2v_ratio = args.t2v_ratio
+ self.i2v_ratio = args.i2v_ratio
+ self.transition_ratio = args.transition_ratio
+ self.v2v_ratio = args.v2v_ratio
+ self.clear_video_ratio = args.clear_video_ratio
+ self.Semantic_ratio = args.Semantic_ratio
+ self.bbox_ratio = args.bbox_ratio
+ self.background_ratio = args.background_ratio
+ self.fixed_ratio = args.fixed_ratio
+ self.Semantic_expansion_ratio = args.Semantic_expansion_ratio
+ self.fixed_bg_ratio = args.fixed_bg_ratio
+ assert self.t2v_ratio + self.i2v_ratio + self.transition_ratio + self.v2v_ratio + self.clear_video_ratio + self.Semantic_ratio + self.bbox_ratio + self.background_ratio + self.fixed_ratio + self.fixed_bg_ratio + self.Semantic_expansion_ratio < 1, 'The sum of t2v_ratio, i2v_ratio, transition_ratio, v2v_ratio and clear video ratio should be less than 1.'
+
+ self.min_clear_ratio = 0.0 if args.min_clear_ratio is None else args.min_clear_ratio
+ assert self.min_clear_ratio >= 0 and self.min_clear_ratio <= 1, 'min_clear_ratio should be in the range of [0, 1].'
+
+
+ self.transform = transforms.Compose([
+ ToTensorVideo(),
+ ae_norm[args.ae]
+ ])
+
+ self.init_mask_func()
+ self.init_ratio()
+
+ self.default_text_ratio = args.default_text_ratio
+
+ self.yolomodel = YOLOmodel
+
+ def init_mask_func(self):
+ # mask: ones_like (t 1 h w)
+ def t2iv(mask):
+ mask[:] = 1
+ return mask
+
+ def i2v(mask):
+ mask[0] = 0
+ return mask
+
+ def transition(mask):
+ mask[0] = 0
+ mask[-1] = 0
+ return mask
+
+ def v2v(mask):
+ end_idx = random.randint(int(mask.shape[0] * self.min_clear_ratio), mask.shape[0])
+ mask[:end_idx] = 0
+ return mask
+
+ def clear(mask):
+ mask[:] = 0
+ return mask
+
+ def random_mask(mask):
+ num_to_select = random.randint(int(mask.shape[0] * self.min_clear_ratio), mask.shape[0])
+ selected_indices = random.sample(range(mask.shape[0]), num_to_select)
+ mask[selected_indices] = 0
+ return mask
+
+ def Semantic_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.Semantic_mask,self.yolomodel)
+
+ def bbox_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.bbox_mask,self.yolomodel)
+
+ def background_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.background_mask,self.yolomodel)
+
+ def fixed_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.fixed_mask,self.yolomodel)
+
+ def Semantic_expansion_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.Semantic_expansion_mask,self.yolomodel)
+
+ def fixed_bg_mask(video_tensor):
+ return get_mask_tensor(video_tensor,MaskType.fixed_bg_mask,self.yolomodel)
+
+
+
+ self.mask_functions = {
+ MaskType.t2iv_mask: t2iv,
+ MaskType.i2v_mask: i2v,
+ MaskType.transition_mask: transition,
+ MaskType.v2v_mask: v2v,
+ MaskType.clear_mask: clear,
+ MaskType.random_mask: random_mask,
+ MaskType.Semantic_mask:Semantic_mask,
+ MaskType.bbox_mask:bbox_mask,
+ MaskType.background_mask:background_mask,
+ MaskType.fixed_mask:fixed_mask,
+ MaskType.Semantic_expansion_mask:Semantic_expansion_mask,
+ MaskType.fixed_bg_mask:fixed_bg_mask
+ }
+
+ def init_ratio(self):
+
+ self.mask_func_weights_video = {
+ MaskType.t2iv_mask: self.t2v_ratio,
+ MaskType.i2v_mask: self.i2v_ratio,
+ MaskType.transition_mask: self.transition_ratio,
+ MaskType.v2v_mask: self.v2v_ratio,
+ MaskType.clear_mask: self.clear_video_ratio,
+ MaskType.Semantic_mask:self.Semantic_ratio,
+ MaskType.bbox_mask:self.bbox_ratio,
+ MaskType.background_mask:self.background_ratio,
+ MaskType.fixed_mask:self.fixed_ratio,
+ MaskType.Semantic_expansion_mask:self.Semantic_expansion_ratio,
+ MaskType.fixed_bg_mask:self.fixed_bg_ratio,
+ MaskType.random_mask: 1 - self.t2v_ratio - self.i2v_ratio - self.transition_ratio - self.v2v_ratio - self.clear_video_ratio - self.Semantic_ratio - self.bbox_ratio - self.background_ratio - self.fixed_ratio - self.Semantic_expansion_ratio - self.fixed_bg_ratio
+
+ }
+
+ self.mask_func_weights_image = {
+ 't2iv': 0.9,
+ 'clear': 0.1
+ }
+
+ # t c h w
+ def __call__(self,pixel_values):
+ # pixel_values shape (T, C, H, W)
+ # 1 means masked, 0 means not masked
+ t, c, h, w = pixel_values.shape
+ mask = torch.ones([t, 1, h, w], device=pixel_values.device, dtype=pixel_values.dtype)
+
+ mask_func_name = random.choices(list(self.mask_func_weights_video.keys()), list(self.mask_func_weights_video.values()))[0]
+ frame_mask_list = [MaskType.t2iv_mask,MaskType.i2v_mask,MaskType.transition_mask,MaskType.v2v_mask,MaskType.clear_mask,MaskType.random_mask]
+ pos_mask_list = [MaskType.Semantic_mask,MaskType.bbox_mask,MaskType.background_mask,MaskType.fixed_mask,MaskType.Semantic_expansion_mask,MaskType.fixed_bg_mask]
+
+
+ if mask_func_name in frame_mask_list:
+ mask = self.mask_functions[mask_func_name](mask)
+ masked_pixel_values = pixel_values * (mask < 0.5)
+
+ if mask_func_name in pos_mask_list:
+ masked_pixel_values,mask = self.mask_functions[mask_func_name](pixel_values)
+ # save_video(masked_pixel_values.permute(0, 2, 3, 1).cpu().numpy(), 'masked_video.mp4')
+
+ # import ipdb; ipdb.set_trace()
+
+ pixel_values = self.transform(pixel_values)
+ masked_pixel_values = self.transform(masked_pixel_values.to(torch.uint8))
+
+
+
+ return masked_pixel_values, pixel_values, mask
+
+
diff --git a/opensora/utils/parallel_states.py b/opensora/utils/parallel_states.py
new file mode 100644
index 000000000..def564f71
--- /dev/null
+++ b/opensora/utils/parallel_states.py
@@ -0,0 +1,44 @@
+import torch
+import torch.distributed as dist
+import os
+
+class COMM_INFO:
+ def __init__(self):
+ self.group = None
+ self.world_size = 0
+ self.rank = -1
+
+nccl_info = COMM_INFO()
+_SEQUENCE_PARALLEL_STATE = False
+def initialize_sequence_parallel_state(sequence_parallel_size):
+ global _SEQUENCE_PARALLEL_STATE
+ if sequence_parallel_size > 1:
+ _SEQUENCE_PARALLEL_STATE = True
+ initialize_sequence_parallel_group(sequence_parallel_size)
+
+def set_sequence_parallel_state(state):
+ global _SEQUENCE_PARALLEL_STATE
+ _SEQUENCE_PARALLEL_STATE = state
+
+def get_sequence_parallel_state():
+ return _SEQUENCE_PARALLEL_STATE
+
+def initialize_sequence_parallel_group(sequence_parallel_size):
+ """Initialize the sequence parallel group."""
+ rank = int(os.getenv('RANK', '0'))
+ world_size = int(os.getenv("WORLD_SIZE", '1'))
+ assert world_size % sequence_parallel_size == 0, "world_size must be divisible by sequence_parallel_size"
+ # hccl
+ nccl_info.world_size = sequence_parallel_size
+ nccl_info.rank = rank
+ num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+ for i in range(num_sequence_parallel_groups):
+ ranks = range(i * sequence_parallel_size, (i + 1) * sequence_parallel_size)
+ group = dist.new_group(ranks)
+ if rank in ranks:
+ nccl_info.group = group
+
+
+def destroy_sequence_parallel_group():
+ """Destroy the sequence parallel group."""
+ dist.destroy_process_group()
diff --git a/opensora/utils/utils.py b/opensora/utils/utils.py
index ba94b6aa1..601b79d7c 100644
--- a/opensora/utils/utils.py
+++ b/opensora/utils/utils.py
@@ -15,6 +15,7 @@
from torch import inf
from PIL import Image
from typing import Union, Iterable
+import collections
from collections import OrderedDict
from torch.utils.tensorboard import SummaryWriter
@@ -32,6 +33,11 @@
_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
+def to_2tuple(x):
+ if isinstance(x, collections.abc.Iterable):
+ return x
+ return (x, x)
+
def find_model(model_name):
"""
Finds a pre-trained Latte model, downloading it if necessary. Alternatively, loads a model from a local path.
@@ -297,8 +303,9 @@ def save_video_grid(video, nrow=None):
video_grid = torch.zeros((t, (padding + h) * nrow + padding,
(padding + w) * ncol + padding, c), dtype=torch.uint8)
- print(video_grid.shape)
+ print('save_video_grid video_grid.shape', video_grid.shape)
for i in range(b):
+ print('i', i)
r = i // ncol
c = i % ncol
start_r = (padding + h) * r
@@ -336,10 +343,9 @@ def collect_env():
bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}') # noqa
-def text_preprocessing(text):
+def text_preprocessing(text, support_Chinese=True):
# The exact text cleaning as was in the training stage:
- text = clean_caption(text)
- text = clean_caption(text)
+ text = clean_caption(text, support_Chinese=support_Chinese)
return text
def basic_clean(text):
@@ -347,7 +353,7 @@ def basic_clean(text):
text = html.unescape(html.unescape(text))
return text.strip()
-def clean_caption(caption):
+def clean_caption(caption, support_Chinese=True):
caption = str(caption)
caption = ul.unquote_plus(caption)
caption = caption.strip().lower()
@@ -378,7 +384,8 @@ def clean_caption(caption):
caption = re.sub(r'[\u3300-\u33ff]+', '', caption)
caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)
caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)
- caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
+ if not support_Chinese:
+ caption = re.sub(r'[\u4e00-\u9fff]+', '', caption) # Chinese
#######################################################
# все виды тире / all types of dash --> "-"
@@ -455,5 +462,10 @@ def clean_caption(caption):
return caption.strip()
-
+if __name__ == '__main__':
+
+ # caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
+ a = "امرأة مسنة بشعر أبيض ووجه مليء بالتجاعيد تجلس داخل سيارة قديمة الطراز، تنظر من خلال النافذة الجانبية بتعبير تأملي أو حزين قليلاً."
+ print(a)
+ print(text_preprocessing(a))
diff --git a/pyproject.toml b/pyproject.toml
index 5a3624fd0..431e0c32e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "opensora"
-version = "1.0.0"
+version = "1.2.0"
description = "Reproduce OpenAI's Sora."
readme = "README.md"
requires-python = ">=3.8"
@@ -14,14 +14,14 @@ classifiers = [
]
dependencies = [
"torch==2.1.0", "torchvision==0.16.0",
- "transformers==4.39.1", "accelerate==0.28.0",
+ "transformers==4.40.1", "accelerate==0.29.3", "tokenizers==0.19.1", "diffusers==0.28.0",
"albumentations==1.4.0", "av==11.0.0", "decord==0.6.0", "einops==0.7.0", "fastapi==0.110.0",
"gdown==5.1.0", "h5py==3.10.0", "idna==3.6", 'imageio==2.34.0', "matplotlib==3.7.5", "numpy==1.24.4",
"omegaconf==2.1.1", "opencv-python==4.9.0.80", "opencv-python-headless==4.9.0.80", "pandas==2.0.3", "pillow==10.2.0",
"pydub==0.25.1", "pytorch-lightning==2.2.1", "pytorchvideo==0.1.5", "PyYAML==6.0.1", "regex==2023.12.25",
"requests==2.31.0", "scikit-learn==1.3.2", "scipy==1.10.1", "six==1.16.0", "test-tube==0.7.5",
"timm==0.9.16", "torchdiffeq==0.2.3", "torchmetrics==1.3.2", "tqdm==4.66.2", "urllib3==2.2.1", "uvicorn==0.27.1",
- "diffusers==0.27.2", "scikit-video==1.1.11", "imageio-ffmpeg==0.4.9", "sentencepiece==0.1.99", "beautifulsoup4==4.12.3",
+ "scikit-video==1.1.11", "imageio-ffmpeg==0.4.9", "sentencepiece==0.1.99", "beautifulsoup4==4.12.3",
"ftfy==6.1.3", "moviepy==1.0.3", "wandb==0.16.3", "tensorboard==2.14.0", "pydantic==2.6.4", "gradio==4.0.0", "xformers==0.0.22.post7"
]
diff --git a/scripts/accelerate_configs/deepspeed_zero2_config.yaml b/scripts/accelerate_configs/deepspeed_zero2_config.yaml
index 43ec1e80c..b15390113 100644
--- a/scripts/accelerate_configs/deepspeed_zero2_config.yaml
+++ b/scripts/accelerate_configs/deepspeed_zero2_config.yaml
@@ -5,9 +5,9 @@ deepspeed_config:
fsdp_config: {}
machine_rank: 0
main_process_ip: null
-main_process_port: 29501
+main_process_port: 29503
main_training_function: main
num_machines: 1
num_processes: 8
gpu_ids: 0,1,2,3,4,5,6,7
-use_cpu: false
\ No newline at end of file
+use_cpu: false
diff --git a/scripts/accelerate_configs/hostfile b/scripts/accelerate_configs/hostfile
index a22693099..901cae636 100644
--- a/scripts/accelerate_configs/hostfile
+++ b/scripts/accelerate_configs/hostfile
@@ -1,2 +1,4 @@
-gpu55 slots=8 # your server name and GPU in total
-gpu117 slots=8
+node032 slots=8
+node034 slots=8
+node035 slots=8
+node037 slots=8
diff --git a/scripts/accelerate_configs/hostfile1 b/scripts/accelerate_configs/hostfile1
new file mode 100644
index 000000000..2940421cf
--- /dev/null
+++ b/scripts/accelerate_configs/hostfile1
@@ -0,0 +1,2 @@
+node014 slots=8
+node026 slots=8
diff --git a/scripts/accelerate_configs/hostfile2 b/scripts/accelerate_configs/hostfile2
new file mode 100644
index 000000000..6af909735
--- /dev/null
+++ b/scripts/accelerate_configs/hostfile2
@@ -0,0 +1,2 @@
+node109 slots=8
+node114 slots=8
diff --git a/scripts/accelerate_configs/hostfile40 b/scripts/accelerate_configs/hostfile40
new file mode 100644
index 000000000..d098b9db3
--- /dev/null
+++ b/scripts/accelerate_configs/hostfile40
@@ -0,0 +1,40 @@
+node30 slots=8
+node004 slots=8
+node005 slots=8
+node006 slots=8
+node007 slots=8
+node008 slots=8
+node010 slots=8
+node011 slots=8
+node012 slots=8
+node013 slots=8
+node019 slots=8
+node026 slots=8
+node027 slots=8
+node028 slots=8
+node029 slots=8
+node031 slots=8
+node032 slots=8
+node033 slots=8
+node034 slots=8
+node035 slots=8
+node037 slots=8
+node056 slots=8
+node059 slots=8
+node060 slots=8
+node061 slots=8
+node062 slots=8
+node064 slots=8
+node065 slots=8
+node072 slots=8
+node074 slots=8
+node076 slots=8
+node077 slots=8
+node068 slots=8
+node063 slots=8
+node075 slots=8
+node003 slots=8
+node058 slots=8
+node109 slots=8
+node066 slots=8
+node114 slots=8
\ No newline at end of file
diff --git a/scripts/accelerate_configs/multi_node_example.yaml b/scripts/accelerate_configs/multi_node_example.yaml
index 4e3cac79c..a8a65f012 100644
--- a/scripts/accelerate_configs/multi_node_example.yaml
+++ b/scripts/accelerate_configs/multi_node_example.yaml
@@ -2,14 +2,14 @@ compute_environment: LOCAL_MACHINE
distributed_type: DEEPSPEED
deepspeed_config:
deepspeed_config_file: scripts/accelerate_configs/zero2.json
- deepspeed_hostfile: /remote-home1/yeyang/Open-Sora-Plan/scripts/accelerate_configs/hostfile
+ deepspeed_hostfile: scripts/accelerate_configs/hostfile
fsdp_config: {}
machine_rank: 0
-main_process_ip: 10.10.10.55
-main_process_port: 29501
+main_process_ip: 100.64.24.32
+main_process_port: 29522
main_training_function: main
-num_machines: 2
-num_processes: 16
+num_machines: 32
+num_processes: 256
rdzv_backend: static
same_network: true
tpu_env: []
diff --git a/scripts/accelerate_configs/multi_node_example1.yaml b/scripts/accelerate_configs/multi_node_example1.yaml
new file mode 100644
index 000000000..2288c4f30
--- /dev/null
+++ b/scripts/accelerate_configs/multi_node_example1.yaml
@@ -0,0 +1,18 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: DEEPSPEED
+deepspeed_config:
+ deepspeed_config_file: scripts/accelerate_configs/zero2.json
+ deepspeed_hostfile: scripts/accelerate_configs/hostfile1
+fsdp_config: {}
+machine_rank: 0
+main_process_ip: 100.64.24.14
+main_process_port: 29522
+main_training_function: main
+num_machines: 2
+num_processes: 16
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/scripts/accelerate_configs/multi_node_example2.yaml b/scripts/accelerate_configs/multi_node_example2.yaml
new file mode 100644
index 000000000..4dd53c8ea
--- /dev/null
+++ b/scripts/accelerate_configs/multi_node_example2.yaml
@@ -0,0 +1,18 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: DEEPSPEED
+deepspeed_config:
+ deepspeed_config_file: scripts/accelerate_configs/zero2.json
+ deepspeed_hostfile: scripts/accelerate_configs/hostfile2
+fsdp_config: {}
+machine_rank: 0
+main_process_ip: 100.64.24.109
+main_process_port: 29502
+main_training_function: main
+num_machines: 4
+num_processes: 32
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/scripts/accelerate_configs/multi_node_example_by_ddp.yaml b/scripts/accelerate_configs/multi_node_example_by_ddp.yaml
new file mode 100644
index 000000000..fa16c29a6
--- /dev/null
+++ b/scripts/accelerate_configs/multi_node_example_by_ddp.yaml
@@ -0,0 +1,13 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+fsdp_config: {}
+main_process_port: 29501
+main_training_function: main
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml b/scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml
new file mode 100644
index 000000000..9c26fd8c5
--- /dev/null
+++ b/scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: DEEPSPEED
+deepspeed_config:
+ deepspeed_config_file: scripts/accelerate_configs/zero2_npu.json
+ deepspeed_multinode_launcher: standard
+fsdp_config: {}
+main_process_port: 29501
+main_training_function: main
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/scripts/accelerate_configs/zero2.json b/scripts/accelerate_configs/zero2.json
index bc26a0a5f..8dcdd154d 100644
--- a/scripts/accelerate_configs/zero2.json
+++ b/scripts/accelerate_configs/zero2.json
@@ -1,6 +1,6 @@
{
"fp16": {
- "enabled": "auto",
+ "enabled": false,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
@@ -10,6 +10,8 @@
"bf16": {
"enabled": "auto"
},
+ "communication_data_type": "fp32",
+ "gradient_clipping": 1.0,
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
diff --git a/scripts/accelerate_configs/zero2_npu.json b/scripts/accelerate_configs/zero2_npu.json
new file mode 100644
index 000000000..6f1ecdf2c
--- /dev/null
+++ b/scripts/accelerate_configs/zero2_npu.json
@@ -0,0 +1,25 @@
+{
+ "fp16": {
+ "enabled": false,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 16,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "bf16": {
+ "enabled": "auto"
+ },
+ "communication_data_type": "fp32",
+ "gradient_clipping": 1.0,
+ "train_micro_batch_size_per_gpu": "auto",
+ "train_batch_size": "auto",
+ "gradient_accumulation_steps": "auto",
+ "zero_optimization": {
+ "stage": 2,
+ "overlap_comm": true,
+ "allgather_bucket_size": 536870912,
+ "contiguous_gradients": true,
+ "reduce_bucket_size": 536870912
+ }
+}
\ No newline at end of file
diff --git a/scripts/causalvae/eval.sh b/scripts/causalvae/eval.sh
index ed8bd3eda..0b7b909cc 100644
--- a/scripts/causalvae/eval.sh
+++ b/scripts/causalvae/eval.sh
@@ -1,12 +1,21 @@
-python opensora/eval/eval_common_metric.py \
- --batch_size 2 \
- --real_video_dir ..//test_eval/release/origin \
- --generated_video_dir ../test_eval/release \
- --device cuda \
- --sample_fps 10 \
- --crop_size 256 \
- --resolution 256 \
- --num_frames 17 \
- --sample_rate 1 \
- --subset_size 100 \
- --metric ssim
\ No newline at end of file
+# REAL_DATASET_DIR=/remote-home1/dataset/OpenMMLab___Kinetics-400/raw/Kinetics-400/videos_val/
+REAL_DATASET_DIR=../dataset/webvid/videos
+EXP_NAME=decoder
+SAMPLE_RATE=3
+NUM_FRAMES=33
+RESOLUTION=256
+SUBSET_SIZE=50
+METRIC=ssim
+
+python opensora/models/causalvideovae/eval/eval_common_metric.py \
+ --batch_size 1 \
+ --real_video_dir ${REAL_DATASET_DIR} \
+ --generated_video_dir /remote-home1/lzj/dataset/gen/${EXP_NAME}_sr${SAMPLE_RATE}_nf${NUM_FRAMES}_res${RESOLUTION}_subset${SUBSET_SIZE} \
+ --device cuda:0 \
+ --sample_fps 3 \
+ --sample_rate ${SAMPLE_RATE} \
+ --num_frames ${NUM_FRAMES} \
+ --resolution ${RESOLUTION} \
+ --subset_size ${SUBSET_SIZE} \
+ --crop_size ${RESOLUTION} \
+ --metric ${METRIC}
\ No newline at end of file
diff --git a/scripts/causalvae/gen_video.sh b/scripts/causalvae/gen_video.sh
deleted file mode 100644
index 2f340d62e..000000000
--- a/scripts/causalvae/gen_video.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-python examples/rec_video_vae.py \
- --batch_size 1 \
- --real_video_dir ../test_eval/eyes_test \
- --generated_video_dir ../test_eval/eyes_gen \
- --device cuda \
- --sample_fps 10 \
- --sample_rate 1 \
- --num_frames 17 \
- --resolution 512 \
- --crop_size 512 \
- --num_workers 8 \
- --ckpt results/pretrained_488 \
- --enable_tiling
\ No newline at end of file
diff --git a/scripts/causalvae/prepare_eval.sh b/scripts/causalvae/prepare_eval.sh
new file mode 100644
index 000000000..8ce0b5ba8
--- /dev/null
+++ b/scripts/causalvae/prepare_eval.sh
@@ -0,0 +1,24 @@
+export CUDA_VISIBLE_DEVICES=0
+REAL_DATASET_DIR=valid/
+EXP_NAME=decoder
+SAMPLE_RATE=1
+NUM_FRAMES=33
+RESOLUTION=512
+SUBSET_SIZE=1
+CKPT=/storage/lcm/Causal-Video-VAE/results/488dim8
+
+python opensora/models/causalvideovae/sample/rec_video_vae.py \
+ --batch_size 1 \
+ --real_video_dir ${REAL_DATASET_DIR} \
+ --generated_video_dir valid_gen/${EXP_NAME}_sr${SAMPLE_RATE}_nf${NUM_FRAMES}_res${RESOLUTION}_subset${SUBSET_SIZE} \
+ --device cuda \
+ --sample_fps 24 \
+ --sample_rate ${SAMPLE_RATE} \
+ --num_frames ${NUM_FRAMES} \
+ --resolution ${RESOLUTION} \
+ --subset_size ${SUBSET_SIZE} \
+ --crop_size ${RESOLUTION} \
+ --num_workers 8 \
+ --ckpt ${CKPT} \
+ --output_origin \
+ --enable_tiling
\ No newline at end of file
diff --git a/scripts/causalvae/rec_image.sh b/scripts/causalvae/rec_image.sh
new file mode 100644
index 000000000..3c65c54b3
--- /dev/null
+++ b/scripts/causalvae/rec_image.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 python examples/rec_image.py \
+ --ae_path "/storage/dataset/test140k" \
+ --image_path /storage/dataset/image/anytext3m/ocr_data/Art/images/gt_5544.jpg \
+ --rec_path rec.jpg \
+ --device cuda \
+ --short_size 512 \
+ --ae CausalVAEModel_4x8x8 \
+ --enable_tiling
\ No newline at end of file
diff --git a/scripts/causalvae/rec_video.sh b/scripts/causalvae/rec_video.sh
new file mode 100644
index 000000000..a317fc3bb
--- /dev/null
+++ b/scripts/causalvae/rec_video.sh
@@ -0,0 +1,12 @@
+CUDA_VISIBLE_DEVICES=1 python examples/rec_video.py \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --video_path /storage/dataset/mixkit-train-passing-the-rails-4462_resize1080p.mp4 \
+ --rec_path rec_tile.mp4 \
+ --device cpu \
+ --sample_rate 1 \
+ --num_frames 17 \
+ --height 256 \
+ --width 256 \
+ --fps 30 \
+ --enable_tiling
\ No newline at end of file
diff --git a/scripts/causalvae/reconstruction.sh b/scripts/causalvae/reconstruction.sh
deleted file mode 100644
index 3e89d2c28..000000000
--- a/scripts/causalvae/reconstruction.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 python examples/rec_imvi_vae.py \
- --ae_path "../../Open-Sora-Plan/CausalVAEModel_4x8x8/" \
- --video_path origin_tiger.mp4 \
- --rec_path rec.mp4 \
- --device cuda \
- --sample_rate 1 \
- --num_frames 65 \
- --resolution 512 \
- --crop_size 512 \
- --ae CausalVAEModel_4x8x8 \
- --enable_tiling
\ No newline at end of file
diff --git a/scripts/causalvae/release.json b/scripts/causalvae/release.json
deleted file mode 100644
index f572f3207..000000000
--- a/scripts/causalvae/release.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "_class_name": "CausalVAEModel",
- "_diffusers_version": "0.27.2",
- "attn_resolutions": [],
- "decoder_attention": "AttnBlock3D",
- "decoder_conv_in": "CausalConv3d",
- "decoder_conv_out": "CausalConv3d",
- "decoder_mid_resnet": "ResnetBlock3D",
- "decoder_resnet_blocks": [
- "ResnetBlock3D",
- "ResnetBlock3D",
- "ResnetBlock3D",
- "ResnetBlock3D"
- ],
- "decoder_spatial_upsample": [
- "",
- "SpatialUpsample2x",
- "SpatialUpsample2x",
- "SpatialUpsample2x"
- ],
- "decoder_temporal_upsample": [
- "",
- "",
- "TimeUpsample2x",
- "TimeUpsample2x"
- ],
- "double_z": true,
- "dropout": 0.0,
- "embed_dim": 4,
- "encoder_attention": "AttnBlock3D",
- "encoder_conv_in": "CausalConv3d",
- "encoder_conv_out": "CausalConv3d",
- "encoder_mid_resnet": "ResnetBlock3D",
- "encoder_resnet_blocks": [
- "ResnetBlock3D",
- "ResnetBlock3D",
- "ResnetBlock3D",
- "ResnetBlock3D"
- ],
- "encoder_spatial_downsample": [
- "SpatialDownsample2x",
- "SpatialDownsample2x",
- "SpatialDownsample2x",
- ""
- ],
- "encoder_temporal_downsample": [
- "TimeDownsample2x",
- "TimeDownsample2x",
- "",
- ""
- ],
- "hidden_size": 128,
- "hidden_size_mult": [
- 1,
- 2,
- 4,
- 4
- ],
- "loss_params": {
- "disc_start": 2001,
- "disc_weight": 0.5,
- "kl_weight": 1e-06,
- "logvar_init": 0.0
- },
- "loss_type": "opensora.models.ae.videobase.losses.LPIPSWithDiscriminator",
- "lr": 1e-05,
- "num_res_blocks": 2,
- "q_conv": "CausalConv3d",
- "resolution": 256,
- "z_channels": 4
-}
diff --git a/scripts/causalvae/train.sh b/scripts/causalvae/train.sh
index 1a49bf50f..d97ba141f 100644
--- a/scripts/causalvae/train.sh
+++ b/scripts/causalvae/train.sh
@@ -1,15 +1,45 @@
-python opensora/train/train_causalvae.py \
- --exp_name "exp_name" \
+# export https_proxy=http://127.0.0.1:8998
+# export http_proxy=http://127.0.0.1:8998
+unset https_proxy
+unset http_proxy
+export WANDB_PROJECT=causalvideovae_2.0
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+EXP_NAME=latent4_resume_2
+
+torchrun \
+ --nnodes=1 --nproc_per_node=8 \
+ --master_addr=localhost \
+ --master_port=29600 \
+ scripts/causalvae/train_causalvae.py \
+ --exp_name ${EXP_NAME} \
+ --model_config scripts/config.json \
+ --video_path /storage/dataset/pexels/ \
+ --eval_video_path /storage/dataset/pixabay_v2/folder_01 \
+ --resolution 320 \
+ --epochs 1000 \
+ --num_frames 25 \
--batch_size 1 \
- --precision bf16 \
- --max_steps 40000 \
- --save_steps 100 \
- --output_dir results/causalvae_ \
- --video_path /remote-home1/dataset/data_split_tt \
- --video_num_frames 17 \
- --resolution 256 \
- --sample_rate 1 \
- --n_nodes 1 \
- --devices 1 \
- --num_workers 8 \
- --load_from_checkpoint ./results/pretrained_488/
\ No newline at end of file
+ --disc_start 2000 \
+ --save_ckpt_step 2000 \
+ --eval_steps 500 \
+ --eval_batch_size 1 \
+ --eval_num_frames 33 \
+ --eval_sample_rate 3 \
+ --eval_subset_size 50 \
+ --eval_lpips \
+ --ema \
+ --ema_decay 0.999 \
+ --perceptual_weight 1.0 \
+ --loss_type l1 \
+ --disc_cls causalvideovae.model.losses.LPIPSWithDiscriminator3D \
+ --not_resume_training_process \
+ --resume_from_checkpoint /storage/lcm/Causal-Video-VAE/results/latent8_3d-lr1.00e-05-bs1-rs320-sr2-fr25/checkpoint-14000.ckpt
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_image.sh b/scripts/text_condition/gpu/sample_image.sh
new file mode 100644
index 000000000..22ff081f4
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_image.sh
@@ -0,0 +1,21 @@
+export TASK_QUEUE_ENABLE=0
+
+CUDA_VISIBLE_DEVICES=1 python opensora/sample/sample_t2v.py \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x8_vae8_anyx320x320_lr5e-5_snr5_noioff0.02_ema9999_sparse1d4_newdit_l_122_rope_mt5xxl_mj/checkpoint-218000/model_ema \
+ --version 65x512x512 \
+ --num_frames 1 \
+ --height 320 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt examples/prompt_list_2.txt \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "WFVAE_DISTILL_FORMAL" \
+ --save_img_path "sample_image_vae8_newdit_218k_320x320_test_sam" \
+ --fps 24 \
+ --guidance_scale 4.5 \
+ --num_sampling_steps 28 \
+ --enable_tiling \
+ --max_sequence_length 512 \
+ --sample_method DPMSolverMultistep \
+ --model_type sparsedit
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_image_sparse.sh b/scripts/text_condition/gpu/sample_image_sparse.sh
new file mode 100644
index 000000000..9c2bb39b8
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_image_sparse.sh
@@ -0,0 +1,19 @@
+CUDA_VISIBLE_DEVICES=3 python opensora/sample/sample_t2v.py \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs1x8x16_1x720p_lr1e-4_snr5_noioff0.02_ema9999_sparse1d16_dit_l_122_rope_mt5xxl_mj_fromhw480p/checkpoint-4000/model_ema \
+ --version 65x512x512 \
+ --num_frames 1 \
+ --height 720 \
+ --width 1280 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt examples/prompt_list_1.txt \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --save_img_path "sample_images_sparse1d16_dit_9k" \
+ --fps 24 \
+ --guidance_scale 4.5 \
+ --num_sampling_steps 20 \
+ --enable_tiling \
+ --max_sequence_length 512 \
+ --sample_method DPMSolverMultistep \
+ --model_type dit
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_inpaint_ddp.sh b/scripts/text_condition/gpu/sample_inpaint_ddp.sh
new file mode 100644
index 000000000..c8aaadb1d
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_inpaint_ddp.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nnodes=1 --nproc_per_node 2 --master_port 29502 \
+ -m opensora.sample.sample_inpaint_ddp \
+ --model_path /storage/gyy/hw/Open-Sora-Plan/test_sparse_inpaint/checkpoint-84000/model \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/gyy/hw/Open-Sora-Plan/test_prompt.txt \
+ --conditional_images_path /storage/gyy/hw/Open-Sora-Plan/test_cond_imgs_path.txt \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "./sample_test_inpaint_sparse" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 1234
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_t2v.sh b/scripts/text_condition/gpu/sample_t2v.sh
new file mode 100644
index 000000000..47b425897
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v.sh
@@ -0,0 +1,21 @@
+CUDA_VISIBLE_DEVICES=5 python opensora/sample/sample_t2v.py \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr5e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-20000/model \
+ --version 65x512x512 \
+ --num_frames 1 \
+ --height 320 \
+ --width 160 \
+ --cache_dir "cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt examples/prompt_list_0.txt \
+ --ae CausalVAEModel_D8_4x8x8 \
+ --ae_path "/storage/dataset/new488dim8/last" \
+ --save_img_path "./sample_video_dit_vae8_newmodel_anyx93x320x160_img" \
+ --fps 24 \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 50 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 1.0
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp.sh b/scripts/text_condition/gpu/sample_t2v_ddp.sh
new file mode 100644
index 000000000..7d1fac080
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp.sh
@@ -0,0 +1,26 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=3,4,6,7 torchrun --nnodes=1 --nproc_per_node 4 --master_port 29502 \
+ -m opensora.sample.sample_t2v_ddp \
+ --model_path bs32x8x2_anyx93x320x320_fps16_lr2e-6_snr5_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m_vpred_zerosnr/checkpoint-49000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/new/7.19anyres/Open-Sora-Plan/examples/sora_refine.txt \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "./sample_video_dit_vae8_newmodel_anyx93x160x320_sora_m0.9_49k_s100_wf_xformer_refine_vpred_zerosnr_trailing_lowlr" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 1234 \
+ --num_samples_per_prompt 1 \
+ --prediction_type "v_prediction" \
+ --rescale_betas_zero_snr
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp1.sh b/scripts/text_condition/gpu/sample_t2v_ddp1.sh
new file mode 100644
index 000000000..2ebff1f17
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp1.sh
@@ -0,0 +1,26 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nnodes=1 --nproc_per_node 8 --master_port 29502 \
+ -m opensora.sample.sample_t2v_ddp \
+ --model_path bs32x8x2_anyx93x320x320_fps16_lr1e-5_snr5_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m_vpred_zerosnr/checkpoint-3000/model \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/new/7.19anyres/Open-Sora-Plan/examples/sora_refine.txt \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "./sample_video_dit_vae8_newmodel_anyx93x160x320_sora_m0.9_3k_s100_wf_xformer_refine_vpred_zerosnr_trailing" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 1234 \
+ --num_samples_per_prompt 1 \
+ --prediction_type "v_prediction" \
+ --rescale_betas_zero_snr
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy0.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy0.sh
new file mode 100644
index 000000000..e820621d4
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy0.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=0 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29502 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/overall_consistency.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/overall_consistency_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy1.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy1.sh
new file mode 100644
index 000000000..b15f935a2
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy1.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=1 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29503 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/multiple_objects.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/multiple_objects_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy2.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy2.sh
new file mode 100644
index 000000000..7fc4e499c
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy2.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=2 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29504 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/object_class.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/object_class_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy3.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy3.sh
new file mode 100644
index 000000000..f1f5c7975
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy3.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=3 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29505 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/human_action.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/human_action_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy4.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy4.sh
new file mode 100644
index 000000000..868da3592
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy4.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=4 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29506 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/appearance_style.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/appearance_style_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy5.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy5.sh
new file mode 100644
index 000000000..eecb258b1
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy5.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=5 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29507 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/spatial_relationship.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/spatial_relationship_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy6.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy6.sh
new file mode 100644
index 000000000..e0096dcda
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy6.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=6 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29508 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/temporal_style.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/temporal_style_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_ddp_yy7.sh b/scripts/text_condition/gpu/sample_t2v_ddp_yy7.sh
new file mode 100644
index 000000000..b986b3fc3
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_ddp_yy7.sh
@@ -0,0 +1,23 @@
+# --save_img_path "./sample_video_26500ema_61x480p_k333_s122_cfg5.0_step50" \
+# CUDA_VISIBLE_DEVICES=7 python opensora/sample/sample_t2v.py \
+CUDA_VISIBLE_DEVICES=7 torchrun --nnodes=1 --nproc_per_node 1 --master_port 29509 \
+ -m opensora.sample.sample_t2v_ddp_vbench_gpt \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-415000/model_ema \
+ --version 65x512x512 \
+ --num_frames 93 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt /storage/ongoing/refine_model/prompts_per_dimension/csv_gpt/scene.csv \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/node-user/scene_gpt" \
+ --fps 18 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
+ --seed 42 43 44 45 46
diff --git a/scripts/text_condition/gpu/sample_t2v_sp.sh b/scripts/text_condition/gpu/sample_t2v_sp.sh
new file mode 100644
index 000000000..2ffc0ee5c
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_t2v_sp.sh
@@ -0,0 +1,22 @@
+
+torchrun --nnodes=1 --nproc_per_node 8 --master_port 29503 \
+ -m opensora.sample.sample_t2v_sp \
+ --model_path /storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs8x8x1_29x480p_lr1e-5_snr5_noioff0.02_ema999_sparse1d4_dit_l_122_rope_t5xxl_movie_panda \
+ --version 65x512x512 \
+ --num_frames 29 \
+ --height 480 \
+ --width 640 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt examples/prompt_list_2.txt \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --save_img_path "./sample_video_test" \
+ --fps 24 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "dit"
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_v2v.sh b/scripts/text_condition/gpu/sample_v2v.sh
new file mode 100644
index 000000000..1a5d37a21
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_v2v.sh
@@ -0,0 +1,30 @@
+export MASTER_ADDR='localhost'
+export TOKENIZERS_PARALLELISM=false
+MODEL_PATH=/storage/gyy/hw/Open-Sora-Plan/runs/inpaint_only_480p_f93_bs4x8x1_lr1e-5_snrgamma5_0_noiseoffset0_02_ema0_999/checkpoint-14400/model_ema
+# export HF_DATASETS_OFFLINE=1
+# export TRANSFORMERS_OFFLINE=1
+
+torchrun --nproc_per_node=8 --master_port=29501 opensora/sample/sample_v2v.py \
+ --model_path $MODEL_PATH \
+ --model_type 'inpaint_only' \
+ --num_frames 93 \
+ --height 480 \
+ --width 640 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "/storage/dataset/test140k" \
+ --save_img_path "./samples/md" \
+ --fps 24 \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 50 \
+ --enable_tiling \
+ --max_sequence_length 512 \
+ --sample_method PNDM \
+ --validation_dir "./md_validation" \
+ --pretrained_transformer_model_path /storage/ongoing/new/Open-Sora-Plan-bak/7.14bak/bs16x8x1_93x480p_lr1e-4_snr5_ema999_opensora122_rope_mt5xxl_high_pandamovie_speed1.0/checkpoint-3500/model_ema \
+ --pretrained_vipnet_path /storage/gyy/hw/Open-Sora-Plan/runs/videoip_3d_480p_f29_bs2x16_lr1e-5_snrgamma5_0_noiseoffset0_02_dino518_ema0_999/checkpoint-14000/model \
+ --image_encoder_name vit_giant_patch14_reg4_dinov2.lvd142m \
+ --image_encoder_path /storage/cache_dir/hub/models--timm--vit_giant_patch14_reg4_dinov2.lvd142m/snapshots/a2208b21b069f6b2e45999870fcce4b7e43d1a2c/model.safetensors \
+ --max_sample_num 24 \
+ --seed 42
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/sample_video_ddp.sh b/scripts/text_condition/gpu/sample_video_ddp.sh
new file mode 100644
index 000000000..a6d82f217
--- /dev/null
+++ b/scripts/text_condition/gpu/sample_video_ddp.sh
@@ -0,0 +1,21 @@
+
+torchrun --nnodes=1 --nproc_per_node 8 --master_port 29503 \
+ -m opensora.sample.sample_t2v_ddp \
+ --model_path /path/to/checkpoint-xxx/model_ema \
+ --num_frames 93 \
+ --height 720 \
+ --width 1280 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name google/mt5-xxl \
+ --text_prompt examples/prompt_list_0.txt \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/path/to/causalvideovae" \
+ --save_img_path "./sample_93x480p_cfg7.5_step100_eulera" \
+ --fps 24 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --max_sequence_length 512 \
+ --sample_method EulerAncestralDiscrete \
+ --model_type "dit"
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/train_inpaint_all_in_one.sh b/scripts/text_condition/gpu/train_inpaint_all_in_one.sh
new file mode 100644
index 000000000..b13bb626d
--- /dev/null
+++ b/scripts/text_condition/gpu/train_inpaint_all_in_one.sh
@@ -0,0 +1,91 @@
+# PROJECT="video_test"
+PROJECT="inpaint_3d_720p_f93_bs16x8x1_lr1e-5_snrgamma5_0_noiseoffset0_02_dino518_ema0_999"
+export WANDB_API_KEY="720d886d8c437c2142c88056a1eab8ef78d64a1f"
+export WANDB_MODE="online"
+export ENTITY="yunyangge"
+export PROJECT=$PROJECT
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+
+export PDSH_RCMD_TYPE=ssh
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_inpaint_all_in_one.py \
+ --model OpenSoraInpaint-ROPE-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --image_encoder_name vit_giant_patch14_reg4_dinov2.lvd142m \
+ --image_encoder_path /storage/cache_dir/hub/models--timm--vit_giant_patch14_reg4_dinov2.lvd142m/snapshots/a2208b21b069f6b2e45999870fcce4b7e43d1a2c/model.safetensors \
+ --cache_dir "/storage/cache_dir" \
+ --dataset inpaint \
+ --model_type inpaint_only \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "/storage/dataset/test140k" \
+ --data "scripts/train_data/video_data.txt" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --use_image_num 0 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --gradient_checkpointing \
+ --max_train_steps=500000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --enable_tracker \
+ --checkpointing_steps=200 \
+ --output_dir runs/$PROJECT \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --validation_dir "validation_dir" \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 50 \
+ --ema_start_step 0 \
+ --use_ema \
+ --cfg 0.1 \
+ --i2v_ratio 0.5 \
+ --transition_ratio 0.4 \
+ --v2v_ratio 0.1 \
+ --clear_video_ratio 0.0 \
+ --default_text_ratio 0.5 \
+ --seed 42 \
+ --snr_gamma 5.0 \
+ --noise_offset 0.02 \
+ --vip_num_attention_heads 16 \
+ --ema_decay 0.999 \
+ --use_rope \
+ --speed_factor 1.5 \
+ --group_frame \
+ --pretrained_transformer_model_path "/storage/ongoing/new/Open-Sora-Plan/bs32x8x1_93x720p_lr2e-5_snr5_ema999_opensora122_rope_fp32_mt5xxl_sucai_aes5.5_speed1.5/checkpoint-400/model_ema" \
+ --pretrained_vip_adapter_path "/storage/gyy/hw/Open-Sora-Plan/runs/videoip_3d_480p_f93_bs1x16_lr1e-5_snrgamma5_0_noiseoffset0_02_dino518_ema0_999/checkpoint-3000/model" \
+ # --sp_size 8 \
+ # --train_sp_batch_size 2 \
+ # --train_vip \
+ # --need_validation \
+ # --resume_from_checkpoint "latest" \
+ # --zero_terminal_snr \
+ # 基模型权重没有参与训练所以一定要加载
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/train_inpaint_only.sh b/scripts/text_condition/gpu/train_inpaint_only.sh
new file mode 100644
index 000000000..4dfe8ca58
--- /dev/null
+++ b/scripts/text_condition/gpu/train_inpaint_only.sh
@@ -0,0 +1,88 @@
+# PROJECT="video_test"
+PROJECT="inpaint_only_480p_f93_bs4x8x1_lr1e-5_snrgamma5_0_noiseoffset0_02_ema0_999"
+export WANDB_API_KEY="720d886d8c437c2142c88056a1eab8ef78d64a1f"
+export WANDB_MODE="online"
+export ENTITY="yunyangge"
+export PROJECT=$PROJECT
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+
+export PDSH_RCMD_TYPE=ssh
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_inpaint_all_in_one.py \
+ --model OpenSoraInpaint-ROPE-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --image_encoder_name vit_giant_patch14_reg4_dinov2.lvd142m \
+ --image_encoder_path /storage/cache_dir/hub/models--timm--vit_giant_patch14_reg4_dinov2.lvd142m/snapshots/a2208b21b069f6b2e45999870fcce4b7e43d1a2c/model.safetensors \
+ --cache_dir "/storage/cache_dir" \
+ --dataset inpaint \
+ --model_type inpaint_only \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "/storage/dataset/test140k" \
+ --data "scripts/train_data/video_data_sucai.txt" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --use_image_num 0 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --gradient_checkpointing \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --enable_tracker \
+ --checkpointing_steps=200 \
+ --output_dir runs/$PROJECT \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --validation_dir "validation_dir" \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 50 \
+ --ema_start_step 0 \
+ --use_ema \
+ --cfg 0.05 \
+ --i2v_ratio 0.4 \
+ --transition_ratio 0.4 \
+ --v2v_ratio 0.1 \
+ --clear_video_ratio 0.0 \
+ --default_text_ratio 0.5 \
+ --seed 42 \
+ --snr_gamma 5.0 \
+ --noise_offset 0.02 \
+ --ema_decay 0.999 \
+ --use_rope \
+ --group_frame \
+ --resume_from_checkpoint "latest" \
+ # --pretrained_transformer_model_path "/storage/gyy/hw/Open-Sora-Plan/runs/inpaint_only_480p_f93_bs4x8x1_lr1e-5_snrgamma5_0_noiseoffset0_02_ema0_999_old_script/checkpoint-25800/model_ema" \
+ # --pretrained_vip_adapter_path "/storage/gyy/hw/Open-Sora-Plan/pretrained_models/pretrained_vip_9000.pth"
+ # --speed_factor 1.5 \
+ # --vip_num_attention_heads 16 \
+ # --train_vip \
+ # --zero_terminal_snr \
+ # 基模型权重没有参与训练所以一定要加载
\ No newline at end of file
diff --git a/scripts/text_condition/gpu/train_inpaint_sparse1d_newmodel_motion.sh b/scripts/text_condition/gpu/train_inpaint_sparse1d_newmodel_motion.sh
new file mode 100644
index 000000000..9a650d736
--- /dev/null
+++ b/scripts/text_condition/gpu/train_inpaint_sparse1d_newmodel_motion.sh
@@ -0,0 +1,80 @@
+export WANDB_KEY="720d886d8c437c2142c88056a1eab8ef78d64a1f"
+export WANDB_MODE="online"
+export ENTITY="yunyang"
+export PROJECT="test_sparse_inpaint"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=25
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export NCCL_IB_RETRY_CNT=32
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_inpaint.py \
+ --model OpenSoraInpaint-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset inpaint \
+ --data "scripts/train_data/merge_data_debug.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 320 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --use_motion \
+ --train_fps 16 \
+ --seed 1234 \
+ --group_data \
+ --t2v_ratio 0.1 \
+ --i2v_ratio 0.0 \
+ --transition_ratio 0.0 \
+ --v2v_ratio 0.0 \
+ --clear_video_ratio 0.0 \
+ --min_clear_ratio 0.5 \
+ --default_text_ratio 0.5 \
+ --pretrained_transformer_model_path "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-526000/model_ema" \
+ --output_dir="test_sparse_inpaint" > training_log_new.txt
+ # --resume_from_checkpoint="latest" \
diff --git a/scripts/text_condition/gpu/train_t2v.sh b/scripts/text_condition/gpu/train_t2v.sh
new file mode 100644
index 000000000..c719c9eb7
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v.sh
@@ -0,0 +1,70 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_panda_movie_mj.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 29 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 1.0 \
+ --force_resolution \
+ --pretrained "/storage/dataset/hw29/image/model_ema/diffusion_pytorch_model.safetensors" \
+ --output_dir="bs16x8x1_img_29x480p_lr2e-5_snr5_noioff0.02_ema9999_dit_l_122_rope_mt5xxl_movie_panda_mj"
diff --git a/scripts/text_condition/gpu/train_t2v_img.sh b/scripts/text_condition/gpu/train_t2v_img.sh
new file mode 100644
index 000000000..880f95148
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_img.sh
@@ -0,0 +1,70 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D8_4x8x8 \
+ --ae_path "/storage/dataset/new488dim8/last" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 320 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=8 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --enable_tracker \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs16x8x8_vae8_any320x320_lr1e-4_snr5_noioff0.02_ema9999_dit_l_122_rope_mt5xxl_mj/checkpoint-150000/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --output_dir="bs16x8x8_vae8_any320x320_lr2e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_122_rope_mt5xxl_mj"
diff --git a/scripts/text_condition/gpu/train_t2v_img_lora.sh b/scripts/text_condition/gpu/train_t2v_img_lora.sh
new file mode 100644
index 000000000..ccfb4d717
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_img_lora.sh
@@ -0,0 +1,70 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_t2v_diffusers_lora.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 720 \
+ --max_width 1280 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.5 \
+ --interpolation_scale_w 2.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --enable_tracker \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --force_resolution \
+ --model_base "/storage/dataset/hw29/image/model_ema" \
+ --output_dir="bs1x8x16_1x720p_lr1e-5_snr5_noioff0.02_ema9999_lora64_dit_l_122_rope_mt5xxl_mj_fromhw480p" \
+ --enable_lora --rank 64
diff --git a/scripts/text_condition/gpu/train_t2v_img_sparse1d.sh b/scripts/text_condition/gpu/train_t2v_img_sparse1d.sh
new file mode 100644
index 000000000..ad6b7a6e2
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_img_sparse1d.sh
@@ -0,0 +1,71 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example2.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D8_4x8x8 \
+ --ae_path "/storage/dataset/new488dim8/last" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --enable_tracker \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs16x8x8_vae8_any320x320_lr2e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_122_rope_mt5xxl_mj/checkpoint-84000/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --force_resolution \
+ --sparse1d --sparse_n 4 \
+ --output_dir="bs4x8x16_vae8_1x480p_lr2e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_122_rope_mt5xxl_mj"
diff --git a/scripts/text_condition/gpu/train_t2v_img_sparse1d_newmodel.sh b/scripts/text_condition/gpu/train_t2v_img_sparse1d_newmodel.sh
new file mode 100644
index 000000000..7f5a48abf
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_img_sparse1d_newmodel.sh
@@ -0,0 +1,70 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V1-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/image_data.txt" \
+ --ae CausalVAEModel_D8_4x8x8 \
+ --ae_path "/storage/dataset/new488dim8/last" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 320 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=4 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=5e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --enable_tracker \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs16x8x8_vae8_anyx320x320_lr2e-5_snr5_noioff0.02_ema9999_sparse1d4_newdit_l_122_rope_mt5xxl_mj/checkpoint-57000/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --output_dir="bs32x8x8_vae8_anyx320x320_lr5e-5_snr5_noioff0.02_ema9999_sparse1d4_newdit_l_122_rope_mt5xxl_mj"
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d.sh b/scripts/text_condition/gpu/train_t2v_sparse1d.sh
new file mode 100644
index 000000000..091d32ccf
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d.sh
@@ -0,0 +1,74 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example1.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_panda_movie_mj.txt" \
+ --ae CausalVAEModel_D8_4x8x8 \
+ --ae_path "/storage/dataset/new488dim8/last" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 176 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=500 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --force_resolution \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs16x8x8_vae8_any320x320_lr1e-4_snr5_noioff0.02_ema9999_dit_l_122_rope_mt5xxl_mj/checkpoint-150000/model_ema/diffusion_pytorch_model.safetensors" \
+ --output_dir="bs8x8x1_93x176x320_fps16_lr2e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_sucaiaes5_fromimgsparse1d4" \
+ --tile_sample_min_size 512 \
+ --tile_sample_min_size_t 125 \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 --train_fps 16
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d16_image.sh b/scripts/text_condition/gpu/train_t2v_sparse1d16_image.sh
new file mode 100644
index 000000000..19f6e77dd
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d16_image.sh
@@ -0,0 +1,73 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 720 \
+ --max_width 1280 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.5 \
+ --interpolation_scale_w 2.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --sparse1d \
+ --sparse_n 16 \
+ --force_resolution \
+ --enable_tracker \
+ --pretrained "/storage/dataset/hw29/image/model_ema/diffusion_pytorch_model.safetensors" \
+ --output_dir="bs1x8x16_1x720p_lr1e-4_snr5_noioff0.02_ema9999_sparse1d16_dit_l_122_rope_mt5xxl_mj_fromhw480p"
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d_debug.sh b/scripts/text_condition/gpu/train_t2v_sparse1d_debug.sh
new file mode 100644
index 000000000..1bb41112c
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d_debug.sh
@@ -0,0 +1,74 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V1-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/video_data_debug.txt" \
+ --ae CausalVAEModel_D8_4x8x8 \
+ --ae_path "/storage/dataset/new488dim8/last" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 720 \
+ --max_width 1280 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=500 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 1.0 \
+ --force_resolution \
+ --output_dir="debug" \
+ --tile_sample_min_size 512 \
+ --tile_sample_min_size_t 33 \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 --train_fps 16 \
+ --train_sp_batch_size 4 --sp_size 8
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d_image.sh b/scripts/text_condition/gpu/train_t2v_sparse1d_image.sh
new file mode 100644
index 000000000..63a84c467
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d_image.sh
@@ -0,0 +1,73 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example1.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=8 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --sparse1d \
+ --sparse_n 4 \
+ --force_resolution \
+ --enable_tracker \
+ --pretrained "/storage/dataset/hw29/image/model_ema/diffusion_pytorch_model.safetensors" \
+ --output_dir="bs8x8x8_1x480p_lr2e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_122_rope_mt5xxl_mj_fromhw480p"
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion.sh b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion.sh
new file mode 100644
index 000000000..ca5883318
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion.sh
@@ -0,0 +1,75 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=25
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export NCCL_IB_RETRY_CNT=32
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V2-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 320 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr5e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-118000/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --use_motion \
+ --train_fps 16 \
+ --seed 1234 \
+ --trained_data_global_step 42000 \
+ --group_data \
+ --use_decord \
+ --output_dir="bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m" > training_log_new.txt
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_.sh b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_.sh
new file mode 100644
index 000000000..b3639f5fe
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_.sh
@@ -0,0 +1,74 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=25
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export NCCL_IB_RETRY_CNT=32
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V2-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_debug.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.5 \
+ --interpolation_scale_w 2.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --force_resolution \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr5e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-118000/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --use_motion \
+ --train_fps 16 \
+ --seed 1234 \
+ --trained_data_global_step 0 \
+ --output_dir="debug"
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_480p.sh b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_480p.sh
new file mode 100644
index 000000000..b1fc43c84
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_480p.sh
@@ -0,0 +1,76 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=25
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export NCCL_IB_RETRY_CNT=32
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V2-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 640 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=500 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/checkpoint-526000/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --use_motion \
+ --train_fps 16 \
+ --seed 1234 \
+ --trained_data_global_step 0 \
+ --group_data \
+ --use_decord \
+ --force_resolution \
+ --output_dir="bs32x8x1_anyx93x640x640_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_aesimg18m" > training_log_new.txt
diff --git a/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_v_pred.sh b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_v_pred.sh
new file mode 100644
index 000000000..f2679686d
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse1d_newmodel_motion_v_pred.sh
@@ -0,0 +1,77 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=25
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export NCCL_IB_RETRY_CNT=32
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V2-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/storage/lcm/Causal-Video-VAE/results/WFVAE_DISTILL_FORMAL" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 320 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 8 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.0 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --pretrained "/storage/ongoing/new/7.19anyres/Open-Sora-Plan/bs32x8x2_anyx93x320x320_fps16_lr2e-6_snr5_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m_vpred_zerosnr/checkpoint-45100/model_ema/diffusion_pytorch_model.safetensors" \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --use_motion \
+ --train_fps 16 \
+ --seed 1234 \
+ --trained_data_global_step 0 \
+ --group_data \
+ --use_decord \
+ --prediction_type "v_prediction" \
+ --rescale_betas_zero_snr \
+ --output_dir="bs32x8x1_anyx93x320x320_fps16_lr2e-6_snr5_ema9999_sparse1d4_dit_l_mt5xxl_40m_vpred_zerosnr"
diff --git a/scripts/text_condition/gpu/train_t2v_sparse2d.sh b/scripts/text_condition/gpu/train_t2v_sparse2d.sh
new file mode 100644
index 000000000..be229b422
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse2d.sh
@@ -0,0 +1,73 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example1.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/video_data_sucai_aes5.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 720 \
+ --max_width 1280 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=2e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=250 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 1.0 \
+ --force_resolution \
+ --pretrained "/storage/dataset/Open-Sora-Plan-v1.2.0/93x720p/diffusion_pytorch_model.safetensors" \
+ --output_dir="bs8x8x1_93x720p_lr2e-5_snr5_noioff0.02_ema9999_sparse2d4_dit_l_122_rope_mt5xxl_sucaiaes5" \
+ --tile_sample_min_size 512 \
+ --tile_sample_min_size_t 33 \
+ --sparse2d --sparse_n 4
diff --git a/scripts/text_condition/gpu/train_t2v_sparse2d_image.sh b/scripts/text_condition/gpu/train_t2v_sparse2d_image.sh
new file mode 100644
index 000000000..e8a2d6d10
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparse2d_image.sh
@@ -0,0 +1,73 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 720 \
+ --max_width 1280 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.5 \
+ --interpolation_scale_w 2.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --sparse2d \
+ --sparse_n 4 \
+ --force_resolution \
+ --enable_tracker \
+ --pretrained "/storage/dataset/hw29/image/model_ema/diffusion_pytorch_model.safetensors" \
+ --output_dir="bs1x8x16_1x720p_lr1e-4_snr5_noioff0.02_ema9999_sparse2d4_dit_l_122_rope_mt5xxl_mj_fromhw480p"
diff --git a/scripts/text_condition/gpu/train_t2v_sparsedebug_image.sh b/scripts/text_condition/gpu/train_t2v_sparsedebug_image.sh
new file mode 100644
index 000000000..fed165ba4
--- /dev/null
+++ b/scripts/text_condition/gpu/train_t2v_sparsedebug_image.sh
@@ -0,0 +1,71 @@
+export WANDB_KEY="953e958793b218efb850fa194e85843e2c3bd88b"
+# export WANDB_MODE="offline"
+export ENTITY="linbin"
+export PROJECT="bs32x8x2_61x480p_lr1e-4_snr5_noioff0.02_opensora122_rope_mt5xxl_pandamovie_aes_mo_sucai_mo_speed1.2"
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export PDSH_RCMD_TYPE=ssh
+# NCCL setting
+export GLOO_SOCKET_IFNAME=bond0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_10:1,mlx5_11:1,mlx5_12:1,mlx5_13:1
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TC=162
+export NCCL_IB_TIMEOUT=22
+export NCCL_PXN_DISABLE=0
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_ALGO=Ring
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+# export NCCL_ALGO=Tree
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset t2v \
+ --data "scripts/train_data/merge_data_mj.txt" \
+ --ae CausalVAEModel_D4_4x8x8 \
+ --ae_path "/storage/dataset/488dim4_plus" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 720 \
+ --max_width 1280 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.5 \
+ --interpolation_scale_w 2.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --tile_overlap_factor 0.125 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --use_rope \
+ --resume_from_checkpoint="latest" \
+ --group_data \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --force_resolution \
+ --enable_tracker \
+ --pretrained "/storage/dataset/hw29/image/model_ema/diffusion_pytorch_model.safetensors" \
+ --output_dir="debug"
diff --git a/scripts/text_condition/npu/sample_image.sh b/scripts/text_condition/npu/sample_image.sh
new file mode 100644
index 000000000..e44aeaf53
--- /dev/null
+++ b/scripts/text_condition/npu/sample_image.sh
@@ -0,0 +1,18 @@
+export TASK_QUEUE_ENABLE=0
+torchrun --nproc_per_node=1 opensora/sample/sample_t2v_on_npu.py \
+ --model_path bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/model_ema \
+ --num_frames 1 \
+ --height 320 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name /home/image_data/mt5-xxl \
+ --text_prompt examples/prompt_list_0.txt \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "WFVAE_DISTILL_FORMAL" \
+ --save_img_path "./test_image" \
+ --fps 24 \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 24 \
+ --sample_method PNDM \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
\ No newline at end of file
diff --git a/scripts/text_condition/npu/sample_inpaint.sh b/scripts/text_condition/npu/sample_inpaint.sh
new file mode 100644
index 000000000..1037f512d
--- /dev/null
+++ b/scripts/text_condition/npu/sample_inpaint.sh
@@ -0,0 +1,23 @@
+export TASK_QUEUE_ENABLE=0
+torchrun --nnodes=1 --nproc_per_node=1 --master_port 29502 \
+ -m opensora.sample.sample_inpaint_ddp \
+ --model_path /home/save_dir/runs/allinpaint_stage1/checkpoint-18000/model \
+ --num_frames 93 \
+ --height 320 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name /home/image_data/mt5-xxl \
+ --text_prompt /home/image_data/test_prompt.txt \
+ --conditional_images_path "/home/image_data/test_cond_imgs_path.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL" \
+ --save_img_path "/home/image_data/hxy/data/video" \
+ --fps 24 \
+ --guidance_scale 7.5 \
+ --num_sampling_steps 100 \
+ --sample_method EulerAncestralDiscrete \
+ --motion_score 0.9 \
+ --video_path "/home/image_data/hxy/data/video/000184_cut.mp4" \
+ --mask_path "/home/image_data/hxy/data/video/000001_bbox_cut.mp4" \
+ --prediction_type "v_prediction" \
+ --rescale_betas_zero_snr
\ No newline at end of file
diff --git a/scripts/text_condition/npu/sample_inpaint_bak.sh b/scripts/text_condition/npu/sample_inpaint_bak.sh
new file mode 100644
index 000000000..005be4739
--- /dev/null
+++ b/scripts/text_condition/npu/sample_inpaint_bak.sh
@@ -0,0 +1,34 @@
+WEIGHT_PATH="/home/opensora/pre_weights/"
+
+export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$(pwd)"
+export MASTER_PORT=12359
+
+if [ -z "$SAMPLE_SAVE_PATH" ]; then
+ export SAMPLE_SAVE_PATH="/home/image_data/sample_videos"
+fi
+
+if [ -z "$SAMPLE_HEIGHT" ]; then
+ echo "You should set both envs of SAMPLE_HEIGHT and SAMPLE_WIDTH"
+ return
+fi
+
+torchrun --nproc_per_node=8 opensora/sample/sample_inpaint_on_npu.py \
+ --model_path /home/image_data/checkpoints/${PROJECT_NAME} \
+ --num_frames ${NUM_FRAME} \
+ --height $SAMPLE_HEIGHT \
+ --width $SAMPLE_WIDTH \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --text_prompt "/home/image_data/checkpoints/i2v_validation_dir/prompt.txt" \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --save_img_path "${SAMPLE_SAVE_PATH}/${PROJECT_NAME}" \
+ --fps 24 \
+ --max_sequence_length 512 \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 50 \
+ --sample_method PNDM \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --model_3d \
+ --validation_dir "/home/image_data/checkpoints/i2v_validation_dir"
\ No newline at end of file
diff --git a/scripts/text_condition/npu/sample_video.sh b/scripts/text_condition/npu/sample_video.sh
new file mode 100644
index 000000000..24f03cd25
--- /dev/null
+++ b/scripts/text_condition/npu/sample_video.sh
@@ -0,0 +1,18 @@
+export TASK_QUEUE_ENABLE=0
+torchrun --nproc_per_node=1 opensora/sample/sample_t2v_on_npu.py \
+ --model_path bs32x8x1_anyx93x320x320_fps16_lr1e-5_snr5_noioff0.02_ema9999_sparse1d4_dit_l_mt5xxl_alldata100m/model_ema \
+ --num_frames 29 \
+ --height 160 \
+ --width 320 \
+ --cache_dir "../cache_dir" \
+ --text_encoder_name /home/image_data/mt5-xxl \
+ --text_prompt examples/prompt_list_0.txt \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "WFVAE_DISTILL_FORMAL" \
+ --save_img_path "./test_video" \
+ --fps 24 \
+ --guidance_scale 5.0 \
+ --num_sampling_steps 24 \
+ --sample_method PNDM \
+ --model_type "sparsedit" \
+ --motion_score 0.9 \
\ No newline at end of file
diff --git a/scripts/text_condition/npu/train_image3d_240p.sh b/scripts/text_condition/npu/train_image3d_240p.sh
new file mode 100644
index 000000000..a85907a82
--- /dev/null
+++ b/scripts/text_condition/npu/train_image3d_240p.sh
@@ -0,0 +1,50 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 240 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --seed=10 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_image3d_240p_rope.sh b/scripts/text_condition/npu/train_image3d_240p_rope.sh
new file mode 100644
index 000000000..18e938642
--- /dev/null
+++ b/scripts/text_condition/npu/train_image3d_240p_rope.sh
@@ -0,0 +1,52 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 240 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/image3d_rope_256p_zp_umt5/checkpoint-146000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_image3d_256p.sh b/scripts/text_condition/npu/train_image3d_256p.sh
new file mode 100644
index 000000000..375bfd201
--- /dev/null
+++ b/scripts/text_condition/npu/train_image3d_256p.sh
@@ -0,0 +1,50 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 256 \
+ --max_width 256 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --seed=10 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_image3d_256p_rope.sh b/scripts/text_condition/npu/train_image3d_256p_rope.sh
new file mode 100644
index 000000000..34792dcfc
--- /dev/null
+++ b/scripts/text_condition/npu/train_image3d_256p_rope.sh
@@ -0,0 +1,51 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 256 \
+ --max_width 256 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=16 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --seed=10 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_image3d_480p.sh b/scripts/text_condition/npu/train_image3d_480p.sh
new file mode 100644
index 000000000..67e51a1dc
--- /dev/null
+++ b/scripts/text_condition/npu/train_image3d_480p.sh
@@ -0,0 +1,54 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=8 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/image3d_256p_zp_umt5_from_initial_layer_40_head_16/checkpoint-176000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest"
+
diff --git a/scripts/text_condition/npu/train_image3d_480p_rope.sh b/scripts/text_condition/npu/train_image3d_480p_rope.sh
new file mode 100644
index 000000000..138d6d011
--- /dev/null
+++ b/scripts/text_condition/npu/train_image3d_480p_rope.sh
@@ -0,0 +1,54 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=8 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/image3d_rope_256p_zp_umt5/checkpoint-146000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_imageudit_480p.sh b/scripts/text_condition/npu/train_imageudit_480p.sh
new file mode 100644
index 000000000..398c49b0f
--- /dev/null
+++ b/scripts/text_condition/npu/train_imageudit_480p.sh
@@ -0,0 +1,51 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model UDiTT2V-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/DeepFloyd/t5-v1_1-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=4 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=250 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --downsampler "k33_s22" \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_imageuditultra_480p_new.sh b/scripts/text_condition/npu/train_imageuditultra_480p_new.sh
new file mode 100644
index 000000000..26c2bd087
--- /dev/null
+++ b/scripts/text_condition/npu/train_imageuditultra_480p_new.sh
@@ -0,0 +1,52 @@
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model UDiTUltraT2V-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames 1 \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 20 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="constant" \
+ --seed=10 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=250 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --enable_tracker \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --downsampler "k33_s22" \
+ --resume_from_checkpoint="latest" \
+ --checkpoints_total_limit 3
\ No newline at end of file
diff --git a/scripts/text_condition/npu/train_inpaint_sparse1d_newmodel_motion.sh b/scripts/text_condition/npu/train_inpaint_sparse1d_newmodel_motion.sh
new file mode 100644
index 000000000..1d45767f4
--- /dev/null
+++ b/scripts/text_condition/npu/train_inpaint_sparse1d_newmodel_motion.sh
@@ -0,0 +1,83 @@
+export WANDB_KEY="c54943d667ed1abb58ed994e739462e66bc1aee2"
+export WANDB_MODE="online"
+export ENTITY="hexianyi"
+export PROJECT=$PROJECT_NAME
+# export PROJECT='test'
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+# export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_inpaint.py \
+ --model OpenSoraInpaint-L/122 \
+ --text_encoder_name google/mt5-xxl \
+ --cache_dir "../../cache_dir/" \
+ --dataset inpaint \
+ --data "scripts/train_data/video_data_debug.txt" \
+ --ae WFVAEModel_D8_4x8x8 \
+ --ae_path "/home/image_data/lb/Open-Sora-Plan/WFVAE_DISTILL_FORMAL" \
+ --sample_rate 1 \
+ --num_frames 93 \
+ --max_height 320 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 0 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.0 \
+ --use_rope \
+ --skip_low_resolution \
+ --speed_factor 1.0 \
+ --ema_decay 0.9999 \
+ --drop_short_ratio 0.0 \
+ --hw_stride 32 \
+ --sparse1d --sparse_n 4 \
+ --use_motion \
+ --train_fps 16 \
+ --seed 1234 \
+ --trained_data_global_step 0 \
+ --group_data \
+ --use_decord \
+ --prediction_type "v_prediction" \
+ --rescale_betas_zero_snr \
+ --t2v_ratio 0.0 \
+ --i2v_ratio 0.0 \
+ --transition_ratio 0.0 \
+ --v2v_ratio 0.0 \
+ --Semantic_ratio 0.2\
+ --bbox_ratio 0.2\
+ --background_ratio 0.2\
+ --fixed_ratio 0.1\
+ --Semantic_expansion_ratio 0.1\
+ --fixed_bg_ratio 0.1\
+ --clear_video_ratio 0.0 \
+ --min_clear_ratio 0.25 \
+ --default_text_ratio 0.0 \
+ --output_dir /home/save_dir/runs/$PROJECT \
+ --pretrained_transformer_model_path "/home/image_data/captions/vpre_latest_134k/model_ema" \
+ --yolomodel_pathorname "/home/image_data/hxy/Open-Sora-Plan/opensora/dataset/yolov9c-seg.pt"\
+ # --resume_from_checkpoint="/home/save_dir/runs/allinpaint_stage1/checkpoint-13000"
+ # 切part是resume,不是pretrained
+ # --snr_gamma 5.0 \
diff --git a/scripts/text_condition/npu/train_inpaint_video3d_nx480p.sh b/scripts/text_condition/npu/train_inpaint_video3d_nx480p.sh
new file mode 100644
index 000000000..b7f751312
--- /dev/null
+++ b/scripts/text_condition/npu/train_inpaint_video3d_nx480p.sh
@@ -0,0 +1,61 @@
+export WANDB_KEY="720d886d8c437c2142c88056a1eab8ef78d64a1f"
+# # export WANDB_MODE="offline"
+export ENTITY="yunyangge"
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+# export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_inpaint.py \
+ --model OpenSoraInpaint-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset i2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 8 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-5 \
+ --lr_scheduler="constant" \
+ --seed=42 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=200 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/gyy_pretrained_f125/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest" \
+ --i2v_ratio 0.5 \
+ --transition_ratio 0.4 \
+ --default_text_ratio 0.5 \
+ --group_frame \
+ --speed_factor 1.1 \
diff --git a/scripts/text_condition/npu/train_video21d_nx240p.sh b/scripts/text_condition/npu/train_video21d_nx240p.sh
new file mode 100644
index 000000000..6e14288ab
--- /dev/null
+++ b/scripts/text_condition/npu/train_video21d_nx240p.sh
@@ -0,0 +1,51 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model LatteT2V-S/122 \
+ --text_encoder_name ${WEIGHT_PATH}/DeepFloyd/t5-v1_1-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 240 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 15 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --seed=10 \
+ --lr_scheduler="cosine" \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --seed=10 \
+ --checkpointing_steps=1000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --pretrained "${WEIGHT_PATH}/t2v.pt" \
+ --cfg 0.1 \
+ --resume_from_checkpoint="latest"
\ No newline at end of file
diff --git a/scripts/text_condition/npu/train_video3d_nx240p.sh b/scripts/text_condition/npu/train_video3d_nx240p.sh
new file mode 100644
index 000000000..7a0244e9e
--- /dev/null
+++ b/scripts/text_condition/npu/train_video3d_nx240p.sh
@@ -0,0 +1,55 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 240 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 8 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="constant" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/yc_video3d_rope_240p/checkpoint-98000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest" \
+ --enable_stable_fp32
diff --git a/scripts/text_condition/npu/train_video3d_nx480p.sh b/scripts/text_condition/npu/train_video3d_nx480p.sh
new file mode 100644
index 000000000..a691fff7c
--- /dev/null
+++ b/scripts/text_condition/npu/train_video3d_nx480p.sh
@@ -0,0 +1,54 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 8 \
+ --gradient_accumulation_steps=8 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/image3d_rope_480p_from_pretrain/checkpoint-14000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/npu/train_video3d_sp_nx240p.sh b/scripts/text_condition/npu/train_video3d_sp_nx240p.sh
new file mode 100644
index 000000000..3511f2d60
--- /dev/null
+++ b/scripts/text_condition/npu/train_video3d_sp_nx240p.sh
@@ -0,0 +1,55 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 240 \
+ --max_width 320 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 0.5 \
+ --interpolation_scale_w 0.5 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 8 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="constant" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/yc_image3d_rope_240p_pretrain_seed_data/checkpoint-18000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest" \
+ --sp_size 8 \
+ --train_sp_batch_size 4 \
+ --enable_stable_fp32
diff --git a/scripts/text_condition/npu/train_video3d_sp_nx480p.sh b/scripts/text_condition/npu/train_video3d_sp_nx480p.sh
new file mode 100644
index 000000000..f5b7c9777
--- /dev/null
+++ b/scripts/text_condition/npu/train_video3d_sp_nx480p.sh
@@ -0,0 +1,56 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model OpenSoraT2V-ROPE-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/google/mt5-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 8 \
+ --gradient_accumulation_steps=8 \
+ --max_train_steps=1000000 \
+ --learning_rate=4e-5 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=500 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=2000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --enable_tiling \
+ --tile_overlap_factor 0.125 \
+ --use_rope \
+ --noise_offset 0.02 \
+ --pretrained "/home/image_data/checkpoints/image3d_rope_480p_from_pretrain/checkpoint-14000/model_ema/diffusion_pytorch_model.safetensors" \
+ --resume_from_checkpoint="latest" \
+ --sp_size 8 \
+ --train_sp_batch_size 1
diff --git a/scripts/text_condition/npu/train_videoudit_nx480p.sh b/scripts/text_condition/npu/train_videoudit_nx480p.sh
new file mode 100644
index 000000000..167f3cb45
--- /dev/null
+++ b/scripts/text_condition/npu/train_videoudit_nx480p.sh
@@ -0,0 +1,53 @@
+export PROJECT=$PROJECT_NAME
+WEIGHT_PATH="/home/opensora/pre_weights/"
+env
+export WANDB_MODE='offline'
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export HCCL_ALGO="level0:NA;level1:H-D_R"
+
+accelerate launch \
+ --config_file scripts/accelerate_configs/multi_node_example_by_deepspeed.yaml \
+ --machine_rank=${MACHINE_RANK} \
+ --main_process_ip=${MAIN_PROCESS_IP_VALUE} \
+ opensora/train/train_t2v_diffusers.py \
+ --model UDiTT2V-L/122 \
+ --text_encoder_name ${WEIGHT_PATH}/DeepFloyd/t5-v1_1-xxl \
+ --cache_dir "../cache_dir" \
+ --dataset t2v \
+ --ae CausalVAEModel_4x8x8 \
+ --ae_path "${WEIGHT_PATH}/test140k/" \
+ --video_data "./scripts/train_data/video_data_on_npu.txt" \
+ --image_data "./scripts/train_data/image_data_on_npu.txt" \
+ --sample_rate 1 \
+ --num_frames ${NUM_FRAME} \
+ --max_height 480 \
+ --max_width 640 \
+ --interpolation_scale_t 1.0 \
+ --interpolation_scale_h 1.0 \
+ --interpolation_scale_w 1.0 \
+ --attention_mode xformers \
+ --gradient_checkpointing \
+ --train_batch_size=1 \
+ --dataloader_num_workers 10 \
+ --gradient_accumulation_steps=1 \
+ --max_train_steps=1000000 \
+ --learning_rate=1e-4 \
+ --lr_scheduler="cosine" \
+ --seed=10 \
+ --lr_warmup_steps=0 \
+ --mixed_precision="bf16" \
+ --report_to="wandb" \
+ --checkpointing_steps=1000 \
+ --output_dir="/home/image_data/checkpoints/${PROJECT}/" \
+ --allow_tf32 \
+ --model_max_length 512 \
+ --use_image_num 0 \
+ --enable_tiling \
+ --snr_gamma 5.0 \
+ --use_ema \
+ --ema_start_step 0 \
+ --cfg 0.1 \
+ --noise_offset 0.02 \
+ --downsampler "k333_s222" \
+
+ --resume_from_checkpoint="latest"
diff --git a/scripts/text_condition/sample_image.sh b/scripts/text_condition/sample_image.sh
deleted file mode 100644
index 303dac6d7..000000000
--- a/scripts/text_condition/sample_image.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 python opensora/sample/sample_t2v.py \
- --model_path LanguageBind/Open-Sora-Plan-v1.0.0 \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --text_prompt examples/prompt_list_0.txt \
- --ae CausalVAEModel_4x8x8 \
- --version 65x512x512 \
- --save_img_path "./sample_images/prompt_list_0" \
- --fps 24 \
- --guidance_scale 7.5 \
- --num_sampling_steps 250 \
- --enable_tiling \
- --force_images
diff --git a/scripts/text_condition/sample_video.sh b/scripts/text_condition/sample_video.sh
deleted file mode 100644
index fb6b7bd10..000000000
--- a/scripts/text_condition/sample_video.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-CUDA_VISIBLE_DEVICES=6 python opensora/sample/sample_t2v.py \
- --model_path /remote-home1/yeyang/dev/Open-Sora-Plan/debug_mask_loss/checkpoint-500/model \
- --version 65x512x512 \
- --image_size 512 \
- --cache_dir "/remote-home1/yeyang/sora4.8/Open-Sora-Plan/cache_dir" \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --text_prompt examples/prompt_list_0.txt \
- --ae CausalVAEModel_4x8x8 \
- --ae_path "/remote-home1/yeyang/sora4.8/Open-Sora-Plan/CausalVAEModel_4x8x8/" \
- --save_img_path "./sample_videos" \
- --fps 24 \
- --guidance_scale 10.0 \
- --num_sampling_steps 50 \
- --enable_tiling
diff --git a/scripts/text_condition/train_imageae.sh b/scripts/text_condition/train_imageae.sh
deleted file mode 100644
index fff9c8afe..000000000
--- a/scripts/text_condition/train_imageae.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-export WANDB_KEY=""
-export ENTITY=""
-export PROJECT="t2v-f16s3-img4-128-imgvae188-bf16-gc-xformers"
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
- opensora/train/train_t2v.py \
- --model LatteT2V-XL/122 \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --dataset t2v \
- --ae stabilityai/sd-vae-ft-mse \
- --data_path /remote-home1/dataset/sharegpt4v_path_cap_.json \
- --video_folder /remote-home1/dataset/data_split \
- --sample_rate 1 \
- --num_frames 17 \
- --max_image_size 256 \
- --gradient_checkpointing \
- --attention_mode xformers \
- --train_batch_size=4 \
- --dataloader_num_workers 10 \
- --gradient_accumulation_steps=1 \
- --max_train_steps=1000000 \
- --learning_rate=2e-05 \
- --lr_scheduler="constant" \
- --lr_warmup_steps=0 \
- --mixed_precision="bf16" \
- --report_to="wandb" \
- --checkpointing_steps=500 \
- --output_dir="t2v-f17-256-img4-imagevae488-bf16-ckpt-xformers-bs4-lr2e-5-t5" \
- --allow_tf32 \
- --pretrained t2v.pt \
- --use_deepspeed \
- --model_max_length 300 \
- --use_image_num 4 \
- --use_img_from_vid
diff --git a/scripts/text_condition/train_videoae_65x1024x1024.sh b/scripts/text_condition/train_videoae_65x1024x1024.sh
deleted file mode 100644
index 6b5d4fb09..000000000
--- a/scripts/text_condition/train_videoae_65x1024x1024.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-export WANDB_KEY=""
-export ENTITY=""
-export PROJECT="1024"
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
- opensora/train/train_t2v.py \
- --model LatteT2V-XL/122 \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --cache_dir "../../Open-Sora-Plan/cache_dir" \
- --dataset t2v \
- --ae CausalVAEModel_4x8x8 \
- --ae_path "../../Open-Sora-Plan/CausalVAEModel_4x8x8/" \
- --video_data_path "../../Open-Sora-Plan/sharegpt4v_path_cap_64x512x512_mixkit.json" \
- --video_folder /remote-home1/dataset/data_split_tt \
- --image_data_path "../../../dataset/image_114054.json" \
- --image_folder "../../../dataset/picture" \
- --sample_rate 1 \
- --num_frames 65 \
- --max_image_size 1024 \
- --gradient_checkpointing \
- --attention_mode xformers \
- --train_batch_size=2 \
- --dataloader_num_workers 10 \
- --gradient_accumulation_steps=1 \
- --max_train_steps=1000000 \
- --learning_rate=2e-05 \
- --lr_scheduler="constant" \
- --lr_warmup_steps=0 \
- --mixed_precision="bf16" \
- --report_to="tensorboard" \
- --checkpointing_steps=100 \
- --output_dir="1024" \
- --allow_tf32 \
- --pretrained ../../65x512x512/diffusion_pytorch_model.safetensors \
- --use_deepspeed \
- --model_max_length 300 \
- --use_image_num 4 \
- --use_img_from_vid \
- --enable_tiling
diff --git a/scripts/text_condition/train_videoae_65x512x512.sh b/scripts/text_condition/train_videoae_65x512x512.sh
deleted file mode 100644
index 30414c65e..000000000
--- a/scripts/text_condition/train_videoae_65x512x512.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-export WANDB_KEY=""
-export ENTITY=""
-export PROJECT="512_useimg"
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
- opensora/train/train_t2v.py \
- --model LatteT2V-XL/122 \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --cache_dir "../../Open-Sora-Plan/cache_dir" \
- --dataset t2v \
- --ae CausalVAEModel_4x8x8 \
- --ae_path "../../Open-Sora-Plan/CausalVAEModel_4x8x8/" \
- --video_data_path "../../Open-Sora-Plan/sharegpt4v_path_cap_64x512x512_mixkit.json" \
- --video_folder /remote-home1/dataset/data_split_tt \
- --image_data_path "../../../dataset/image_114054.json" \
- --image_folder "../../../dataset/picture" \
- --sample_rate 1 \
- --num_frames 65 \
- --max_image_size 512 \
- --gradient_checkpointing \
- --attention_mode xformers \
- --train_batch_size=4 \
- --dataloader_num_workers 10 \
- --gradient_accumulation_steps=1 \
- --max_train_steps=1000000 \
- --learning_rate=2e-05 \
- --lr_scheduler="constant" \
- --lr_warmup_steps=0 \
- --mixed_precision="bf16" \
- --report_to="tensorboard" \
- --checkpointing_steps=500 \
- --output_dir="512_useimg" \
- --allow_tf32 \
- --pretrained t2v.pt \
- --use_deepspeed \
- --model_max_length 300 \
- --use_image_num 4 \
- --enable_tiling
diff --git a/scripts/text_condition/train_videoae_65x512x512_d64.sh b/scripts/text_condition/train_videoae_65x512x512_d64.sh
deleted file mode 100644
index d93ea8773..000000000
--- a/scripts/text_condition/train_videoae_65x512x512_d64.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-export WANDB_KEY=""
-export ENTITY=""
-export PROJECT="512_d64"
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
- opensora/train/train_t2v.py \
- --model LatteT2V-D64-XL/122 \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --cache_dir "../../Open-Sora-Plan/cache_dir" \
- --dataset t2v \
- --ae CausalVAEModel_4x8x8 \
- --ae_path "../../Open-Sora-Plan/CausalVAEModel_4x8x8/" \
- --video_data_path "../../Open-Sora-Plan/sharegpt4v_path_cap_64x512x512_mixkit.json" \
- --video_folder /remote-home1/dataset/data_split_tt \
- --image_data_path "../../../dataset/image_114054.json" \
- --image_folder "../../../dataset/picture" \
- --sample_rate 1 \
- --num_frames 65 \
- --max_image_size 512 \
- --gradient_checkpointing \
- --attention_mode xformers \
- --train_batch_size=4 \
- --dataloader_num_workers 10 \
- --gradient_accumulation_steps=1 \
- --max_train_steps=1000000 \
- --learning_rate=2e-05 \
- --lr_scheduler="constant" \
- --lr_warmup_steps=0 \
- --mixed_precision="bf16" \
- --report_to="tensorboard" \
- --checkpointing_steps=200 \
- --output_dir="512_d64" \
- --allow_tf32 \
- --pretrained t2v.pt \
- --use_deepspeed \
- --model_max_length 300 \
- --use_image_num 4 \
- --use_img_from_vid \
- --enable_tiling
diff --git a/scripts/text_condition/train_videoae_65x512x512_rope.sh b/scripts/text_condition/train_videoae_65x512x512_rope.sh
deleted file mode 100644
index dcd9a87b2..000000000
--- a/scripts/text_condition/train_videoae_65x512x512_rope.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-export WANDB_KEY=""
-export ENTITY=""
-export PROJECT="512_rope_abs"
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
- opensora/train/train_t2v.py \
- --model LatteT2V-XL/122 \
- --text_encoder_name DeepFloyd/t5-v1_1-xxl \
- --cache_dir "../../Open-Sora-Plan/cache_dir" \
- --dataset t2v \
- --ae CausalVAEModel_4x8x8 \
- --ae_path "../../Open-Sora-Plan/CausalVAEModel_4x8x8/" \
- --video_data_path "../../Open-Sora-Plan/sharegpt4v_path_cap_64x512x512_mixkit.json" \
- --video_folder /remote-home1/dataset/data_split_tt \
- --image_data_path "../../../dataset/image_114054.json" \
- --image_folder "../../../dataset/picture" \
- --sample_rate 1 \
- --num_frames 65 \
- --max_image_size 512 \
- --gradient_checkpointing \
- --attention_mode xformers \
- --train_batch_size=4 \
- --dataloader_num_workers 10 \
- --gradient_accumulation_steps=1 \
- --max_train_steps=1000000 \
- --learning_rate=2e-05 \
- --lr_scheduler="constant" \
- --lr_warmup_steps=0 \
- --mixed_precision="bf16" \
- --report_to="tensorboard" \
- --checkpointing_steps=200 \
- --output_dir="512_rope_abs" \
- --allow_tf32 \
- --pretrained 512_useimg/checkpoint-5500/model/diffusion_pytorch_model.safetensors \
- --use_deepspeed \
- --model_max_length 300 \
- --use_image_num 4 \
- --use_img_from_vid \
- --enable_tiling \
- --use_rope
diff --git a/scripts/train_data/current_data_on_npu.txt b/scripts/train_data/current_data_on_npu.txt
new file mode 100644
index 000000000..a8a14d937
--- /dev/null
+++ b/scripts/train_data/current_data_on_npu.txt
@@ -0,0 +1 @@
+/home/obs_data,/home/image_data/captions/2_16_4764771.json
diff --git a/scripts/train_data/image_data.txt b/scripts/train_data/image_data.txt
new file mode 100644
index 000000000..b19a5ff40
--- /dev/null
+++ b/scripts/train_data/image_data.txt
@@ -0,0 +1,6 @@
+/storage/dataset/image/sam,/storage/anno_jsons/sam_image_11185255_resolution.json
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_1712571_resolution.json
+/storage/dataset/image/human_images,/storage/anno_jsons/human_images_162094_resolution.json
+/storage/dataset/image/anytext3m,/storage/anno_jsons/anytext_3509994_resolution.json
+/storage/dataset/civitai/Images_civitai_v1,/storage/anno_jsons/civitai_v1_1940032_resolution.json
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.json
diff --git a/scripts/train_data/image_data_debug.txt b/scripts/train_data/image_data_debug.txt
new file mode 100644
index 000000000..0e85c866c
--- /dev/null
+++ b/scripts/train_data/image_data_debug.txt
@@ -0,0 +1 @@
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.json
\ No newline at end of file
diff --git a/scripts/train_data/image_data_notext.txt b/scripts/train_data/image_data_notext.txt
new file mode 100644
index 000000000..4e3a08d17
--- /dev/null
+++ b/scripts/train_data/image_data_notext.txt
@@ -0,0 +1,3 @@
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_4615265.json
+/storage/dataset/image/human_images,/storage/anno_jsons/human_images_162094.json
+/storage/dataset/image/sam,/storage/anno_jsons/sam_image_11185255.json
\ No newline at end of file
diff --git a/scripts/train_data/image_data_on_npu.txt b/scripts/train_data/image_data_on_npu.txt
new file mode 100644
index 000000000..a65548d77
--- /dev/null
+++ b/scripts/train_data/image_data_on_npu.txt
@@ -0,0 +1,4 @@
+/home/local_dataset_6t/image_data_obs/mj_unzip_files,/home/opensora/captions/linbin_captions/tuzhan_mj_4615265.json
+/home/local_dataset_6t/image_data_obs/mj_unzip_files,/home/opensora/captions/tuzhan_mj_4615265_cn.json
+/home/local_dataset_6t/image_data_obs/sa_unzip_files,/home/opensora/captions/linbin_captions/sam_image_11185255.json
+/home/local_dataset_6t/image_data_obs/images/,/home/opensora/captions/linbin_captions/human_images_162094.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data.txt b/scripts/train_data/merge_data.txt
new file mode 100644
index 000000000..05244e1ab
--- /dev/null
+++ b/scripts/train_data/merge_data.txt
@@ -0,0 +1,5 @@
+/storage,/storage/anno_jsons/stage2_pandamovie34m_aes4.5_sucai5m_sam3m_c1m_vidal3m_1_16_3045439_shuffle.json
+/storage,/storage/anno_jsons/stage2_pandamovie34m_aes4.5_sucai5m_sam3m_c1m_vidal3m_2_16_3045439_shuffle.json
+/storage,/storage/anno_jsons/stage2_pandamovie34m_aes4.5_sucai5m_sam3m_c1m_vidal3m_3_16_3045439_shuffle.json
+/storage,/storage/anno_jsons/stage2_pandamovie34m_aes4.5_sucai5m_sam3m_c1m_vidal3m_4_16_3045439_shuffle.json
+/storage,/storage/anno_jsons/stage2_pandamovie34m_aes4.5_sucai5m_sam3m_c1m_vidal3m_5_16_3045439_shuffle.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_debug.txt b/scripts/train_data/merge_data_debug.txt
new file mode 100644
index 000000000..3450a2574
--- /dev/null
+++ b/scripts/train_data/merge_data_debug.txt
@@ -0,0 +1 @@
+/storage/dataset/movie,/storage/dataset/movie/TV01_clips_final_478625_llavanext_217405.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_jsonl.txt b/scripts/train_data/merge_data_jsonl.txt
new file mode 100644
index 000000000..5b7c2a780
--- /dev/null
+++ b/scripts/train_data/merge_data_jsonl.txt
@@ -0,0 +1 @@
+/storage,/storage/anno_jsons/stage1_nomj_sam5m_1_4_22673408.jsonl
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_mj.txt b/scripts/train_data/merge_data_mj.txt
new file mode 100644
index 000000000..53e0412b9
--- /dev/null
+++ b/scripts/train_data/merge_data_mj.txt
@@ -0,0 +1,4 @@
+/storage/dataset/civitai/Images_civitai_v1,/storage/anno_jsons/civitai_v1_1940032_resolution.json
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.json
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_4615265_resolution.json
+/storage/dataset/image/human_images,/storage/anno_jsons/human_images_162094_resolution.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_movie.txt b/scripts/train_data/merge_data_movie.txt
new file mode 100644
index 000000000..fc0c33ddf
--- /dev/null
+++ b/scripts/train_data/merge_data_movie.txt
@@ -0,0 +1,11 @@
+
+/storage/dataset/movie,/storage/dataset/movie/bbc01_clips_final_1610998_llavanext_246875.json
+/storage/dataset/movie,/storage/dataset/movie/bbc02_clips_final_1441415_llavanext_289761.json
+/storage/dataset/movie,/storage/dataset/movie/bbc03_clips_final_1905074_llavanext_519162.json
+/storage/dataset/movie,/storage/dataset/movie/bbc04_clips_final_1718543_llavanext_249487.json
+/storage/dataset/movie,/storage/dataset/movie/bbc05_clips_final_2684676_llavanext_416525.json
+/storage/dataset/movie,/storage/dataset/movie/TV01_clips_final_478625_llavanext_217405.json
+/storage/dataset/image/sam,/storage/anno_jsons/sam_image_11185255_resolution.json
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_1712571_resolution.json
+/storage/dataset/image/human_images,/storage/anno_jsons/human_images_162094_resolution.json
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_on_npu.txt b/scripts/train_data/merge_data_on_npu.txt
new file mode 100644
index 000000000..46cfd4222
--- /dev/null
+++ b/scripts/train_data/merge_data_on_npu.txt
@@ -0,0 +1,16 @@
+/home/image_data/captions/1_16_4441840.json
+/home/image_data/captions/2_16_4441840.json
+/home/image_data/captions/3_16_4441840.json
+/home/image_data/captions/4_16_4441840.json
+/home/image_data/captions/5_16_4441840.json
+/home/image_data/captions/6_16_4441840.json
+/home/image_data/captions/7_16_4441840.json
+/home/image_data/captions/8_16_4441840.json
+/home/image_data/captions/9_16_4441840.json
+/home/image_data/captions/10_16_4441840.json
+/home/image_data/captions/11_16_4441839.json
+/home/image_data/captions/12_16_4441839.json
+/home/image_data/captions/13_16_4441839.json
+/home/image_data/captions/14_16_4441839.json
+/home/image_data/captions/15_16_4441839.json
+/home/image_data/captions/16_16_4441839.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_panda_movie.txt b/scripts/train_data/merge_data_panda_movie.txt
new file mode 100644
index 000000000..bbe859653
--- /dev/null
+++ b/scripts/train_data/merge_data_panda_movie.txt
@@ -0,0 +1,7 @@
+/storage/dataset/panda70m,/storage/dataset/filter_aes/filter_panda70m_json/panda_53473351_18923406_4.75plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc01_clips_final_1610998_566004_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc02_clips_final_1441415_663717_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc03_clips_final_1905075_1178482_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc04_clips_final_1718543_639395_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc05_clips_final_2684681_1060985_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/TV01_clips_final_478625_364795_4.5plus.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_panda_movie_mj.txt b/scripts/train_data/merge_data_panda_movie_mj.txt
new file mode 100644
index 000000000..8a52a3a27
--- /dev/null
+++ b/scripts/train_data/merge_data_panda_movie_mj.txt
@@ -0,0 +1,10 @@
+/storage/dataset/panda70m,/storage/dataset/filter_aes/filter_panda70m_json/panda_53473351_18923406_4.75plus.json
+/storage/dataset/civitai/Images_civitai_v1,/storage/anno_jsons/civitai_v1_1940032_resolution.json
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.json
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_4615265_resolution.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc01_clips_final_1610998_566004_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc02_clips_final_1441415_663717_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc03_clips_final_1905075_1178482_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc04_clips_final_1718543_639395_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/bbc05_clips_final_2684681_1060985_4.5plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_all_4.5_json/TV01_clips_final_478625_364795_4.5plus.json
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_parquet.txt b/scripts/train_data/merge_data_parquet.txt
new file mode 100644
index 000000000..23ad123f4
--- /dev/null
+++ b/scripts/train_data/merge_data_parquet.txt
@@ -0,0 +1,13 @@
+/storage/dataset,/storage/zhubin/liuyihang/add_motion/output/Webvid_output_rela_path.parquet
+/storage/dataset,/storage/zhubin/liuyihang/add_motion/output/VIDAL_output_new_rela_path.parquet
+/storage/dataset,/storage/zhubin/liuyihang/add_aes/output/sucai_aes.parquet
+/storage/dataset/movie,/storage/dataset/movie/bbc01_clips_final_1610998_llavanext_246875.parquet
+/storage/dataset/movie,/storage/dataset/movie/bbc02_clips_final_1441415_llavanext_289761.parquet
+/storage/dataset/movie,/storage/dataset/movie/bbc03_clips_final_1905074_llavanext_519162.parquet
+/storage/dataset/movie,/storage/dataset/movie/bbc04_clips_final_1718543_llavanext_249487.parquet
+/storage/dataset/movie,/storage/dataset/movie/bbc05_clips_final_2684676_llavanext_416525.parquet
+/storage/dataset/movie,/storage/dataset/movie/TV01_clips_final_478625_llavanext_217405.parquet
+/storage/dataset/image/sam,/storage/anno_jsons/sam_image_11185255_resolution.parquet
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_1712571_resolution.parquet
+/storage/dataset/image/human_images,/storage/anno_jsons/human_images_162094_resolution.parquet
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.parquet
\ No newline at end of file
diff --git a/scripts/train_data/merge_data_zb.txt b/scripts/train_data/merge_data_zb.txt
new file mode 100644
index 000000000..ffe6dea43
--- /dev/null
+++ b/scripts/train_data/merge_data_zb.txt
@@ -0,0 +1,16 @@
+/storage/dataset,/storage/zhubin/liuyihang/add_aes/output/panda_aes.json
+/storage/dataset,/storage/zhubin/liuyihang/add_motion/output/Webvid_output_rela_path.json
+/storage/dataset,/storage/zhubin/liuyihang/add_motion/output/VIDAL_output_new_rela_path.json
+/storage/dataset,/storage/zhubin/liuyihang/add_aes/output/sucai_aes.json
+
+/storage/dataset/movie,/storage/dataset/movie/bbc01_clips_final_1610998_llavanext_246875.json
+/storage/dataset/movie,/storage/dataset/movie/bbc02_clips_final_1441415_llavanext_289761.json
+/storage/dataset/movie,/storage/dataset/movie/bbc03_clips_final_1905074_llavanext_519162.json
+/storage/dataset/movie,/storage/dataset/movie/bbc04_clips_final_1718543_llavanext_249487.json
+/storage/dataset/movie,/storage/dataset/movie/bbc05_clips_final_2684676_llavanext_416525.json
+/storage/dataset/movie,/storage/dataset/movie/TV01_clips_final_478625_llavanext_217405.json
+/storage/dataset/image/sam,/storage/anno_jsons/sam_image_11185255_resolution.json
+/storage/dataset/image/tuzhan_mj,/storage/anno_jsons/tuzhan_mj_1712571_resolution.json
+/storage/dataset/image/human_images,/storage/anno_jsons/human_images_162094_resolution.json
+/storage/dataset/civitai/Images_civitai_v1,/storage/anno_jsons/civitai_v1_1940032_resolution.json
+/storage/dataset/ideogram/Images_ideogram_v1,/storage/anno_jsons/ideogram_v1_71637_resolution.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data.txt b/scripts/train_data/video_data.txt
new file mode 100644
index 000000000..e0bffc577
--- /dev/null
+++ b/scripts/train_data/video_data.txt
@@ -0,0 +1,7 @@
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_json/bbc01_250508_117513_5.0plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_json/bbc02_289778_128670_5.0plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_json/bbc03_519184_281100_5.0plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_json/bbc04_249497_102768_5.0plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_json/bbc05_416548_207570_5.0plus.json
+/storage/dataset/movie,/storage/dataset/filter_aes/filter_movie_json/TV01_217419_141366_5.0plus.json
+/storage/dataset/panda70m,/storage/dataset/filter_aes/filter_panda70m_json/panda70m_last_6268414_flowValue_940432_5.4plus.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data_513.txt b/scripts/train_data/video_data_513.txt
new file mode 100644
index 000000000..796c7d1f3
--- /dev/null
+++ b/scripts/train_data/video_data_513.txt
@@ -0,0 +1 @@
+/storage/dataset/mixkit,/storage/anno_jsons/video_mixkit_65f_54735.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data_aesmovie_panda.txt b/scripts/train_data/video_data_aesmovie_panda.txt
new file mode 100644
index 000000000..b26ad56a8
--- /dev/null
+++ b/scripts/train_data/video_data_aesmovie_panda.txt
@@ -0,0 +1,7 @@
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc01_250508.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc02_289778.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc03_519184.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc04_249497.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc05_416548.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/TV01_217419.json
+/storage/dataset/panda70m,/storage/dataset/panda70m/panda70m_last_6268414_flowValue.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data_aesmovie_sucai_panda.txt b/scripts/train_data/video_data_aesmovie_sucai_panda.txt
new file mode 100644
index 000000000..f583df7f6
--- /dev/null
+++ b/scripts/train_data/video_data_aesmovie_sucai_panda.txt
@@ -0,0 +1,16 @@
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc01_250508.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc02_289778.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc03_519184.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc04_249497.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/bbc05_416548.json
+/storage/dataset/movie,/storage/dataset/movie/merge_cap_lb_7_11/TV01_217419.json
+/storage/dataset/panda70m,/storage/dataset/panda70m/panda70m_last_6268414_flowValue.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_canva_final_95441.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_clipchamp_final_452264.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_coverr_final_3002.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_istock_final_815070.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_kapwing_final_68473.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_mixkit_final_4490.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_pexels_final_267395.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_pixabay_v2_final_21608.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_storyblocks_final_1270947.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data_debug.txt b/scripts/train_data/video_data_debug.txt
new file mode 100644
index 000000000..4c9bd7bd1
--- /dev/null
+++ b/scripts/train_data/video_data_debug.txt
@@ -0,0 +1 @@
+/home/obs_data/20240426/20240704-02-bbc-01,/home/image_data/captions/TV01_clips_final_478625_llavanext_217405_aes478625.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data_on_npu.txt b/scripts/train_data/video_data_on_npu.txt
new file mode 100644
index 000000000..eac812cee
--- /dev/null
+++ b/scripts/train_data/video_data_on_npu.txt
@@ -0,0 +1,16 @@
+/home/obs_data,/home/image_data/captions/1_16_4764771.json
+/home/obs_data,/home/image_data/captions/2_16_4764771.json
+/home/obs_data,/home/image_data/captions/3_16_4764771.json
+/home/obs_data,/home/image_data/captions/4_16_4764771.json
+/home/obs_data,/home/image_data/captions/5_16_4764770.json
+/home/obs_data,/home/image_data/captions/6_16_4764770.json
+/home/obs_data,/home/image_data/captions/7_16_4764770.json
+/home/obs_data,/home/image_data/captions/8_16_4764770.json
+/home/obs_data,/home/image_data/captions/9_16_4764770.json
+/home/obs_data,/home/image_data/captions/10_16_4764770.json
+/home/obs_data,/home/image_data/captions/11_16_4764770.json
+/home/obs_data,/home/image_data/captions/12_16_4764770.json
+/home/obs_data,/home/image_data/captions/13_16_4764770.json
+/home/obs_data,/home/image_data/captions/14_16_4764770.json
+/home/obs_data,/home/image_data/captions/15_16_4764770.json
+/home/obs_data,/home/image_data/captions/16_16_4764770.json
diff --git a/scripts/train_data/video_data_sucai.txt b/scripts/train_data/video_data_sucai.txt
new file mode 100644
index 000000000..31537b50c
--- /dev/null
+++ b/scripts/train_data/video_data_sucai.txt
@@ -0,0 +1,9 @@
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_canva_final_95441.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_clipchamp_final_452264.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_coverr_final_3002.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_istock_final_815070.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_kapwing_final_68473.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_mixkit_final_4490.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_pexels_final_267395.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_pixabay_v2_final_21608.json
+/storage/dataset,/storage/zhubin/video_statistics_data/task1.5/Final_format_dataset_data_v2/step1.5_storyblocks_final_1270947.json
\ No newline at end of file
diff --git a/scripts/train_data/video_data_sucai_aes5.txt b/scripts/train_data/video_data_sucai_aes5.txt
new file mode 100644
index 000000000..fc4e028c0
--- /dev/null
+++ b/scripts/train_data/video_data_sucai_aes5.txt
@@ -0,0 +1,9 @@
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_canva_final_95441_65422_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_clipchamp_final_452264_231178_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_coverr_final_3002_1274_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_istock_final_815070_405120_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_kapwing_final_68473_27757_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_mixkit_final_4490_2634_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_pexels_final_267395_148323_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_pixabay_v2_final_21608_11400_5.0plus.json
+/storage/dataset,/storage/dataset/filter_aes/filter_sucai_json/step1.5_storyblocks_final_1270947_825948_5.0plus.json
\ No newline at end of file
diff --git a/scripts/videogpt/train_videogpt.sh b/scripts/videogpt/train_videogpt.sh
deleted file mode 100644
index 6c06e0a64..000000000
--- a/scripts/videogpt/train_videogpt.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-
-accelerate launch \
- --config_file scripts/accelerate_configs/ddp_config.yaml \
- opensora/train/train_videogpt.py \
- --do_train \
- --seed 1234 \
- --data_path "/remote-home/yeyang/UCF-101/" \
- --per_device_train_batch_size 1 \
- --gradient_accumulation_steps 1 \
- --learning_rate 7e-4 \
- --weight_decay 0. \
- --max_steps 20000 \
- --lr_scheduler_type cosine \
- --max_grad_norm 1.0 \
- --save_strategy steps \
- --save_total_limit 5 \
- --logging_steps 5 \
- --save_steps 1000 \
- --n_codes 2048 \
- --n_hiddens 240 \
- --embedding_dim 4 \
- --n_res_layers 4 \
- --downsample "4,4,4" \
- --resolution 240 \
- --sequence_length 16 \
- --output_dir results/videogpt_488_256_16 \
- --bf16 True \
- --fp16 False \
- --report_to tensorboard \
- --dataloader_num_workers 10
diff --git a/scripts/videogpt/train_videogpt_dsz2.sh b/scripts/videogpt/train_videogpt_dsz2.sh
deleted file mode 100644
index 7875333eb..000000000
--- a/scripts/videogpt/train_videogpt_dsz2.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-export ACCELERATE_GRADIENT_ACCUMULATION_STEPS=1
-
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \
- opensora/train/train_videogpt.py \
- --do_train \
- --seed 1234 \
- --data_path "datasets/UCF-101/" \
- --per_device_train_batch_size 32 \
- --gradient_accumulation_steps $ACCELERATE_GRADIENT_ACCUMULATION_STEPS \
- --learning_rate 7e-4 \
- --weight_decay 0. \
- --num_train_epochs 2 \
- --lr_scheduler_type cosine \
- --max_grad_norm 1.0 \
- --save_strategy steps \
- --save_total_limit 5 \
- --logging_steps 5 \
- --save_steps 10000 \
- --n_codes 1024 \
- --n_hiddens 240 \
- --embedding_dim 4 \
- --n_res_layers 4 \
- --downsample "4,4,4" \
- --resolution 128 \
- --sequence_length 16 \
- --output_dir results/videogpt_444_128 \
- --bf16 True \
- --fp16 False \
- --report_to tensorboard
diff --git a/scripts/videogpt/train_videogpt_dsz3.sh b/scripts/videogpt/train_videogpt_dsz3.sh
deleted file mode 100644
index 484084236..000000000
--- a/scripts/videogpt/train_videogpt_dsz3.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-export ACCELERATE_GRADIENT_ACCUMULATION_STEPS=1
-
-accelerate launch \
- --config_file scripts/accelerate_configs/deepspeed_zero3_config.yaml \
- opensora/train/train_videogpt.py \
- --do_train \
- --seed 1234 \
- --data_path "datasets/UCF-101/" \
- --per_device_train_batch_size 32 \
- --gradient_accumulation_steps $ACCELERATE_GRADIENT_ACCUMULATION_STEPS \
- --learning_rate 7e-4 \
- --weight_decay 0. \
- --num_train_epochs 2 \
- --lr_scheduler_type cosine \
- --max_grad_norm 1.0 \
- --save_strategy steps \
- --save_total_limit 5 \
- --logging_steps 5 \
- --save_steps 10000 \
- --n_codes 1024 \
- --n_hiddens 240 \
- --embedding_dim 4 \
- --n_res_layers 4 \
- --downsample "4,4,4" \
- --resolution 128 \
- --sequence_length 16 \
- --output_dir results/videogpt_444_128 \
- --bf16 True \
- --fp16 False \
- --report_to tensorboard
diff --git a/seed.txt b/seed.txt
new file mode 100644
index 000000000..71abb8505
--- /dev/null
+++ b/seed.txt
@@ -0,0 +1,200 @@
+54772192
+1818559
+96453589
+31562712
+57322344
+69472081
+111012
+21408319
+92640719
+61294860
+21327469
+53263210
+40582092
+28394045
+71159969
+2157594
+58200604
+23406796
+76254691
+21916831
+62388905
+63738670
+26204979
+7771881
+59943270
+80583057
+95793202
+52859755
+2075209
+99525631
+33755453
+35407358
+27398776
+18898781
+74552050
+61167570
+35845404
+33774409
+79535361
+89984239
+57416385
+83451617
+93069601
+7626451
+90670347
+12456687
+88103643
+36406985
+53298385
+79583560
+31475157
+69403517
+10853392
+79612930
+57941600
+64597087
+3837970
+55989364
+30518141
+30458584
+90178174
+78102020
+85241869
+56602460
+4287298
+69650713
+84917682
+22260866
+86060151
+50690452
+79466300
+15142491
+16189759
+48108189
+40868115
+26720024
+57424386
+92772187
+66605653
+82892525
+43330140
+47838312
+54852636
+91021440
+38201699
+88828952
+22709517
+40070762
+13020663
+83479581
+12206776
+88705935
+51064471
+70264569
+62293291
+67887552
+91338327
+56759295
+42384628
+19062215
+89549704
+36762545
+65980816
+80073026
+49304259
+84459031
+46163305
+92297335
+91470376
+4243674
+54249696
+94095222
+79354938
+27136429
+38474371
+43628631
+72990601
+48425792
+71070649
+62709171
+20040550
+46032149
+40764114
+50477866
+79805183
+35062693
+15657913
+72197366
+10639508
+53191828
+74447311
+71914086
+59133527
+98084299
+91062387
+83689168
+41848719
+21283911
+15325792
+81328486
+77584362
+52674907
+9253255
+89806702
+67348822
+68776016
+42791133
+93708934
+47683245
+43970228
+21994327
+57504058
+20830613
+31554532
+76251166
+34808236
+59340023
+47416276
+13518519
+80865077
+82164806
+95435324
+41236744
+64021281
+78588359
+5529302
+63957306
+60796342
+91025348
+57426714
+77145225
+22724514
+33351724
+28468671
+68301472
+25527807
+27943666
+52461789
+38671842
+81392557
+25561570
+46263168
+89589577
+77038019
+71423560
+67019718
+56009735
+68932922
+98988290
+26565463
+60536315
+13782449
+32549829
+5696904
+56946718
+17171986
+6197126
+1463826
+84204677
+12099676
\ No newline at end of file
diff --git a/seed_list.py b/seed_list.py
new file mode 100644
index 000000000..de4bee0f1
--- /dev/null
+++ b/seed_list.py
@@ -0,0 +1,8 @@
+import random
+import numpy as np
+
+
+randnum = np.random.randint(0, 1e8, size=200)
+
+with open('seed.txt', 'w') as f:
+ f.write('\n'.join([str(i) for i in randnum]))
\ No newline at end of file
diff --git a/tools/get_img_info.py b/tools/get_img_info.py
new file mode 100644
index 000000000..f09375f8e
--- /dev/null
+++ b/tools/get_img_info.py
@@ -0,0 +1,76 @@
+# import cv2
+# from tqdm import tqdm
+# from glob import glob
+# import json
+# import os
+
+# def get_image_size(image_path):
+# """
+# Given an image path, return its width and height.
+# """
+# try:
+# image = cv2.imread(image_path)
+# height, width = image.shape[:2]
+# return height, width
+# except Exception as e:
+# return None, None
+
+# image_root = '/storage/dataset/image/human_images/'
+# save_root = '/storage/dataset/image'
+# os.makedirs(save_root, exist_ok=True)
+# save_name = 'human_images_{}_resolution.json'
+# all_paths = glob(os.path.join(image_root, '**', f'*.jpg'), recursive=True)
+# items = []
+# for i in tqdm(all_paths):
+# height, width = get_image_size(i)
+# path = i.replace(image_root if image_root.endswith('/') else image_root + '/', '')
+# item = dict(path=path, resolution=dict(height=height, width=width))
+# items.append(item)
+# with open(os.path.join(save_root, save_name.format(len(items))), 'w') as f:
+# json.dump(items, f, indent=2)
+
+
+
+
+import cv2
+from tqdm import tqdm
+from glob import glob
+import json
+import os
+from multiprocessing import Pool
+
+def get_image_size(image_path):
+ """
+ Given an image path, return its width and height.
+ """
+ try:
+ image = cv2.imread(image_path)
+ height, width = image.shape[:2]
+ return image_path, height, width
+ except Exception as e:
+ return image_path, None, None
+
+def process_image_paths(image_paths):
+ items = []
+ for image_path, height, width in image_paths:
+ path = image_path.replace(image_root if image_root.endswith('/') else image_root + '/', '')
+ item = dict(path=path, resolution=dict(height=height, width=width))
+ items.append(item)
+ return items
+
+if __name__ == '__main__':
+ image_root = '/storage/dataset/image/tuzhan_mj'
+ save_root = '/storage/dataset/image'
+ os.makedirs(save_root, exist_ok=True)
+ save_name = 'tuzhan_mj_{}_resolution.json'
+ all_paths = glob(os.path.join(image_root, '**', '*.jpg'), recursive=True)
+
+ num_processes = os.cpu_count() # Use the number of CPU cores
+ num_processes = 128 # Use the number of CPU cores
+ with Pool(num_processes) as pool:
+ results = list(tqdm(pool.imap(get_image_size, all_paths), total=len(all_paths)))
+
+ items = process_image_paths(results)
+
+ with open(os.path.join(save_root, save_name.format(len(items))), 'w') as f:
+ json.dump(items, f, indent=2)
diff --git a/tools/merge_imginfo_to_anno.py b/tools/merge_imginfo_to_anno.py
new file mode 100644
index 000000000..067b7b9ee
--- /dev/null
+++ b/tools/merge_imginfo_to_anno.py
@@ -0,0 +1,37 @@
+
+from tqdm import tqdm
+from glob import glob
+import json
+import os
+
+# anno_path = '/storage/anno_jsons/human_images_162094.json'
+# img_info_path = '/storage/dataset/image/human_images_162094_resolution.json'
+# save_root = '/storage/anno_jsons'
+# save_name = 'human_images_{}_resolution.json'
+
+
+# anno_path = '/storage/anno_jsons/tuzhan_mj_4615265.json'
+# img_info_path = '/storage/dataset/image/tuzhan_mj_4615530_resolution.json'
+# save_root = '/storage/anno_jsons'
+# save_name = 'tuzhan_mj_{}_resolution.json'
+
+
+anno_path = '/storage/anno_jsons/sam_image_11185255.json'
+img_info_path = '/storage/dataset/image/sam_image_11185362_resolution.json'
+save_root = '/storage/anno_jsons'
+save_name = 'sam_image_{}_resolution.json'
+
+with open(anno_path, 'r') as f:
+ anno = json.load(f)
+with open(img_info_path, 'r') as f:
+ img_info = json.load(f)
+img_info = {i['path']: i['resolution'] for i in img_info}
+
+items = []
+cnt = 0
+for i in tqdm(anno):
+ resolution = img_info[i['path']]
+ i['resolution'] = resolution
+ items.append(i)
+with open(os.path.join(save_root, save_name.format(len(items))), 'w') as f:
+ json.dump(items, f, indent=2)
\ No newline at end of file
diff --git a/validation_dir/prompt.txt b/validation_dir/prompt.txt
new file mode 100644
index 000000000..592bd5a4d
--- /dev/null
+++ b/validation_dir/prompt.txt
@@ -0,0 +1,27 @@
+A rocket ascends slowly into the sky.
+A coffee cup with "anytext" foam floating on it.
+A beautiful girl is blinking her eyes.
+A flower and bird painting.This video has a silky transition effect.
+The scenery of snow capped mountains.This video has a silky transition effect.
+The night view of the city.This video has a silky transition effect.
+A rocket ascends slowly into the sky.
+Along the coast, variously sized boats float on the lake.
+The landscape at sunset is profound and expansive.
+A beautiful girl is blinking her eyes.
+A beautiful girl is blinking her eyes.
+A coffee cup with "anytext" foam floating on it.
+A person is eating pizza on a plate.
+A group of people are cheering and celebrating.
+A green dinosaur is playing tennis.
+A female elf is smiling.
+The girl with red hair is turning her head and smiling.
+A racing car is speeding on the road. Environmental perspective.
+A family photo of four people smiling and playing around with each other.
+The old lady hugged her granddaughter and they laughed happily.
+Drones fly from snow capped mountains to cliffs, with waterfalls flowing. This video has a silky transition effect.
+There is no transition video from the cliff to the bottom, and the waterfall is flowing.
+The scene from the seabed to the cliff.This video has a silky transition effect.
+The boy in the video abandoned his girlfriend and instead embraced the girl in red clothes.
+A beautiful girl is stroking her hair and blinking her eyes.
+A fashionable female model strolls on the streets of Tokyo.
+A panda playing guitar on the grass.
\ No newline at end of file